From 8c3ba334f8588e1d5099f8602cf01897720e0eca Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Mon, 18 Jul 2011 17:17:15 +0300
Subject: KVM: x86: Raise the hard VCPU count limit

The patch raises the hard limit of VCPU count to 254.

This will allow developers to easily work on scalability
and will allow users to test high VCPU setups easily without
patching the kernel.

To prevent possible issues with current setups, KVM_CAP_NR_VCPUS
now returns the recommended VCPU limit (which is still 64) - this
should be a safe value for everybody, while a new KVM_CAP_MAX_VCPUS
returns the hard limit which is now 254.

Cc: Avi Kivity <avi@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Suggested-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/kvm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index aace6b8691a2..206979877888 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -463,7 +463,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_VAPIC 6
 #define KVM_CAP_EXT_CPUID 7
 #define KVM_CAP_CLOCKSOURCE 8
-#define KVM_CAP_NR_VCPUS 9       /* returns max vcpus per vm */
+#define KVM_CAP_NR_VCPUS 9       /* returns recommended max vcpus per vm */
 #define KVM_CAP_NR_MEMSLOTS 10   /* returns max memory slots per vm */
 #define KVM_CAP_PIT 11
 #define KVM_CAP_NOP_IO_DELAY 12
@@ -553,6 +553,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_SPAPR_TCE 63
 #define KVM_CAP_PPC_SMT 64
 #define KVM_CAP_PPC_RMA	65
+#define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
 #define KVM_CAP_S390_GMAP 71
 
 #ifdef KVM_CAP_IRQ_ROUTING
-- 
cgit v1.2.3-59-g8ed1b


From 2b3c246a682c50f5415c71fc5387a114a6f0d643 Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Wed, 20 Jul 2011 20:59:00 +0300
Subject: KVM: Make coalesced mmio use a device per zone

This patch changes coalesced mmio to create one mmio device per
zone instead of handling all zones in one device.

Doing so enables us to take advantage of existing locking and prevents
a race condition between coalesced mmio registration/unregistration
and lookups.

Suggested-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/kvm_host.h  |   5 +-
 virt/kvm/coalesced_mmio.c | 118 ++++++++++++++++++----------------------------
 virt/kvm/coalesced_mmio.h |   7 ++-
 3 files changed, 53 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index eabb21a30c34..ff4d4062af9d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -63,7 +63,7 @@ extern struct kmem_cache *kvm_vcpu_cache;
  */
 struct kvm_io_bus {
 	int                   dev_count;
-#define NR_IOBUS_DEVS 200
+#define NR_IOBUS_DEVS 300
 	struct kvm_io_device *devs[NR_IOBUS_DEVS];
 };
 
@@ -256,8 +256,9 @@ struct kvm {
 	struct kvm_arch arch;
 	atomic_t users_count;
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
-	struct kvm_coalesced_mmio_dev *coalesced_mmio_dev;
 	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
+	spinlock_t ring_lock;
+	struct list_head coalesced_zones;
 #endif
 
 	struct mutex irq_lock;
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index ae075dc0890d..2316ec1aadc4 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -24,23 +24,13 @@ static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
 static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
 				   gpa_t addr, int len)
 {
-	struct kvm_coalesced_mmio_zone *zone;
-	int i;
-
-	/* is it in a batchable area ? */
-
-	for (i = 0; i < dev->nb_zones; i++) {
-		zone = &dev->zone[i];
-
-		/* (addr,len) is fully included in
-		 * (zone->addr, zone->size)
-		 */
+	/* is it in a batchable area ?
+	 * (addr,len) is fully included in
+	 * (zone->addr, zone->size)
+	 */
 
-		if (zone->addr <= addr &&
-		    addr + len <= zone->addr + zone->size)
-			return 1;
-	}
-	return 0;
+	return (dev->zone.addr <= addr &&
+		addr + len <= dev->zone.addr + dev->zone.size);
 }
 
 static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
@@ -73,10 +63,10 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
 	if (!coalesced_mmio_in_range(dev, addr, len))
 		return -EOPNOTSUPP;
 
-	spin_lock(&dev->lock);
+	spin_lock(&dev->kvm->ring_lock);
 
 	if (!coalesced_mmio_has_room(dev)) {
-		spin_unlock(&dev->lock);
+		spin_unlock(&dev->kvm->ring_lock);
 		return -EOPNOTSUPP;
 	}
 
@@ -87,7 +77,7 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
 	memcpy(ring->coalesced_mmio[ring->last].data, val, len);
 	smp_wmb();
 	ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
-	spin_unlock(&dev->lock);
+	spin_unlock(&dev->kvm->ring_lock);
 	return 0;
 }
 
@@ -95,6 +85,8 @@ static void coalesced_mmio_destructor(struct kvm_io_device *this)
 {
 	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
 
+	list_del(&dev->list);
+
 	kfree(dev);
 }
 
@@ -105,7 +97,6 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {
 
 int kvm_coalesced_mmio_init(struct kvm *kvm)
 {
-	struct kvm_coalesced_mmio_dev *dev;
 	struct page *page;
 	int ret;
 
@@ -113,31 +104,18 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page)
 		goto out_err;
-	kvm->coalesced_mmio_ring = page_address(page);
 
-	ret = -ENOMEM;
-	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
-	if (!dev)
-		goto out_free_page;
-	spin_lock_init(&dev->lock);
-	kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
-	dev->kvm = kvm;
-	kvm->coalesced_mmio_dev = dev;
-
-	mutex_lock(&kvm->slots_lock);
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
-	mutex_unlock(&kvm->slots_lock);
-	if (ret < 0)
-		goto out_free_dev;
+	ret = 0;
+	kvm->coalesced_mmio_ring = page_address(page);
 
-	return ret;
+	/*
+	 * We're using this spinlock to sync access to the coalesced ring.
+	 * The list doesn't need it's own lock since device registration and
+	 * unregistration should only happen when kvm->slots_lock is held.
+	 */
+	spin_lock_init(&kvm->ring_lock);
+	INIT_LIST_HEAD(&kvm->coalesced_zones);
 
-out_free_dev:
-	kvm->coalesced_mmio_dev = NULL;
-	kfree(dev);
-out_free_page:
-	kvm->coalesced_mmio_ring = NULL;
-	__free_page(page);
 out_err:
 	return ret;
 }
@@ -151,51 +129,49 @@ void kvm_coalesced_mmio_free(struct kvm *kvm)
 int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
 					 struct kvm_coalesced_mmio_zone *zone)
 {
-	struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+	int ret;
+	struct kvm_coalesced_mmio_dev *dev;
 
-	if (dev == NULL)
-		return -ENXIO;
+	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
+	dev->kvm = kvm;
+	dev->zone = *zone;
 
 	mutex_lock(&kvm->slots_lock);
-	if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
-		mutex_unlock(&kvm->slots_lock);
-		return -ENOBUFS;
-	}
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+	if (ret < 0)
+		goto out_free_dev;
+	list_add_tail(&dev->list, &kvm->coalesced_zones);
+	mutex_unlock(&kvm->slots_lock);
 
-	dev->zone[dev->nb_zones] = *zone;
-	dev->nb_zones++;
+	return ret;
 
+out_free_dev:
 	mutex_unlock(&kvm->slots_lock);
+
+	kfree(dev);
+
+	if (dev == NULL)
+		return -ENXIO;
+
 	return 0;
 }
 
 int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
 					   struct kvm_coalesced_mmio_zone *zone)
 {
-	int i;
-	struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
-	struct kvm_coalesced_mmio_zone *z;
-
-	if (dev == NULL)
-		return -ENXIO;
+	struct kvm_coalesced_mmio_dev *dev, *tmp;
 
 	mutex_lock(&kvm->slots_lock);
 
-	i = dev->nb_zones;
-	while (i) {
-		z = &dev->zone[i - 1];
-
-		/* unregister all zones
-		 * included in (zone->addr, zone->size)
-		 */
-
-		if (zone->addr <= z->addr &&
-		    z->addr + z->size <= zone->addr + zone->size) {
-			dev->nb_zones--;
-			*z = dev->zone[dev->nb_zones];
+	list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
+		if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
+			kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+			kvm_iodevice_destructor(&dev->dev);
 		}
-		i--;
-	}
 
 	mutex_unlock(&kvm->slots_lock);
 
diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h
index 8a5959e3535f..b280c20444d1 100644
--- a/virt/kvm/coalesced_mmio.h
+++ b/virt/kvm/coalesced_mmio.h
@@ -12,14 +12,13 @@
 
 #ifdef CONFIG_KVM_MMIO
 
-#define KVM_COALESCED_MMIO_ZONE_MAX 100
+#include <linux/list.h>
 
 struct kvm_coalesced_mmio_dev {
+	struct list_head list;
 	struct kvm_io_device dev;
 	struct kvm *kvm;
-	spinlock_t lock;
-	int nb_zones;
-	struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX];
+	struct kvm_coalesced_mmio_zone zone;
 };
 
 int kvm_coalesced_mmio_init(struct kvm *kvm);
-- 
cgit v1.2.3-59-g8ed1b


From 743eeb0b01d2fbf4154bf87bff1ebb6fb18aeb7a Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Wed, 27 Jul 2011 16:00:48 +0300
Subject: KVM: Intelligent device lookup on I/O bus

Currently the method of dealing with an IO operation on a bus (PIO/MMIO)
is to call the read or write callback for each device registered
on the bus until we find a device which handles it.

Since the number of devices on a bus can be significant due to ioeventfds
and coalesced MMIO zones, this leads to a lot of overhead on each IO
operation.

Instead of registering devices, we now register ranges which points to
a device. Lookup is done using an efficient bsearch instead of a linear
search.

Performance test was conducted by comparing exit count per second with
200 ioeventfds created on one byte and the guest is trying to access a
different byte continuously (triggering usermode exits).
Before the patch the guest has achieved 259k exits per second, after the
patch the guest does 274k exits per second.

Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c      |   6 ++-
 arch/x86/kvm/i8259.c      | 108 ++++++++++++++++++++++++++++++++++++--------
 arch/x86/kvm/irq.h        |   4 +-
 arch/x86/kvm/x86.c        |   6 ++-
 include/linux/kvm_host.h  |  18 ++++----
 virt/kvm/coalesced_mmio.c |   3 +-
 virt/kvm/eventfd.c        |   3 +-
 virt/kvm/ioapic.c         |   3 +-
 virt/kvm/kvm_main.c       | 112 +++++++++++++++++++++++++++++++++++++++++-----
 9 files changed, 216 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index efad72385058..76e3f1cd0369 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -713,14 +713,16 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
 
 	kvm_iodevice_init(&pit->dev, &pit_dev_ops);
-	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
+	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
+				      KVM_PIT_MEM_LENGTH, &pit->dev);
 	if (ret < 0)
 		goto fail;
 
 	if (flags & KVM_PIT_SPEAKER_DUMMY) {
 		kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
 		ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
-						&pit->speaker_dev);
+					      KVM_SPEAKER_BASE_ADDRESS, 4,
+					      &pit->speaker_dev);
 		if (ret < 0)
 			goto fail_unregister;
 	}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 19fe855e7953..6b869ce0cc19 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -459,15 +459,9 @@ static int picdev_in_range(gpa_t addr)
 	}
 }
 
-static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
-{
-	return container_of(dev, struct kvm_pic, dev);
-}
-
-static int picdev_write(struct kvm_io_device *this,
+static int picdev_write(struct kvm_pic *s,
 			 gpa_t addr, int len, const void *val)
 {
-	struct kvm_pic *s = to_pic(this);
 	unsigned char data = *(unsigned char *)val;
 	if (!picdev_in_range(addr))
 		return -EOPNOTSUPP;
@@ -494,10 +488,9 @@ static int picdev_write(struct kvm_io_device *this,
 	return 0;
 }
 
-static int picdev_read(struct kvm_io_device *this,
+static int picdev_read(struct kvm_pic *s,
 		       gpa_t addr, int len, void *val)
 {
-	struct kvm_pic *s = to_pic(this);
 	unsigned char data = 0;
 	if (!picdev_in_range(addr))
 		return -EOPNOTSUPP;
@@ -525,6 +518,48 @@ static int picdev_read(struct kvm_io_device *this,
 	return 0;
 }
 
+static int picdev_master_write(struct kvm_io_device *dev,
+			       gpa_t addr, int len, const void *val)
+{
+	return picdev_write(container_of(dev, struct kvm_pic, dev_master),
+			    addr, len, val);
+}
+
+static int picdev_master_read(struct kvm_io_device *dev,
+			      gpa_t addr, int len, void *val)
+{
+	return picdev_read(container_of(dev, struct kvm_pic, dev_master),
+			    addr, len, val);
+}
+
+static int picdev_slave_write(struct kvm_io_device *dev,
+			      gpa_t addr, int len, const void *val)
+{
+	return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
+			    addr, len, val);
+}
+
+static int picdev_slave_read(struct kvm_io_device *dev,
+			     gpa_t addr, int len, void *val)
+{
+	return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
+			    addr, len, val);
+}
+
+static int picdev_eclr_write(struct kvm_io_device *dev,
+			     gpa_t addr, int len, const void *val)
+{
+	return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
+			    addr, len, val);
+}
+
+static int picdev_eclr_read(struct kvm_io_device *dev,
+			    gpa_t addr, int len, void *val)
+{
+	return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
+			    addr, len, val);
+}
+
 /*
  * callback when PIC0 irq status changed
  */
@@ -537,9 +572,19 @@ static void pic_irq_request(struct kvm *kvm, int level)
 	s->output = level;
 }
 
-static const struct kvm_io_device_ops picdev_ops = {
-	.read     = picdev_read,
-	.write    = picdev_write,
+static const struct kvm_io_device_ops picdev_master_ops = {
+	.read     = picdev_master_read,
+	.write    = picdev_master_write,
+};
+
+static const struct kvm_io_device_ops picdev_slave_ops = {
+	.read     = picdev_slave_read,
+	.write    = picdev_slave_write,
+};
+
+static const struct kvm_io_device_ops picdev_eclr_ops = {
+	.read     = picdev_eclr_read,
+	.write    = picdev_eclr_write,
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm)
@@ -560,16 +605,39 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 	/*
 	 * Initialize PIO device
 	 */
-	kvm_iodevice_init(&s->dev, &picdev_ops);
+	kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
+	kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
+	kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops);
 	mutex_lock(&kvm->slots_lock);
-	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
+	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
+				      &s->dev_master);
+	if (ret < 0)
+		goto fail_unlock;
+
+	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave);
+	if (ret < 0)
+		goto fail_unreg_2;
+
+	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr);
+	if (ret < 0)
+		goto fail_unreg_1;
+
 	mutex_unlock(&kvm->slots_lock);
-	if (ret < 0) {
-		kfree(s);
-		return NULL;
-	}
 
 	return s;
+
+fail_unreg_1:
+	kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
+
+fail_unreg_2:
+	kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master);
+
+fail_unlock:
+	mutex_unlock(&kvm->slots_lock);
+
+	kfree(s);
+
+	return NULL;
 }
 
 void kvm_destroy_pic(struct kvm *kvm)
@@ -577,7 +645,9 @@ void kvm_destroy_pic(struct kvm *kvm)
 	struct kvm_pic *vpic = kvm->arch.vpic;
 
 	if (vpic) {
-		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master);
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave);
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr);
 		kvm->arch.vpic = NULL;
 		kfree(vpic);
 	}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 53e2d084bffb..2086f2bfba33 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -66,7 +66,9 @@ struct kvm_pic {
 	struct kvm *kvm;
 	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
 	int output;		/* intr from master PIC */
-	struct kvm_io_device dev;
+	struct kvm_io_device dev_master;
+	struct kvm_io_device dev_slave;
+	struct kvm_io_device dev_eclr;
 	void (*ack_notifier)(void *opaque, int irq);
 	unsigned long irq_states[16];
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6cb353c83a12..d28dff749dfd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3562,7 +3562,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			if (r) {
 				mutex_lock(&kvm->slots_lock);
 				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
-							  &vpic->dev);
+							  &vpic->dev_master);
+				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+							  &vpic->dev_slave);
+				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+							  &vpic->dev_eclr);
 				mutex_unlock(&kvm->slots_lock);
 				kfree(vpic);
 				goto create_irqchip_unlock;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ff4d4062af9d..d0e42f30edf6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -55,16 +55,16 @@ struct kvm;
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
 
-/*
- * It would be nice to use something smarter than a linear search, TBD...
- * Thankfully we dont expect many devices to register (famous last words :),
- * so until then it will suffice.  At least its abstracted so we can change
- * in one place.
- */
+struct kvm_io_range {
+	gpa_t addr;
+	int len;
+	struct kvm_io_device *dev;
+};
+
 struct kvm_io_bus {
 	int                   dev_count;
 #define NR_IOBUS_DEVS 300
-	struct kvm_io_device *devs[NR_IOBUS_DEVS];
+	struct kvm_io_range range[NR_IOBUS_DEVS];
 };
 
 enum kvm_bus {
@@ -77,8 +77,8 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val);
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
 		    void *val);
-int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-			    struct kvm_io_device *dev);
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+			    int len, struct kvm_io_device *dev);
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 			      struct kvm_io_device *dev);
 
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 2316ec1aadc4..a6ec206f36ba 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -141,7 +141,8 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
 	dev->zone = *zone;
 
 	mutex_lock(&kvm->slots_lock);
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr,
+				      zone->size, &dev->dev);
 	if (ret < 0)
 		goto out_free_dev;
 	list_add_tail(&dev->list, &kvm->coalesced_zones);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 73358d256fa2..f59c1e8de7a2 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -586,7 +586,8 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 	kvm_iodevice_init(&p->dev, &ioeventfd_ops);
 
-	ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev);
+	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
+				      &p->dev);
 	if (ret < 0)
 		goto unlock_fail;
 
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 8df1ca104a7f..3eed61eb4867 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -394,7 +394,8 @@ int kvm_ioapic_init(struct kvm *kvm)
 	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
 	ioapic->kvm = kvm;
 	mutex_lock(&kvm->slots_lock);
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
+				      IOAPIC_MEM_LENGTH, &ioapic->dev);
 	mutex_unlock(&kvm->slots_lock);
 	if (ret < 0) {
 		kvm->arch.vioapic = NULL;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index aefdda390f5e..d9cfb782cb81 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -47,6 +47,8 @@
 #include <linux/srcu.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/bsearch.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -2391,24 +2393,92 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 	int i;
 
 	for (i = 0; i < bus->dev_count; i++) {
-		struct kvm_io_device *pos = bus->devs[i];
+		struct kvm_io_device *pos = bus->range[i].dev;
 
 		kvm_iodevice_destructor(pos);
 	}
 	kfree(bus);
 }
 
+int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+{
+	const struct kvm_io_range *r1 = p1;
+	const struct kvm_io_range *r2 = p2;
+
+	if (r1->addr < r2->addr)
+		return -1;
+	if (r1->addr + r1->len > r2->addr + r2->len)
+		return 1;
+	return 0;
+}
+
+int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
+			  gpa_t addr, int len)
+{
+	if (bus->dev_count == NR_IOBUS_DEVS)
+		return -ENOSPC;
+
+	bus->range[bus->dev_count++] = (struct kvm_io_range) {
+		.addr = addr,
+		.len = len,
+		.dev = dev,
+	};
+
+	sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
+		kvm_io_bus_sort_cmp, NULL);
+
+	return 0;
+}
+
+int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
+			     gpa_t addr, int len)
+{
+	struct kvm_io_range *range, key;
+	int off;
+
+	key = (struct kvm_io_range) {
+		.addr = addr,
+		.len = len,
+	};
+
+	range = bsearch(&key, bus->range, bus->dev_count,
+			sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
+	if (range == NULL)
+		return -ENOENT;
+
+	off = range - bus->range;
+
+	while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
+		off--;
+
+	return off;
+}
+
 /* kvm_io_bus_write - called under kvm->slots_lock */
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val)
 {
-	int i;
+	int idx;
 	struct kvm_io_bus *bus;
+	struct kvm_io_range range;
+
+	range = (struct kvm_io_range) {
+		.addr = addr,
+		.len = len,
+	};
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+	idx = kvm_io_bus_get_first_dev(bus, addr, len);
+	if (idx < 0)
+		return -EOPNOTSUPP;
+
+	while (idx < bus->dev_count &&
+		kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+		if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
 			return 0;
+		idx++;
+	}
+
 	return -EOPNOTSUPP;
 }
 
@@ -2416,19 +2486,33 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		    int len, void *val)
 {
-	int i;
+	int idx;
 	struct kvm_io_bus *bus;
+	struct kvm_io_range range;
+
+	range = (struct kvm_io_range) {
+		.addr = addr,
+		.len = len,
+	};
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
+	idx = kvm_io_bus_get_first_dev(bus, addr, len);
+	if (idx < 0)
+		return -EOPNOTSUPP;
+
+	while (idx < bus->dev_count &&
+		kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+		if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
 			return 0;
+		idx++;
+	}
+
 	return -EOPNOTSUPP;
 }
 
 /* Caller must hold slots_lock. */
-int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-			    struct kvm_io_device *dev)
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+			    int len, struct kvm_io_device *dev)
 {
 	struct kvm_io_bus *new_bus, *bus;
 
@@ -2440,7 +2524,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 	if (!new_bus)
 		return -ENOMEM;
 	memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
-	new_bus->devs[new_bus->dev_count++] = dev;
+	kvm_io_bus_insert_dev(new_bus, dev, addr, len);
 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
 	synchronize_srcu_expedited(&kvm->srcu);
 	kfree(bus);
@@ -2464,9 +2548,13 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 
 	r = -ENOENT;
 	for (i = 0; i < new_bus->dev_count; i++)
-		if (new_bus->devs[i] == dev) {
+		if (new_bus->range[i].dev == dev) {
 			r = 0;
-			new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
+			new_bus->dev_count--;
+			new_bus->range[i] = new_bus->range[new_bus->dev_count];
+			sort(new_bus->range, new_bus->dev_count,
+			     sizeof(struct kvm_io_range),
+			     kvm_io_bus_sort_cmp, NULL);
 			break;
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From a15bd354f083f20f257db450488db52ac27df439 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 8 Aug 2011 17:17:09 +0200
Subject: KVM: PPC: Add support for explicit HIOR setting

Until now, we always set HIOR based on the PVR, but this is just wrong.
Instead, we should be setting HIOR explicitly, so user space can decide
what the initial HIOR value is - just like on real hardware.

We keep the old PVR based way around for backwards compatibility, but
once user space uses the SREGS based method, we drop the PVR logic.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm.h        |  8 ++++++++
 arch/powerpc/include/asm/kvm_book3s.h |  2 ++
 arch/powerpc/kvm/book3s_pr.c          | 14 ++++++++++++--
 arch/powerpc/kvm/powerpc.c            |  1 +
 include/linux/kvm.h                   |  1 +
 5 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index a4f6c85431f8..a6a253ee81bb 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -148,6 +148,12 @@ struct kvm_regs {
 #define KVM_SREGS_E_UPDATE_DEC		(1 << 2)
 #define KVM_SREGS_E_UPDATE_DBSR		(1 << 3)
 
+/*
+ * Book3S special bits to indicate contents in the struct by maintaining
+ * backwards compatibility with older structs. If adding a new field,
+ * please make sure to add a flag for that new field */
+#define KVM_SREGS_S_HIOR		(1 << 0)
+
 /*
  * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a
  * previous KVM_GET_REGS.
@@ -173,6 +179,8 @@ struct kvm_sregs {
 				__u64 ibat[8]; 
 				__u64 dbat[8]; 
 			} ppc32;
+			__u64 flags; /* KVM_SREGS_S_ */
+			__u64 hior;
 		} s;
 		struct {
 			union {
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 37dd7486627b..472437b7b85d 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s {
 #endif
 	int context_id[SID_CONTEXTS];
 
+	bool hior_sregs;		/* HIOR is set by SREGS, not PVR */
+
 	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
 	struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
 	struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 0c0d3f274437..78dcf659e120 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -150,13 +150,15 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
 #ifdef CONFIG_PPC_BOOK3S_64
 	if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
 		kvmppc_mmu_book3s_64_init(vcpu);
-		to_book3s(vcpu)->hior = 0xfff00000;
+		if (!to_book3s(vcpu)->hior_sregs)
+			to_book3s(vcpu)->hior = 0xfff00000;
 		to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
 	} else
 #endif
 	{
 		kvmppc_mmu_book3s_32_init(vcpu);
-		to_book3s(vcpu)->hior = 0;
+		if (!to_book3s(vcpu)->hior_sregs)
+			to_book3s(vcpu)->hior = 0;
 		to_book3s(vcpu)->msr_mask = 0xffffffffULL;
 	}
 
@@ -770,6 +772,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 		}
 	}
 
+	if (sregs->u.s.flags & KVM_SREGS_S_HIOR)
+		sregs->u.s.hior = to_book3s(vcpu)->hior;
+
 	return 0;
 }
 
@@ -806,6 +811,11 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	/* Flush the MMU after messing with the segments */
 	kvmppc_mmu_pte_flush(vcpu, 0, 0);
 
+	if (sregs->u.s.flags & KVM_SREGS_S_HIOR) {
+		to_book3s(vcpu)->hior_sregs = true;
+		to_book3s(vcpu)->hior = sregs->u.s.hior;
+	}
+
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index a107c9be0fb1..17a5c83e1ccc 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -188,6 +188,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_BOOKE_SREGS:
 #else
 	case KVM_CAP_PPC_SEGSTATE:
+	case KVM_CAP_PPC_HIOR:
 #endif
 	case KVM_CAP_PPC_UNSET_IRQ:
 	case KVM_CAP_PPC_IRQ_LEVEL:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 206979877888..490b041aba45 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_SMT 64
 #define KVM_CAP_PPC_RMA	65
 #define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
+#define KVM_CAP_PPC_HIOR 67
 #define KVM_CAP_S390_GMAP 71
 
 #ifdef KVM_CAP_IRQ_ROUTING
-- 
cgit v1.2.3-59-g8ed1b


From 930b412a005bde2ea3f05911eaaeeb10f11d79ab Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 8 Aug 2011 17:29:42 +0200
Subject: KVM: PPC: Enable the PAPR CAP for Book3S

Now that Book3S PV mode can also run PAPR guests, we can add a PAPR cap and
enable it for all Book3S targets. Enabling that CAP switches KVM into PAPR
mode.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/powerpc.c | 5 +++++
 include/linux/kvm.h        | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 17a5c83e1ccc..13bc798a4441 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -189,6 +189,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 #else
 	case KVM_CAP_PPC_SEGSTATE:
 	case KVM_CAP_PPC_HIOR:
+	case KVM_CAP_PPC_PAPR:
 #endif
 	case KVM_CAP_PPC_UNSET_IRQ:
 	case KVM_CAP_PPC_IRQ_LEVEL:
@@ -572,6 +573,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		r = 0;
 		vcpu->arch.osi_enabled = true;
 		break;
+	case KVM_CAP_PPC_PAPR:
+		r = 0;
+		vcpu->arch.papr_enabled = true;
+		break;
 	default:
 		r = -EINVAL;
 		break;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 490b041aba45..68840544006d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -555,6 +555,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_RMA	65
 #define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
 #define KVM_CAP_PPC_HIOR 67
+#define KVM_CAP_PPC_PAPR 68
 #define KVM_CAP_S390_GMAP 71
 
 #ifdef KVM_CAP_IRQ_ROUTING
-- 
cgit v1.2.3-59-g8ed1b


From bd80158aff71a80292f96d9baea1a65bc0ce87b3 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 12 Sep 2011 11:26:22 +0200
Subject: KVM: Clean up and extend rate-limited output

The use of printk_ratelimit is discouraged, replace it with
pr*_ratelimited or __ratelimit. While at it, convert remaining
guest-triggerable printks to rate-limited variants.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8259.c     | 15 ++++++++-------
 arch/x86/kvm/mmu_audit.c |  6 +++---
 arch/x86/kvm/vmx.c       | 13 ++++++-------
 include/linux/kvm_host.h |  8 +++-----
 4 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 6b869ce0cc19..cac4746d7ffb 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -34,6 +34,9 @@
 #include <linux/kvm_host.h>
 #include "trace.h"
 
+#define pr_pic_unimpl(fmt, ...)	\
+	pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__)
+
 static void pic_irq_request(struct kvm *kvm, int level);
 
 static void pic_lock(struct kvm_pic *s)
@@ -306,10 +309,10 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 			}
 			s->init_state = 1;
 			if (val & 0x02)
-				printk(KERN_ERR "single mode not supported");
+				pr_pic_unimpl("single mode not supported");
 			if (val & 0x08)
-				printk(KERN_ERR
-				       "level sensitive irq not supported");
+				pr_pic_unimpl(
+					"level sensitive irq not supported");
 		} else if (val & 0x08) {
 			if (val & 0x04)
 				s->poll = 1;
@@ -467,8 +470,7 @@ static int picdev_write(struct kvm_pic *s,
 		return -EOPNOTSUPP;
 
 	if (len != 1) {
-		if (printk_ratelimit())
-			printk(KERN_ERR "PIC: non byte write\n");
+		pr_pic_unimpl("non byte write\n");
 		return 0;
 	}
 	pic_lock(s);
@@ -496,8 +498,7 @@ static int picdev_read(struct kvm_pic *s,
 		return -EOPNOTSUPP;
 
 	if (len != 1) {
-		if (printk_ratelimit())
-			printk(KERN_ERR "PIC: non byte read\n");
+		pr_pic_unimpl("non byte read\n");
 		return 0;
 	}
 	pic_lock(s);
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 2460a265be23..746ec259d024 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -121,16 +121,16 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 
 static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 {
+	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
 	unsigned long *rmapp;
 	struct kvm_mmu_page *rev_sp;
 	gfn_t gfn;
 
-
 	rev_sp = page_header(__pa(sptep));
 	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
 	if (!gfn_to_memslot(kvm, gfn)) {
-		if (!printk_ratelimit())
+		if (!__ratelimit(&ratelimit_state))
 			return;
 		audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
 		audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
@@ -141,7 +141,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 
 	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
 	if (!*rmapp) {
-		if (!printk_ratelimit())
+		if (!__ratelimit(&ratelimit_state))
 			return;
 		audit_printk(kvm, "no rmap for writable spte %llx\n",
 			     *sptep);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 21217b65b129..a0d6bd9ad442 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2762,8 +2762,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
 
 	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
 	if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
-		printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
-		       __func__);
+		pr_debug_ratelimited("%s: tss fixup for long mode. \n",
+				     __func__);
 		vmcs_write32(GUEST_TR_AR_BYTES,
 			     (guest_tr_ar & ~AR_TYPE_MASK)
 			     | AR_TYPE_BUSY_64_TSS);
@@ -5634,8 +5634,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		return 0;
 
 	if (unlikely(vmx->fail)) {
-		printk(KERN_INFO "%s failed vm entry %x\n",
-		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+		pr_info_ratelimited("%s failed vm entry %x\n", __func__,
+				    vmcs_read32(VM_INSTRUCTION_ERROR));
 		return 1;
 	}
 
@@ -6612,9 +6612,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	if (vmcs12->vm_entry_msr_load_count > 0 ||
 	    vmcs12->vm_exit_msr_load_count > 0 ||
 	    vmcs12->vm_exit_msr_store_count > 0) {
-		if (printk_ratelimit())
-			printk(KERN_WARNING
-			  "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
+		pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
+				    __func__);
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		return 1;
 	}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d0e42f30edf6..2a414f66af28 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -18,6 +18,7 @@
 #include <linux/msi.h>
 #include <linux/slab.h>
 #include <linux/rcupdate.h>
+#include <linux/ratelimit.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -282,11 +283,8 @@ struct kvm {
 
 /* The guest did something we don't support. */
 #define pr_unimpl(vcpu, fmt, ...)					\
- do {									\
-	if (printk_ratelimit())						\
-		printk(KERN_ERR "kvm: %i: cpu%i " fmt,			\
-		       current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
- } while (0)
+	pr_err_ratelimited("kvm: %i: cpu%i " fmt,			\
+			   current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__)
 
 #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
 #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
-- 
cgit v1.2.3-59-g8ed1b


From 7460fb4a340033107530df19e7e125bd0969bfb2 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 20 Sep 2011 13:43:14 +0300
Subject: KVM: Fix simultaneous NMIs

If simultaneous NMIs happen, we're supposed to queue the second
and next (collapsing them), but currently we sometimes collapse
the second into the first.

Fix by using a counter for pending NMIs instead of a bool; since
the counter limit depends on whether the processor is currently
in an NMI handler, which can only be checked in vcpu context
(via the NMI mask), we add a new KVM_REQ_NMI to request recalculation
of the counter.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +++--
 arch/x86/kvm/x86.c              | 48 ++++++++++++++++++++++++++---------------
 include/linux/kvm_host.h        |  1 +
 3 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6ab4241c27cb..ab62711ccb78 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -413,8 +413,9 @@ struct kvm_vcpu_arch {
 	u32  tsc_catchup_mult;
 	s8   tsc_catchup_shift;
 
-	bool nmi_pending;
-	bool nmi_injected;
+	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
+	unsigned nmi_pending; /* NMI queued after currently running handler */
+	bool nmi_injected;    /* Trying to inject an NMI this entry */
 
 	struct mtrr_state_type mtrr_state;
 	u32 pat;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6b37f18a1663..d51e40733fcb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -83,6 +83,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 				    struct kvm_cpuid_entry2 __user *entries);
+static void process_nmi(struct kvm_vcpu *vcpu);
 
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -359,8 +360,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
-	vcpu->arch.nmi_pending = 1;
+	atomic_inc(&vcpu->arch.nmi_queued);
+	kvm_make_request(KVM_REQ_NMI, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 
@@ -2827,6 +2828,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 					       struct kvm_vcpu_events *events)
 {
+	process_nmi(vcpu);
 	events->exception.injected =
 		vcpu->arch.exception.pending &&
 		!kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2844,7 +2846,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 			KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
 
 	events->nmi.injected = vcpu->arch.nmi_injected;
-	events->nmi.pending = vcpu->arch.nmi_pending;
+	events->nmi.pending = vcpu->arch.nmi_pending != 0;
 	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
 	events->nmi.pad = 0;
 
@@ -2864,6 +2866,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 			      | KVM_VCPUEVENT_VALID_SHADOW))
 		return -EINVAL;
 
+	process_nmi(vcpu);
 	vcpu->arch.exception.pending = events->exception.injected;
 	vcpu->arch.exception.nr = events->exception.nr;
 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -4763,7 +4766,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 	kvm_set_rflags(vcpu, ctxt->eflags);
 
 	if (irq == NMI_VECTOR)
-		vcpu->arch.nmi_pending = false;
+		vcpu->arch.nmi_pending = 0;
 	else
 		vcpu->arch.interrupt.pending = false;
 
@@ -5572,7 +5575,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
 	/* try to inject new event if pending */
 	if (vcpu->arch.nmi_pending) {
 		if (kvm_x86_ops->nmi_allowed(vcpu)) {
-			vcpu->arch.nmi_pending = false;
+			--vcpu->arch.nmi_pending;
 			vcpu->arch.nmi_injected = true;
 			kvm_x86_ops->set_nmi(vcpu);
 		}
@@ -5604,10 +5607,26 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void process_nmi(struct kvm_vcpu *vcpu)
+{
+	unsigned limit = 2;
+
+	/*
+	 * x86 is limited to one NMI running, and one NMI pending after it.
+	 * If an NMI is already in progress, limit further NMIs to just one.
+	 * Otherwise, allow two (and we'll inject the first one immediately).
+	 */
+	if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
+		limit = 1;
+
+	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
+	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
-	bool nmi_pending;
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
 
@@ -5647,6 +5666,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		}
 		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
 			record_steal_time(vcpu);
+		if (kvm_check_request(KVM_REQ_NMI, vcpu))
+			process_nmi(vcpu);
 
 	}
 
@@ -5654,19 +5675,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (unlikely(r))
 		goto out;
 
-	/*
-	 * An NMI can be injected between local nmi_pending read and
-	 * vcpu->arch.nmi_pending read inside inject_pending_event().
-	 * But in that case, KVM_REQ_EVENT will be set, which makes
-	 * the race described above benign.
-	 */
-	nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
-
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
 		inject_pending_event(vcpu);
 
 		/* enable NMI/IRQ window open exits if needed */
-		if (nmi_pending)
+		if (vcpu->arch.nmi_pending)
 			kvm_x86_ops->enable_nmi_window(vcpu);
 		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
 			kvm_x86_ops->enable_irq_window(vcpu);
@@ -6374,7 +6387,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.nmi_pending = false;
+	atomic_set(&vcpu->arch.nmi_queued, 0);
+	vcpu->arch.nmi_pending = 0;
 	vcpu->arch.nmi_injected = false;
 
 	vcpu->arch.switch_db_regs = 0;
@@ -6649,7 +6663,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 		!vcpu->arch.apf.halted)
 		|| !list_empty_careful(&vcpu->async_pf.done)
 		|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
-		|| vcpu->arch.nmi_pending ||
+		|| atomic_read(&vcpu->arch.nmi_queued) ||
 		(kvm_arch_interrupt_allowed(vcpu) &&
 		 kvm_cpu_has_interrupt(vcpu));
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2a414f66af28..d52623199978 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -49,6 +49,7 @@
 #define KVM_REQ_EVENT             11
 #define KVM_REQ_APF_HALT          12
 #define KVM_REQ_STEAL_UPDATE      13
+#define KVM_REQ_NMI               14
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
-- 
cgit v1.2.3-59-g8ed1b


From 7697e71f72b45a1bd0abe70918c383100fcc8514 Mon Sep 17 00:00:00 2001
From: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date: Tue, 18 Oct 2011 12:27:15 +0200
Subject: KVM: s390: implement sigp external call

Implement sigp external call, which might be required for guests that
issue an external call instead of an emergency signal for IPI.

This fixes an issue with "KVM: unknown SIGP: 0x02" when booting
such an SMP guest.

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/s390/include/asm/kvm_host.h |  7 +++++++
 arch/s390/kvm/interrupt.c        | 30 +++++++++++++++++++++++++++
 arch/s390/kvm/kvm-s390.c         |  2 ++
 arch/s390/kvm/sigp.c             | 45 +++++++++++++++++++++++++++++++++++++++-
 include/linux/kvm.h              |  1 +
 5 files changed, 84 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 00ff00dfb24c..1ca5de07ac36 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -119,6 +119,7 @@ struct kvm_vcpu_stat {
 	u32 instruction_lctlg;
 	u32 exit_program_interruption;
 	u32 exit_instr_and_program;
+	u32 deliver_external_call;
 	u32 deliver_emergency_signal;
 	u32 deliver_service_signal;
 	u32 deliver_virtio_interrupt;
@@ -138,6 +139,7 @@ struct kvm_vcpu_stat {
 	u32 instruction_stfl;
 	u32 instruction_tprot;
 	u32 instruction_sigp_sense;
+	u32 instruction_sigp_external_call;
 	u32 instruction_sigp_emergency;
 	u32 instruction_sigp_stop;
 	u32 instruction_sigp_arch;
@@ -174,6 +176,10 @@ struct kvm_s390_prefix_info {
 	__u32 address;
 };
 
+struct kvm_s390_extcall_info {
+	__u16 code;
+};
+
 struct kvm_s390_emerg_info {
 	__u16 code;
 };
@@ -186,6 +192,7 @@ struct kvm_s390_interrupt_info {
 		struct kvm_s390_ext_info ext;
 		struct kvm_s390_pgm_info pgm;
 		struct kvm_s390_emerg_info emerg;
+		struct kvm_s390_extcall_info extcall;
 		struct kvm_s390_prefix_info prefix;
 	};
 };
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index c9aeb4b4d0b8..87c16705b381 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -38,6 +38,11 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
 				      struct kvm_s390_interrupt_info *inti)
 {
 	switch (inti->type) {
+	case KVM_S390_INT_EXTERNAL_CALL:
+		if (psw_extint_disabled(vcpu))
+			return 0;
+		if (vcpu->arch.sie_block->gcr[0] & 0x2000ul)
+			return 1;
 	case KVM_S390_INT_EMERGENCY:
 		if (psw_extint_disabled(vcpu))
 			return 0;
@@ -98,6 +103,7 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
 				      struct kvm_s390_interrupt_info *inti)
 {
 	switch (inti->type) {
+	case KVM_S390_INT_EXTERNAL_CALL:
 	case KVM_S390_INT_EMERGENCY:
 	case KVM_S390_INT_SERVICE:
 	case KVM_S390_INT_VIRTIO:
@@ -143,6 +149,28 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
 			exception = 1;
 		break;
 
+	case KVM_S390_INT_EXTERNAL_CALL:
+		VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
+		vcpu->stat.deliver_external_call++;
+		rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202);
+		if (rc == -EFAULT)
+			exception = 1;
+
+		rc = put_guest_u16(vcpu, __LC_CPU_ADDRESS, inti->extcall.code);
+		if (rc == -EFAULT)
+			exception = 1;
+
+		rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+			 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+		if (rc == -EFAULT)
+			exception = 1;
+
+		rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+			__LC_EXT_NEW_PSW, sizeof(psw_t));
+		if (rc == -EFAULT)
+			exception = 1;
+		break;
+
 	case KVM_S390_INT_SERVICE:
 		VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
 			   inti->ext.ext_params);
@@ -522,6 +550,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 		break;
 	case KVM_S390_PROGRAM_INT:
 	case KVM_S390_SIGP_STOP:
+	case KVM_S390_INT_EXTERNAL_CALL:
 	case KVM_S390_INT_EMERGENCY:
 	default:
 		kfree(inti);
@@ -581,6 +610,7 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 		break;
 	case KVM_S390_SIGP_STOP:
 	case KVM_S390_RESTART:
+	case KVM_S390_INT_EXTERNAL_CALL:
 	case KVM_S390_INT_EMERGENCY:
 		VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
 		inti->type = s390int->type;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 29635678b5ec..9610ba41b974 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -46,6 +46,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
 	{ "instruction_lctl", VCPU_STAT(instruction_lctl) },
 	{ "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
+	{ "deliver_external_call", VCPU_STAT(deliver_external_call) },
 	{ "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
 	{ "deliver_virtio_interrupt", VCPU_STAT(deliver_virtio_interrupt) },
 	{ "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
@@ -64,6 +65,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
 	{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
 	{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
+	{ "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
 	{ "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) },
 	{ "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
 	{ "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index d6a50c1fb2e6..f815118835f3 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -87,6 +87,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
 		return -ENOMEM;
 
 	inti->type = KVM_S390_INT_EMERGENCY;
+	inti->emerg.code = vcpu->vcpu_id;
 
 	spin_lock(&fi->lock);
 	li = fi->local_int[cpu_addr];
@@ -103,9 +104,47 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
 		wake_up_interruptible(&li->wq);
 	spin_unlock_bh(&li->lock);
 	rc = 0; /* order accepted */
+	VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
+unlock:
+	spin_unlock(&fi->lock);
+	return rc;
+}
+
+static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
+{
+	struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+	struct kvm_s390_local_interrupt *li;
+	struct kvm_s390_interrupt_info *inti;
+	int rc;
+
+	if (cpu_addr >= KVM_MAX_VCPUS)
+		return 3; /* not operational */
+
+	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+	if (!inti)
+		return -ENOMEM;
+
+	inti->type = KVM_S390_INT_EXTERNAL_CALL;
+	inti->extcall.code = vcpu->vcpu_id;
+
+	spin_lock(&fi->lock);
+	li = fi->local_int[cpu_addr];
+	if (li == NULL) {
+		rc = 3; /* not operational */
+		kfree(inti);
+		goto unlock;
+	}
+	spin_lock_bh(&li->lock);
+	list_add_tail(&inti->list, &li->list);
+	atomic_set(&li->active, 1);
+	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
+	if (waitqueue_active(&li->wq))
+		wake_up_interruptible(&li->wq);
+	spin_unlock_bh(&li->lock);
+	rc = 0; /* order accepted */
+	VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
 unlock:
 	spin_unlock(&fi->lock);
-	VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
 	return rc;
 }
 
@@ -267,6 +306,10 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
 		rc = __sigp_sense(vcpu, cpu_addr,
 				  &vcpu->arch.guest_gprs[r1]);
 		break;
+	case SIGP_EXTERNAL_CALL:
+		vcpu->stat.instruction_sigp_external_call++;
+		rc = __sigp_external_call(vcpu, cpu_addr);
+		break;
 	case SIGP_EMERGENCY:
 		vcpu->stat.instruction_sigp_emergency++;
 		rc = __sigp_emergency(vcpu, cpu_addr);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 68840544006d..f47fcd30273d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -371,6 +371,7 @@ struct kvm_s390_psw {
 #define KVM_S390_INT_VIRTIO		0xffff2603u
 #define KVM_S390_INT_SERVICE		0xffff2401u
 #define KVM_S390_INT_EMERGENCY		0xffff1201u
+#define KVM_S390_INT_EXTERNAL_CALL	0xffff1202u
 
 struct kvm_s390_interrupt {
 	__u32 type;
-- 
cgit v1.2.3-59-g8ed1b