From 2e5ad6b9c45d43cc4e7b8ac5ded1c55a7c4a3893 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 23 May 2012 12:53:11 -0400
Subject: xen/hvc: Collapse error logic.

All of the error paths are doing the same logic. In which
case we might as well collapse them in one path.

CC: stable@kernel.org
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/tty/hvc/hvc_xen.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c
index d3d91dae065c..afc7fc27aa52 100644
--- a/drivers/tty/hvc/hvc_xen.c
+++ b/drivers/tty/hvc/hvc_xen.c
@@ -216,22 +216,16 @@ static int xen_hvm_console_init(void)
 		return 0;
 
 	r = hvm_get_parameter(HVM_PARAM_CONSOLE_EVTCHN, &v);
-	if (r < 0) {
-		kfree(info);
-		return -ENODEV;
-	}
+	if (r < 0)
+		goto err;
 	info->evtchn = v;
 	hvm_get_parameter(HVM_PARAM_CONSOLE_PFN, &v);
-	if (r < 0) {
-		kfree(info);
-		return -ENODEV;
-	}
+	if (r < 0)
+		goto err;
 	mfn = v;
 	info->intf = ioremap(mfn << PAGE_SHIFT, PAGE_SIZE);
-	if (info->intf == NULL) {
-		kfree(info);
-		return -ENODEV;
-	}
+	if (info->intf == NULL)
+		goto err;
 	info->vtermno = HVC_COOKIE;
 
 	spin_lock(&xencons_lock);
@@ -239,6 +233,9 @@ static int xen_hvm_console_init(void)
 	spin_unlock(&xencons_lock);
 
 	return 0;
+err:
+	kfree(info);
+	return -ENODEV;
 }
 
 static int xen_pv_console_init(void)
-- 
cgit v1.2.3-59-g8ed1b


From a32c88b9386ce3df87f28dd46bdc3776cd6edf75 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 23 May 2012 12:55:38 -0400
Subject: xen/hvc: Fix error cases around HVM_PARAM_CONSOLE_PFN

We weren't resetting the parameter to be passed in to a
known default. Nor were we checking the return value of
hvm_get_parameter.

CC: stable@kernel.org
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/tty/hvc/hvc_xen.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c
index afc7fc27aa52..3277f0eec4a7 100644
--- a/drivers/tty/hvc/hvc_xen.c
+++ b/drivers/tty/hvc/hvc_xen.c
@@ -219,7 +219,8 @@ static int xen_hvm_console_init(void)
 	if (r < 0)
 		goto err;
 	info->evtchn = v;
-	hvm_get_parameter(HVM_PARAM_CONSOLE_PFN, &v);
+	v = 0;
+	r = hvm_get_parameter(HVM_PARAM_CONSOLE_PFN, &v);
 	if (r < 0)
 		goto err;
 	mfn = v;
-- 
cgit v1.2.3-59-g8ed1b


From 5842f5768599094758931b74190cdf93641a8e35 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 23 May 2012 12:56:59 -0400
Subject: xen/hvc: Check HVM_PARAM_CONSOLE_[EVTCHN|PFN] for correctness.

We need to make sure that those parameters are setup to be correct.
As such the value of 0 is deemed invalid and we find that we
bail out. The hypervisor sets by default all of them to be zero
and when the hypercall is done does a simple:

 a.value = d->arch.hvm_domain.params[a.index];

Which means that if the Xen toolstack forgot to setup the proper
HVM_PARAM_CONSOLE_EVTCHN (or the PFN one), we would get the
default value of 0 and use that.

CC: stable@kernel.org
Fixes-Oracle-Bug: 14091238
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/tty/hvc/hvc_xen.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c
index 3277f0eec4a7..944eaeb8e0cf 100644
--- a/drivers/tty/hvc/hvc_xen.c
+++ b/drivers/tty/hvc/hvc_xen.c
@@ -214,14 +214,19 @@ static int xen_hvm_console_init(void)
 	/* already configured */
 	if (info->intf != NULL)
 		return 0;
-
+	/*
+	 * If the toolstack (or the hypervisor) hasn't set these values, the
+	 * default value is 0. Even though mfn = 0 and evtchn = 0 are
+	 * theoretically correct values, in practice they never are and they
+	 * mean that a legacy toolstack hasn't initialized the pv console correctly.
+	 */
 	r = hvm_get_parameter(HVM_PARAM_CONSOLE_EVTCHN, &v);
-	if (r < 0)
+	if (r < 0 || v == 0)
 		goto err;
 	info->evtchn = v;
 	v = 0;
 	r = hvm_get_parameter(HVM_PARAM_CONSOLE_PFN, &v);
-	if (r < 0)
+	if (r < 0 || v == 0)
 		goto err;
 	mfn = v;
 	info->intf = ioremap(mfn << PAGE_SHIFT, PAGE_SIZE);
-- 
cgit v1.2.3-59-g8ed1b


From 5e152e6c4b0da84c66cad56597b42c4ecedb0448 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 23 May 2012 13:28:44 -0400
Subject: xen/events: Add WARN_ON when quick lookup found invalid type.

All of the bind_XYZ_to_irq do a quick lookup to see if the
event exists. And if it does, then the initialized IRQ number
is returned instead of initializing a new IRQ number.

This patch adds an extra logic to check that the type returned
is proper one and that there is an IRQ handler setup for it.

This patch has the benefit of being able to find drivers that
are doing something naught.

[v1: Enhanced based on Stefano's review]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/events.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index faae2f910ad2..af8b8fbd9d88 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -827,6 +827,9 @@ int bind_evtchn_to_irq(unsigned int evtchn)
 					      handle_edge_irq, "event");
 
 		xen_irq_info_evtchn_init(irq, evtchn);
+	} else {
+		struct irq_info *info = info_for_irq(irq);
+		WARN_ON(info == NULL || info->type != IRQT_EVTCHN);
 	}
 
 out:
@@ -862,6 +865,9 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 		xen_irq_info_ipi_init(cpu, irq, evtchn, ipi);
 
 		bind_evtchn_to_cpu(evtchn, cpu);
+	} else {
+		struct irq_info *info = info_for_irq(irq);
+		WARN_ON(info == NULL || info->type != IRQT_IPI);
 	}
 
  out:
@@ -939,6 +945,9 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 		xen_irq_info_virq_init(cpu, irq, evtchn, virq);
 
 		bind_evtchn_to_cpu(evtchn, cpu);
+	} else {
+		struct irq_info *info = info_for_irq(irq);
+		WARN_ON(info == NULL || info->type != IRQT_VIRQ);
 	}
 
 out:
-- 
cgit v1.2.3-59-g8ed1b


From 780dbcd0eedec6528d777b668019dabb36badf1a Mon Sep 17 00:00:00 2001
From: "Zhang, Yang Z" <yang.z.zhang@intel.com>
Date: Tue, 22 May 2012 08:40:16 +0000
Subject: xen/pci: Check for PCI bridge before using it.

Some SR-IOV devices may use more than one bus number, but there is no real bridges
because that have internal routing mechanism. So need to check whether the bridge is
existing before using it.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
index b84bf0b6cc34..18fff88254eb 100644
--- a/drivers/xen/pci.c
+++ b/drivers/xen/pci.c
@@ -59,7 +59,7 @@ static int xen_add_device(struct device *dev)
 
 #ifdef CONFIG_ACPI
 		handle = DEVICE_ACPI_HANDLE(&pci_dev->dev);
-		if (!handle)
+		if (!handle && pci_dev->bus->bridge)
 			handle = DEVICE_ACPI_HANDLE(pci_dev->bus->bridge);
 #ifdef CONFIG_PCI_IOV
 		if (!handle && pci_dev->is_virtfn)
-- 
cgit v1.2.3-59-g8ed1b


From 58b7b53a36b0be8081fbfc91aeea24b83c20ca1b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 29 May 2012 12:36:43 -0400
Subject: xen/balloon: Subtract from xen_released_pages the count that is
 populated.

We did not take into account that xen_released_pages would be
used outside the initial E820 parsing code. As such we would
did not subtract from xen_released_pages the count of pages
that we had populated back (instead we just did a simple
extra_pages = released - populated).

The balloon driver uses xen_released_pages to set the initial
current_pages count.  If this is wrong (too low) then when a new
(higher) target is set, the balloon driver will request too many pages
from Xen."

This fixes errors such as:

(XEN) memory.c:133:d0 Could not allocate order=0 extent: id=0 memflags=0 (51 of 512)
during bootup and
free_memory            : 0

where the free_memory should be 128.

Acked-by: David Vrabel <david.vrabel@citrix.com>
[v1: Per David's review made the git commit better]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/setup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 3ebba0753d38..a4790bf22c59 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -371,7 +371,8 @@ char * __init xen_memory_setup(void)
 	populated = xen_populate_chunk(map, memmap.nr_entries,
 			max_pfn, &last_pfn, xen_released_pages);
 
-	extra_pages += (xen_released_pages - populated);
+	xen_released_pages -= populated;
+	extra_pages += xen_released_pages;
 
 	if (last_pfn > max_pfn) {
 		max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
-- 
cgit v1.2.3-59-g8ed1b


From 5e626254206a709c6e937f3dda69bf26c7344f6f Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Tue, 29 May 2012 13:07:31 +0200
Subject: xen/setup: filter APERFMPERF cpuid feature out

Xen PV kernels allow access to the APERF/MPERF registers to read the
effective frequency. Access to the MSRs is however redirected to the
currently scheduled physical CPU, making consecutive read and
compares unreliable. In addition each rdmsr traps into the hypervisor.
So to avoid bogus readouts and expensive traps, disable the kernel
internal feature flag for APERF/MPERF if running under Xen.
This will
a) remove the aperfmperf flag from /proc/cpuinfo
b) not mislead the power scheduler (arch/x86/kernel/cpu/sched.c) to
   use the feature to improve scheduling (by default disabled)
c) not mislead the cpufreq driver to use the MSRs

This does not cover userland programs which access the MSRs via the
device file interface, but this will be addressed separately.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Cc: stable@vger.kernel.org # v3.0+
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d1f9a0472d44..272ebd0ce326 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -208,6 +208,9 @@ static void __init xen_banner(void)
 	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
 }
 
+#define CPUID_THERM_POWER_LEAF 6
+#define APERFMPERF_PRESENT 0
+
 static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
 static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
 
@@ -241,6 +244,11 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 		*dx = cpuid_leaf5_edx_val;
 		return;
 
+	case CPUID_THERM_POWER_LEAF:
+		/* Disabling APERFMPERF for kernel usage */
+		maskecx = ~(1 << APERFMPERF_PRESENT);
+		break;
+
 	case 0xb:
 		/* Suppress extended topology stuff */
 		maskebx = 0;
-- 
cgit v1.2.3-59-g8ed1b


From b9e0d95c041ca2d7ad297ee37c2e9cfab67a188f Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Wed, 23 May 2012 18:57:20 +0100
Subject: xen: mark local pages as FOREIGN in the m2p_override

When the frontend and the backend reside on the same domain, even if we
add pages to the m2p_override, these pages will never be returned by
mfn_to_pfn because the check "get_phys_to_machine(pfn) != mfn" will
always fail, so the pfn of the frontend will be returned instead
(resulting in a deadlock because the frontend pages are already locked).

INFO: task qemu-system-i38:1085 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
qemu-system-i38 D ffff8800cfc137c0     0  1085      1 0x00000000
 ffff8800c47ed898 0000000000000282 ffff8800be4596b0 00000000000137c0
 ffff8800c47edfd8 ffff8800c47ec010 00000000000137c0 00000000000137c0
 ffff8800c47edfd8 00000000000137c0 ffffffff82213020 ffff8800be4596b0
Call Trace:
 [<ffffffff81101ee0>] ? __lock_page+0x70/0x70
 [<ffffffff81a0fdd9>] schedule+0x29/0x70
 [<ffffffff81a0fe80>] io_schedule+0x60/0x80
 [<ffffffff81101eee>] sleep_on_page+0xe/0x20
 [<ffffffff81a0e1ca>] __wait_on_bit_lock+0x5a/0xc0
 [<ffffffff81101ed7>] __lock_page+0x67/0x70
 [<ffffffff8106f750>] ? autoremove_wake_function+0x40/0x40
 [<ffffffff811867e6>] ? bio_add_page+0x36/0x40
 [<ffffffff8110b692>] set_page_dirty_lock+0x52/0x60
 [<ffffffff81186021>] bio_set_pages_dirty+0x51/0x70
 [<ffffffff8118c6b4>] do_blockdev_direct_IO+0xb24/0xeb0
 [<ffffffff811e71a0>] ? ext3_get_blocks_handle+0xe00/0xe00
 [<ffffffff8118ca95>] __blockdev_direct_IO+0x55/0x60
 [<ffffffff811e71a0>] ? ext3_get_blocks_handle+0xe00/0xe00
 [<ffffffff811e91c8>] ext3_direct_IO+0xf8/0x390
 [<ffffffff811e71a0>] ? ext3_get_blocks_handle+0xe00/0xe00
 [<ffffffff81004b60>] ? xen_mc_flush+0xb0/0x1b0
 [<ffffffff81104027>] generic_file_aio_read+0x737/0x780
 [<ffffffff813bedeb>] ? gnttab_map_refs+0x15b/0x1e0
 [<ffffffff811038f0>] ? find_get_pages+0x150/0x150
 [<ffffffff8119736c>] aio_rw_vect_retry+0x7c/0x1d0
 [<ffffffff811972f0>] ? lookup_ioctx+0x90/0x90
 [<ffffffff81198856>] aio_run_iocb+0x66/0x1a0
 [<ffffffff811998b8>] do_io_submit+0x708/0xb90
 [<ffffffff81199d50>] sys_io_submit+0x10/0x20
 [<ffffffff81a18d69>] system_call_fastpath+0x16/0x1b

The explanation is in the comment within the code:

We need to do this because the pages shared by the frontend
(xen-blkfront) can be already locked (lock_page, called by
do_read_cache_page); when the userspace backend tries to use them
with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
do_blockdev_direct_IO is going to try to lock the same pages
again resulting in a deadlock.

A simplified call graph looks like this:

pygrub                          QEMU
-----------------------------------------------
do_read_cache_page              io_submit
  |                              |
lock_page                       ext3_direct_IO
                                 |
                                bio_add_page
                                 |
                                lock_page

Internally the xen-blkback uses m2p_add_override to swizzle (temporarily)
a 'struct page' to have a different MFN (so that it can point to another
guest). It also can easily find out whether another pfn corresponding
to the mfn exists in the m2p, and can set the FOREIGN bit
in the p2m, making sure that mfn_to_pfn returns the pfn of the backend.

This allows the backend to perform direct_IO on these pages, but as a
side effect prevents the frontend from using get_user_pages_fast on
them while they are being shared with the backend.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index ffd08c414e91..64effdc6da94 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -706,6 +706,7 @@ int m2p_add_override(unsigned long mfn, struct page *page,
 	unsigned long uninitialized_var(address);
 	unsigned level;
 	pte_t *ptep = NULL;
+	int ret = 0;
 
 	pfn = page_to_pfn(page);
 	if (!PageHighMem(page)) {
@@ -741,6 +742,24 @@ int m2p_add_override(unsigned long mfn, struct page *page,
 	list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]);
 	spin_unlock_irqrestore(&m2p_override_lock, flags);
 
+	/* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
+	 * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
+	 * pfn so that the following mfn_to_pfn(mfn) calls will return the
+	 * pfn from the m2p_override (the backend pfn) instead.
+	 * We need to do this because the pages shared by the frontend
+	 * (xen-blkfront) can be already locked (lock_page, called by
+	 * do_read_cache_page); when the userspace backend tries to use them
+	 * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
+	 * do_blockdev_direct_IO is going to try to lock the same pages
+	 * again resulting in a deadlock.
+	 * As a side effect get_user_pages_fast might not be safe on the
+	 * frontend pages while they are being shared with the backend,
+	 * because mfn_to_pfn (that ends up being called by GUPF) will
+	 * return the backend pfn rather than the frontend pfn. */
+	ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
+	if (ret == 0 && get_phys_to_machine(pfn) == mfn)
+		set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(m2p_add_override);
@@ -752,6 +771,7 @@ int m2p_remove_override(struct page *page, bool clear_pte)
 	unsigned long uninitialized_var(address);
 	unsigned level;
 	pte_t *ptep = NULL;
+	int ret = 0;
 
 	pfn = page_to_pfn(page);
 	mfn = get_phys_to_machine(pfn);
@@ -821,6 +841,22 @@ int m2p_remove_override(struct page *page, bool clear_pte)
 	} else
 		set_phys_to_machine(pfn, page->index);
 
+	/* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present
+	 * somewhere in this domain, even before being added to the
+	 * m2p_override (see comment above in m2p_add_override).
+	 * If there are no other entries in the m2p_override corresponding
+	 * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for
+	 * the original pfn (the one shared by the frontend): the backend
+	 * cannot do any IO on this page anymore because it has been
+	 * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of
+	 * the original pfn causes mfn_to_pfn(mfn) to return the frontend
+	 * pfn again. */
+	mfn &= ~FOREIGN_FRAME_BIT;
+	ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
+	if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) &&
+			m2p_find_override(mfn) == NULL)
+		set_phys_to_machine(pfn, mfn);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(m2p_remove_override);
-- 
cgit v1.2.3-59-g8ed1b