From 9674f35b1ec17577163897f052f405c1e9e5893d Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Fri, 3 Apr 2009 08:34:05 -0500
Subject: x86: UV BAU and nodes with no memory

This patch fixes BAU initialization for systems containing
nodes with no memory and for systems with non-consecutive
node numbers.

Fixes and clarifies situations where pnode should be used instead
of node id.

Tested on the UV hardware simulator.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <E1LpjX3-00007N-12@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 108 ++++++++++++++++++++++++++---------------------
 1 file changed, 61 insertions(+), 47 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 79c073247284..b833bc634d17 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -31,6 +31,34 @@ static unsigned long		uv_mmask __read_mostly;
 static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
 static DEFINE_PER_CPU(struct bau_control, bau_control);
 
+/*
+ * Determine the first node on a blade.
+ */
+static int __init blade_to_first_node(int blade)
+{
+	int node, b;
+
+	for_each_online_node(node) {
+		b = uv_node_to_blade_id(node);
+		if (blade == b)
+			return node;
+	}
+	BUG();
+}
+
+/*
+ * Determine the apicid of the first cpu on a blade.
+ */
+static int __init blade_to_first_apicid(int blade)
+{
+	int cpu;
+
+	for_each_present_cpu(cpu)
+		if (blade == uv_cpu_to_blade_id(cpu))
+			return per_cpu(x86_cpu_to_apicid, cpu);
+	return -1;
+}
+
 /*
  * Free a software acknowledge hardware resource by clearing its Pending
  * bit. This will return a reply to the sender.
@@ -67,7 +95,7 @@ static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
 	msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
 	cpu = uv_blade_processor_id();
 	msg->number_of_cpus =
-	    uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
+		uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
 	this_cpu_mask = 1UL << cpu;
 	if (msp->seen_by.bits & this_cpu_mask)
 		return;
@@ -215,14 +243,14 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
  * Returns @flush_mask if some remote flushing remains to be done. The
  * mask will have some bits still set.
  */
-const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
+const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
 					     struct bau_desc *bau_desc,
 					     struct cpumask *flush_mask)
 {
 	int completion_status = 0;
 	int right_shift;
 	int tries = 0;
-	int blade;
+	int pnode;
 	int bit;
 	unsigned long mmr_offset;
 	unsigned long index;
@@ -265,8 +293,8 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
 	 * use the IPI method of shootdown on them.
 	 */
 	for_each_cpu(bit, flush_mask) {
-		blade = uv_cpu_to_blade_id(bit);
-		if (blade == this_blade)
+		pnode = uv_cpu_to_pnode(bit);
+		if (pnode == this_pnode)
 			continue;
 		cpumask_clear_cpu(bit, flush_mask);
 	}
@@ -308,16 +336,16 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
 	int i;
 	int bit;
-	int blade;
+	int pnode;
 	int uv_cpu;
-	int this_blade;
+	int this_pnode;
 	int locals = 0;
 	struct bau_desc *bau_desc;
 
 	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
 
 	uv_cpu = uv_blade_processor_id();
-	this_blade = uv_numa_blade_id();
+	this_pnode = uv_hub_info->pnode;
 	bau_desc = __get_cpu_var(bau_control).descriptor_base;
 	bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
 
@@ -325,13 +353,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 
 	i = 0;
 	for_each_cpu(bit, flush_mask) {
-		blade = uv_cpu_to_blade_id(bit);
-		BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
-		if (blade == this_blade) {
+		pnode = uv_cpu_to_pnode(bit);
+		BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));
+		if (pnode == this_pnode) {
 			locals++;
 			continue;
 		}
-		bau_node_set(blade, &bau_desc->distribution);
+		bau_node_set(pnode, &bau_desc->distribution);
 		i++;
 	}
 	if (i == 0) {
@@ -349,7 +377,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	bau_desc->payload.address = va;
 	bau_desc->payload.sending_cpu = cpu;
 
-	return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
+	return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);
 }
 
 /*
@@ -481,8 +509,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 			   stat->requestee, stat->onetlb, stat->alltlb,
 			   stat->s_retry, stat->d_retry, stat->ptc_i);
 		seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
-			   uv_read_global_mmr64(uv_blade_to_pnode
-					(uv_cpu_to_blade_id(cpu)),
+			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
 					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
 			   stat->sflush, stat->dflush,
 			   stat->retriesok, stat->nomsg,
@@ -616,16 +643,18 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
  * finish the initialization of the per-blade control structures
  */
 static void __init
-uv_table_bases_finish(int blade, int node, int cur_cpu,
+uv_table_bases_finish(int blade,
 		      struct bau_control *bau_tablesp,
 		      struct bau_desc *adp)
 {
 	struct bau_control *bcp;
-	int i;
+	int cpu;
 
-	for (i = cur_cpu; i < cur_cpu + uv_blade_nr_possible_cpus(blade); i++) {
-		bcp = (struct bau_control *)&per_cpu(bau_control, i);
+	for_each_present_cpu(cpu) {
+		if (blade != uv_cpu_to_blade_id(cpu))
+			continue;
 
+		bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
 		bcp->bau_msg_head	= bau_tablesp->va_queue_first;
 		bcp->va_queue_first	= bau_tablesp->va_queue_first;
 		bcp->va_queue_last	= bau_tablesp->va_queue_last;
@@ -648,8 +677,7 @@ uv_activation_descriptor_init(int node, int pnode)
 	struct bau_desc *adp;
 	struct bau_desc *ad2;
 
-	adp = (struct bau_desc *)
-	    kmalloc_node(16384, GFP_KERNEL, node);
+	adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);
 	BUG_ON(!adp);
 
 	pa = __pa((unsigned long)adp);
@@ -666,8 +694,7 @@ uv_activation_descriptor_init(int node, int pnode)
 	for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
 		memset(ad2, 0, sizeof(struct bau_desc));
 		ad2->header.sw_ack_flag = 1;
-		ad2->header.base_dest_nodeid =
-		    uv_blade_to_pnode(uv_cpu_to_blade_id(0));
+		ad2->header.base_dest_nodeid = uv_cpu_to_pnode(0);
 		ad2->header.command = UV_NET_ENDPOINT_INTD;
 		ad2->header.int_both = 1;
 		/*
@@ -714,8 +741,9 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
 /*
  * Initialization of each UV blade's structures
  */
-static int __init uv_init_blade(int blade, int node, int cur_cpu)
+static int __init uv_init_blade(int blade)
 {
+	int node;
 	int pnode;
 	unsigned long pa;
 	unsigned long apicid;
@@ -723,16 +751,17 @@ static int __init uv_init_blade(int blade, int node, int cur_cpu)
 	struct bau_payload_queue_entry *pqp;
 	struct bau_control *bau_tablesp;
 
+	node = blade_to_first_node(blade);
 	bau_tablesp = uv_table_bases_init(blade, node);
 	pnode = uv_blade_to_pnode(blade);
 	adp = uv_activation_descriptor_init(node, pnode);
 	pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
-	uv_table_bases_finish(blade, node, cur_cpu, bau_tablesp, adp);
+	uv_table_bases_finish(blade, bau_tablesp, adp);
 	/*
 	 * the below initialization can't be in firmware because the
 	 * messaging IRQ will be determined by the OS
 	 */
-	apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+	apicid = blade_to_first_apicid(blade);
 	pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
 	if ((pa & 0xff) != UV_BAU_MESSAGE) {
 		uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
@@ -747,9 +776,7 @@ static int __init uv_init_blade(int blade, int node, int cur_cpu)
 static int __init uv_bau_init(void)
 {
 	int blade;
-	int node;
 	int nblades;
-	int last_blade;
 	int cur_cpu;
 
 	if (!is_uv_system())
@@ -758,29 +785,16 @@ static int __init uv_bau_init(void)
 	uv_bau_retry_limit = 1;
 	uv_nshift = uv_hub_info->n_val;
 	uv_mmask = (1UL << uv_hub_info->n_val) - 1;
-	nblades = 0;
-	last_blade = -1;
-	cur_cpu = 0;
-	for_each_online_node(node) {
-		blade = uv_node_to_blade_id(node);
-		if (blade == last_blade)
-			continue;
-		last_blade = blade;
-		nblades++;
-	}
+	nblades = uv_num_possible_blades();
+
 	uv_bau_table_bases = (struct bau_control **)
 	    kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
 	BUG_ON(!uv_bau_table_bases);
 
-	last_blade = -1;
-	for_each_online_node(node) {
-		blade = uv_node_to_blade_id(node);
-		if (blade == last_blade)
-			continue;
-		last_blade = blade;
-		uv_init_blade(blade, node, cur_cpu);
-		cur_cpu += uv_blade_nr_possible_cpus(blade);
-	}
+	for (blade = 0; blade < nblades; blade++)
+		if (uv_blade_nr_possible_cpus(blade))
+			uv_init_blade(blade);
+
 	alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
 	uv_enable_timeouts();
 
-- 
cgit v1.2.3-59-g8ed1b


From c4c4688f72e638708e5f6b5c259699de82a36fec Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Fri, 3 Apr 2009 08:34:32 -0500
Subject: x86: UV BAU messaging timeouts

This patch replaces a 'nop' uv_enable_timeouts() in the
UV TLB shootdown code. (somehow, long ago that function got
eviscerated)

If any cpu in the destination node does not get interrupted by the
message and post completion in a reasonable time the hardware
should respond to the sender with an error.  This function
enables such timeouts.

Tested on the UV hardware simulator.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <E1LpjXU-00007e-Qh@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_mmrs.h |  5 ++++
 arch/x86/kernel/tlb_uv.c          | 56 +++++++++++++++++++++++++++++++--------
 2 files changed, 50 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index db68ac8a5ac2..2cae46c7c8a2 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -17,6 +17,11 @@
 /* ========================================================================= */
 /*                           UVH_BAU_DATA_CONFIG                             */
 /* ========================================================================= */
+#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
+#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
+/* 1011 timebase 7 (168millisec) * 3 ticks -> 500ms */
 #define UVH_BAU_DATA_CONFIG 0x61680UL
 #define UVH_BAU_DATA_CONFIG_32 0x0438
 
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index b833bc634d17..fced96e94e22 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -445,24 +445,58 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
+/*
+ * uv_enable_timeouts
+ *
+ * Each target blade (i.e. blades that have cpu's) needs to have
+ * shootdown message timeouts enabled.  The timeout does not cause
+ * an interrupt, but causes an error message to be returned to
+ * the sender.
+ */
 static void uv_enable_timeouts(void)
 {
-	int i;
 	int blade;
-	int last_blade;
+	int nblades;
 	int pnode;
-	int cur_cpu = 0;
-	unsigned long apicid;
+	unsigned long mmr_image;
+
+	nblades = uv_num_possible_blades();
 
-	last_blade = -1;
-	for_each_online_node(i) {
-		blade = uv_node_to_blade_id(i);
-		if (blade == last_blade)
+	for (blade = 0; blade < nblades; blade++) {
+		if (!uv_blade_nr_possible_cpus(blade))
 			continue;
-		last_blade = blade;
-		apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+
 		pnode = uv_blade_to_pnode(blade);
-		cur_cpu += uv_blade_nr_possible_cpus(i);
+		mmr_image =
+		    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
+		/*
+		 * Set the timeout period and then lock it in, in three
+		 * steps; captures and locks in the period.
+		 *
+		 * To program the period, the SOFT_ACK_MODE must be off.
+		 */
+		mmr_image &= ~((unsigned long)1 <<
+			       UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+		uv_write_global_mmr64
+		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+		/*
+		 * Set the 4-bit period.
+		 */
+		mmr_image &= ~((unsigned long)0xf <<
+			UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+		mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
+			     UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+		uv_write_global_mmr64
+		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+		/*
+		 * Subsequent reversals of the timebase bit (3) cause an
+		 * immediate timeout of one or all INTD resources as
+		 * indicated in bits 2:0 (7 causes all of them to timeout).
+		 */
+		mmr_image |= ((unsigned long)1 <<
+			      UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+		uv_write_global_mmr64
+		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 1a544e659cbfce178395e9a090a47d1907d0cfa8 Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@sgi.com>
Date: Mon, 30 Mar 2009 17:52:40 -0500
Subject: x86, UV: system table in bios accessed after unmap

Use the copy of UV system table in kernel memory, not the one in
bios after unmapping.

Signed-off-by: Russ Anderson <rja@sgi.com>
LKML-Reference: <20090330225240.GA22776@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/bios_uv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index f63882728d91..63a88e1f987d 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -182,7 +182,8 @@ void uv_bios_init(void)
 	memcpy(&uv_systab, tab, sizeof(struct uv_systab));
 	iounmap(tab);
 
-	printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision);
+	printk(KERN_INFO "EFI UV System Table Revision %d\n",
+					uv_systab.revision);
 }
 #else	/* !CONFIG_EFI */
 
-- 
cgit v1.2.3-59-g8ed1b


From 6a891a24e4d0056c365a90ff2d71c38fd366b0d0 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Mon, 30 Mar 2009 09:01:11 -0500
Subject: x86, UV: Fix for nodes with memory and no cpus

Fix initialization of UV blade information for systems that have
nodes with memory but no cpus.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090330140111.GA18461@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 1248318436e8..de1a50af807b 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -549,7 +549,8 @@ void __init uv_system_init(void)
 	unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
 	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
 	int max_pnode = 0;
-	unsigned long mmr_base, present;
+	unsigned long mmr_base, present, paddr;
+	unsigned short pnode_mask;
 
 	map_low_mmrs();
 
@@ -592,6 +593,7 @@ void __init uv_system_init(void)
 		}
 	}
 
+	pnode_mask = (1 << n_val) - 1;
 	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
 	gnode_upper = (((unsigned long)node_id.s.node_id) &
 		       ~((1 << n_val) - 1)) << m_val;
@@ -615,7 +617,7 @@ void __init uv_system_init(void)
 		uv_cpu_hub_info(cpu)->numa_blade_id = blade;
 		uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
 		uv_cpu_hub_info(cpu)->pnode = pnode;
-		uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1;
+		uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
 		uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
 		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
 		uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
@@ -631,6 +633,16 @@ void __init uv_system_init(void)
 			lcpu, blade);
 	}
 
+	/* Add blade/pnode info for nodes without cpus */
+	for_each_online_node(nid) {
+		if (uv_node_to_blade[nid] >= 0)
+			continue;
+		paddr = node_start_pfn(nid) << PAGE_SHIFT;
+		pnode = (paddr >> m_val) & pnode_mask;
+		blade = boot_pnode_to_blade(pnode);
+		uv_node_to_blade[nid] = blade;
+	}
+
 	map_gru_high(max_pnode);
 	map_mmr_high(max_pnode);
 	map_config_high(max_pnode);
-- 
cgit v1.2.3-59-g8ed1b


From 06aa05b307e8efbc278f201198e7cdf3877bc5c2 Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@sgi.com>
Date: Fri, 3 Apr 2009 17:24:23 -0500
Subject: x86: prevent /sys/firmware/sgi_uv from being created on non-uv
 systems

/sys/firmware/sgi_uv should only be created on uv systems.

Signed-off-by: Russ Anderson <rja@sgi.com>
LKML-Reference: <20090403222423.GA28546@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/uv_sysfs.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
index 67f9b9dbf800..36afb98675a4 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -21,6 +21,7 @@
 
 #include <linux/sysdev.h>
 #include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
 
 struct kobject *sgi_uv_kobj;
 
@@ -47,6 +48,9 @@ static int __init sgi_uv_sysfs_init(void)
 {
 	unsigned long ret;
 
+	if (!is_uv_system())
+		return -ENODEV;
+
 	if (!sgi_uv_kobj)
 		sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
 	if (!sgi_uv_kobj) {
-- 
cgit v1.2.3-59-g8ed1b


From 54c28d294c658abb6d6430a49fda230fdfd601c8 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Fri, 3 Apr 2009 15:39:42 -0500
Subject: x86, uv: add Kconfig dependency on NUMA for UV systems

Impact: build fix

Add Kconfig dependency on NUMA for enabling UV. Although it might
be possible to configure non-NUMA UV systems, they are unsupported
and not interesting. Much of the infrastructure for UV requires
NUMA support.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090403203942.GA20137@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 748e50a1a152..2817ab5a1204 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -354,6 +354,7 @@ config X86_UV
 	bool "SGI Ultraviolet"
 	depends on X86_64
 	depends on X86_EXTENDED_PLATFORM
+	depends on NUMA
 	select X86_X2APIC
 	---help---
 	  This option is needed in order to support SGI Ultraviolet systems.
-- 
cgit v1.2.3-59-g8ed1b


From d6382bf77e30799b78f19c0a129d4a9e0e9f2e2a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 20 Feb 2009 23:01:26 -0800
Subject: xen: disable preempt for leave_lazy_mmu

xen_mc_flush() requires preemption to be disabled for its own sanity,
so disable it while we're flushing.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index db3802fb7b84..b6e56594fe13 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1819,7 +1819,6 @@ __init void xen_post_allocator_init(void)
 	xen_mark_init_mm_pinned();
 }
 
-
 const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.pagetable_setup_start = xen_pagetable_setup_start,
 	.pagetable_setup_done = xen_pagetable_setup_done,
-- 
cgit v1.2.3-59-g8ed1b


From e791ca0fd79461ad72559a6e01362da4d7d16253 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Feb 2009 15:48:33 -0800
Subject: xen: separate p2m allocation from setting

When doing very early p2m setting, we need to separate setting
from allocation, so split things up accordingly.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 61 +++++++++++++++++++++++++++++++++++++++---------------
 arch/x86/xen/mmu.h |  3 +++
 2 files changed, 47 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index b6e56594fe13..0b2d554d1d58 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -233,47 +233,74 @@ unsigned long get_phys_to_machine(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(get_phys_to_machine);
 
-static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
+/* install a  new p2m_top page */
+bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
 {
-	unsigned long *p;
+	unsigned topidx = p2m_top_index(pfn);
+	unsigned long **pfnp, *mfnp;
 	unsigned i;
 
-	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
-	BUG_ON(p == NULL);
+	pfnp = &p2m_top[topidx];
+	mfnp = &p2m_top_mfn[topidx];
 
 	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
 		p[i] = INVALID_P2M_ENTRY;
 
-	if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
-		free_page((unsigned long)p);
-	else
+	if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
 		*mfnp = virt_to_mfn(p);
+		return true;
+	}
+
+	return false;
 }
 
-void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+static void alloc_p2m(unsigned long pfn)
 {
-	unsigned topidx, idx;
+	unsigned long *p;
 
-	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
-		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
-		return;
-	}
+	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
+	BUG_ON(p == NULL);
+
+	if (!install_p2mtop_page(pfn, p))
+		free_page((unsigned long)p);
+}
+
+/* Try to install p2m mapping; fail if intermediate bits missing */
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+	unsigned topidx, idx;
 
 	if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
 		BUG_ON(mfn != INVALID_P2M_ENTRY);
-		return;
+		return true;
 	}
 
 	topidx = p2m_top_index(pfn);
 	if (p2m_top[topidx] == p2m_missing) {
-		/* no need to allocate a page to store an invalid entry */
 		if (mfn == INVALID_P2M_ENTRY)
-			return;
-		alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
+			return true;
+		return false;
 	}
 
 	idx = p2m_index(pfn);
 	p2m_top[topidx][idx] = mfn;
+
+	return true;
+}
+
+void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+		return;
+	}
+
+	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
+		alloc_p2m(pfn);
+
+		if (!__set_phys_to_machine(pfn, mfn))
+			BUG();
+	}
 }
 
 unsigned long arbitrary_virt_to_mfn(void *vaddr)
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 24d1b44a337d..da7302624897 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -11,6 +11,9 @@ enum pt_level {
 };
 
 
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
+
 void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
 
-- 
cgit v1.2.3-59-g8ed1b


From cdaead6b4e657f960d6d6f9f380e7dfeedc6a09b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Feb 2009 15:34:59 -0800
Subject: xen: split construction of p2m mfn tables from registration

Build the p2m_mfn_list_list early with the rest of the p2m table, but
register it later when the real shared_info structure is in place.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0b2d554d1d58..6e58acd4d00d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -184,7 +184,7 @@ static inline unsigned p2m_index(unsigned long pfn)
 }
 
 /* Build the parallel p2m_top_mfn structures */
-void xen_setup_mfn_list_list(void)
+static void __init xen_build_mfn_list_list(void)
 {
 	unsigned pfn, idx;
 
@@ -198,7 +198,10 @@ void xen_setup_mfn_list_list(void)
 		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
 		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
 	}
+}
 
+void xen_setup_mfn_list_list(void)
+{
 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
@@ -218,6 +221,8 @@ void __init xen_build_dynamic_phys_to_machine(void)
 
 		p2m_top[topidx] = &mfn_list[pfn];
 	}
+
+	xen_build_mfn_list_list();
 }
 
 unsigned long get_phys_to_machine(unsigned long pfn)
-- 
cgit v1.2.3-59-g8ed1b


From c7da8c829b3f919089ff021d6ddc376d38299729 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 13:02:18 -0800
Subject: xen: clean up xen_load_gdt

Makes the logic a bit clearer.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 82cd39a6cbd3..8e024554a3e1 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -301,10 +301,21 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 	frames = mcs.args;
 
 	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
-		frames[f] = arbitrary_virt_to_mfn((void *)va);
+		int level;
+		pte_t *ptep = lookup_address(va, &level);
+		unsigned long pfn, mfn;
+		void *virt;
+
+		BUG_ON(ptep == NULL);
+
+		pfn = pte_pfn(*ptep);
+		mfn = pfn_to_mfn(pfn);
+		virt = __va(PFN_PHYS(pfn));
+
+		frames[f] = mfn;
 
 		make_lowmem_page_readonly((void *)va);
-		make_lowmem_page_readonly(mfn_to_virt(frames[f]));
+		make_lowmem_page_readonly(virt);
 	}
 
 	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
-- 
cgit v1.2.3-59-g8ed1b


From a957fac50023eac09368ab19d3a7ec725c2657c3 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 15:26:00 -0800
Subject: xen: make xen_load_gdt simpler

Remove use of multicall machinery which is unused (gdt loading
is never performance critical).  This removes the implicit use
of percpu variables, which simplifies understanding how
the percpu code's use of load_gdt interacts with this code.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 8e024554a3e1..334d2ffc4e2a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -284,12 +284,11 @@ static void xen_set_ldt(const void *addr, unsigned entries)
 
 static void xen_load_gdt(const struct desc_ptr *dtr)
 {
-	unsigned long *frames;
 	unsigned long va = dtr->address;
 	unsigned int size = dtr->size + 1;
 	unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+	unsigned long frames[pages];
 	int f;
-	struct multicall_space mcs;
 
 	/* A GDT can be up to 64k in size, which corresponds to 8192
 	   8-byte entries, or 16 4k pages.. */
@@ -297,9 +296,6 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 	BUG_ON(size > 65536);
 	BUG_ON(va & ~PAGE_MASK);
 
-	mcs = xen_mc_entry(sizeof(*frames) * pages);
-	frames = mcs.args;
-
 	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
 		int level;
 		pte_t *ptep = lookup_address(va, &level);
@@ -314,13 +310,15 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 
 		frames[f] = mfn;
 
+		printk("xen_load_gdt: %d va=%p mfn=%lx pfn=%lx va'=%p\n",
+		       f, (void *)va, mfn, pfn, virt);
+
 		make_lowmem_page_readonly((void *)va);
 		make_lowmem_page_readonly(virt);
 	}
 
-	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
-
-	xen_mc_issue(PARAVIRT_LAZY_CPU);
+	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+		BUG();
 }
 
 static void load_TLS_descriptor(struct thread_struct *t,
-- 
cgit v1.2.3-59-g8ed1b


From c667d5d6a77a5f33f9181bcf92a04fdc69712a2b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 16:34:27 -0800
Subject: xen: remove xen_load_gdt debug

Don't need the noise.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 334d2ffc4e2a..5d9a1c37c515 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -310,9 +310,6 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 
 		frames[f] = mfn;
 
-		printk("xen_load_gdt: %d va=%p mfn=%lx pfn=%lx va'=%p\n",
-		       f, (void *)va, mfn, pfn, virt);
-
 		make_lowmem_page_readonly((void *)va);
 		make_lowmem_page_readonly(virt);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 1207cf8eb99d8c699919e352292bdf1f519fbba5 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Thu, 5 Mar 2009 20:13:57 +0100
Subject: NULL noise: arch/x86/xen/smp.c

Fix this sparse warnings:
  arch/x86/xen/smp.c:316:52: warning: Using plain integer as NULL pointer
  arch/x86/xen/smp.c:421:60: warning: Using plain integer as NULL pointer

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/smp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 585a6e330837..429834ec1687 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -317,7 +317,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
 	BUG_ON(rc);
 
 	while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
-		HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+		HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
 		barrier();
 	}
 
@@ -422,7 +422,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
 	/* Make sure other vcpus get a chance to run if they need to. */
 	for_each_cpu(cpu, mask) {
 		if (xen_vcpu_stolen(cpu)) {
-			HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+			HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
 			break;
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 191216b9289ed02256086e6bab4f668112109399 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sat, 7 Mar 2009 17:09:27 -0800
Subject: xen: mask XSAVE from cpuid

Xen leaves XSAVE set in cpuid, but doesn't allow cr4.OSXSAVE
to be set.  This confuses the kernel and it ends up crashing on
an xsetbv instruction.

At boot time, try to set cr4.OSXSAVE, and mask XSAVE out of
cpuid it we can't.  This will produce a spurious error from Xen,
but allows us to support XSAVE if/when Xen does.

This also factors out the cpuid mask decisions to boot time.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 50 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5d9a1c37c515..69de19168a7e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -168,21 +168,23 @@ static void __init xen_banner(void)
 	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
 }
 
+static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
+static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
+
 static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 		      unsigned int *cx, unsigned int *dx)
 {
+	unsigned maskecx = ~0;
 	unsigned maskedx = ~0;
 
 	/*
 	 * Mask out inconvenient features, to try and disable as many
 	 * unsupported kernel subsystems as possible.
 	 */
-	if (*ax == 1)
-		maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
-			    (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
-			    (1 << X86_FEATURE_MCE)  |  /* disable MCE */
-			    (1 << X86_FEATURE_MCA)  |  /* disable MCA */
-			    (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+	if (*ax == 1) {
+		maskecx = cpuid_leaf1_ecx_mask;
+		maskedx = cpuid_leaf1_edx_mask;
+	}
 
 	asm(XEN_EMULATE_PREFIX "cpuid"
 		: "=a" (*ax),
@@ -190,9 +192,43 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 		  "=c" (*cx),
 		  "=d" (*dx)
 		: "0" (*ax), "2" (*cx));
+
+	*cx &= maskecx;
 	*dx &= maskedx;
 }
 
+static __init void xen_init_cpuid_mask(void)
+{
+	unsigned int ax, bx, cx, dx;
+
+	cpuid_leaf1_edx_mask =
+		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
+		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
+		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+
+	if (!xen_initial_domain())
+		cpuid_leaf1_edx_mask &=
+			~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
+			  (1 << X86_FEATURE_ACPI));  /* disable ACPI */
+
+	ax = 1;
+	xen_cpuid(&ax, &bx, &cx, &dx);
+
+	/* cpuid claims we support xsave; try enabling it to see what happens */
+	if (cx & (1 << (X86_FEATURE_XSAVE % 32))) {
+		unsigned long cr4;
+
+		set_in_cr4(X86_CR4_OSXSAVE);
+		
+		cr4 = read_cr4();
+
+		if ((cr4 & X86_CR4_OSXSAVE) == 0)
+			cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
+
+		clear_in_cr4(X86_CR4_OSXSAVE);
+	}
+}
+
 static void xen_set_debugreg(int reg, unsigned long val)
 {
 	HYPERVISOR_set_debugreg(reg, val);
@@ -903,6 +939,8 @@ asmlinkage void __init xen_start_kernel(void)
 
 	xen_init_irq_ops();
 
+	xen_init_cpuid_mask();
+
 #ifdef CONFIG_X86_LOCAL_APIC
 	/*
 	 * set up the basic apic ops.
-- 
cgit v1.2.3-59-g8ed1b


From 10eceebeaac3049cad018d4a77c941a602cbd7a5 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sun, 8 Mar 2009 03:59:04 -0700
Subject: x86-64: remove PGE from must-have feature list

PGE may not be available when running paravirtualized, so test the cpuid
bit before using it.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/required-features.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index d5cd6c586881..a4737dddfd58 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -50,7 +50,7 @@
 #ifdef CONFIG_X86_64
 #define NEED_PSE	0
 #define NEED_MSR	(1<<(X86_FEATURE_MSR & 31))
-#define NEED_PGE	(1<<(X86_FEATURE_PGE & 31))
+#define NEED_PGE	0
 #define NEED_FXSR	(1<<(X86_FEATURE_FXSR & 31))
 #define NEED_XMM	(1<<(X86_FEATURE_XMM & 31))
 #define NEED_XMM2	(1<<(X86_FEATURE_XMM2 & 31))
-- 
cgit v1.2.3-59-g8ed1b


From b40bf53effc0338ad7926aa1abce703af372cbd4 Mon Sep 17 00:00:00 2001
From: Alex Nixon <alex.nixon@citrix.com>
Date: Mon, 9 Feb 2009 12:05:46 -0800
Subject: Xen: Add virt_to_pfn helper function

Signed-off-by: Alex Nixon <alex.nixon@citrix.com>
---
 arch/x86/include/asm/xen/page.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 1a918dde46b5..018a0a400799 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -124,7 +124,8 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
 
 /* VIRT <-> MACHINE conversion */
 #define virt_to_machine(v)	(phys_to_machine(XPADDR(__pa(v))))
-#define virt_to_mfn(v)		(pfn_to_mfn(PFN_DOWN(__pa(v))))
+#define virt_to_pfn(v)          (PFN_DOWN(__pa(v)))
+#define virt_to_mfn(v)		(pfn_to_mfn(virt_to_pfn(v)))
 #define mfn_to_virt(m)		(__va(mfn_to_pfn(m) << PAGE_SHIFT))
 
 static inline unsigned long pte_mfn(pte_t pte)
-- 
cgit v1.2.3-59-g8ed1b


From b96229b50d71c548302053c244b89572a5264c0b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 17 Mar 2009 13:30:55 -0700
Subject: xen/mmu: some early pagetable cleanups

1. make sure early-allocated ptes are pinned, so they can be later
   unpinned
2. don't pin pmd+pud, just make them RO
3. scatter some __inits around

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c     | 40 ++++++++++++++++++++++++++++------------
 arch/x86/xen/xen-ops.h |  2 --
 2 files changed, 28 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 6e58acd4d00d..4db24e1393a2 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1019,7 +1019,7 @@ static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
 	return 0;
 }
 
-void __init xen_mark_init_mm_pinned(void)
+static void __init xen_mark_init_mm_pinned(void)
 {
 	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
 }
@@ -1470,10 +1470,29 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
 }
 #endif
 
+static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+	struct mmuext_op op;
+	op.cmd = cmd;
+	op.arg1.mfn = pfn_to_mfn(pfn);
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+		BUG();
+}
+
 /* Early in boot, while setting up the initial pagetable, assume
    everything is pinned. */
 static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
 {
+#ifdef CONFIG_FLATMEM
+	BUG_ON(mem_map);	/* should only be used early */
+#endif
+	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+}
+
+/* Used for pmd and pud */
+static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
+{
 #ifdef CONFIG_FLATMEM
 	BUG_ON(mem_map);	/* should only be used early */
 #endif
@@ -1482,18 +1501,15 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
 
 /* Early release_pte assumes that all pts are pinned, since there's
    only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(unsigned long pfn)
+static __init void xen_release_pte_init(unsigned long pfn)
 {
+	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 }
 
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+static __init void xen_release_pmd_init(unsigned long pfn)
 {
-	struct mmuext_op op;
-	op.cmd = cmd;
-	op.arg1.mfn = pfn_to_mfn(pfn);
-	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-		BUG();
+	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 }
 
 /* This needs to make sure the new pte page is pinned iff its being
@@ -1874,9 +1890,9 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
 
 	.alloc_pte = xen_alloc_pte_init,
 	.release_pte = xen_release_pte_init,
-	.alloc_pmd = xen_alloc_pte_init,
+	.alloc_pmd = xen_alloc_pmd_init,
 	.alloc_pmd_clone = paravirt_nop,
-	.release_pmd = xen_release_pte_init,
+	.release_pmd = xen_release_pmd_init,
 
 #ifdef CONFIG_HIGHPTE
 	.kmap_atomic_pte = xen_kmap_atomic_pte,
@@ -1914,8 +1930,8 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
 	.set_pgd = xen_set_pgd_hyper,
 
-	.alloc_pud = xen_alloc_pte_init,
-	.release_pud = xen_release_pte_init,
+	.alloc_pud = xen_alloc_pmd_init,
+	.release_pud = xen_release_pmd_init,
 #endif	/* PAGETABLE_LEVELS == 4 */
 
 	.activate_mm = xen_activate_mm,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 2f5ef2632ea2..20139464943c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -57,8 +57,6 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
 
 bool xen_vcpu_stolen(int vcpu);
 
-void xen_mark_init_mm_pinned(void);
-
 void xen_setup_vcpu_info_placement(void);
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3-59-g8ed1b


From e3f8a74e3a884b91a4390c66ed8175ef74db7067 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 17:36:57 -0800
Subject: xen/mmu: weaken flush_tlb_other test

Impact: fixes crashing bug

There's no particular problem with getting an empty cpu mask,
so just shortcut-return if we get one.

Avoids crash reported by Christophe Saout <christophe@saout.de>

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 4db24e1393a2..7ef880c51dca 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1302,8 +1302,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 	} *args;
 	struct multicall_space mcs;
 
-	BUG_ON(cpumask_empty(cpus));
-	BUG_ON(!mm);
+	if (cpumask_empty(cpus))
+		return;		/* nothing to do */
 
 	mcs = xen_mc_entry(sizeof(*args));
 	args = mcs.args;
-- 
cgit v1.2.3-59-g8ed1b


From bc6081ff98eec627919e0c68415e46a78fe51dc4 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Mar 2009 11:29:02 -0700
Subject: xen: set _PAGE_NX in __supported_pte_mask before pagetable
 construction

Some 64-bit machines don't support the NX flag in ptes.
Check for NX before constructing the kernel pagetables.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 69de19168a7e..f7d0fd7ff8e1 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -42,6 +42,7 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/fixmap.h>
 #include <asm/processor.h>
+#include <asm/proto.h>
 #include <asm/msr-index.h>
 #include <asm/setup.h>
 #include <asm/desc.h>
@@ -914,7 +915,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
 	.emergency_restart = xen_emergency_restart,
 };
 
-
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
@@ -982,6 +982,11 @@ asmlinkage void __init xen_start_kernel(void)
 	if (!xen_initial_domain())
 		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
 
+#ifdef CONFIG_X86_64
+	/* Work out if we support NX */
+	check_efer();
+#endif
+
 	/* Don't do the full vcpu_info placement stuff until we have a
 	   possible map and a non-dummy shared_info. */
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
-- 
cgit v1.2.3-59-g8ed1b


From 2b2a733447b2bce5fef053df38412e4c0634ec22 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sun, 29 Mar 2009 22:57:15 -0700
Subject: xen: clean up gate trap/interrupt constants

Use GATE_INTERRUPT/TRAP rather than 0xe/f.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f7d0fd7ff8e1..f09e8c36ee80 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -428,7 +428,7 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
 static int cvt_gate_to_trap(int vector, const gate_desc *val,
 			    struct trap_info *info)
 {
-	if (val->type != 0xf && val->type != 0xe)
+	if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
 		return 0;
 
 	info->vector = vector;
@@ -436,8 +436,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
 	info->cs = gate_segment(*val);
 	info->flags = val->dpl;
 	/* interrupt gates clear IF */
-	if (val->type == 0xe)
-		info->flags |= 4;
+	if (val->type == GATE_INTERRUPT)
+		info->flags |= 1 << 2;
 
 	return 1;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 3ecb1b7df92393647b13b21b1f7142b65c582511 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sat, 7 Mar 2009 23:48:41 -0800
Subject: xen: add FIX_TEXT_POKE to fixmap

FIX_TEXT_POKE[01] are used to map kernel addresses, so they're mapping
pfns, not mfns.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 7ef880c51dca..c3061d318da8 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1821,6 +1821,9 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
 #ifdef CONFIG_X86_LOCAL_APIC
 	case FIX_APIC_BASE:	/* maps dummy local APIC */
 #endif
+	case FIX_TEXT_POKE0:
+	case FIX_TEXT_POKE1:
+		/* All local page mappings */
 		pte = pfn_pte(phys, prot);
 		break;
 
-- 
cgit v1.2.3-59-g8ed1b


From 43a432b1559798d33970261f710030f787770231 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 9 Apr 2009 14:26:47 -0700
Subject: x86, CPA: Change idmap attribute before ioremap attribute setup

Change the identity mapping with the requested attribute first, before
we setup the virtual memory mapping with the new requested attribute.

This makes sure that there is no window when identity map'ed attribute
may disagree with ioremap range on the attribute type.

This also avoids doing cpa on the ioremap'ed address twice (first in
ioremap_page_range and then in ioremap_change_attr using vaddr), and
should improve ioremap performance a bit.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <20090409212708.373330000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0dfa09d69e80..329387eca12a 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -280,15 +280,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		return NULL;
 	area->phys_addr = phys_addr;
 	vaddr = (unsigned long) area->addr;
-	if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
+
+	if (kernel_map_sync_memtype(phys_addr, size, prot_val)) {
 		free_memtype(phys_addr, phys_addr + size);
 		free_vm_area(area);
 		return NULL;
 	}
 
-	if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
+	if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
 		free_memtype(phys_addr, phys_addr + size);
-		vunmap(area->addr);
+		free_vm_area(area);
 		return NULL;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From a5593e0b329a14dea41ea173380dbf1533de2bd2 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Thu, 9 Apr 2009 14:26:48 -0700
Subject: x86, PAT: Change order of cpa and free in set_memory_wb

To be free of aliasing due to races, set_memory_* interfaces should
follow ordering of reserving, changing memtype to UC/WC, changing
memtype back to WB followed by free.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20090409212708.512280000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index d71e1b636ce6..d487eaa17bff 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1021,15 +1021,19 @@ int _set_memory_wb(unsigned long addr, int numpages)
 
 int set_memory_wb(unsigned long addr, int numpages)
 {
+	int ret = _set_memory_wb(addr, numpages);
 	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
-
-	return _set_memory_wb(addr, numpages);
+	return ret;
 }
 EXPORT_SYMBOL(set_memory_wb);
 
 int set_memory_array_wb(unsigned long *addr, int addrinarray)
 {
 	int i;
+	int ret;
+
+	ret = change_page_attr_clear(addr, addrinarray,
+				      __pgprot(_PAGE_CACHE_MASK), 1);
 
 	for (i = 0; i < addrinarray; i++) {
 		unsigned long start = __pa(addr[i]);
@@ -1042,8 +1046,7 @@ int set_memory_array_wb(unsigned long *addr, int addrinarray)
 		}
 		free_memtype(start, end);
 	}
-	return change_page_attr_clear(addr, addrinarray,
-				      __pgprot(_PAGE_CACHE_MASK), 1);
+	return ret;
 }
 EXPORT_SYMBOL(set_memory_array_wb);
 
-- 
cgit v1.2.3-59-g8ed1b


From 9fa3ab390abfc8b49fc0dd7c845b0ad224ec429f Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Thu, 9 Apr 2009 14:26:49 -0700
Subject: x86, PAT: Handle faults cleanly in set_memory_ APIs

Handle faults and do proper cleanups in set_memory_*() functions. In
some cases, these functions were not doing proper free on failure paths.

With the changes to tracking memtype of RAM pages in struct page instead
of pat list, we do not need the changes in commits c5e147. This patch
reverts that change.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20090409212708.653222000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr.c | 113 ++++++++++++++++++++++++++++---------------------
 1 file changed, 65 insertions(+), 48 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index d487eaa17bff..985eef80c552 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -945,52 +945,56 @@ int _set_memory_uc(unsigned long addr, int numpages)
 
 int set_memory_uc(unsigned long addr, int numpages)
 {
+	int ret;
+
 	/*
 	 * for now UC MINUS. see comments in ioremap_nocache()
 	 */
-	if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
-			    _PAGE_CACHE_UC_MINUS, NULL))
-		return -EINVAL;
+	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+			    _PAGE_CACHE_UC_MINUS, NULL);
+	if (ret)
+		goto out_err;
+
+	ret = _set_memory_uc(addr, numpages);
+	if (ret)
+		goto out_free;
+
+	return 0;
 
-	return _set_memory_uc(addr, numpages);
+out_free:
+	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+	return ret;
 }
 EXPORT_SYMBOL(set_memory_uc);
 
 int set_memory_array_uc(unsigned long *addr, int addrinarray)
 {
-	unsigned long start;
-	unsigned long end;
-	int i;
+	int i, j;
+	int ret;
+
 	/*
 	 * for now UC MINUS. see comments in ioremap_nocache()
 	 */
 	for (i = 0; i < addrinarray; i++) {
-		start = __pa(addr[i]);
-		for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
-			if (end != __pa(addr[i + 1]))
-				break;
-			i++;
-		}
-		if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
-			goto out;
+		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
+					_PAGE_CACHE_UC_MINUS, NULL);
+		if (ret)
+			goto out_free;
 	}
 
-	return change_page_attr_set(addr, addrinarray,
+	ret = change_page_attr_set(addr, addrinarray,
 				    __pgprot(_PAGE_CACHE_UC_MINUS), 1);
-out:
-	for (i = 0; i < addrinarray; i++) {
-		unsigned long tmp = __pa(addr[i]);
-
-		if (tmp == start)
-			break;
-		for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
-			if (end != __pa(addr[i + 1]))
-				break;
-			i++;
-		}
-		free_memtype(tmp, end);
-	}
-	return -EINVAL;
+	if (ret)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	for (j = 0; j < i; j++)
+		free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
+
+	return ret;
 }
 EXPORT_SYMBOL(set_memory_array_uc);
 
@@ -1002,14 +1006,26 @@ int _set_memory_wc(unsigned long addr, int numpages)
 
 int set_memory_wc(unsigned long addr, int numpages)
 {
+	int ret;
+
 	if (!pat_enabled)
 		return set_memory_uc(addr, numpages);
 
-	if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
-		_PAGE_CACHE_WC, NULL))
-		return -EINVAL;
+	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+		_PAGE_CACHE_WC, NULL);
+	if (ret)
+		goto out_err;
 
-	return _set_memory_wc(addr, numpages);
+	ret = _set_memory_wc(addr, numpages);
+	if (ret)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+	return ret;
 }
 EXPORT_SYMBOL(set_memory_wc);
 
@@ -1021,9 +1037,14 @@ int _set_memory_wb(unsigned long addr, int numpages)
 
 int set_memory_wb(unsigned long addr, int numpages)
 {
-	int ret = _set_memory_wb(addr, numpages);
+	int ret;
+
+	ret = _set_memory_wb(addr, numpages);
+	if (ret)
+		return ret;
+
 	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(set_memory_wb);
 
@@ -1034,19 +1055,13 @@ int set_memory_array_wb(unsigned long *addr, int addrinarray)
 
 	ret = change_page_attr_clear(addr, addrinarray,
 				      __pgprot(_PAGE_CACHE_MASK), 1);
+	if (ret)
+		return ret;
 
-	for (i = 0; i < addrinarray; i++) {
-		unsigned long start = __pa(addr[i]);
-		unsigned long end;
+	for (i = 0; i < addrinarray; i++)
+		free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
 
-		for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
-			if (end != __pa(addr[i + 1]))
-				break;
-			i++;
-		}
-		free_memtype(start, end);
-	}
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(set_memory_array_wb);
 
@@ -1139,6 +1154,8 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
 
 	retval = cpa_clear_pages_array(pages, addrinarray,
 			__pgprot(_PAGE_CACHE_MASK));
+	if (retval)
+		return retval;
 
 	for (i = 0; i < addrinarray; i++) {
 		start = (unsigned long)page_address(pages[i]);
@@ -1146,7 +1163,7 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
 		free_memtype(start, end);
 	}
 
-	return retval;
+	return 0;
 }
 EXPORT_SYMBOL(set_pages_array_wb);
 
-- 
cgit v1.2.3-59-g8ed1b


From 3869c4aa18835c8c61b44bd0f3ace36e9d3b5bd0 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Thu, 9 Apr 2009 14:26:50 -0700
Subject: x86, PAT: Changing memtype to WC ensuring no WB alias

As per SDM, there should not be any aliasing of a WC with any cacheable
type across CPUs. That is if one CPU is changing the identity map
memtype to _WC, no other CPU at the time of this change should not have a
TLB for this page that carries a WB attribute. SDM suggests to make the
page not present. But for that we will have to handle any page faults
that can potentially happen due to these pages being not present.

Other way to deal with this without having any WB mapping is to change
the page first to UC and then to WC. This ensures that we meet the SDM
requirement of no cacheable alais to WC page. This also has same or
lower overhead than marking the page not present and making it present
later.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20090409212708.797481000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 985eef80c552..797f9f107cb6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1000,8 +1000,15 @@ EXPORT_SYMBOL(set_memory_array_uc);
 
 int _set_memory_wc(unsigned long addr, int numpages)
 {
-	return change_page_attr_set(&addr, numpages,
+	int ret;
+	ret = change_page_attr_set(&addr, numpages,
+				    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
+
+	if (!ret) {
+		ret = change_page_attr_set(&addr, numpages,
 				    __pgprot(_PAGE_CACHE_WC), 0);
+	}
+	return ret;
 }
 
 int set_memory_wc(unsigned long addr, int numpages)
-- 
cgit v1.2.3-59-g8ed1b


From b6ff32d9aaeeeecf98f9a852d715569183585312 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 9 Apr 2009 14:26:51 -0700
Subject: x86, PAT: Consolidate code in pat_x_mtrr_type() and reserve_memtype()

Fix pat_x_mtrr_type() to use UC_MINUS when the mtrr type return UC. This
is to be  consistent with ioremap() and ioremap_nocache() which uses
UC_MINUS.

Consolidate the code such that reserve_memtype() also uses
pat_x_mtrr_type() when the caller doesn't specify any special attribute
(non WB attribute).

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <20090409212708.939936000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c |  3 ++-
 arch/x86/mm/pat.c     | 35 +++++++++++++----------------------
 2 files changed, 15 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 329387eca12a..d4c4b2c4dbbe 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -375,7 +375,8 @@ static void __iomem *ioremap_default(resource_size_t phys_addr,
 	 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
 	 * - Inherit from confliting mappings otherwise
 	 */
-	err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
+	err = reserve_memtype(phys_addr, phys_addr + size,
+				_PAGE_CACHE_WB, &flags);
 	if (err < 0)
 		return NULL;
 
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 640339ee4fb2..8d3de9580508 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -182,10 +182,10 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
 		u8 mtrr_type;
 
 		mtrr_type = mtrr_type_lookup(start, end);
-		if (mtrr_type == MTRR_TYPE_UNCACHABLE)
-			return _PAGE_CACHE_UC;
-		if (mtrr_type == MTRR_TYPE_WRCOMB)
-			return _PAGE_CACHE_WC;
+		if (mtrr_type != MTRR_TYPE_WRBACK)
+			return _PAGE_CACHE_UC_MINUS;
+
+		return _PAGE_CACHE_WB;
 	}
 
 	return req_type;
@@ -352,23 +352,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 		return 0;
 	}
 
-	if (req_type == -1) {
-		/*
-		 * Call mtrr_lookup to get the type hint. This is an
-		 * optimization for /dev/mem mmap'ers into WB memory (BIOS
-		 * tools and ACPI tools). Use WB request for WB memory and use
-		 * UC_MINUS otherwise.
-		 */
-		u8 mtrr_type = mtrr_type_lookup(start, end);
-
-		if (mtrr_type == MTRR_TYPE_WRBACK)
-			actual_type = _PAGE_CACHE_WB;
-		else
-			actual_type = _PAGE_CACHE_UC_MINUS;
-	} else {
-		actual_type = pat_x_mtrr_type(start, end,
-					      req_type & _PAGE_CACHE_MASK);
-	}
+	/*
+	 * Call mtrr_lookup to get the type hint. This is an
+	 * optimization for /dev/mem mmap'ers into WB memory (BIOS
+	 * tools and ACPI tools). Use WB request for WB memory and use
+	 * UC_MINUS otherwise.
+	 */
+	actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK);
 
 	if (new_type)
 		*new_type = actual_type;
@@ -587,7 +577,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	if (flags != -1) {
 		retval = reserve_memtype(offset, offset + size, flags, NULL);
 	} else {
-		retval = reserve_memtype(offset, offset + size, -1, &flags);
+		retval = reserve_memtype(offset, offset + size,
+					_PAGE_CACHE_WB, &flags);
 	}
 
 	if (retval < 0)
-- 
cgit v1.2.3-59-g8ed1b


From 0c3c8a18361a636069f5a5d9d0d0f9c2124e6b94 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 9 Apr 2009 14:26:52 -0700
Subject: x86, PAT: Remove duplicate memtype reserve in devmem mmap

/dev/mem mmap code was doing memtype reserve/free for a while now.
Recently we added memtype tracking in remap_pfn_range, and /dev/mem mmap
uses it indirectly. So, we don't need seperate tracking in /dev/mem code
any more. That means another ~100 lines of code removed :-).

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <20090409212709.085210000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/pat.h |  4 ----
 arch/x86/mm/pat.c          | 60 ++--------------------------------------------
 drivers/char/mem.c         | 27 ---------------------
 3 files changed, 2 insertions(+), 89 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index 2cd07b9422f4..7af14e512f97 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -18,9 +18,5 @@ extern int free_memtype(u64 start, u64 end);
 
 extern int kernel_map_sync_memtype(u64 base, unsigned long size,
 		unsigned long flag);
-extern void map_devmem(unsigned long pfn, unsigned long size,
-		       struct pgprot vma_prot);
-extern void unmap_devmem(unsigned long pfn, unsigned long size,
-			 struct pgprot vma_prot);
 
 #endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 8d3de9580508..cc5e0e24e443 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -536,9 +536,7 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 				unsigned long size, pgprot_t *vma_prot)
 {
-	u64 offset = ((u64) pfn) << PAGE_SHIFT;
-	unsigned long flags = -1;
-	int retval;
+	unsigned long flags = _PAGE_CACHE_WB;
 
 	if (!range_is_allowed(pfn, size))
 		return 0;
@@ -566,65 +564,11 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	}
 #endif
 
-	/*
-	 * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
-	 *
-	 * Without O_SYNC, we want to get
-	 * - WB for WB-able memory and no other conflicting mappings
-	 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
-	 * - Inherit from confliting mappings otherwise
-	 */
-	if (flags != -1) {
-		retval = reserve_memtype(offset, offset + size, flags, NULL);
-	} else {
-		retval = reserve_memtype(offset, offset + size,
-					_PAGE_CACHE_WB, &flags);
-	}
-
-	if (retval < 0)
-		return 0;
-
-	if (((pfn < max_low_pfn_mapped) ||
-	     (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
-	    ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
-		free_memtype(offset, offset + size);
-		printk(KERN_INFO
-		"%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
-			current->comm, current->pid,
-			cattr_name(flags),
-			offset, (unsigned long long)(offset + size));
-		return 0;
-	}
-
 	*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
 			     flags);
 	return 1;
 }
 
-void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
-{
-	unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
-	u64 addr = (u64)pfn << PAGE_SHIFT;
-	unsigned long flags;
-
-	reserve_memtype(addr, addr + size, want_flags, &flags);
-	if (flags != want_flags) {
-		printk(KERN_INFO
-		"%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
-			current->comm, current->pid,
-			cattr_name(want_flags),
-			addr, (unsigned long long)(addr + size),
-			cattr_name(flags));
-	}
-}
-
-void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
-{
-	u64 addr = (u64)pfn << PAGE_SHIFT;
-
-	free_memtype(addr, addr + size);
-}
-
 /*
  * Change the memory type for the physial address range in kernel identity
  * mapping space if that range is a part of identity map.
@@ -662,8 +606,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 {
 	int is_ram = 0;
 	int ret;
-	unsigned long flags;
 	unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
+	unsigned long flags = want_flags;
 
 	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 3586b3b3df3f..8f05c38c2f06 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -301,33 +301,7 @@ static inline int private_mapping_ok(struct vm_area_struct *vma)
 }
 #endif
 
-void __attribute__((weak))
-map_devmem(unsigned long pfn, unsigned long len, pgprot_t prot)
-{
-	/* nothing. architectures can override. */
-}
-
-void __attribute__((weak))
-unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t prot)
-{
-	/* nothing. architectures can override. */
-}
-
-static void mmap_mem_open(struct vm_area_struct *vma)
-{
-	map_devmem(vma->vm_pgoff,  vma->vm_end - vma->vm_start,
-			vma->vm_page_prot);
-}
-
-static void mmap_mem_close(struct vm_area_struct *vma)
-{
-	unmap_devmem(vma->vm_pgoff,  vma->vm_end - vma->vm_start,
-			vma->vm_page_prot);
-}
-
 static struct vm_operations_struct mmap_mem_ops = {
-	.open  = mmap_mem_open,
-	.close = mmap_mem_close,
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 	.access = generic_access_phys
 #endif
@@ -362,7 +336,6 @@ static int mmap_mem(struct file * file, struct vm_area_struct * vma)
 			    vma->vm_pgoff,
 			    size,
 			    vma->vm_page_prot)) {
-		unmap_devmem(vma->vm_pgoff, size, vma->vm_page_prot);
 		return -EAGAIN;
 	}
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From ff6c6fed3a8ab9b0a7b02574e095e905e89421d9 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 12 Apr 2009 23:24:21 +0530
Subject: x86: pci-swiotlb.c swiotlb_dma_ops should be static

Impact: reduce kernel size a bit, address sparse warning

Addresses the problem pointed out by this sparse warning:

  arch/x86/kernel/pci-swiotlb.c:53:20: warning: symbol 'swiotlb_dma_ops' was not declared. Should it be static?

For x86: swiotlb_dma_ops can be static, because it's not used outside
of pci-swiotlb.c

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
LKML-Reference: <1239558861.3938.2.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-swiotlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 34f12e9996ed..221a3853e268 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -50,7 +50,7 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
 }
 
-struct dma_map_ops swiotlb_dma_ops = {
+static struct dma_map_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc_coherent = x86_swiotlb_alloc_coherent,
 	.free_coherent = swiotlb_free_coherent,
-- 
cgit v1.2.3-59-g8ed1b


From 1c98aa7424ff163637d8321674ec58dee28152d4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 13 Apr 2009 18:09:20 -0700
Subject: Fix quilt merge error in acpi-cpufreq.c

We ended up incorrectly using '&cur' instead of '&readin' in the
work_on_cpu() -> smp_call_function_single() transformation in commit
01599fca6758d2cd133e78f87426fc851c9ea725 ("cpufreq: use
smp_call_function_[single|many]() in acpi-cpufreq.c").

Andrew explains:
 "OK, the acpi tree went and had conflicting changes merged into it after
  I'd written the patch and it appears that I incorrectly reverted part
  of 18b2646fe3babeb40b34a0c1751e0bf5adfdc64c while fixing the resulting
  rejects.

  Switching it to `readin' looks correct."

Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 3e3cd3db7a0c..837c2c4cc203 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -277,7 +277,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
 	unsigned int perf_percent;
 	unsigned int retval;
 
-	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &cur, 1))
+	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1))
 		return 0;
 
 	cur.aperf.whole = readin.aperf.whole -
-- 
cgit v1.2.3-59-g8ed1b


From 6ec3cfeca04622e3d80c9270191cd7f5f88214af Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Mon, 13 Apr 2009 15:20:58 -0700
Subject: x86, irq: Remove IRQ_DISABLED check in process context IRQ move

As discussed in the thread here:

  http://marc.info/?l=linux-kernel&m=123964468521142&w=2

Eric W. Biederman observed:

> It looks like some additional bugs have slipped in since last I looked.
>
> set_irq_affinity does this:
> ifdef CONFIG_GENERIC_PENDING_IRQ
>        if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
>                cpumask_copy(desc->affinity, cpumask);
>                desc->chip->set_affinity(irq, cpumask);
>        } else {
>                desc->status |= IRQ_MOVE_PENDING;
>                cpumask_copy(desc->pending_mask, cpumask);
>        }
> #else
>
> That IRQ_DISABLED case is a software state and as such it has nothing to
> do with how safe it is to move an irq in process context.

[...]

>
> The only reason we migrate MSIs in interrupt context today is that there
> wasn't infrastructure for support migration both in interrupt context
> and outside of it.

Yes. The idea here was to force the MSI migration to happen in process
context. One of the patches in the series did

        disable_irq(dev->irq);
        irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
        enable_irq(dev->irq);

with the above patch adding irq/manage code check for interrupt disabled
and moving the interrupt in process context.

IIRC, there was no IRQ_MOVE_PCNTXT when we were developing this HPET
code and we ended up having this ugly hack. IRQ_MOVE_PCNTXT was there
when we eventually submitted the patch upstream. But, looks like I did a
blind rebasing instead of using IRQ_MOVE_PCNTXT in hpet MSI code.

Below patch fixes this. i.e., revert commit 932775a4ab622e3c99bd59f14cc
and add PCNTXT to HPET MSI setup. Also removes copying of desc->affinity
in generic code as set_affinity routines are doing it internally.

Reported-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Li Shaohua" <shaohua.li@intel.com>
Cc: Gary Hade <garyhade@us.ibm.com>
Cc: "lcm@us.ibm.com" <lcm@us.ibm.com>
Cc: suresh.b.siddha@intel.com
LKML-Reference: <20090413222058.GB8211@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 2 ++
 kernel/irq/manage.c            | 5 ++---
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index a2789e42e162..30da617d18e4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3670,12 +3670,14 @@ int arch_setup_hpet_msi(unsigned int irq)
 {
 	int ret;
 	struct msi_msg msg;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 	ret = msi_compose_msg(NULL, irq, &msg);
 	if (ret < 0)
 		return ret;
 
 	hpet_msi_write(irq, &msg);
+	desc->status |= IRQ_MOVE_PCNTXT;
 	set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
 		"edge");
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7e2e7dd4cd2f..2734eca59243 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -109,10 +109,9 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	spin_lock_irqsave(&desc->lock, flags);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
-		cpumask_copy(desc->affinity, cpumask);
+	if (desc->status & IRQ_MOVE_PCNTXT)
 		desc->chip->set_affinity(irq, cpumask);
-	} else {
+	else {
 		desc->status |= IRQ_MOVE_PENDING;
 		cpumask_copy(desc->pending_mask, cpumask);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 94ca8e4852807fc42d2f64fcaf248aafc4f2e6a7 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Tue, 14 Apr 2009 10:56:48 -0500
Subject: x86: UV: BAU partition-relative distribution map

This patch enables each partition's BAU distribution bit map
to be partition-relative.

The distribution bitmap had been constructed assuming 0 as the base
node number.  That construct would not have allowed a total system of
greater than 256 nodes.
It also corrects an error that occurred when the first blade's nasid
was not zero.  That nasid was stored as the base node.
The base node number gets added by hardware to the node numbers implied
in the distribution bitmap, resulting in invalid target nasids.

Tested on the UV hardware simulator.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <E1Ltl0C-0004Ob-37@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index fced96e94e22..98307f953492 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -25,6 +25,8 @@ static int			uv_bau_retry_limit __read_mostly;
 
 /* position of pnode (which is nasid>>1): */
 static int			uv_nshift __read_mostly;
+/* base pnode in this partition */
+static int			uv_partition_base_pnode __read_mostly;
 
 static unsigned long		uv_mmask __read_mostly;
 
@@ -43,7 +45,7 @@ static int __init blade_to_first_node(int blade)
 		if (blade == b)
 			return node;
 	}
-	BUG();
+	return -1; /* shouldn't happen */
 }
 
 /*
@@ -359,7 +361,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 			locals++;
 			continue;
 		}
-		bau_node_set(pnode, &bau_desc->distribution);
+		bau_node_set(pnode - uv_partition_base_pnode,
+				&bau_desc->distribution);
 		i++;
 	}
 	if (i == 0) {
@@ -728,7 +731,12 @@ uv_activation_descriptor_init(int node, int pnode)
 	for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
 		memset(ad2, 0, sizeof(struct bau_desc));
 		ad2->header.sw_ack_flag = 1;
-		ad2->header.base_dest_nodeid = uv_cpu_to_pnode(0);
+		/*
+		 * base_dest_nodeid is the first node in the partition, so
+		 * the bit map will indicate partition-relative node numbers.
+		 * note that base_dest_nodeid is actually a nasid.
+		 */
+		ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
 		ad2->header.command = UV_NET_ENDPOINT_INTD;
 		ad2->header.int_both = 1;
 		/*
@@ -825,6 +833,11 @@ static int __init uv_bau_init(void)
 	    kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
 	BUG_ON(!uv_bau_table_bases);
 
+	uv_partition_base_pnode = 0x7fffffff;
+	for (blade = 0; blade < nblades; blade++)
+		if (uv_blade_nr_possible_cpus(blade) &&
+			(uv_blade_to_pnode(blade) < uv_partition_base_pnode))
+			uv_partition_base_pnode = uv_blade_to_pnode(blade);
 	for (blade = 0; blade < nblades; blade++)
 		if (uv_blade_nr_possible_cpus(blade))
 			uv_init_blade(blade);
-- 
cgit v1.2.3-59-g8ed1b


From 6f66cbc63081fd70e3191b4dbb796746780e5ae1 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 14 Apr 2009 19:25:42 +0100
Subject: x86 microcode: revert some work_on_cpu

Revert part of af5c820a3169e81af869c113e18ec7588836cd50 ("x86: cpumask:
use work_on_cpu in arch/x86/kernel/microcode_core.c")

That change is causing only one Intel CPU's microcode to be updated e.g.
microcode: CPU3 updated from revision 0x9 to 0x17, date = 2005-04-22
where before it announced that also for CPU0 and CPU1 and CPU2.

We cannot use work_on_cpu() in the CONFIG_MICROCODE_OLD_INTERFACE code,
because Intel's request_microcode_user() involves a copy_from_user() from
/sbin/microcode_ctl, which therefore needs to be on that CPU at the time.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/microcode_core.c | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index a0f3851ef310..2e0eb4140951 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -108,40 +108,29 @@ struct ucode_cpu_info		ucode_cpu_info[NR_CPUS];
 EXPORT_SYMBOL_GPL(ucode_cpu_info);
 
 #ifdef CONFIG_MICROCODE_OLD_INTERFACE
-struct update_for_cpu {
-	const void __user	*buf;
-	size_t			size;
-};
-
-static long update_for_cpu(void *_ufc)
-{
-	struct update_for_cpu *ufc = _ufc;
-	int error;
-
-	error = microcode_ops->request_microcode_user(smp_processor_id(),
-						      ufc->buf, ufc->size);
-	if (error < 0)
-		return error;
-	if (!error)
-		microcode_ops->apply_microcode(smp_processor_id());
-	return error;
-}
-
 static int do_microcode_update(const void __user *buf, size_t size)
 {
+	cpumask_t old;
 	int error = 0;
 	int cpu;
-	struct update_for_cpu ufc = { .buf = buf, .size = size };
+
+	old = current->cpus_allowed;
 
 	for_each_online_cpu(cpu) {
 		struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
 		if (!uci->valid)
 			continue;
-		error = work_on_cpu(cpu, update_for_cpu, &ufc);
+
+		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+		error = microcode_ops->request_microcode_user(cpu, buf, size);
 		if (error < 0)
-			break;
+			goto out;
+		if (!error)
+			microcode_ops->apply_microcode(cpu);
 	}
+out:
+	set_cpus_allowed_ptr(current, &old);
 	return error;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From ea34f43a074af85823e49b9bf62f47d8d3f0e81a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 15 Apr 2009 08:05:13 -0700
Subject: acpi-cpufreq: fix 'smp_call_function_many()' confusion

It turns out that 'smp_call_function_many()' doesn't work at all like
'smp_call_function_single()', and my change to Andrew's patch to use it
rather than a loop over all CPU's acpi-cpufreq doesn't work.

My bad.

'smp_call_function_many()' has two "features" (aka "documented bugs"):

 (a) it needs to be called with preemption disabled, because it uses
     smp_processor_id() without guarding the CPU lookup with 'get_cpu()'
     and 'put_cpu()' like the 'single' variant does.

 (b) even if the current CPU is part of the CPU mask, it won't do the
     call on that CPU.

Still, we're better off trying to use 'smp_call_function_many()' than
looping over CPU's, since it at least in theory allows us to use a
broadcast IPI and do it all in parallel.  So let's just work around the
silly semantic bugs in that function.

Reported-and-tested-by: Ali Gholami Rudi <ali@rudi.ir>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>,
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Dave Jones <davej@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 837c2c4cc203..ecdb682ab516 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -204,7 +204,13 @@ static void drv_read(struct drv_cmd *cmd)
 
 static void drv_write(struct drv_cmd *cmd)
 {
+	int this_cpu;
+
+	this_cpu = get_cpu();
+	if (cpumask_test_cpu(this_cpu, cmd->mask))
+		do_drv_write(cmd);
 	smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
+	put_cpu();
 }
 
 static u32 get_cur_val(const struct cpumask *mask)
-- 
cgit v1.2.3-59-g8ed1b


From d45b41ae8da0f54aec0eebcc6f893ba5f22a1e8e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 15 Apr 2009 23:15:14 +0200
Subject: x86: disable X86_PTRACE_BTS for now

Oleg Nesterov found a couple of races in the ptrace-bts code
and fixes are queued up for it but they did not get ready in time
for the merge window. We'll merge them in v2.6.31 - until then
mark the feature as CONFIG_BROKEN. There's no user-space yet
making use of this so it's not a big issue.

Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.cpu | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 924e156a85ab..8130334329c0 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -506,6 +506,7 @@ config X86_PTRACE_BTS
 	bool "Branch Trace Store"
 	default y
 	depends on X86_DEBUGCTLMSR
+	depends on BROKEN
 	---help---
 	  This adds a ptrace interface to the hardware's branch trace store.
 
-- 
cgit v1.2.3-59-g8ed1b


From 4ea3c51d5bd3bb4eea7d7d3a1f80d1a48c2a6f92 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Thu, 16 Apr 2009 07:53:09 -0500
Subject: x86: UV BAU distribution and payload MMRs

This patch correctly sets BAU memory mapped registers to point
to the sending activation descriptor table and target payload table.

The "Broadcast Assist Unit" is used for TLB shootdown in UV.

The memory mapped registers that point to sending and receiving
memory structures contain node numbers.

In one case the __pa() function did not provide the node id of
memory on blade zero in configurations where that id is nonzero.
In another case, it was assumed that memory was allocated on
the local node.  That assumption is not true in a configuration
in which the node has no memory.

Tested on the UV hardware simulator.

[ Impact: fix possible runtime crash due to incorrect TLB logic ]

Signed-off-by: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <E1LuR5Z-0007An-B8@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 98307f953492..78422336ddea 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -717,7 +717,7 @@ uv_activation_descriptor_init(int node, int pnode)
 	adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);
 	BUG_ON(!adp);
 
-	pa = __pa((unsigned long)adp);
+	pa = uv_gpa(adp); /* need the real nasid*/
 	n = pa >> uv_nshift;
 	m = pa & uv_mmask;
 
@@ -754,6 +754,8 @@ static struct bau_payload_queue_entry * __init
 uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
 {
 	struct bau_payload_queue_entry *pqp;
+	unsigned long pa;
+	int pn;
 	char *cp;
 
 	pqp = (struct bau_payload_queue_entry *) kmalloc_node(
@@ -764,10 +766,14 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
 	cp = (char *)pqp + 31;
 	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
 	bau_tablesp->va_queue_first = pqp;
+	/*
+	 * need the pnode of where the memory was really allocated
+	 */
+	pa = uv_gpa(pqp);
+	pn = pa >> uv_nshift;
 	uv_write_global_mmr64(pnode,
 			      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
-			      ((unsigned long)pnode <<
-			       UV_PAYLOADQ_PNODE_SHIFT) |
+			      ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
 			      uv_physnodeaddr(pqp));
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
 			      uv_physnodeaddr(pqp));
-- 
cgit v1.2.3-59-g8ed1b


From 4b065046273afa01ec8e3de7da407e8d3599251d Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Wed, 8 Apr 2009 15:37:16 -0700
Subject: x86, PAT: Remove page granularity tracking for vm_insert_pfn maps

This change resolves the problem of too many single page entries
in pat_memtype_list and "freeing invalid memtype" errors with i915,
reported here:

  http://marc.info/?l=linux-kernel&m=123845244713183&w=2

Remove page level granularity track and untrack of vm_insert_pfn.
memtype tracking at page granularity does not scale and cleaner
approach would be for the driver to request a type for a bigger
IO address range or PCI io memory range for that device, either at
mmap time or driver init time and just use that type during
vm_insert_pfn.

This patch just removes the track/untrack of vm_insert_pfn. That
means we will be in same state as 2.6.28, with respect to these APIs.

Newer APIs for the drivers to request a memtype for a bigger region
is coming soon.

[ Impact: fix Xorg startup warnings and hangs ]

Reported-by: Arkadiusz Miskiewicz <a.miskiewicz@gmail.com>
Tested-by: Arkadiusz Miskiewicz <a.miskiewicz@gmail.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
LKML-Reference: <20090408223716.GC3493@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c | 98 +++++++++++--------------------------------------------
 1 file changed, 19 insertions(+), 79 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index cc5e0e24e443..41c805718158 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -669,29 +669,28 @@ static void free_pfn_range(u64 paddr, unsigned long size)
  *
  * If the vma has a linear pfn mapping for the entire range, we get the prot
  * from pte and reserve the entire vma range with single reserve_pfn_range call.
- * Otherwise, we reserve the entire vma range, my ging through the PTEs page
- * by page to get physical address and protection.
  */
 int track_pfn_vma_copy(struct vm_area_struct *vma)
 {
-	int retval = 0;
-	unsigned long i, j;
 	resource_size_t paddr;
 	unsigned long prot;
-	unsigned long vma_start = vma->vm_start;
-	unsigned long vma_end = vma->vm_end;
-	unsigned long vma_size = vma_end - vma_start;
+	unsigned long vma_size = vma->vm_end - vma->vm_start;
 	pgprot_t pgprot;
 
 	if (!pat_enabled)
 		return 0;
 
+	/*
+	 * For now, only handle remap_pfn_range() vmas where
+	 * is_linear_pfn_mapping() == TRUE. Handling of
+	 * vm_insert_pfn() is TBD.
+	 */
 	if (is_linear_pfn_mapping(vma)) {
 		/*
 		 * reserve the whole chunk covered by vma. We need the
 		 * starting address and protection from pte.
 		 */
-		if (follow_phys(vma, vma_start, 0, &prot, &paddr)) {
+		if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
 			WARN_ON_ONCE(1);
 			return -EINVAL;
 		}
@@ -699,28 +698,7 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
 		return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
 	}
 
-	/* reserve entire vma page by page, using pfn and prot from pte */
-	for (i = 0; i < vma_size; i += PAGE_SIZE) {
-		if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
-			continue;
-
-		pgprot = __pgprot(prot);
-		retval = reserve_pfn_range(paddr, PAGE_SIZE, &pgprot, 1);
-		if (retval)
-			goto cleanup_ret;
-	}
 	return 0;
-
-cleanup_ret:
-	/* Reserve error: Cleanup partial reservation and return error */
-	for (j = 0; j < i; j += PAGE_SIZE) {
-		if (follow_phys(vma, vma_start + j, 0, &prot, &paddr))
-			continue;
-
-		free_pfn_range(paddr, PAGE_SIZE);
-	}
-
-	return retval;
 }
 
 /*
@@ -730,50 +708,28 @@ cleanup_ret:
  * prot is passed in as a parameter for the new mapping. If the vma has a
  * linear pfn mapping for the entire range reserve the entire vma range with
  * single reserve_pfn_range call.
- * Otherwise, we look t the pfn and size and reserve only the specified range
- * page by page.
- *
- * Note that this function can be called with caller trying to map only a
- * subrange/page inside the vma.
  */
 int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
 			unsigned long pfn, unsigned long size)
 {
-	int retval = 0;
-	unsigned long i, j;
-	resource_size_t base_paddr;
 	resource_size_t paddr;
-	unsigned long vma_start = vma->vm_start;
-	unsigned long vma_end = vma->vm_end;
-	unsigned long vma_size = vma_end - vma_start;
+	unsigned long vma_size = vma->vm_end - vma->vm_start;
 
 	if (!pat_enabled)
 		return 0;
 
+	/*
+	 * For now, only handle remap_pfn_range() vmas where
+	 * is_linear_pfn_mapping() == TRUE. Handling of
+	 * vm_insert_pfn() is TBD.
+	 */
 	if (is_linear_pfn_mapping(vma)) {
 		/* reserve the whole chunk starting from vm_pgoff */
 		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
 		return reserve_pfn_range(paddr, vma_size, prot, 0);
 	}
 
-	/* reserve page by page using pfn and size */
-	base_paddr = (resource_size_t)pfn << PAGE_SHIFT;
-	for (i = 0; i < size; i += PAGE_SIZE) {
-		paddr = base_paddr + i;
-		retval = reserve_pfn_range(paddr, PAGE_SIZE, prot, 0);
-		if (retval)
-			goto cleanup_ret;
-	}
 	return 0;
-
-cleanup_ret:
-	/* Reserve error: Cleanup partial reservation and return error */
-	for (j = 0; j < i; j += PAGE_SIZE) {
-		paddr = base_paddr + j;
-		free_pfn_range(paddr, PAGE_SIZE);
-	}
-
-	return retval;
 }
 
 /*
@@ -784,39 +740,23 @@ cleanup_ret:
 void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
 			unsigned long size)
 {
-	unsigned long i;
 	resource_size_t paddr;
-	unsigned long prot;
-	unsigned long vma_start = vma->vm_start;
-	unsigned long vma_end = vma->vm_end;
-	unsigned long vma_size = vma_end - vma_start;
+	unsigned long vma_size = vma->vm_end - vma->vm_start;
 
 	if (!pat_enabled)
 		return;
 
+	/*
+	 * For now, only handle remap_pfn_range() vmas where
+	 * is_linear_pfn_mapping() == TRUE. Handling of
+	 * vm_insert_pfn() is TBD.
+	 */
 	if (is_linear_pfn_mapping(vma)) {
 		/* free the whole chunk starting from vm_pgoff */
 		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
 		free_pfn_range(paddr, vma_size);
 		return;
 	}
-
-	if (size != 0 && size != vma_size) {
-		/* free page by page, using pfn and size */
-		paddr = (resource_size_t)pfn << PAGE_SHIFT;
-		for (i = 0; i < size; i += PAGE_SIZE) {
-			paddr = paddr + i;
-			free_pfn_range(paddr, PAGE_SIZE);
-		}
-	} else {
-		/* free entire vma, page by page, using the pfn from pte */
-		for (i = 0; i < vma_size; i += PAGE_SIZE) {
-			if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
-				continue;
-
-			free_pfn_range(paddr, PAGE_SIZE);
-		}
-	}
 }
 
 pgprot_t pgprot_writecombine(pgprot_t prot)
-- 
cgit v1.2.3-59-g8ed1b


From 0917798d82212f884fff650e7e520de3b438f947 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Wed, 15 Apr 2009 16:51:48 +0200
Subject: x86: fix microcode driver newly spewing warnings

Jeff Garzik reported this WARN_ON() noise:

> Kernel: 2.6.30-rc1-00306-g8371f87
> Hardware: ICH10 x86-64
>
> This is a regression from 2.6.29.  Microcode spews the following WARNING
> multiple times during boot:
>
> ------------[ cut here ]------------
> WARNING: at fs/sysfs/group.c:138 sysfs_remove_group+0xeb/0xf0()
> Hardware name:         sysfs group ffffffffa0209700 not found for
>  kobject 'cpu0'

Keep sysfs files around for cpus even when we failed to locate
microcode for them at the moment of module loading. The appropriate
microcode firmware can become available later on.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index a0f3851ef310..4d420de9ac61 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -391,8 +391,6 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
 		return err;
 
 	err = microcode_init_cpu(cpu);
-	if (err)
-		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
 
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From ca713c2ab0eea3458962983e4a7e13430ea479b8 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 15 Apr 2009 18:39:13 -0700
Subject: x86/irq: mark NUMA_MIGRATE_IRQ_DESC broken

It causes crash on system with lots of cards with MSI-X
when irq_balancer enabled...

The patches fixing it were both complex and fragile, according
to Eric they were also doing quite dangerous things to the
hardware.

Instead we now have patches that solve this problem via static
NUMA node mappings - not dynamic allocation and balancing.

The patches are much simpler than this method but are still too
large outside of the merge window, so we mark the dynamic balancer
as broken for now, and queue up the new approach for v2.6.31.

[ Impact: deactivate broken kernel feature ]

Reported-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49E68C41.4020801@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bc25b9f5e4cd..b1d2be7d91a0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -277,6 +277,7 @@ config SPARSE_IRQ
 config NUMA_MIGRATE_IRQ_DESC
 	bool "Move irq desc when changing irq smp_affinity"
 	depends on SPARSE_IRQ && NUMA
+	depends on BROKEN
 	default n
 	---help---
 	  This enables moving irq_desc to cpu/node that irq will use handled.
-- 
cgit v1.2.3-59-g8ed1b


From dc098551918093901d8ac8936e9d1a1b891b56ed Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Fri, 17 Apr 2009 09:22:42 -0500
Subject: x86/uv: fix init of memory-less nodes

Add support for nodes that have cpus but no memory.
The current code was failing to add these nodes
to the nodes_present_map.

v2: Fixes case caught by David Rientjes - missed support
    for the x2apic SRAT table.

[ Impact: fix potential boot crash on memory-less UV nodes. ]

Reported-by: David Rientjes <rientjes@google.com>
Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090417142242.GA23743@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/srat_64.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index c7d272b8574c..33c5fa57e43d 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -28,6 +28,7 @@ int acpi_numa __initdata;
 static struct acpi_table_slit *acpi_slit;
 
 static nodemask_t nodes_parsed __initdata;
+static nodemask_t cpu_nodes_parsed __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES];
 static int found_add_area __initdata;
@@ -141,6 +142,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 
 	apic_id = pa->apic_id;
 	apicid_to_node[apic_id] = node;
+	node_set(node, cpu_nodes_parsed);
 	acpi_numa = 1;
 	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
 	       pxm, apic_id, node);
@@ -174,6 +176,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	else
 		apic_id = pa->apic_id;
 	apicid_to_node[apic_id] = node;
+	node_set(node, cpu_nodes_parsed);
 	acpi_numa = 1;
 	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
 	       pxm, apic_id, node);
@@ -402,7 +405,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		return -1;
 	}
 
-	node_possible_map = nodes_parsed;
+	/* Account for nodes with cpus and no memory */
+	nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
 
 	/* Finally register nodes */
 	for_each_node_mask(i, node_possible_map)
-- 
cgit v1.2.3-59-g8ed1b


From 27229ca63269c1be0a710f074fa9de4024f283f1 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Fri, 17 Apr 2009 09:24:47 -0500
Subject: x86/uv: fix init of cpu-less nodes

Fix an endcase in the UV initialization code for the "UV large system mode"
of apicids.  If node zero contains no cpus, cpus on another node will be the
boot cpu.  The percpu data that contains the extra apicid bits was not
being initialized early enough.

[ Impact: fix potential boot crash on cpu-less UV nodes ]

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090417142447.GA23759@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 1248318436e8..49d2f5c739b7 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -19,6 +19,7 @@
 #include <linux/timer.h>
 #include <linux/cpu.h>
 #include <linux/init.h>
+#include <linux/io.h>
 
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
@@ -34,6 +35,17 @@ DEFINE_PER_CPU(int, x2apic_extra_bits);
 
 static enum uv_system_type uv_system_type;
 
+static int early_get_nodeid(void)
+{
+	union uvh_node_id_u node_id;
+	unsigned long *mmr;
+
+	mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
+	node_id.v = *mmr;
+	early_iounmap(mmr, sizeof(*mmr));
+	return node_id.s.node_id;
+}
+
 static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	if (!strcmp(oem_id, "SGI")) {
@@ -42,6 +54,8 @@ static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 		else if (!strcmp(oem_table_id, "UVX"))
 			uv_system_type = UV_X2APIC;
 		else if (!strcmp(oem_table_id, "UVH")) {
+			__get_cpu_var(x2apic_extra_bits) =
+				early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1);
 			uv_system_type = UV_NON_UNIQUE_APIC;
 			return 1;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 1648e4f805063137b55b869666e936ffd4419cdf Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 17 Apr 2009 10:46:37 -0700
Subject: x86, kbuild: make "make install" not depend on vmlinux

It is common to use "make install" in restricted environments which
differ from the one which was actually used to build the kernel.  In
such environments it is highly undesirable to trigger a rebuild of any
part of the system.  Worse, the rebuild may be spurious, triggered by
differences in the environment.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
LKML-Reference: <20090415234642.GA28531@uranus.ravnborg.org>
---
 arch/x86/Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index f05d8c91d9e5..8c86b72afdc2 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -153,7 +153,7 @@ endif
 
 boot := arch/x86/boot
 
-BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage install
+BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage
 
 PHONY += bzImage $(BOOT_TARGETS)
 
@@ -171,6 +171,10 @@ bzImage: vmlinux
 $(BOOT_TARGETS): vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) $@
 
+PHONY += install
+install:
+	$(Q)$(MAKE) $(build)=$(boot) $@
+
 PHONY += vdso_install
 vdso_install:
 	$(Q)$(MAKE) $(build)=arch/x86/vdso $@
-- 
cgit v1.2.3-59-g8ed1b


From a81b6314e0aa480b8ac6dd02779d44cd0bee0a34 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Fri, 17 Apr 2009 23:31:20 +0530
Subject: x86: mm/numa_32.c calculate_numa_remap_pages should use __init

calculate_numa_remap_pages() is called only by __init initmem_init()
further calculate_numa_remap_pages is calling:
	__init find_e820_area() and __init reserve_early()

So calculate_numa_remap_pages() should be __init calculate_numa_remap_pages().

 WARNING: arch/x86/built-in.o(.text+0x82ea3): Section mismatch in reference from the function calculate_numa_remap_pages() to the function .init.text:find_e820_area()
 The function calculate_numa_remap_pages() references
 the function __init find_e820_area().
 This is often because calculate_numa_remap_pages lacks a __init
 annotation or the annotation of find_e820_area is wrong.

 WARNING: arch/x86/built-in.o(.text+0x82f5f): Section mismatch in reference from the function calculate_numa_remap_pages() to the function .init.text:reserve_early()
 The function calculate_numa_remap_pages() references
 the function __init reserve_early().
 This is often because calculate_numa_remap_pages lacks a __init
 annotation or the annotation of reserve_early is wrong.

[ Impact: save memory, address Section mismatch warning ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
LKML-Reference: <1239991281.3153.4.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/numa_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 3daefa04ace5..d2530062fe00 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -257,7 +257,7 @@ void resume_map_numa_kva(pgd_t *pgd_base)
 }
 #endif
 
-static unsigned long calculate_numa_remap_pages(void)
+static __init unsigned long calculate_numa_remap_pages(void)
 {
 	int nid;
 	unsigned long size, reserve_pages = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 0300e7f1a525ae4e4ac05344624adf0e5f13ea52 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 17 Apr 2009 08:33:52 -0400
Subject: lockdep, x86: account for irqs enabled in paranoid_exit

I hit the check_flags error of lockdep:

 WARNING: at kernel/lockdep.c:2893 check_flags+0x1a7/0x1d0()
 [...]
 hardirqs last  enabled at (12567): [<ffffffff8026206a>] local_bh_enable+0xaa/0x110
 hardirqs last disabled at (12569): [<ffffffff80610c76>] int3+0x16/0x40
 softirqs last  enabled at (12566): [<ffffffff80514d2b>] lock_sock_nested+0xfb/0x110
 softirqs last disabled at (12568): [<ffffffff8058454e>] tcp_prequeue_process+0x2e/0xa0

The check_flags warning of lockdep tells me that lockdep thought interrupts
were disabled, but they were really enabled.

The numbers in the above parenthesis show the order of events:

 12566: softirqs last enabled:  lock_sock_nested
 12567: hardirqs last enabled:  local_bh_enable
 12568: softirqs last disabled: tcp_prequeue_process
 12566: hardirqs last disabled: int3

int3 is a breakpoint!

Examining this further, I have CONFIG_NET_TCPPROBE enabled which adds
break points into the kernel.

The paranoid_exit of the return of int3 does not account for enabling
interrupts on return to kernel. This code is a bit tricky since it
is also used by the nmi handler (when lockdep is off), and we must be
careful about the swapgs. We can not call kernel code after the swapgs
has been performed.

[ Impact: fix lockdep check_flags warning + self-turn-off ]

Acked-by: Peter Zijlsta <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a331ec38af9e..38946c6e8433 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1410,7 +1410,10 @@ ENTRY(paranoid_exit)
 paranoid_swapgs:
 	TRACE_IRQS_IRETQ 0
 	SWAPGS_UNSAFE_STACK
+	RESTORE_ALL 8
+	jmp irq_return
 paranoid_restore:
+	TRACE_IRQS_IRETQ 0
 	RESTORE_ALL 8
 	jmp irq_return
 paranoid_userspace:
-- 
cgit v1.2.3-59-g8ed1b


From a489f0b555b753f9df8ddc24c7e74f657ef7ee7b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sun, 19 Apr 2009 23:14:00 -0600
Subject: lguest: fix guest crash on non-linear addresses in gdt pvops

Fixes guest crash 'lguest: bad read address 0x4800000 len 256'

The new per-cpu allocator ends up handing a non-linear address to
write_gdt_entry.  We do __pa() on it, and hand it to the host, which
kills us.

I've long wanted to make the hypercall "LOAD_GDT_ENTRY" to match the IDT
code, but had no pressing reason until now.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: lguest@ozlabs.org
---
 arch/x86/include/asm/lguest_hcall.h |  2 +-
 arch/x86/lguest/boot.c              | 16 +++++++++-------
 drivers/lguest/lg.h                 |  3 ++-
 drivers/lguest/segments.c           | 13 +++++++------
 drivers/lguest/x86/core.c           |  4 ++--
 5 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index 0f4ee7148afe..faae1996487b 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -5,7 +5,6 @@
 #define LHCALL_FLUSH_ASYNC	0
 #define LHCALL_LGUEST_INIT	1
 #define LHCALL_SHUTDOWN		2
-#define LHCALL_LOAD_GDT		3
 #define LHCALL_NEW_PGTABLE	4
 #define LHCALL_FLUSH_TLB	5
 #define LHCALL_LOAD_IDT_ENTRY	6
@@ -17,6 +16,7 @@
 #define LHCALL_SET_PMD		15
 #define LHCALL_LOAD_TLS		16
 #define LHCALL_NOTIFY		17
+#define LHCALL_LOAD_GDT_ENTRY	18
 
 #define LGUEST_TRAP_ENTRY 0x1F
 
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index e94a11e42f98..a2085368a3dc 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -273,15 +273,15 @@ static void lguest_load_idt(const struct desc_ptr *desc)
  * controls the entire thing and the Guest asks it to make changes using the
  * LOAD_GDT hypercall.
  *
- * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY
- * hypercall and use that repeatedly to load a new IDT.  I don't think it
- * really matters, but wouldn't it be nice if they were the same?  Wouldn't
- * it be even better if you were the one to send the patch to fix it?
+ * This is the exactly like the IDT code.
  */
 static void lguest_load_gdt(const struct desc_ptr *desc)
 {
-	BUG_ON((desc->size + 1) / 8 != GDT_ENTRIES);
-	kvm_hypercall2(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES);
+	unsigned int i;
+	struct desc_struct *gdt = (void *)desc->address;
+
+	for (i = 0; i < (desc->size+1)/8; i++)
+		kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b);
 }
 
 /* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
@@ -291,7 +291,9 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
 				   const void *desc, int type)
 {
 	native_write_gdt_entry(dt, entrynum, desc, type);
-	kvm_hypercall2(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES);
+	/* Tell Host about this new entry. */
+	kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, entrynum,
+		       dt[entrynum].a, dt[entrynum].b);
 }
 
 /* OK, I lied.  There are three "thread local storage" GDT entries which change
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index ac8a4a3741b8..af92a176697f 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -158,7 +158,8 @@ void free_interrupts(void);
 /* segments.c: */
 void setup_default_gdt_entries(struct lguest_ro_state *state);
 void setup_guest_gdt(struct lg_cpu *cpu);
-void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num);
+void load_guest_gdt_entry(struct lg_cpu *cpu, unsigned int i,
+			  u32 low, u32 hi);
 void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
 void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
 void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index 4f15439b7f12..7ede64ffeef9 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -144,18 +144,19 @@ void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
 			gdt[i] = cpu->arch.gdt[i];
 }
 
-/*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT).
- * We copy it from the Guest and tweak the entries. */
-void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num)
+/*H:620 This is where the Guest asks us to load a new GDT entry
+ * (LHCALL_LOAD_GDT_ENTRY).  We tweak the entry and copy it in. */
+void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
 {
 	/* We assume the Guest has the same number of GDT entries as the
 	 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */
 	if (num > ARRAY_SIZE(cpu->arch.gdt))
 		kill_guest(cpu, "too many gdt entries %i", num);
 
-	/* We read the whole thing in, then fix it up. */
-	__lgread(cpu, cpu->arch.gdt, table, num * sizeof(cpu->arch.gdt[0]));
-	fixup_gdt_table(cpu, 0, ARRAY_SIZE(cpu->arch.gdt));
+	/* Set it up, then fix it. */
+	cpu->arch.gdt[num].a = lo;
+	cpu->arch.gdt[num].b = hi;
+	fixup_gdt_table(cpu, num, num+1);
 	/* Mark that the GDT changed so the core knows it has to copy it again,
 	 * even if the Guest is run on the same CPU. */
 	cpu->changed |= CHANGED_GDT;
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index d6d7ac0982ab..1a83910f674f 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -568,8 +568,8 @@ void __exit lguest_arch_host_fini(void)
 int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 {
 	switch (args->arg0) {
-	case LHCALL_LOAD_GDT:
-		load_guest_gdt(cpu, args->arg1, args->arg2);
+	case LHCALL_LOAD_GDT_ENTRY:
+		load_guest_gdt_entry(cpu, args->arg1, args->arg2, args->arg3);
 		break;
 	case LHCALL_LOAD_IDT_ENTRY:
 		load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
-- 
cgit v1.2.3-59-g8ed1b


From 093f13e23137b9e5f7629dd5932ceea1419e2b61 Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Wed, 15 Apr 2009 10:37:33 -0700
Subject: x86, acpi_cpufreq: Fix the NULL pointer dereference in
 get_measured_perf

Fix for a regression that was introduced by earlier commit
18b2646fe3babeb40b34a0c1751e0bf5adfdc64c on Mon Apr 6 11:26:08 2009

Regression resulted in the below error happened on systems with
software coordination where per_cpu acpi data will not be initiated for
secondary CPUs in a P-state domain.

On Tue, 2009-04-14 at 23:01 -0700, Zhang, Yanmin wrote:
 My machine hanged with kernel 2.6.30-rc2 when script read
> /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor.
>
> opps happens in get_measured_perf:
>
>         cur.aperf.whole = readin.aperf.whole -
>                                 per_cpu(drv_data, cpu)->saved_aperf;
>
> Because per_cpu(drv_data, cpu)=NULL.
>
> So function get_measured_perf should check if (per_cpu(drv_data,
> cpu)==NULL)
> and return 0 if it's NULL.

--------------sys log------------------

BUG: unable to handle kernel NULL pointer dereference at
0000000000000020
IP: [<ffffffff8021af75>] get_measured_perf+0x4a/0xf9
PGD a7dd88067 PUD a7ccf5067 PMD 0
Oops: 0000 [#1] SMP
last sysfs file: /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
CPU 0
Modules linked in: video output
Pid: 2091, comm: kondemand/0 Not tainted 2.6.30-rc2 #1 MP Server
RIP: 0010:[<ffffffff8021af75>]  [<ffffffff8021af75>]
get_measured_perf+0x4a/0xf9
RSP: 0018:ffff880a7d56de20  EFLAGS: 00010246
RAX: 0000000000000000 RBX: 00000046241a42b6 RCX: ffff88004d219000
RDX: 000000000000b660 RSI: 0000000000000020 RDI: 0000000000000001
RBP: ffff880a7f052000 R08: 00000046241a42b6 R09: ffffffff807639f0
R10: 00000000ffffffea R11: ffffffff802207f4 R12: ffff880a7f052000
R13: ffff88004d20e460 R14: 0000000000ddd5a6 R15: 0000000000000001
FS:  0000000000000000(0000) GS:ffff88004d200000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000020 CR3: 0000000a7f1bf000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process kondemand/0 (pid: 2091, threadinfo ffff880a7d56c000, task
ffff880a7d4d18c0)
Stack:
 ffff880a7f052078 ffffffff803efd54 00000046241a42b6 000000462ffa9e95
 0000000000000001 0000000000000001 00000000ffffffea ffffffff8064f41a
 0000000000000012 0000000000000012 ffff880a7f052000 ffffffff80650547
Call Trace:
 [<ffffffff803efd54>] ? kobject_get+0x12/0x17
 [<ffffffff8064f41a>] ? __cpufreq_driver_getavg+0x42/0x57
 [<ffffffff80650547>] ? do_dbs_timer+0x147/0x272
 [<ffffffff80650400>] ? do_dbs_timer+0x0/0x272
 [<ffffffff802474ca>] ? worker_thread+0x15b/0x1f5
 [<ffffffff8024a02c>] ? autoremove_wake_function+0x0/0x2e
 [<ffffffff8024736f>] ? worker_thread+0x0/0x1f5
 [<ffffffff80249f0d>] ? kthread+0x54/0x83
 [<ffffffff8020c87a>] ? child_rip+0xa/0x20
 [<ffffffff80249eb9>] ? kthread+0x0/0x83
 [<ffffffff8020c870>] ? child_rip+0x0/0x20
Code: 99 a6 03 00 31 c9 85 c0 0f 85 c3 00 00 00 89 df 4c 8b 44 24 10 48
c7 c2 60 b6 00 00 48 8b 0c fd e0 30 a5 80 4c 89 c3 48 8b 04 0a <48> 2b
58 20 48 8b 44 24 18 48 89 1c 24 48 8b 34 0a 48 2b 46 28
RIP  [<ffffffff8021af75>] get_measured_perf+0x4a/0xf9
 RSP <ffff880a7d56de20>
CR2: 0000000000000020
---[ end trace 2b8fac9a49e19ad4 ]---

Tested-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ecdb682ab516..dd0bd76d14c0 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -68,11 +68,16 @@ struct acpi_cpufreq_data {
 	unsigned int max_freq;
 	unsigned int resume;
 	unsigned int cpu_feature;
-	u64 saved_aperf, saved_mperf;
 };
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
 
+struct acpi_msr_data {
+	u64 saved_aperf, saved_mperf;
+};
+
+static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
+
 DEFINE_TRACE(power_mark);
 
 /* acpi_perf_data is a pointer to percpu data. */
@@ -287,11 +292,11 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
 		return 0;
 
 	cur.aperf.whole = readin.aperf.whole -
-				per_cpu(drv_data, cpu)->saved_aperf;
+				per_cpu(msr_data, cpu).saved_aperf;
 	cur.mperf.whole = readin.mperf.whole -
-				per_cpu(drv_data, cpu)->saved_mperf;
-	per_cpu(drv_data, cpu)->saved_aperf = readin.aperf.whole;
-	per_cpu(drv_data, cpu)->saved_mperf = readin.mperf.whole;
+				per_cpu(msr_data, cpu).saved_mperf;
+	per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
+	per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
 
 #ifdef __i386__
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From e0e8c4e512e92bc25c19bd8d4926de17d2f8fbf2 Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Fri, 17 Apr 2009 16:22:06 +0200
Subject: acpi-cpufreq: Cleanup: Use printk_once

Signed-off-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index dd0bd76d14c0..4eab747a8966 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -693,13 +693,9 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	/* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
 	if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
 	    policy->cpuinfo.transition_latency > 20 * 1000) {
-		static int print_once;
 		policy->cpuinfo.transition_latency = 20 * 1000;
-		if (!print_once) {
-			print_once = 1;
-			printk(KERN_INFO "Capping off P-state tranision latency"
-				" at 20 uS\n");
-		}
+			printk_once(KERN_INFO "Capping off P-state tranision"
+				    " latency at 20 uS\n");
 	}
 
 	data->max_freq = perf->states[0].core_frequency * 1000;
-- 
cgit v1.2.3-59-g8ed1b


From d91758f5ddb80e91176fa2cf80c88c1633950b3d Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Fri, 17 Apr 2009 16:22:07 +0200
Subject: acpi-cpufreq: style-only: add parens to math expression

Signed-off-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4eab747a8966..aec3161abede 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -340,7 +340,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
 
 #endif
 
-	retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
+	retval = (per_cpu(drv_data, policy->cpu)->max_freq * perf_percent) / 100;
 
 	return retval;
 }
-- 
cgit v1.2.3-59-g8ed1b


From d876dfbbf5c8728102fb4f683450fa9ae3259cda Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Fri, 17 Apr 2009 16:22:08 +0200
Subject: acpi-cpufreq: Do not let get_measured perf depend on internal
 variable

Take already available policy->cpuinfo.max_freq and get rid of acpi-cpufreq
specific max_freq variable.

This implies that P0 is always the highest frequency which should always
be true as ACPI spec says:
As a result, the zeroth entry describes the highest performance state

Signed-off-by: Thomas Renninger <trenn@suse.de>
Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index aec3161abede..208ecf6643df 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -65,7 +65,6 @@ enum {
 struct acpi_cpufreq_data {
 	struct acpi_processor_performance *acpi_data;
 	struct cpufreq_frequency_table *freq_table;
-	unsigned int max_freq;
 	unsigned int resume;
 	unsigned int cpu_feature;
 };
@@ -340,7 +339,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
 
 #endif
 
-	retval = (per_cpu(drv_data, policy->cpu)->max_freq * perf_percent) / 100;
+	retval = (policy->cpuinfo.max_freq * perf_percent) / 100;
 
 	return retval;
 }
@@ -698,7 +697,6 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 				    " latency at 20 uS\n");
 	}
 
-	data->max_freq = perf->states[0].core_frequency * 1000;
 	/* table init */
 	for (i = 0; i < perf->state_count; i++) {
 		if (i > 0 && perf->states[i].core_frequency >=
@@ -717,6 +715,9 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (result)
 		goto err_freqfree;
 
+	if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
+		printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
+
 	switch (perf->control_register.space_id) {
 	case ACPI_ADR_SPACE_SYSTEM_IO:
 		/* Current speed is unknown and not detectable by IO port */
-- 
cgit v1.2.3-59-g8ed1b


From fc61e6636d13eb3a23eb29b4327eeee9de0ef3bc Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Mon, 20 Apr 2009 08:25:31 -0500
Subject: x86/uv: fix for no memory at paddr 0

Fix endcase where the memory at physical address 0 does not really
exist AND one of the sockets on blade 0 has no active cpus.

The memory that _appears_ to be at physical address 0 is actually
memory that located at a different address but has been remapped by
the chipset so that it appears to be at physical address 0.

When determining the UV pnode, the algorithm for determining the pnode
incorrectly used the relocated physical address instead of the actual
(global) address.

[ Impact: boot failure on partitioned systems ]

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090420132530.GA23156@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d6712334ce4b..2bda69352976 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -652,6 +652,7 @@ void __init uv_system_init(void)
 		if (uv_node_to_blade[nid] >= 0)
 			continue;
 		paddr = node_start_pfn(nid) << PAGE_SHIFT;
+		paddr = uv_soc_phys_ram_to_gpa(paddr);
 		pnode = (paddr >> m_val) & pnode_mask;
 		blade = boot_pnode_to_blade(pnode);
 		uv_node_to_blade[nid] = blade;
-- 
cgit v1.2.3-59-g8ed1b


From 06c38d5e36b12d040839ff224e805146c0368556 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 9 Apr 2009 15:24:34 -0700
Subject: x86-64: fix FPU corruption with signals and preemption

In 64bit signal delivery path, clear_used_math() was happening before saving
the current active FPU state on to the user stack for signal handling. Between
clear_used_math() and the state store on to the user stack, potentially we
can get a page fault for the user address and can block. Infact, while testing
we were hitting the might_fault() in __clear_user() which can do a schedule().

At a later point in time, we will schedule back into this process and
resume the save state (using "xsave/fxsave" instruction) which can lead
to DNA fault. And as used_math was cleared before, we will reinit the FP state
in the DNA fault and continue. This reinit will result in loosing the
FPU state of the process.

Move clear_used_math() to a point after the FPU state has been stored
onto the user stack.

This issue is present from a long time (even before the xsave changes
and the x86 merge). But it can easily be exposed in 2.6.28.x and 2.6.29.x
series because of the __clear_user() in this path, which has an explicit
__cond_resched() leading to a context switch with CONFIG_PREEMPT_VOLUNTARY.

[ Impact: fix FPU state corruption ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: <stable@kernel.org>			[2.6.28.x, 2.6.29.x]
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/xsave.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 0a5b04aa98f1..c5ee17e8c6d9 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -89,7 +89,7 @@ int save_i387_xstate(void __user *buf)
 
 	if (!used_math())
 		return 0;
-	clear_used_math(); /* trigger finit */
+
 	if (task_thread_info(tsk)->status & TS_USEDFPU) {
 		/*
 	 	 * Start with clearing the user buffer. This will present a
@@ -114,6 +114,8 @@ int save_i387_xstate(void __user *buf)
 			return -1;
 	}
 
+	clear_used_math(); /* trigger finit */
+
 	if (task_thread_info(tsk)->status & TS_XSAVE) {
 		struct _fpstate __user *fx = buf;
 		struct _xstate __user *x = buf;
-- 
cgit v1.2.3-59-g8ed1b


From 0112fc2229847feb6c4eb011e6833d8f1742a375 Mon Sep 17 00:00:00 2001
From: Oleg Drokin <green@linuxhacker.ru>
Date: Wed, 8 Apr 2009 20:05:42 +0400
Subject: Separate out common fstatat code into vfs_fstatat

This is a version incorporating Christoph's suggestion.

Separate out common *fstatat functionality into a single function
instead of duplicating it all over the code.

Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/arm/kernel/sys_oabi-compat.c | 19 ++++---------
 arch/s390/kernel/compat_linux.c   | 18 ++++---------
 arch/sparc/kernel/sys_sparc32.c   | 19 ++++---------
 arch/x86/ia32/sys_ia32.c          | 19 ++++---------
 fs/compat.c                       | 19 ++++---------
 fs/stat.c                         | 56 +++++++++++++++++++--------------------
 include/linux/fs.h                |  1 +
 7 files changed, 54 insertions(+), 97 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index e04173c7e621..d59a0cd537f0 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -177,21 +177,12 @@ asmlinkage long sys_oabi_fstatat64(int dfd,
 				   int flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
+	int error;
 
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-	error = cp_oldabi_stat64(&stat, statbuf);
-
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_oldabi_stat64(&stat, statbuf);
 }
 
 struct oabi_flock64 {
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 6cc87d8c8682..002c70d3cb75 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -702,20 +702,12 @@ asmlinkage long sys32_fstatat64(unsigned int dfd, char __user *filename,
 				struct stat64_emu31 __user* statbuf, int flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_stat64(statbuf, &stat);
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_stat64(statbuf, &stat);
 }
 
 /*
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index e800503879e4..f5000a460c05 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -206,21 +206,12 @@ asmlinkage long compat_sys_fstatat64(unsigned int dfd, char __user *filename,
 		struct compat_stat64 __user * statbuf, int flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_compat_stat64(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_compat_stat64(&stat, statbuf);
 }
 
 asmlinkage long compat_sys_sysfs(int option, u32 arg1, u32 arg2)
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index efac92fd1efb..085a8c35f149 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -129,21 +129,12 @@ asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
 			      struct stat64 __user *statbuf, int flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
+	int error;
 
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_stat64(statbuf, &stat);
-
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_stat64(statbuf, &stat);
 }
 
 /*
diff --git a/fs/compat.c b/fs/compat.c
index 3f84d5f15889..dda72e267092 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -204,21 +204,12 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
 		struct compat_stat __user *statbuf, int flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_compat_stat(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
 }
 #endif
 
diff --git a/fs/stat.c b/fs/stat.c
index 2db740a0cfb5..54711662b855 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -109,6 +109,24 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
 
 EXPORT_SYMBOL(vfs_fstat);
 
+int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
+{
+	int error = -EINVAL;
+
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	if (flag & AT_SYMLINK_NOFOLLOW)
+		error = vfs_lstat_fd(dfd, filename, stat);
+	else
+		error = vfs_stat_fd(dfd, filename, stat);
+out:
+	return error;
+}
+
+EXPORT_SYMBOL(vfs_fstatat);
+
+
 #ifdef __ARCH_WANT_OLD_STAT
 
 /*
@@ -264,21 +282,12 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
 		struct stat __user *, statbuf, int, flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_new_stat(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_new_stat(&stat, statbuf);
 }
 #endif
 
@@ -404,21 +413,12 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
 		struct stat64 __user *, statbuf, int, flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_new_stat64(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_new_stat64(&stat, statbuf);
 }
 #endif /* __ARCH_WANT_STAT64 */
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e766be0d4329..257f4d37ad23 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2302,6 +2302,7 @@ extern int vfs_lstat(char __user *, struct kstat *);
 extern int vfs_stat_fd(int dfd, char __user *, struct kstat *);
 extern int vfs_lstat_fd(int dfd, char __user *, struct kstat *);
 extern int vfs_fstat(unsigned int, struct kstat *);
+extern int vfs_fstatat(int , char __user *, struct kstat *, int);
 
 extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		    unsigned long arg);
-- 
cgit v1.2.3-59-g8ed1b


From 2f537a9f8e82f55c241b002c8cfbf34303b45ada Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 21 Apr 2009 16:00:15 +0930
Subject: x86: fix boot crash in NMI watchdog with CONFIG_CPUMASK_OFFSTACK=y
 and flat APIC

fcef8576d8a64fc603e719c97d423f9f6d4e0e8b converted backtrace_mask to a
cpumask_var_t, and assumed check_nmi_watchdog was called before
nmi_watchdog_tick was ever called.  Steven's oops shows I was wrong.

This is something of a bandaid: I'm not sure we *should* be calling
nmi_watchdog_tick before check_nmi_watchdog.  Note that gcc eliminates
this test for the CONFIG_CPUMASK_OFFSTACK=n case.

[ Impact: fix boot crash in rare configs ]

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <alpine.DEB.2.00.0904202113520.10097@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/nmi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index d6bd62407152..2ba52f35a88c 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -414,7 +414,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 		touched = 1;
 	}
 
-	if (cpumask_test_cpu(cpu, backtrace_mask)) {
+	/* We can be called before check_nmi_watchdog, hence NULL check. */
+	if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) {
 		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
 
 		spin_lock(&lock);
-- 
cgit v1.2.3-59-g8ed1b


From fcc5c4a2feea3886dc058498b28508b2731720d5 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 21 Apr 2009 16:03:41 +0930
Subject: x86: avoid theoretical spurious NMI backtraces with
 CONFIG_CPUMASK_OFFSTACK=y

In theory (though not shown in practice) alloc_cpumask_var() doesn't zero
memory, so CPUs might print an "NMI backtrace for cpu %d" once on boot.

(Bug introduced in fcef8576d8a64fc603e719c97d423f9f6d4e0e8b).

[ Impact: avoid theoretical syslog noise in rare configs ]

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <alpine.DEB.2.00.0904202113520.10097@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/nmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 2ba52f35a88c..ce4fbfa315a1 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -138,7 +138,7 @@ int __init check_nmi_watchdog(void)
 	if (!prev_nmi_count)
 		goto error;
 
-	alloc_cpumask_var(&backtrace_mask, GFP_KERNEL);
+	alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO);
 	printk(KERN_INFO "Testing NMI watchdog ... ");
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3-59-g8ed1b


From 8e19608e8b5c001e4a66ce482edc474f05fb7355 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 21 Apr 2009 12:24:00 -0700
Subject: clocksource: pass clocksource to read() callback

Pass clocksource pointer to the read() callback for clocksources.  This
allows us to share the callback between multiple instances.

[hugh@veritas.com: fix powerpc build of clocksource pass clocksource mods]
[akpm@linux-foundation.org: cleanup]
Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-at91/at91rm9200_time.c         |  2 +-
 arch/arm/mach-at91/at91sam926x_time.c        |  2 +-
 arch/arm/mach-davinci/time.c                 |  2 +-
 arch/arm/mach-imx/time.c                     |  2 +-
 arch/arm/mach-ixp4xx/common.c                |  2 +-
 arch/arm/mach-msm/timer.c                    |  4 ++--
 arch/arm/mach-netx/time.c                    |  2 +-
 arch/arm/mach-ns9xxx/time-ns9360.c           |  2 +-
 arch/arm/mach-omap1/time.c                   |  2 +-
 arch/arm/mach-omap2/timer-gp.c               |  2 +-
 arch/arm/mach-pxa/time.c                     |  2 +-
 arch/arm/mach-realview/core.c                |  2 +-
 arch/arm/mach-versatile/core.c               |  2 +-
 arch/arm/plat-mxc/time.c                     |  2 +-
 arch/arm/plat-omap/common.c                  |  4 ++--
 arch/arm/plat-orion/time.c                   |  2 +-
 arch/avr32/kernel/time.c                     |  2 +-
 arch/blackfin/kernel/time-ts.c               | 12 ++++++------
 arch/ia64/kernel/cyclone.c                   |  2 +-
 arch/ia64/kernel/time.c                      |  4 ++--
 arch/ia64/sn/kernel/sn2/timer.c              |  2 +-
 arch/m68knommu/platform/68328/timers.c       |  2 +-
 arch/m68knommu/platform/coldfire/dma_timer.c |  2 +-
 arch/m68knommu/platform/coldfire/pit.c       |  2 +-
 arch/m68knommu/platform/coldfire/timers.c    |  2 +-
 arch/mips/kernel/cevt-txx9.c                 |  2 +-
 arch/mips/kernel/csrc-bcm1480.c              |  2 +-
 arch/mips/kernel/csrc-ioasic.c               |  6 +++---
 arch/mips/kernel/csrc-r4k.c                  |  2 +-
 arch/mips/kernel/csrc-sb1250.c               |  2 +-
 arch/mips/kernel/i8253.c                     |  2 +-
 arch/mips/nxp/pnx8550/common/time.c          |  2 +-
 arch/mips/sgi-ip27/ip27-timer.c              |  2 +-
 arch/powerpc/kernel/time.c                   |  8 ++++----
 arch/s390/kernel/time.c                      |  2 +-
 arch/sh/kernel/time_32.c                     |  2 +-
 arch/sh/kernel/timers/timer-tmu.c            |  2 +-
 arch/sparc/kernel/time_64.c                  |  7 ++++++-
 arch/um/kernel/time.c                        |  2 +-
 arch/x86/kernel/hpet.c                       |  6 +++---
 arch/x86/kernel/i8253.c                      |  2 +-
 arch/x86/kernel/kvmclock.c                   |  7 ++++++-
 arch/x86/kernel/tsc.c                        |  2 +-
 arch/x86/kernel/vmiclock_32.c                |  2 +-
 arch/x86/lguest/boot.c                       |  2 +-
 arch/x86/xen/time.c                          |  7 ++++++-
 drivers/char/hpet.c                          |  2 +-
 drivers/clocksource/acpi_pm.c                | 12 ++++++------
 drivers/clocksource/cyclone.c                |  2 +-
 drivers/clocksource/scx200_hrt.c             |  2 +-
 drivers/clocksource/tcb_clksrc.c             |  2 +-
 include/linux/clocksource.h                  |  6 +++---
 kernel/time/clocksource.c                    |  8 ++++----
 kernel/time/jiffies.c                        |  2 +-
 54 files changed, 94 insertions(+), 79 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/mach-at91/at91rm9200_time.c b/arch/arm/mach-at91/at91rm9200_time.c
index 1ff1bda0a894..309f3511aa20 100644
--- a/arch/arm/mach-at91/at91rm9200_time.c
+++ b/arch/arm/mach-at91/at91rm9200_time.c
@@ -85,7 +85,7 @@ static struct irqaction at91rm9200_timer_irq = {
 	.handler	= at91rm9200_timer_interrupt
 };
 
-static cycle_t read_clk32k(void)
+static cycle_t read_clk32k(struct clocksource *cs)
 {
 	return read_CRTR();
 }
diff --git a/arch/arm/mach-at91/at91sam926x_time.c b/arch/arm/mach-at91/at91sam926x_time.c
index b63e1d5f1bad..4bd56aee4370 100644
--- a/arch/arm/mach-at91/at91sam926x_time.c
+++ b/arch/arm/mach-at91/at91sam926x_time.c
@@ -31,7 +31,7 @@ static u32 pit_cnt;		/* access only w/system irq blocked */
  * Clocksource:  just a monotonic counter of MCK/16 cycles.
  * We don't care whether or not PIT irqs are enabled.
  */
-static cycle_t read_pit_clk(void)
+static cycle_t read_pit_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 elapsed;
diff --git a/arch/arm/mach-davinci/time.c b/arch/arm/mach-davinci/time.c
index f8bcd29d17a6..6c227d4ba998 100644
--- a/arch/arm/mach-davinci/time.c
+++ b/arch/arm/mach-davinci/time.c
@@ -238,7 +238,7 @@ static void __init timer_init(void)
 /*
  * clocksource
  */
-static cycle_t read_cycles(void)
+static cycle_t read_cycles(struct clocksource *cs)
 {
 	struct timer_s *t = &timers[TID_CLOCKSOURCE];
 
diff --git a/arch/arm/mach-imx/time.c b/arch/arm/mach-imx/time.c
index aff0ebcfa847..5aef18b599e5 100644
--- a/arch/arm/mach-imx/time.c
+++ b/arch/arm/mach-imx/time.c
@@ -73,7 +73,7 @@ static void __init imx_timer_hardware_init(void)
 	IMX_TCTL(TIMER_BASE) = TCTL_FRR | TCTL_CLK_PCLK1 | TCTL_TEN;
 }
 
-cycle_t imx_get_cycles(void)
+cycle_t imx_get_cycles(struct clocksource *cs)
 {
 	return IMX_TCN(TIMER_BASE);
 }
diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c
index f4656d2ac8a8..1e93dfee7543 100644
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -401,7 +401,7 @@ void __init ixp4xx_sys_init(void)
 /*
  * clocksource
  */
-cycle_t ixp4xx_get_cycles(void)
+cycle_t ixp4xx_get_cycles(struct clocksource *cs)
 {
 	return *IXP4XX_OSTS;
 }
diff --git a/arch/arm/mach-msm/timer.c b/arch/arm/mach-msm/timer.c
index 444d9c0f5ca6..4855b8ca5101 100644
--- a/arch/arm/mach-msm/timer.c
+++ b/arch/arm/mach-msm/timer.c
@@ -57,12 +57,12 @@ static irqreturn_t msm_timer_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static cycle_t msm_gpt_read(void)
+static cycle_t msm_gpt_read(struct clocksource *cs)
 {
 	return readl(MSM_GPT_BASE + TIMER_COUNT_VAL);
 }
 
-static cycle_t msm_dgt_read(void)
+static cycle_t msm_dgt_read(struct clocksource *cs)
 {
 	return readl(MSM_DGT_BASE + TIMER_COUNT_VAL) >> MSM_DGT_SHIFT;
 }
diff --git a/arch/arm/mach-netx/time.c b/arch/arm/mach-netx/time.c
index f201fddb594f..82801dbf0579 100644
--- a/arch/arm/mach-netx/time.c
+++ b/arch/arm/mach-netx/time.c
@@ -104,7 +104,7 @@ static struct irqaction netx_timer_irq = {
 	.handler	= netx_timer_interrupt,
 };
 
-cycle_t netx_get_cycles(void)
+cycle_t netx_get_cycles(struct clocksource *cs)
 {
 	return readl(NETX_GPIO_COUNTER_CURRENT(TIMER_CLOCKSOURCE));
 }
diff --git a/arch/arm/mach-ns9xxx/time-ns9360.c b/arch/arm/mach-ns9xxx/time-ns9360.c
index 41df69721769..77281260358a 100644
--- a/arch/arm/mach-ns9xxx/time-ns9360.c
+++ b/arch/arm/mach-ns9xxx/time-ns9360.c
@@ -25,7 +25,7 @@
 #define TIMER_CLOCKEVENT 1
 static u32 latch;
 
-static cycle_t ns9360_clocksource_read(void)
+static cycle_t ns9360_clocksource_read(struct clocksource *cs)
 {
 	return __raw_readl(SYS_TR(TIMER_CLOCKSOURCE));
 }
diff --git a/arch/arm/mach-omap1/time.c b/arch/arm/mach-omap1/time.c
index 495a32c287b4..4d56408d3cff 100644
--- a/arch/arm/mach-omap1/time.c
+++ b/arch/arm/mach-omap1/time.c
@@ -198,7 +198,7 @@ static struct irqaction omap_mpu_timer2_irq = {
 	.handler	= omap_mpu_timer2_interrupt,
 };
 
-static cycle_t mpu_read(void)
+static cycle_t mpu_read(struct clocksource *cs)
 {
 	return ~omap_mpu_timer_read(1);
 }
diff --git a/arch/arm/mach-omap2/timer-gp.c b/arch/arm/mach-omap2/timer-gp.c
index 9fc13a2cc3f4..1cb2c0909c2b 100644
--- a/arch/arm/mach-omap2/timer-gp.c
+++ b/arch/arm/mach-omap2/timer-gp.c
@@ -138,7 +138,7 @@ static inline void __init omap2_gp_clocksource_init(void) {}
  * clocksource
  */
 static struct omap_dm_timer *gpt_clocksource;
-static cycle_t clocksource_read_cycles(void)
+static cycle_t clocksource_read_cycles(struct clocksource *cs)
 {
 	return (cycle_t)omap_dm_timer_read_counter(gpt_clocksource);
 }
diff --git a/arch/arm/mach-pxa/time.c b/arch/arm/mach-pxa/time.c
index 8eb3830fbb0b..750c448db672 100644
--- a/arch/arm/mach-pxa/time.c
+++ b/arch/arm/mach-pxa/time.c
@@ -125,7 +125,7 @@ static struct clock_event_device ckevt_pxa_osmr0 = {
 	.set_mode	= pxa_osmr0_set_mode,
 };
 
-static cycle_t pxa_read_oscr(void)
+static cycle_t pxa_read_oscr(struct clocksource *cs)
 {
 	return OSCR;
 }
diff --git a/arch/arm/mach-realview/core.c b/arch/arm/mach-realview/core.c
index 9ab947c14f26..942e1a7eb9b2 100644
--- a/arch/arm/mach-realview/core.c
+++ b/arch/arm/mach-realview/core.c
@@ -715,7 +715,7 @@ static struct irqaction realview_timer_irq = {
 	.handler	= realview_timer_interrupt,
 };
 
-static cycle_t realview_get_cycles(void)
+static cycle_t realview_get_cycles(struct clocksource *cs)
 {
 	return ~readl(timer3_va_base + TIMER_VALUE);
 }
diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c
index 565776680d8c..1f929c391af7 100644
--- a/arch/arm/mach-versatile/core.c
+++ b/arch/arm/mach-versatile/core.c
@@ -948,7 +948,7 @@ static struct irqaction versatile_timer_irq = {
 	.handler	= versatile_timer_interrupt,
 };
 
-static cycle_t versatile_get_cycles(void)
+static cycle_t versatile_get_cycles(struct clocksource *cs)
 {
 	return ~readl(TIMER3_VA_BASE + TIMER_VALUE);
 }
diff --git a/arch/arm/plat-mxc/time.c b/arch/arm/plat-mxc/time.c
index ef1b3cd85bd3..dab3357196fb 100644
--- a/arch/arm/plat-mxc/time.c
+++ b/arch/arm/plat-mxc/time.c
@@ -36,7 +36,7 @@ static enum clock_event_mode clockevent_mode = CLOCK_EVT_MODE_UNUSED;
 
 /* clock source */
 
-static cycle_t mxc_get_cycles(void)
+static cycle_t mxc_get_cycles(struct clocksource *cs)
 {
 	return __raw_readl(TIMER_BASE + MXC_TCN);
 }
diff --git a/arch/arm/plat-omap/common.c b/arch/arm/plat-omap/common.c
index d1797147732f..433021f3d7cc 100644
--- a/arch/arm/plat-omap/common.c
+++ b/arch/arm/plat-omap/common.c
@@ -185,7 +185,7 @@ console_initcall(omap_add_serial_console);
 
 #include <linux/clocksource.h>
 
-static cycle_t omap_32k_read(void)
+static cycle_t omap_32k_read(struct clocksource *cs)
 {
 	return omap_readl(TIMER_32K_SYNCHRONIZED);
 }
@@ -207,7 +207,7 @@ unsigned long long sched_clock(void)
 {
 	unsigned long long ret;
 
-	ret = (unsigned long long)omap_32k_read();
+	ret = (unsigned long long)omap_32k_read(&clocksource_32k);
 	ret = (ret * clocksource_32k.mult_orig) >> clocksource_32k.shift;
 	return ret;
 }
diff --git a/arch/arm/plat-orion/time.c b/arch/arm/plat-orion/time.c
index 6fa2923e6dca..2faf9dba4ef7 100644
--- a/arch/arm/plat-orion/time.c
+++ b/arch/arm/plat-orion/time.c
@@ -41,7 +41,7 @@ static u32 ticks_per_jiffy;
 /*
  * Clocksource handling.
  */
-static cycle_t orion_clksrc_read(void)
+static cycle_t orion_clksrc_read(struct clocksource *cs)
 {
 	return 0xffffffff - readl(TIMER0_VAL);
 }
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
index 0ff46bf873b0..f27aa3b259fa 100644
--- a/arch/avr32/kernel/time.c
+++ b/arch/avr32/kernel/time.c
@@ -18,7 +18,7 @@
 #include <mach/pm.h>
 
 
-static cycle_t read_cycle_count(void)
+static cycle_t read_cycle_count(struct clocksource *cs)
 {
 	return (cycle_t)sysreg_read(COUNT);
 }
diff --git a/arch/blackfin/kernel/time-ts.c b/arch/blackfin/kernel/time-ts.c
index 0ed2badfd746..27646121280a 100644
--- a/arch/blackfin/kernel/time-ts.c
+++ b/arch/blackfin/kernel/time-ts.c
@@ -58,16 +58,11 @@ static inline unsigned long long cycles_2_ns(cycle_t cyc)
 	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
 }
 
-static cycle_t read_cycles(void)
+static cycle_t read_cycles(struct clocksource *cs)
 {
 	return __bfin_cycles_off + (get_cycles() << __bfin_cycles_mod);
 }
 
-unsigned long long sched_clock(void)
-{
-	return cycles_2_ns(read_cycles());
-}
-
 static struct clocksource clocksource_bfin = {
 	.name		= "bfin_cycles",
 	.rating		= 350,
@@ -77,6 +72,11 @@ static struct clocksource clocksource_bfin = {
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
+unsigned long long sched_clock(void)
+{
+	return cycles_2_ns(read_cycles(&clocksource_bfin));
+}
+
 static int __init bfin_clocksource_init(void)
 {
 	set_cyc2ns_scale(get_cclk() / 1000);
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index 790ef0d87e12..71e35864d2e2 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -21,7 +21,7 @@ void __init cyclone_setup(void)
 
 static void __iomem *cyclone_mc;
 
-static cycle_t read_cyclone(void)
+static cycle_t read_cyclone(struct clocksource *cs)
 {
 	return (cycle_t)readq((void __iomem *)cyclone_mc);
 }
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 641c8b61c4f1..604c1a35db33 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -33,7 +33,7 @@
 
 #include "fsyscall_gtod_data.h"
 
-static cycle_t itc_get_cycles(void);
+static cycle_t itc_get_cycles(struct clocksource *cs);
 
 struct fsyscall_gtod_data_t fsyscall_gtod_data = {
 	.lock = SEQLOCK_UNLOCKED,
@@ -383,7 +383,7 @@ ia64_init_itm (void)
 	}
 }
 
-static cycle_t itc_get_cycles(void)
+static cycle_t itc_get_cycles(struct clocksource *cs)
 {
 	u64 lcycle, now, ret;
 
diff --git a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c
index cf67fc562054..21d6f09e3447 100644
--- a/arch/ia64/sn/kernel/sn2/timer.c
+++ b/arch/ia64/sn/kernel/sn2/timer.c
@@ -23,7 +23,7 @@
 
 extern unsigned long sn_rtc_cycles_per_second;
 
-static cycle_t read_sn2(void)
+static cycle_t read_sn2(struct clocksource *cs)
 {
 	return (cycle_t)readq(RTC_COUNTER_ADDR);
 }
diff --git a/arch/m68knommu/platform/68328/timers.c b/arch/m68knommu/platform/68328/timers.c
index 6bafefa546e5..309f725995bf 100644
--- a/arch/m68knommu/platform/68328/timers.c
+++ b/arch/m68knommu/platform/68328/timers.c
@@ -75,7 +75,7 @@ static struct irqaction m68328_timer_irq = {
 
 /***************************************************************************/
 
-static cycle_t m68328_read_clk(void)
+static cycle_t m68328_read_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 cycles;
diff --git a/arch/m68knommu/platform/coldfire/dma_timer.c b/arch/m68knommu/platform/coldfire/dma_timer.c
index 772578b1084f..a5f562823d7a 100644
--- a/arch/m68knommu/platform/coldfire/dma_timer.c
+++ b/arch/m68knommu/platform/coldfire/dma_timer.c
@@ -34,7 +34,7 @@
 #define DMA_DTMR_CLK_DIV_16	(2 << 1)
 #define DMA_DTMR_ENABLE		(1 << 0)
 
-static cycle_t cf_dt_get_cycles(void)
+static cycle_t cf_dt_get_cycles(struct clocksource *cs)
 {
 	return __raw_readl(DTCN0);
 }
diff --git a/arch/m68knommu/platform/coldfire/pit.c b/arch/m68knommu/platform/coldfire/pit.c
index 2a12e7fa9748..61b96211f8ff 100644
--- a/arch/m68knommu/platform/coldfire/pit.c
+++ b/arch/m68knommu/platform/coldfire/pit.c
@@ -125,7 +125,7 @@ static struct irqaction pit_irq = {
 
 /***************************************************************************/
 
-static cycle_t pit_read_clk(void)
+static cycle_t pit_read_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 cycles;
diff --git a/arch/m68knommu/platform/coldfire/timers.c b/arch/m68knommu/platform/coldfire/timers.c
index 454f25493491..1ba8a3731653 100644
--- a/arch/m68knommu/platform/coldfire/timers.c
+++ b/arch/m68knommu/platform/coldfire/timers.c
@@ -78,7 +78,7 @@ static struct irqaction mcftmr_timer_irq = {
 
 /***************************************************************************/
 
-static cycle_t mcftmr_read_clk(void)
+static cycle_t mcftmr_read_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 cycles;
diff --git a/arch/mips/kernel/cevt-txx9.c b/arch/mips/kernel/cevt-txx9.c
index eccf7d6096bd..2e911e3da8d3 100644
--- a/arch/mips/kernel/cevt-txx9.c
+++ b/arch/mips/kernel/cevt-txx9.c
@@ -22,7 +22,7 @@
 
 static struct txx9_tmr_reg __iomem *txx9_cs_tmrptr;
 
-static cycle_t txx9_cs_read(void)
+static cycle_t txx9_cs_read(struct clocksource *cs)
 {
 	return __raw_readl(&txx9_cs_tmrptr->trr);
 }
diff --git a/arch/mips/kernel/csrc-bcm1480.c b/arch/mips/kernel/csrc-bcm1480.c
index 868745e7184b..51489f8a825e 100644
--- a/arch/mips/kernel/csrc-bcm1480.c
+++ b/arch/mips/kernel/csrc-bcm1480.c
@@ -28,7 +28,7 @@
 
 #include <asm/sibyte/sb1250.h>
 
-static cycle_t bcm1480_hpt_read(void)
+static cycle_t bcm1480_hpt_read(struct clocksource *cs)
 {
 	return (cycle_t) __raw_readq(IOADDR(A_SCD_ZBBUS_CYCLE_COUNT));
 }
diff --git a/arch/mips/kernel/csrc-ioasic.c b/arch/mips/kernel/csrc-ioasic.c
index 1d5f63cf8997..b551f48d3a07 100644
--- a/arch/mips/kernel/csrc-ioasic.c
+++ b/arch/mips/kernel/csrc-ioasic.c
@@ -25,7 +25,7 @@
 #include <asm/dec/ioasic.h>
 #include <asm/dec/ioasic_addrs.h>
 
-static cycle_t dec_ioasic_hpt_read(void)
+static cycle_t dec_ioasic_hpt_read(struct clocksource *cs)
 {
 	return ioasic_read(IO_REG_FCTR);
 }
@@ -47,13 +47,13 @@ void __init dec_ioasic_clocksource_init(void)
 	while (!ds1287_timer_state())
 		;
 
-	start = dec_ioasic_hpt_read();
+	start = dec_ioasic_hpt_read(&clocksource_dec);
 
 	while (i--)
 		while (!ds1287_timer_state())
 			;
 
-	end = dec_ioasic_hpt_read();
+	end = dec_ioasic_hpt_read(&clocksource_dec);
 
 	freq = (end - start) * 10;
 	printk(KERN_INFO "I/O ASIC clock frequency %dHz\n", freq);
diff --git a/arch/mips/kernel/csrc-r4k.c b/arch/mips/kernel/csrc-r4k.c
index f1a2893931ed..e95a3cd48eea 100644
--- a/arch/mips/kernel/csrc-r4k.c
+++ b/arch/mips/kernel/csrc-r4k.c
@@ -10,7 +10,7 @@
 
 #include <asm/time.h>
 
-static cycle_t c0_hpt_read(void)
+static cycle_t c0_hpt_read(struct clocksource *cs)
 {
 	return read_c0_count();
 }
diff --git a/arch/mips/kernel/csrc-sb1250.c b/arch/mips/kernel/csrc-sb1250.c
index 92212bbb8e45..d14d3d1907fa 100644
--- a/arch/mips/kernel/csrc-sb1250.c
+++ b/arch/mips/kernel/csrc-sb1250.c
@@ -33,7 +33,7 @@
  * The HPT is free running from SB1250_HPT_VALUE down to 0 then starts over
  * again.
  */
-static cycle_t sb1250_hpt_read(void)
+static cycle_t sb1250_hpt_read(struct clocksource *cs)
 {
 	unsigned int count;
 
diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c
index 689719e34f08..ed20e7fe65e3 100644
--- a/arch/mips/kernel/i8253.c
+++ b/arch/mips/kernel/i8253.c
@@ -128,7 +128,7 @@ void __init setup_pit_timer(void)
  * to just read by itself. So use jiffies to emulate a free
  * running counter:
  */
-static cycle_t pit_read(void)
+static cycle_t pit_read(struct clocksource *cs)
 {
 	unsigned long flags;
 	int count;
diff --git a/arch/mips/nxp/pnx8550/common/time.c b/arch/mips/nxp/pnx8550/common/time.c
index cf293b279098..8df43e9e4d90 100644
--- a/arch/mips/nxp/pnx8550/common/time.c
+++ b/arch/mips/nxp/pnx8550/common/time.c
@@ -35,7 +35,7 @@
 
 static unsigned long cpj;
 
-static cycle_t hpt_read(void)
+static cycle_t hpt_read(struct clocksource *cs)
 {
 	return read_c0_count2();
 }
diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c
index f024057a35f8..f10a7cd64f7e 100644
--- a/arch/mips/sgi-ip27/ip27-timer.c
+++ b/arch/mips/sgi-ip27/ip27-timer.c
@@ -159,7 +159,7 @@ static void __init hub_rt_clock_event_global_init(void)
 	setup_irq(irq, &hub_rt_irqaction);
 }
 
-static cycle_t hub_rt_read(void)
+static cycle_t hub_rt_read(struct clocksource *cs)
 {
 	return REMOTE_HUB_L(cputonasid(0), PI_RT_COUNT);
 }
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 926ea864e34f..48571ac56fb7 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -77,7 +77,7 @@
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
 
-static cycle_t rtc_read(void);
+static cycle_t rtc_read(struct clocksource *);
 static struct clocksource clocksource_rtc = {
 	.name         = "rtc",
 	.rating       = 400,
@@ -88,7 +88,7 @@ static struct clocksource clocksource_rtc = {
 	.read         = rtc_read,
 };
 
-static cycle_t timebase_read(void);
+static cycle_t timebase_read(struct clocksource *);
 static struct clocksource clocksource_timebase = {
 	.name         = "timebase",
 	.rating       = 400,
@@ -766,12 +766,12 @@ unsigned long read_persistent_clock(void)
 }
 
 /* clocksource code */
-static cycle_t rtc_read(void)
+static cycle_t rtc_read(struct clocksource *cs)
 {
 	return (cycle_t)get_rtc();
 }
 
-static cycle_t timebase_read(void)
+static cycle_t timebase_read(struct clocksource *cs)
 {
 	return (cycle_t)get_tb();
 }
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 6ded50dfa75a..ef596d020573 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -201,7 +201,7 @@ unsigned long read_persistent_clock(void)
 	return ts.tv_sec;
 }
 
-static cycle_t read_tod_clock(void)
+static cycle_t read_tod_clock(struct clocksource *cs)
 {
 	return get_clock();
 }
diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index c34e1e0f9b02..1700d2465f6c 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -208,7 +208,7 @@ unsigned long long sched_clock(void)
 	if (!clocksource_sh.rating)
 		return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
 
-	cycles = clocksource_sh.read();
+	cycles = clocksource_sh.read(&clocksource_sh);
 	return cyc2ns(&clocksource_sh, cycles);
 }
 #endif
diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c
index c5d3396f5960..fe8d8930ccb6 100644
--- a/arch/sh/kernel/timers/timer-tmu.c
+++ b/arch/sh/kernel/timers/timer-tmu.c
@@ -81,7 +81,7 @@ static int tmu_timer_stop(void)
  */
 static int tmus_are_scaled;
 
-static cycle_t tmu_timer_read(void)
+static cycle_t tmu_timer_read(struct clocksource *cs)
 {
 	return ((cycle_t)(~_tmu_read(TMU1)))<<tmus_are_scaled;
 }
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index db310aa00183..5c12e79b4bdf 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -814,6 +814,11 @@ void udelay(unsigned long usecs)
 }
 EXPORT_SYMBOL(udelay);
 
+static cycle_t clocksource_tick_read(struct clocksource *cs)
+{
+	return tick_ops->get_tick();
+}
+
 void __init time_init(void)
 {
 	unsigned long freq = sparc64_init_timers();
@@ -827,7 +832,7 @@ void __init time_init(void)
 	clocksource_tick.mult =
 		clocksource_hz2mult(freq,
 				    clocksource_tick.shift);
-	clocksource_tick.read = tick_ops->get_tick;
+	clocksource_tick.read = clocksource_tick_read;
 
 	printk("clocksource: mult[%x] shift[%d]\n",
 	       clocksource_tick.mult, clocksource_tick.shift);
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index b13a87a3ec95..c8b9c469fcd7 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -65,7 +65,7 @@ static irqreturn_t um_timer(int irq, void *dev)
 	return IRQ_HANDLED;
 }
 
-static cycle_t itimer_read(void)
+static cycle_t itimer_read(struct clocksource *cs)
 {
 	return os_nsecs() / 1000;
 }
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 648b3a2a3a44..3f0019e0a229 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -722,7 +722,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
 /*
  * Clock source related code
  */
-static cycle_t read_hpet(void)
+static cycle_t read_hpet(struct clocksource *cs)
 {
 	return (cycle_t)hpet_readl(HPET_COUNTER);
 }
@@ -756,7 +756,7 @@ static int hpet_clocksource_register(void)
 	hpet_restart_counter();
 
 	/* Verify whether hpet counter works */
-	t1 = read_hpet();
+	t1 = hpet_readl(HPET_COUNTER);
 	rdtscll(start);
 
 	/*
@@ -770,7 +770,7 @@ static int hpet_clocksource_register(void)
 		rdtscll(now);
 	} while ((now - start) < 200000UL);
 
-	if (t1 == read_hpet()) {
+	if (t1 == hpet_readl(HPET_COUNTER)) {
 		printk(KERN_WARNING
 		       "HPET counter not counting. HPET disabled\n");
 		return -ENODEV;
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 3475440baa54..c2e0bb0890d4 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -129,7 +129,7 @@ void __init setup_pit_timer(void)
  * to just read by itself. So use jiffies to emulate a free
  * running counter:
  */
-static cycle_t pit_read(void)
+static cycle_t pit_read(struct clocksource *cs)
 {
 	static int old_count;
 	static u32 old_jifs;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 137f2e8132df..223af43f1526 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -77,6 +77,11 @@ static cycle_t kvm_clock_read(void)
 	return ret;
 }
 
+static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
+{
+	return kvm_clock_read();
+}
+
 /*
  * If we don't do that, there is the possibility that the guest
  * will calibrate under heavy load - thus, getting a lower lpj -
@@ -107,7 +112,7 @@ static void kvm_get_preset_lpj(void)
 
 static struct clocksource kvm_clock = {
 	.name = "kvm-clock",
-	.read = kvm_clock_read,
+	.read = kvm_clock_get_cycles,
 	.rating = 400,
 	.mask = CLOCKSOURCE_MASK(64),
 	.mult = 1 << KVM_SCALE,
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7a567ebe6361..d57de05dc430 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -699,7 +699,7 @@ static struct clocksource clocksource_tsc;
  * code, which is necessary to support wrapping clocksources like pm
  * timer.
  */
-static cycle_t read_tsc(void)
+static cycle_t read_tsc(struct clocksource *cs)
 {
 	cycle_t ret = (cycle_t)get_cycles();
 
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index d303369a7bad..2b3eb82efeeb 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -283,7 +283,7 @@ void __devinit vmi_time_ap_init(void)
 /** vmi clocksource */
 static struct clocksource clocksource_vmi;
 
-static cycle_t read_real_cycles(void)
+static cycle_t read_real_cycles(struct clocksource *cs)
 {
 	cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
 	return max(ret, clocksource_vmi.cycle_last);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a2085368a3dc..ca7ec44bafc3 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -663,7 +663,7 @@ static unsigned long lguest_tsc_khz(void)
 
 /* If we can't use the TSC, the kernel falls back to our lower-priority
  * "lguest_clock", where we read the time value given to us by the Host. */
-static cycle_t lguest_clock_read(void)
+static cycle_t lguest_clock_read(struct clocksource *cs)
 {
 	unsigned long sec, nsec;
 
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 14f240623497..0a5aa44299a5 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -213,6 +213,11 @@ cycle_t xen_clocksource_read(void)
 	return ret;
 }
 
+static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
+{
+	return xen_clocksource_read();
+}
+
 static void xen_read_wallclock(struct timespec *ts)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
@@ -241,7 +246,7 @@ int xen_set_wallclock(unsigned long now)
 static struct clocksource xen_clocksource __read_mostly = {
 	.name = "xen",
 	.rating = 400,
-	.read = xen_clocksource_read,
+	.read = xen_clocksource_get_cycles,
 	.mask = ~0,
 	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
 	.shift = XEN_SHIFT,
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 50dfa3bc71ce..340ba4f9dc54 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -72,7 +72,7 @@ static u32 hpet_nhpet, hpet_max_freq = HPET_USER_FREQ;
 #ifdef CONFIG_IA64
 static void __iomem *hpet_mctr;
 
-static cycle_t read_hpet(void)
+static cycle_t read_hpet(struct clocksource *cs)
 {
 	return (cycle_t)read_counter((void __iomem *)hpet_mctr);
 }
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index ee19b6e8fcb4..40bd8c61c7d7 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -57,7 +57,7 @@ u32 acpi_pm_read_verified(void)
 	return v2;
 }
 
-static cycle_t acpi_pm_read(void)
+static cycle_t acpi_pm_read(struct clocksource *cs)
 {
 	return (cycle_t)read_pmtmr();
 }
@@ -83,7 +83,7 @@ static int __init acpi_pm_good_setup(char *__str)
 }
 __setup("acpi_pm_good", acpi_pm_good_setup);
 
-static cycle_t acpi_pm_read_slow(void)
+static cycle_t acpi_pm_read_slow(struct clocksource *cs)
 {
 	return (cycle_t)acpi_pm_read_verified();
 }
@@ -156,9 +156,9 @@ static int verify_pmtmr_rate(void)
 	unsigned long count, delta;
 
 	mach_prepare_counter();
-	value1 = clocksource_acpi_pm.read();
+	value1 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
 	mach_countup(&count);
-	value2 = clocksource_acpi_pm.read();
+	value2 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
 	delta = (value2 - value1) & ACPI_PM_MASK;
 
 	/* Check that the PMTMR delta is within 5% of what we expect */
@@ -195,9 +195,9 @@ static int __init init_acpi_pm_clocksource(void)
 	/* "verify" this timing source: */
 	for (j = 0; j < ACPI_PM_MONOTONICITY_CHECKS; j++) {
 		udelay(100 * j);
-		value1 = clocksource_acpi_pm.read();
+		value1 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
 		for (i = 0; i < ACPI_PM_READ_CHECKS; i++) {
-			value2 = clocksource_acpi_pm.read();
+			value2 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
 			if (value2 == value1)
 				continue;
 			if (value2 > value1)
diff --git a/drivers/clocksource/cyclone.c b/drivers/clocksource/cyclone.c
index 8615059a8729..64e528e8bfa6 100644
--- a/drivers/clocksource/cyclone.c
+++ b/drivers/clocksource/cyclone.c
@@ -19,7 +19,7 @@
 int use_cyclone = 0;
 static void __iomem *cyclone_ptr;
 
-static cycle_t read_cyclone(void)
+static cycle_t read_cyclone(struct clocksource *cs)
 {
 	return (cycle_t)readl(cyclone_ptr);
 }
diff --git a/drivers/clocksource/scx200_hrt.c b/drivers/clocksource/scx200_hrt.c
index b92da677aa5d..27f4d9637b62 100644
--- a/drivers/clocksource/scx200_hrt.c
+++ b/drivers/clocksource/scx200_hrt.c
@@ -43,7 +43,7 @@ MODULE_PARM_DESC(ppm, "+-adjust to actual XO freq (ppm)");
 /* The base timer frequency, * 27 if selected */
 #define HRT_FREQ   1000000
 
-static cycle_t read_hrt(void)
+static cycle_t read_hrt(struct clocksource *cs)
 {
 	/* Read the timer value */
 	return (cycle_t) inl(scx200_cb_base + SCx200_TIMER_OFFSET);
diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
index 254f1064d973..01b886e68822 100644
--- a/drivers/clocksource/tcb_clksrc.c
+++ b/drivers/clocksource/tcb_clksrc.c
@@ -39,7 +39,7 @@
 
 static void __iomem *tcaddr;
 
-static cycle_t tc_get_cycles(void)
+static cycle_t tc_get_cycles(struct clocksource *cs)
 {
 	unsigned long	flags;
 	u32		lower, upper;
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 573819ef4cc0..0d96cde9ee5d 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -143,7 +143,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc,
  *			400-499: Perfect
  *				The ideal clocksource. A must-use where
  *				available.
- * @read:		returns a cycle value
+ * @read:		returns a cycle value, passes clocksource as argument
  * @mask:		bitmask for two's complement
  *			subtraction of non 64 bit counters
  * @mult:		cycle to nanosecond multiplier (adjusted by NTP)
@@ -162,7 +162,7 @@ struct clocksource {
 	char *name;
 	struct list_head list;
 	int rating;
-	cycle_t (*read)(void);
+	cycle_t (*read)(struct clocksource *cs);
 	cycle_t mask;
 	u32 mult;
 	u32 mult_orig;
@@ -271,7 +271,7 @@ static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
  */
 static inline cycle_t clocksource_read(struct clocksource *cs)
 {
-	return cs->read();
+	return cs->read(cs);
 }
 
 /**
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c46c931a7fe7..ecfd7b5187e0 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -181,12 +181,12 @@ static void clocksource_watchdog(unsigned long data)
 
 	resumed = test_and_clear_bit(0, &watchdog_resumed);
 
-	wdnow = watchdog->read();
+	wdnow = watchdog->read(watchdog);
 	wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
 	watchdog_last = wdnow;
 
 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
-		csnow = cs->read();
+		csnow = cs->read(cs);
 
 		if (unlikely(resumed)) {
 			cs->wd_last = csnow;
@@ -247,7 +247,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 
 		list_add(&cs->wd_list, &watchdog_list);
 		if (!started && watchdog) {
-			watchdog_last = watchdog->read();
+			watchdog_last = watchdog->read(watchdog);
 			watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
 			add_timer_on(&watchdog_timer,
 				     cpumask_first(cpu_online_mask));
@@ -268,7 +268,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 				cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
 			/* Start if list is not empty */
 			if (!list_empty(&watchdog_list)) {
-				watchdog_last = watchdog->read();
+				watchdog_last = watchdog->read(watchdog);
 				watchdog_timer.expires =
 					jiffies + WATCHDOG_INTERVAL;
 				add_timer_on(&watchdog_timer,
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 06f197560f3b..c3f6c30816e3 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -50,7 +50,7 @@
  */
 #define JIFFIES_SHIFT	8
 
-static cycle_t jiffies_read(void)
+static cycle_t jiffies_read(struct clocksource *cs)
 {
 	return (cycle_t) jiffies;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 2a3313f494c2f3f74a27d66f0f14b38558b7dba2 Mon Sep 17 00:00:00 2001
From: "Michael K. Johnson" <johnsonm@rpath.com>
Date: Tue, 21 Apr 2009 21:44:48 -0400
Subject: x86: more than 8 32-bit CPUs requires X86_BIGSMP

$ cat x86-more-than-8-cpus-requires-bigsmp.patch

Enforce NR_CPUS <= 8 limitation if X86_BIGSMP not set

Configuring more than 8 logical CPUs on 32-bit x86 requires
X86_BIGSMP to be set in order to boot successfully, if more than 8
logical CPUs are actually found at boot time.  The X86_BIGSMP help
text describes that it is required to be set if more than 8 CPUs
are configured, but this was previously not enforced.

This configuration error has affected multiple distributions:
    https://bugzilla.redhat.com/show_bug.cgi?id=480844
    https://issues.rpath.com/browse/RPL-3022

Signed-off-by: Michael K Johnson <johnsonm@rpath.com>
LKML-Reference: <20090422014448.GB32541@logo.rdu.rpath.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c9086e6307a5..b5cda6c03d1d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -664,6 +664,7 @@ config MAXSMP
 
 config NR_CPUS
 	int "Maximum number of CPUs" if SMP && !MAXSMP
+	range 2 8 if SMP && X86_32 && !X86_BIGSMP
 	range 2 512 if SMP && !MAXSMP
 	default "1" if !SMP
 	default "4096" if MAXSMP
-- 
cgit v1.2.3-59-g8ed1b


From 9b8de7479d0dbab1ed98b5b015d44232c9d3d08e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 21 Apr 2009 23:00:24 +0100
Subject: FRV: Fix the section attribute on UP DECLARE_PER_CPU()

In non-SMP mode, the variable section attribute specified by DECLARE_PER_CPU()
does not agree with that specified by DEFINE_PER_CPU().  This means that
architectures that have a small data section references relative to a base
register may throw up linkage errors due to too great a displacement between
where the base register points and the per-CPU variable.

On FRV, the .h declaration says that the variable is in the .sdata section, but
the .c definition says it's actually in the .data section.  The linker throws
up the following errors:

kernel/built-in.o: In function `release_task':
kernel/exit.c:78: relocation truncated to fit: R_FRV_GPREL12 against symbol `per_cpu__process_counts' defined in .data section in kernel/built-in.o
kernel/exit.c:78: relocation truncated to fit: R_FRV_GPREL12 against symbol `per_cpu__process_counts' defined in .data section in kernel/built-in.o

To fix this, DECLARE_PER_CPU() should simply apply the same section attribute
as does DEFINE_PER_CPU().  However, this is made slightly more complex by
virtue of the fact that there are several variants on DEFINE, so these need to
be matched by variants on DECLARE.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/percpu.h  |  2 +-
 arch/ia64/include/asm/smp.h      |  2 +-
 arch/x86/include/asm/desc.h      |  2 +-
 arch/x86/include/asm/hardirq.h   |  2 +-
 arch/x86/include/asm/processor.h |  6 +++---
 arch/x86/include/asm/tlbflush.h  |  2 +-
 include/asm-generic/percpu.h     | 43 ++++++++++++++++++++++++++++++++++++++--
 include/linux/percpu.h           | 24 ----------------------
 net/rds/rds.h                    |  2 +-
 9 files changed, 50 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/percpu.h b/arch/alpha/include/asm/percpu.h
index 3495e8e00d70..e9e0bb5a23bf 100644
--- a/arch/alpha/include/asm/percpu.h
+++ b/arch/alpha/include/asm/percpu.h
@@ -73,6 +73,6 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 
 #endif /* SMP */
 
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name)
+#include <asm-generic/percpu.h>
 
 #endif /* __ALPHA_PERCPU_H */
diff --git a/arch/ia64/include/asm/smp.h b/arch/ia64/include/asm/smp.h
index 598408336251..d217d1d4e051 100644
--- a/arch/ia64/include/asm/smp.h
+++ b/arch/ia64/include/asm/smp.h
@@ -58,7 +58,7 @@ extern struct smp_boot_data {
 extern char no_int_routing __devinitdata;
 
 extern cpumask_t cpu_core_map[NR_CPUS];
-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
+DECLARE_PER_CPU_SHARED_ALIGNED(cpumask_t, cpu_sibling_map);
 extern int smp_num_siblings;
 extern void __iomem *ipi_base_addr;
 extern unsigned char smp_int_redirect;
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 5623c50d67b2..c45f415ce315 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -37,7 +37,7 @@ extern gate_desc idt_table[];
 struct gdt_page {
 	struct desc_struct gdt[GDT_ENTRIES];
 } __attribute__((aligned(PAGE_SIZE)));
-DECLARE_PER_CPU(struct gdt_page, gdt_page);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 039db6aa8e02..37555e52f980 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -26,7 +26,7 @@ typedef struct {
 #endif
 } ____cacheline_aligned irq_cpustat_t;
 
-DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
+DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 
 /* We can have at most NR_VECTORS irqs routed to a cpu at a time */
 #define MAX_HARDIRQS_PER_CPU NR_VECTORS
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fcf4d92e7e04..c2cceae709c8 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -138,7 +138,7 @@ extern struct tss_struct	doublefault_tss;
 extern __u32			cleared_cpu_caps[NCAPINTS];
 
 #ifdef CONFIG_SMP
-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
 #define cpu_data(cpu)		per_cpu(cpu_info, cpu)
 #define current_cpu_data	__get_cpu_var(cpu_info)
 #else
@@ -270,7 +270,7 @@ struct tss_struct {
 
 } ____cacheline_aligned;
 
-DECLARE_PER_CPU(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
 
 /*
  * Save the original ist values for checking stack pointers during debugging
@@ -393,7 +393,7 @@ union irq_stack_union {
 	};
 };
 
-DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
+DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union);
 DECLARE_INIT_PER_CPU(irq_stack_union);
 
 DECLARE_PER_CPU(char *, irq_stack_ptr);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d3539f998f88..16a5c84b0329 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -152,7 +152,7 @@ struct tlb_state {
 	struct mm_struct *active_mm;
 	int state;
 };
-DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
 static inline void reset_lazy_tlbstate(void)
 {
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index b0e63c672ebd..af47b9e10064 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -73,11 +73,50 @@ extern void setup_per_cpu_areas(void);
 
 #endif	/* SMP */
 
+#ifndef PER_CPU_BASE_SECTION
+#ifdef CONFIG_SMP
+#define PER_CPU_BASE_SECTION ".data.percpu"
+#else
+#define PER_CPU_BASE_SECTION ".data"
+#endif
+#endif
+
+#ifdef CONFIG_SMP
+
+#ifdef MODULE
+#define PER_CPU_SHARED_ALIGNED_SECTION ""
+#else
+#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
+#endif
+#define PER_CPU_FIRST_SECTION ".first"
+
+#else
+
+#define PER_CPU_SHARED_ALIGNED_SECTION ""
+#define PER_CPU_FIRST_SECTION ""
+
+#endif
+
 #ifndef PER_CPU_ATTRIBUTES
 #define PER_CPU_ATTRIBUTES
 #endif
 
-#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
-					__typeof__(type) per_cpu_var(name)
+#define DECLARE_PER_CPU_SECTION(type, name, section)			\
+	extern \
+	__attribute__((__section__(PER_CPU_BASE_SECTION section)))	\
+	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+
+#define DECLARE_PER_CPU(type, name)					\
+	DECLARE_PER_CPU_SECTION(type, name, "")
+
+#define DECLARE_PER_CPU_SHARED_ALIGNED(type, name)			\
+	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
+	____cacheline_aligned_in_smp
+
+#define DECLARE_PER_CPU_PAGE_ALIGNED(type, name)				\
+	DECLARE_PER_CPU_SECTION(type, name, ".page_aligned")
+
+#define DECLARE_PER_CPU_FIRST(type, name)				\
+	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION)
 
 #endif /* _ASM_GENERIC_PERCPU_H_ */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index cfda2d5ad319..f052d8184993 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -9,30 +9,6 @@
 
 #include <asm/percpu.h>
 
-#ifndef PER_CPU_BASE_SECTION
-#ifdef CONFIG_SMP
-#define PER_CPU_BASE_SECTION ".data.percpu"
-#else
-#define PER_CPU_BASE_SECTION ".data"
-#endif
-#endif
-
-#ifdef CONFIG_SMP
-
-#ifdef MODULE
-#define PER_CPU_SHARED_ALIGNED_SECTION ""
-#else
-#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
-#endif
-#define PER_CPU_FIRST_SECTION ".first"
-
-#else
-
-#define PER_CPU_SHARED_ALIGNED_SECTION ""
-#define PER_CPU_FIRST_SECTION ""
-
-#endif
-
 #define DEFINE_PER_CPU_SECTION(type, name, section)			\
 	__attribute__((__section__(PER_CPU_BASE_SECTION section)))	\
 	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 619f0a30a4e5..71794449ca4e 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -638,7 +638,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *,
 void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
 
 /* stats.c */
-DECLARE_PER_CPU(struct rds_statistics, rds_stats);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
 #define rds_stats_inc_which(which, member) do {		\
 	per_cpu(which, get_cpu()).member++;		\
 	put_cpu();					\
-- 
cgit v1.2.3-59-g8ed1b


From bf47a760f66add7870fba33ab50f58b550d6bbd1 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Sun, 5 Apr 2009 14:54:46 -0300
Subject: KVM: MMU: disable global page optimization

Complexity to fix it not worthwhile the gains, as discussed
in http://article.gmane.org/gmane.comp.emulators.kvm.devel/28649.

Cc: stable@kernel.org
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a36f7f7c4c7..b6caf1329b1b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1248,7 +1248,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
 	sp->gfn = gfn;
 	sp->role = role;
-	sp->global = role.cr4_pge;
+	sp->global = 0;
 	hlist_add_head(&sp->hash_link, bucket);
 	if (!direct) {
 		if (rmap_write_protect(vcpu->kvm, gfn))
-- 
cgit v1.2.3-59-g8ed1b


From 7f1ea208968f021943d4103ba59e06bb6d8239cb Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 25 Feb 2009 16:08:31 +0100
Subject: KVM: x86: release time_page on vcpu destruction

Not releasing the time_page causes a leak of that page or the compound
page it is situated in.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8ca100a9ecac..a1ecec5c03e9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4159,6 +4159,11 @@ EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
+	if (vcpu->arch.time_page) {
+		kvm_release_page_dirty(vcpu->arch.time_page);
+		vcpu->arch.time_page = NULL;
+	}
+
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 888d256e9c565cb61505bd218eb37c81fe77a325 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 17 Apr 2009 19:24:58 +0200
Subject: KVM: Unregister cpufreq notifier on unload

Properly unregister cpufreq notifier on onload if it was registered
during init.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a1ecec5c03e9..7c1ce5ac6131 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2775,6 +2775,9 @@ out:
 
 void kvm_arch_exit(void)
 {
+	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
+					    CPUFREQ_TRANSITION_NOTIFIER);
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
 }
-- 
cgit v1.2.3-59-g8ed1b


From 7a6f9cbb37120c745fc187083fb5c3de4dca4f97 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 21 Apr 2009 20:00:37 +0200
Subject: x86: hpet: fix periodic mode programming on AMD 81xx

(See http://bugzilla.kernel.org/show_bug.cgi?id=12961)

It partially reverts commit c23e253e67c9d8a91a0ffa33c1f571a17f0a2403
(x86: hpet: stop HPET_COUNTER when programming periodic mode)

HPET on AMD 81xx chipset needs a second write (with HPET_TN_SETVAL
cleared) to T0_CMP register to set the period in periodic mode.

With this patch HPET_COUNTER is still stopped but not reset when HPET
is programmed in periodic mode. This should help to avoid races when
HPET is programmed in periodic mode and fixes a boot time hang that
I've observed on a machine when using 1000HZ.

[ Impact: fix boot time hang on machines with AMD 81xx chipset ]

Reported-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Tested-by: Jeff Mahoney <jeffm@suse.com>
LKML-Reference: <20090421180037.GA2763@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/hpet.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 3f0019e0a229..81408b93f887 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -236,6 +236,10 @@ static void hpet_stop_counter(void)
 	unsigned long cfg = hpet_readl(HPET_CFG);
 	cfg &= ~HPET_CFG_ENABLE;
 	hpet_writel(cfg, HPET_CFG);
+}
+
+static void hpet_reset_counter(void)
+{
 	hpet_writel(0, HPET_COUNTER);
 	hpet_writel(0, HPET_COUNTER + 4);
 }
@@ -250,6 +254,7 @@ static void hpet_start_counter(void)
 static void hpet_restart_counter(void)
 {
 	hpet_stop_counter();
+	hpet_reset_counter();
 	hpet_start_counter();
 }
 
@@ -309,7 +314,7 @@ static int hpet_setup_msi_irq(unsigned int irq);
 static void hpet_set_mode(enum clock_event_mode mode,
 			  struct clock_event_device *evt, int timer)
 {
-	unsigned long cfg;
+	unsigned long cfg, cmp, now;
 	uint64_t delta;
 
 	switch (mode) {
@@ -317,12 +322,23 @@ static void hpet_set_mode(enum clock_event_mode mode,
 		hpet_stop_counter();
 		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
 		delta >>= evt->shift;
+		now = hpet_readl(HPET_COUNTER);
+		cmp = now + (unsigned long) delta;
 		cfg = hpet_readl(HPET_Tn_CFG(timer));
 		/* Make sure we use edge triggered interrupts */
 		cfg &= ~HPET_TN_LEVEL;
 		cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
 		       HPET_TN_SETVAL | HPET_TN_32BIT;
 		hpet_writel(cfg, HPET_Tn_CFG(timer));
+		hpet_writel(cmp, HPET_Tn_CMP(timer));
+		udelay(1);
+		/*
+		 * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
+		 * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
+		 * bit is automatically cleared after the first write.
+		 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
+		 * Publication # 24674)
+		 */
 		hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
 		hpet_start_counter();
 		hpet_print_config();
-- 
cgit v1.2.3-59-g8ed1b


From c5428e950ad42640f00092949fd17e356dfdeafa Mon Sep 17 00:00:00 2001
From: Coly Li <coly.li@suse.de>
Date: Wed, 22 Apr 2009 23:21:56 +0800
Subject: uv_time: add parameter to uv_read_rtc()

uv_read_rtc() is referenced by read member of struct clocksource clocksource_uv.
In include/linux/clocksource.h, read of struct clocksource is declared as:
cycle_t (*read)(struct clocksource *cs)

This got introduced recently in:

   8e19608: clocksource: pass clocksource to read() callback

But arch/x86/kernel/uv_time.c was not properly converted by that pach.

This patch adds a dummy parameter (struct clocksource type) to uv_read_rtc() to
fix the incompatible reference in clocksource_uv, and add a NULL parameter in
all places where uv_read_rtc() gets called.

[ Impact: cleanup, address compiler warning ]

Signed-off-by: Coly Li <coly.li@suse.de>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Magnus Damm <damm@igel.co.jp>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hugh@veritas.com>
LKML-Reference: <49EF3614.1050806@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 arch/x86/kernel/uv_time.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 2ffb6c53326e..583f11d5c480 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -29,7 +29,7 @@
 
 #define RTC_NAME		"sgi_rtc"
 
-static cycle_t uv_read_rtc(void);
+static cycle_t uv_read_rtc(struct clocksource *cs);
 static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
 static void uv_rtc_timer_setup(enum clock_event_mode,
 				struct clock_event_device *);
@@ -123,7 +123,7 @@ static int uv_setup_intr(int cpu, u64 expires)
 	/* Initialize comparator value */
 	uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
 
-	return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
+	return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode));
 }
 
 /*
@@ -256,7 +256,7 @@ static int uv_rtc_unset_timer(int cpu)
 
 	spin_lock_irqsave(&head->lock, flags);
 
-	if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
+	if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t)
 		rc = 1;
 
 	*t = ULLONG_MAX;
@@ -278,7 +278,7 @@ static int uv_rtc_unset_timer(int cpu)
 /*
  * Read the RTC.
  */
-static cycle_t uv_read_rtc(void)
+static cycle_t uv_read_rtc(struct clocksource *cs)
 {
 	return (cycle_t)uv_read_local_mmr(UVH_RTC);
 }
@@ -291,7 +291,7 @@ static int uv_rtc_next_event(unsigned long delta,
 {
 	int ced_cpu = cpumask_first(ced->cpumask);
 
-	return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
+	return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc(NULL));
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 6298c512bc1007c3ff5c9ce20e6996781651cc45 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 9 Apr 2009 12:28:22 +0200
Subject: x86, mce: make polling timer interval per CPU

The polling timer while running per CPU still uses a global next_interval
variable, which lead to some CPUs either polling too fast or too slow.
This was not a serious problem because all errors get picked up eventually,
but it's still better to avoid it. Turn next_interval into a per cpu variable.

v2: Fix check_interval == 0 case (Hidetoshi Seto)

[ Impact: minor bug fix ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 863f89568b1a..82614f1b923a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -452,13 +452,14 @@ void mce_log_therm_throt_event(__u64 status)
  */
 
 static int check_interval = 5 * 60; /* 5 minutes */
-static int next_interval; /* in jiffies */
+static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
 static void mcheck_timer(unsigned long);
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
 static void mcheck_timer(unsigned long data)
 {
 	struct timer_list *t = &per_cpu(mce_timer, data);
+	int *n;
 
 	WARN_ON(smp_processor_id() != data);
 
@@ -470,14 +471,14 @@ static void mcheck_timer(unsigned long data)
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
 	 * polling interval, otherwise increase the polling interval.
 	 */
+	n = &__get_cpu_var(next_interval);
 	if (mce_notify_user()) {
-		next_interval = max(next_interval/2, HZ/100);
+		*n = max(*n/2, HZ/100);
 	} else {
-		next_interval = min(next_interval * 2,
-				(int)round_jiffies_relative(check_interval*HZ));
+		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
 	}
 
-	t->expires = jiffies + next_interval;
+	t->expires = jiffies + *n;
 	add_timer(t);
 }
 
@@ -632,14 +633,13 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
 static void mce_init_timer(void)
 {
 	struct timer_list *t = &__get_cpu_var(mce_timer);
+	int *n = &__get_cpu_var(next_interval);
 
-	/* data race harmless because everyone sets to the same value */
-	if (!next_interval)
-		next_interval = check_interval * HZ;
-	if (!next_interval)
+	*n = check_interval * HZ;
+	if (!*n)
 		return;
 	setup_timer(t, mcheck_timer, smp_processor_id());
-	t->expires = round_jiffies(jiffies + next_interval);
+	t->expires = round_jiffies(jiffies + *n);
 	add_timer(t);
 }
 
@@ -907,7 +907,6 @@ static void mce_cpu_restart(void *data)
 /* Reinit MCEs after user configuration changes */
 static void mce_restart(void)
 {
-	next_interval = check_interval * HZ;
 	on_each_cpu(mce_cpu_restart, NULL, 1);
 }
 
@@ -1110,7 +1109,8 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 		break;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
-		t->expires = round_jiffies(jiffies + next_interval);
+		t->expires = round_jiffies(jiffies +
+						__get_cpu_var(next_interval));
 		add_timer_on(t, cpu);
 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 5679af4c1625a1534a4321e1ecc3c48a1cf65eb8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 7 Apr 2009 17:06:55 +0200
Subject: x86, mce: fix boot logging logic

The earlier patch to change the poller to a separate function subtly
broke the boot logging logic. This could lead to machine checks
getting logged at boot even when disabled or defaulting to off
on some systems. Fix that.

[ Impact: bug fix - avoid spurious MCE in log ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/mce.h          | 1 +
 arch/x86/kernel/cpu/mcheck/mce_64.c | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 563933e06a35..4f8c199584e7 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -137,6 +137,7 @@ DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
 enum mcp_flags {
 	MCP_TIMESTAMP = (1 << 0),	/* log time stamp */
 	MCP_UC = (1 << 1),		/* log uncorrected errors */
+	MCP_DONTLOG = (1 << 2),		/* only clear, don't log */
 };
 extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 82614f1b923a..6fb0b359d2a5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -239,9 +239,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		 * Don't get the IP here because it's unlikely to
 		 * have anything to do with the actual error location.
 		 */
-
-		mce_log(&m);
-		add_taint(TAINT_MACHINE_CHECK);
+		if (!(flags & MCP_DONTLOG)) {
+			mce_log(&m);
+			add_taint(TAINT_MACHINE_CHECK);
+		}
 
 		/*
 		 * Clear state for this bank.
@@ -585,7 +586,7 @@ static void mce_init(void *dummy)
 	 * Log the machine checks left over from the previous reset.
 	 */
 	bitmap_fill(all_banks, MAX_NR_BANKS);
-	machine_check_poll(MCP_UC, &all_banks);
+	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
 
 	set_in_cr4(X86_CR4_MCE);
 
-- 
cgit v1.2.3-59-g8ed1b


From 044cd80942e47b9de0915b627902adf05c52377f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 18 Apr 2009 01:43:46 -0700
Subject: x86/PCI: don't call e820_all_mapped with -1 in the mmconfig case

e820_all_mapped need end is (addr + size) instead of (addr + size - 1)

Cc: stable@kernel.org
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/mmconfig-shared.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 905bb526b133..5fa10bb9604f 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -375,7 +375,7 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
 		if (!fixmem32)
 			return AE_OK;
 		if ((mcfg_res->start >= fixmem32->address) &&
-		    (mcfg_res->end < (fixmem32->address +
+		    (mcfg_res->end <= (fixmem32->address +
 				      fixmem32->address_length))) {
 			mcfg_res->flags = 1;
 			return AE_CTRL_TERMINATE;
@@ -392,7 +392,7 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
 		return AE_OK;
 
 	if ((mcfg_res->start >= address.minimum) &&
-	    (mcfg_res->end < (address.minimum + address.address_length))) {
+	    (mcfg_res->end <= (address.minimum + address.address_length))) {
 		mcfg_res->flags = 1;
 		return AE_CTRL_TERMINATE;
 	}
@@ -439,7 +439,7 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
 	u64 old_size = size;
 	int valid = 0;
 
-	while (!is_reserved(addr, addr + size - 1, E820_RESERVED)) {
+	while (!is_reserved(addr, addr + size, E820_RESERVED)) {
 		size >>= 1;
 		if (size < (16UL<<20))
 			break;
-- 
cgit v1.2.3-59-g8ed1b


From 0bb1be3e30bfc3e09fa0ff1e887ac7da4a16c3a2 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Thu, 16 Apr 2009 13:31:10 -0600
Subject: x86/PCI: Move set_pci_bus_resources_arch_default into arch/x86

Commit 30a18d6c3f1e774de656ebd8ff219d53e2ba4029 introduced a new
function to set the PCI bus resources.  Unfortunately, neither the
author, nor the committers seemed to know that we already have somewhere
to do that -- pcibios_fixup_bus().  This patch moves the hook (used only
by the K8 code) into x86-specific code where it should have been in the
first place.

Cc: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/common.c | 7 ++++++-
 drivers/pci/probe.c   | 6 ------
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 8c362b96b644..f80ece51305d 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -142,15 +142,20 @@ static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
 	}
 }
 
+void __attribute__((weak)) set_pci_bus_resources_arch_default(struct pci_bus *b)
+{
+}
+
 /*
  *  Called after each bus is probed, but before its children
  *  are examined.
  */
 
-void __devinit  pcibios_fixup_bus(struct pci_bus *b)
+void __devinit pcibios_fixup_bus(struct pci_bus *b)
 {
 	struct pci_dev *dev;
 
+	set_pci_bus_resources_arch_default(b);
 	pci_read_bridge_bases(b);
 	list_for_each_entry(dev, &b->devices, bus_list)
 		pcibios_fixup_device_resources(dev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 8eb50dffb78a..e3c3e081b834 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1118,10 +1118,6 @@ unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
 	return max;
 }
 
-void __attribute__((weak)) set_pci_bus_resources_arch_default(struct pci_bus *b)
-{
-}
-
 struct pci_bus * pci_create_bus(struct device *parent,
 		int bus, struct pci_ops *ops, void *sysdata)
 {
@@ -1180,8 +1176,6 @@ struct pci_bus * pci_create_bus(struct device *parent,
 	b->resource[0] = &ioport_resource;
 	b->resource[1] = &iomem_resource;
 
-	set_pci_bus_resources_arch_default(b);
-
 	return b;
 
 dev_create_file_err:
-- 
cgit v1.2.3-59-g8ed1b


From 0e94ecd098347874e776f7818728613a335880d1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 18 Apr 2009 10:11:25 -0700
Subject: x86/PCI: set_pci_bus_resources_arch_default cleanups

Rename set_pci_bus_resources_arch_default to x86_pci_root_bus_res_quirks, move
the weak version from common.c to i386.c, and before calling, make sure it's a
root bus.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/topology.h | 2 +-
 arch/x86/pci/amd_bus.c          | 2 +-
 arch/x86/pci/common.c           | 8 +++-----
 arch/x86/pci/i386.c             | 4 ++++
 4 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 892b119dba6f..f44b49abca49 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -200,7 +200,7 @@ static inline void arch_fix_phys_package_id(int num, u32 slot)
 }
 
 struct pci_bus;
-void set_pci_bus_resources_arch_default(struct pci_bus *b);
+void x86_pci_root_bus_res_quirks(struct pci_bus *b);
 
 #ifdef CONFIG_SMP
 #define mc_capable()	(cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 9bb09823b362..e121ee050f7c 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -94,7 +94,7 @@ struct pci_root_info {
 static int pci_root_num;
 static struct pci_root_info pci_root_info[PCI_ROOT_NR];
 
-void set_pci_bus_resources_arch_default(struct pci_bus *b)
+void x86_pci_root_bus_res_quirks(struct pci_bus *b)
 {
 	int i;
 	int j;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index f80ece51305d..2202b6257b82 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -142,10 +142,6 @@ static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
 	}
 }
 
-void __attribute__((weak)) set_pci_bus_resources_arch_default(struct pci_bus *b)
-{
-}
-
 /*
  *  Called after each bus is probed, but before its children
  *  are examined.
@@ -155,7 +151,9 @@ void __devinit pcibios_fixup_bus(struct pci_bus *b)
 {
 	struct pci_dev *dev;
 
-	set_pci_bus_resources_arch_default(b);
+	/* root bus? */
+	if (!b->parent)
+		x86_pci_root_bus_res_quirks(b);
 	pci_read_bridge_bases(b);
 	list_for_each_entry(dev, &b->devices, bus_list)
 		pcibios_fixup_device_resources(dev);
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index f1817f71e009..a85bef20a3b9 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -238,6 +238,10 @@ void __init pcibios_resource_survey(void)
  */
 fs_initcall(pcibios_assign_resources);
 
+void __weak x86_pci_root_bus_res_quirks(struct pci_bus *b)
+{
+}
+
 /*
  *  If we set up a device for bus mastering, we need to check the latency
  *  timer as certain crappy BIOSes forget to set it properly.
-- 
cgit v1.2.3-59-g8ed1b


From b10ceb5530df7ee6e81f92910589a34dd3e5690b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 20 Apr 2009 18:35:40 -0700
Subject: x86/PCI: don't bother with root quirks if _CRS is used

It will be overwriten later if _CRS is used, so don't bother to set it.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/amd_bus.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index e121ee050f7c..f893d6a6e803 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -100,6 +100,10 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
 	int j;
 	struct pci_root_info *info;
 
+	/* don't go for it if _CRS is used */
+	if (pci_probe & PCI_USE__CRS)
+		return;
+
 	/* if only one root bus, don't need to anything */
 	if (pci_root_num < 2)
 		return;
-- 
cgit v1.2.3-59-g8ed1b


From 4c31e92b97b6d7e7b19ee5e54a22571ffdebb305 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 22 Apr 2009 14:19:27 -0700
Subject: x86: check boundary in setup_node_bootmem()

Commit dc09855 ("x86/uv: fix init of memory-less nodes") causes a
two sockets system (where node-1 doesn't have RAM installed) to crash.

That commit makes node_possible include cpu nodes that do not have memory.
So check boundary in setup_node_bootmem().

[ Impact: fix boot crash on RAM-less NUMA node system ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Jack Steiner <steiner@sgi.com>
LKML-Reference: <49EF89DF.9090404@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/numa_64.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index d73aaa892371..2d05a12029dc 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -188,6 +188,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 	const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
 	int nid;
 
+	if (!end)
+		return;
+
 	start = roundup(start, ZONE_ALIGN);
 
 	printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
-- 
cgit v1.2.3-59-g8ed1b


From d2c8604121648b744ebb127991f1c5876931885e Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Thu, 23 Apr 2009 19:19:42 -0400
Subject: x86, hpet: Stop soliciting hpet=force users on ICH4M

The HPET in the ICH4M is not documented in the data sheet
because it was not officially validated.

While it is fine for hackers to continue to use "hpet=force"
to enable the hardware that they have, it is not prudent to
solicit additional "hpet=force" users on this hardware.

[ Impact: remove hpet=force syslog message on old-ICH systems ]

Signed-off-by: Len Brown <len.brown@intel.com>
Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <alpine.LFD.2.00.0904231918510.15843@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/quirks.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index e95022e4f5d5..7563b31b4f03 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -261,8 +261,6 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
 {
 	if (hpet_force_user)
 		old_ich_force_enable_hpet(dev);
-	else
-		hpet_print_force_info();
 }
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
-- 
cgit v1.2.3-59-g8ed1b


From 33015c85995716d03f6293346cf05a1908b0fb9a Mon Sep 17 00:00:00 2001
From: Stuart Bennett <stuart@freedesktop.org>
Date: Tue, 28 Apr 2009 20:17:48 +0100
Subject: tracing: x86, mmiotrace: fix range test

Matching on (addr == (p->addr + p->len)) causes problems when mappings
are adjacent.

[ Impact: fix mmiotrace confusion on adjacent iomaps ]

Signed-off-by: Stuart Bennett <stuart@freedesktop.org>
Acked-by: Pekka Paalanen <pq@iki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1240946271-7083-2-git-send-email-stuart@freedesktop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/kmmio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 4f115e00486b..50dc802a1c46 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -87,7 +87,7 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
 {
 	struct kmmio_probe *p;
 	list_for_each_entry_rcu(p, &kmmio_probes, list) {
-		if (addr >= p->addr && addr <= (p->addr + p->len))
+		if (addr >= p->addr && addr < (p->addr + p->len))
 			return p;
 	}
 	return NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 2f65dd475c6a8a997145ea83cc3d2d5e6dc55af1 Mon Sep 17 00:00:00 2001
From: John Wright <john.wright@hp.com>
Date: Wed, 29 Apr 2009 14:32:01 -0600
Subject: x86: gettimeofday() vDSO: fix segfault when tv == NULL

According to the gettimeofday(2) manual:

       If either tv or tz is NULL, the corresponding structure is not
       set or returned.

Since it is legal to give NULL as the tv argument, the code should make
sure tv is not NULL before trying to dereference it.

This issue manifests itself on x86_64 when vdso=0 is not on the kernel
command-line and libc uses the vDSO for gettimeofday() (e.g. glibc >=
2.7).  A simple reproducer:

  #include <stdio.h>
  #include <sys/time.h>

  int main(void)
  {
      struct timezone tz;

      gettimeofday(NULL, &tz);

      return 0;
  }

See http://bugs.debian.org/466491 for more details.

[ Impact: fix gettimeofday(NULL, &tz) segfault ]

Signed-off-by: John Wright <john.wright@hp.com>
Cc: Andi Kleen <ak@suse.de>
Cc: John Wright <john.wright@hp.com>
LKML-Reference: <1241037121-14805-1-git-send-email-john.wright@hp.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/vdso/vclock_gettime.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index d9d35824c56f..6a40b78b46aa 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -104,11 +104,13 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
 	long ret;
 	if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
-		BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
-			     offsetof(struct timespec, tv_nsec) ||
-			     sizeof(*tv) != sizeof(struct timespec));
-		do_realtime((struct timespec *)tv);
-		tv->tv_usec /= 1000;
+		if (likely(tv != NULL)) {
+			BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
+				     offsetof(struct timespec, tv_nsec) ||
+				     sizeof(*tv) != sizeof(struct timespec));
+			do_realtime((struct timespec *)tv);
+			tv->tv_usec /= 1000;
+		}
 		if (unlikely(tz != NULL)) {
 			/* Avoid memcpy. Some old compilers fail to inline it */
 			tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest;
-- 
cgit v1.2.3-59-g8ed1b


From f9a196b8dceba3c1e5fe885b81e45043ad7c60fc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 May 2009 20:59:25 +0200
Subject: x86: initialize io_bitmap_base on 32bit

commit db949bba3c7cf2e664ac12e237c6d4c914f0c69d (x86-32: use non-lazy
io bitmap context switching) broke ioperm for 32bit because it removed
the lazy initialization of io_bitmap_base and did not set it to the
real bitmap offset.

[ Impact: fix non-working sys_ioperm() on 32-bit kernels ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/common.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c4f667896c28..c1caefc82e62 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1203,6 +1203,8 @@ void __cpuinit cpu_init(void)
 	load_TR_desc();
 	load_LDT(&init_mm.context);
 
+	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+
 #ifdef CONFIG_DOUBLEFAULT
 	/* Set up doublefault TSS pointer in the GDT */
 	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-- 
cgit v1.2.3-59-g8ed1b


From 6da7342ff1c5274c51ada084974668d10f769c16 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 4 May 2009 11:44:38 +0200
Subject: amd-iommu: fix iommu flag masks

The feature bits should be set via bitmasks, not via feature IDs.

[ Impact: fix feature enabling in newer IOMMU versions ]

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
LKML-Reference: <20090504102028.GA30307@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 42c33cebf00f..8c0be0902dac 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -49,10 +49,10 @@
 #define IVHD_DEV_EXT_SELECT             0x46
 #define IVHD_DEV_EXT_SELECT_RANGE       0x47
 
-#define IVHD_FLAG_HT_TUN_EN             0x00
-#define IVHD_FLAG_PASSPW_EN             0x01
-#define IVHD_FLAG_RESPASSPW_EN          0x02
-#define IVHD_FLAG_ISOC_EN               0x03
+#define IVHD_FLAG_HT_TUN_EN_MASK        0x01
+#define IVHD_FLAG_PASSPW_EN_MASK        0x02
+#define IVHD_FLAG_RESPASSPW_EN_MASK     0x04
+#define IVHD_FLAG_ISOC_EN_MASK          0x08
 
 #define IVMD_FLAG_EXCL_RANGE            0x08
 #define IVMD_FLAG_UNITY_MAP             0x01
@@ -569,19 +569,19 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 	 * First set the recommended feature enable bits from ACPI
 	 * into the IOMMU control registers
 	 */
-	h->flags & IVHD_FLAG_HT_TUN_EN ?
+	h->flags & IVHD_FLAG_HT_TUN_EN_MASK ?
 		iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
 		iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
 
-	h->flags & IVHD_FLAG_PASSPW_EN ?
+	h->flags & IVHD_FLAG_PASSPW_EN_MASK ?
 		iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
 		iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
 
-	h->flags & IVHD_FLAG_RESPASSPW_EN ?
+	h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
 		iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
 		iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
 
-	h->flags & IVHD_FLAG_ISOC_EN ?
+	h->flags & IVHD_FLAG_ISOC_EN_MASK ?
 		iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
 		iommu_feature_disable(iommu, CONTROL_ISOC_EN);
 
-- 
cgit v1.2.3-59-g8ed1b


From 35d11680a9d82c93eb92f08f9702b72877427b4a Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Mon, 4 May 2009 20:28:59 +0200
Subject: x86: show number of core_siblings instead of thread_siblings in
 /proc/cpuinfo

Commit 7ad728f98162cb1af06a85b2a5fc422dddd4fb78
(cpumask: x86: convert cpu_sibling_map/cpu_core_map to cpumask_var_t)
changed the output of /proc/cpuinfo for siblings:

Example on an AMD Phenom:

  physical id   : 0
  siblings : 1
  core id	   : 3
  cpu cores  : 4

Before that commit it was:

  physical id	: 0
  siblings : 4
  core id	   : 3
  cpu cores  : 4

Instead of cpu_core_mask it now uses cpu_sibling_mask to count siblings.
This is due to the following hunk of above commit:

|  --- a/arch/x86/kernel/cpu/proc.c
|  +++ b/arch/x86/kernel/cpu/proc.c
|  @@ -14,7 +14,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinf
|          if (c->x86_max_cores * smp_num_siblings > 1) {
|                  seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
|                  seq_printf(m, "siblings\t: %d\n",
|  -                          cpus_weight(per_cpu(cpu_core_map, cpu)));
|  +                          cpumask_weight(cpu_sibling_mask(cpu)));
|                  seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
|                  seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
|                  seq_printf(m, "apicid\t\t: %d\n", c->apicid);

This was a mistake, because the impact line shows that this side-effect
was not anticipated:

   Impact: reduce per-cpu size for CONFIG_CPUMASK_OFFSTACK=y

So revert the respective hunk to restore the old behavior.

[ Impact: fix sibling-info regression in /proc/cpuinfo ]

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <20090504182859.GA29045@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index f93047fed791..d5e30397246b 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -14,7 +14,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
 	if (c->x86_max_cores * smp_num_siblings > 1) {
 		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
 		seq_printf(m, "siblings\t: %d\n",
-			   cpumask_weight(cpu_sibling_mask(cpu)));
+			   cpumask_weight(cpu_core_mask(cpu)));
 		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
 		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
 		seq_printf(m, "apicid\t\t: %d\n", c->apicid);
-- 
cgit v1.2.3-59-g8ed1b