From 551b81f26ffc2135b8490babad1a9ab12d617e8d Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Tue, 13 Oct 2009 19:44:44 +0000
Subject: powerpc: Make NR_IRQS a CONFIG option

The irq_desc array consumes quite a lot of space, and for systems
that don't need or can't have 512 irqs it's just wasted space.

The first 16 are reserved for ISA, so the minimum of 32 is really
16 - and no one has asked for more than 512 so leave that as the
maximum.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/irq.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index bbcd1aaf3dfd..b83fcc81faed 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -34,8 +34,8 @@ extern atomic_t ppc_n_lost_interrupts;
  */
 #define NO_IRQ_IGNORE		((unsigned int)-1)
 
-/* Total number of virq in the platform (make it a CONFIG_* option ? */
-#define NR_IRQS		512
+/* Total number of virq in the platform */
+#define NR_IRQS		CONFIG_NR_IRQS
 
 /* Number of irqs reserved for the legacy controller */
 #define NUM_ISA_INTERRUPTS	16
-- 
cgit v1.2.3-59-g8ed1b


From 6cff46f4bc6cc4a8a4154b0b6a2e669db08e8fd2 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Tue, 13 Oct 2009 19:44:51 +0000
Subject: powerpc: Remove get_irq_desc()

get_irq_desc() is a powerpc-specific version of irq_to_desc(). That
is reason enough to remove it, but it also doesn't know about sparse
irq_desc support which irq_to_desc() does (when we enable it).

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/irq.h                  |  2 --
 arch/powerpc/kernel/crash.c                     |  2 +-
 arch/powerpc/kernel/irq.c                       | 28 +++++++++++++------------
 arch/powerpc/platforms/512x/mpc5121_ads_cpld.c  |  2 +-
 arch/powerpc/platforms/52xx/media5200.c         |  2 +-
 arch/powerpc/platforms/82xx/pq2ads-pci-pic.c    |  2 +-
 arch/powerpc/platforms/85xx/socrates_fpga_pic.c |  2 +-
 arch/powerpc/platforms/86xx/gef_pic.c           |  2 +-
 arch/powerpc/platforms/cell/beat_interrupt.c    |  2 +-
 arch/powerpc/platforms/cell/spider-pic.c        |  4 ++--
 arch/powerpc/platforms/iseries/irq.c            |  2 +-
 arch/powerpc/platforms/powermac/pic.c           |  8 +++----
 arch/powerpc/platforms/pseries/xics.c           |  8 +++----
 arch/powerpc/sysdev/cpm1.c                      |  2 +-
 arch/powerpc/sysdev/cpm2_pic.c                  | 10 +++++----
 arch/powerpc/sysdev/fsl_msi.c                   |  2 +-
 arch/powerpc/sysdev/i8259.c                     |  4 ++--
 arch/powerpc/sysdev/ipic.c                      |  2 +-
 arch/powerpc/sysdev/mpc8xx_pic.c                |  2 +-
 arch/powerpc/sysdev/mpic.c                      | 18 ++++++++--------
 arch/powerpc/sysdev/mv64x60_pic.c               |  2 +-
 arch/powerpc/sysdev/qe_lib/qe_ic.c              |  4 ++--
 arch/powerpc/sysdev/tsi108_pci.c                |  2 +-
 arch/powerpc/sysdev/uic.c                       |  6 +++---
 arch/powerpc/sysdev/xilinx_intc.c               |  2 +-
 25 files changed, 62 insertions(+), 60 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index b83fcc81faed..03dc28cdb4da 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -17,8 +17,6 @@
 #include <asm/atomic.h>
 
 
-#define get_irq_desc(irq) (&irq_desc[(irq)])
-
 /* Define a way to iterate across irqs. */
 #define for_each_irq(i) \
 	for ((i) = 0; (i) < NR_IRQS; ++(i))
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index 0a8439aafdd1..6f4613dd05ef 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -373,7 +373,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
 	hard_irq_disable();
 
 	for_each_irq(i) {
-		struct irq_desc *desc = irq_desc + i;
+		struct irq_desc *desc = irq_to_desc(i);
 
 		if (desc->status & IRQ_INPROGRESS)
 			desc->chip->eoi(i);
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index e5d121177984..65632215f020 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -190,7 +190,7 @@ int show_interrupts(struct seq_file *p, void *v)
 	}
 
 	if (i < NR_IRQS) {
-		desc = get_irq_desc(i);
+		desc = irq_to_desc(i);
 		spin_lock_irqsave(&desc->lock, flags);
 		action = desc->action;
 		if (!action || !action->handler)
@@ -230,23 +230,25 @@ skip:
 #ifdef CONFIG_HOTPLUG_CPU
 void fixup_irqs(cpumask_t map)
 {
+	struct irq_desc *desc;
 	unsigned int irq;
 	static int warned;
 
 	for_each_irq(irq) {
 		cpumask_t mask;
 
-		if (irq_desc[irq].status & IRQ_PER_CPU)
+		desc = irq_to_desc(irq);
+		if (desc && desc->status & IRQ_PER_CPU)
 			continue;
 
-		cpumask_and(&mask, irq_desc[irq].affinity, &map);
+		cpumask_and(&mask, desc->affinity, &map);
 		if (any_online_cpu(mask) == NR_CPUS) {
 			printk("Breaking affinity for irq %i\n", irq);
 			mask = map;
 		}
-		if (irq_desc[irq].chip->set_affinity)
-			irq_desc[irq].chip->set_affinity(irq, &mask);
-		else if (irq_desc[irq].action && !(warned++))
+		if (desc->chip->set_affinity)
+			desc->chip->set_affinity(irq, &mask);
+		else if (desc->action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
 	}
 
@@ -273,7 +275,7 @@ static inline void handle_one_irq(unsigned int irq)
 		return;
 	}
 
-	desc = irq_desc + irq;
+	desc = irq_to_desc(irq);
 	saved_sp_limit = current->thread.ksp_limit;
 
 	irqtp->task = curtp->task;
@@ -535,7 +537,7 @@ struct irq_host *irq_alloc_host(struct device_node *of_node,
 			smp_wmb();
 
 			/* Clear norequest flags */
-			get_irq_desc(i)->status &= ~IRQ_NOREQUEST;
+			irq_to_desc(i)->status &= ~IRQ_NOREQUEST;
 
 			/* Legacy flags are left to default at this point,
 			 * one can then use irq_create_mapping() to
@@ -602,7 +604,7 @@ static int irq_setup_virq(struct irq_host *host, unsigned int virq,
 			    irq_hw_number_t hwirq)
 {
 	/* Clear IRQ_NOREQUEST flag */
-	get_irq_desc(virq)->status &= ~IRQ_NOREQUEST;
+	irq_to_desc(virq)->status &= ~IRQ_NOREQUEST;
 
 	/* map it */
 	smp_wmb();
@@ -732,7 +734,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
 
 	/* Set type if specified and different than the current one */
 	if (type != IRQ_TYPE_NONE &&
-	    type != (get_irq_desc(virq)->status & IRQF_TRIGGER_MASK))
+	    type != (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK))
 		set_irq_type(virq, type);
 	return virq;
 }
@@ -804,7 +806,7 @@ void irq_dispose_mapping(unsigned int virq)
 	irq_map[virq].hwirq = host->inval_irq;
 
 	/* Set some flags */
-	get_irq_desc(virq)->status |= IRQ_NOREQUEST;
+	irq_to_desc(virq)->status |= IRQ_NOREQUEST;
 
 	/* Free it */
 	irq_free_virt(virq, 1);
@@ -1001,7 +1003,7 @@ void irq_early_init(void)
 	unsigned int i;
 
 	for (i = 0; i < NR_IRQS; i++)
-		get_irq_desc(i)->status |= IRQ_NOREQUEST;
+		irq_to_desc(i)->status |= IRQ_NOREQUEST;
 }
 
 /* We need to create the radix trees late */
@@ -1064,7 +1066,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
 		      "chip name", "host name");
 
 	for (i = 1; i < NR_IRQS; i++) {
-		desc = get_irq_desc(i);
+		desc = irq_to_desc(i);
 		spin_lock_irqsave(&desc->lock, flags);
 
 		if (desc->action && desc->action->handler) {
diff --git a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c
index a6ce80566625..cd70ee1667fa 100644
--- a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c
+++ b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c
@@ -132,7 +132,7 @@ static int
 cpld_pic_host_map(struct irq_host *h, unsigned int virq,
 			     irq_hw_number_t hw)
 {
-	get_irq_desc(virq)->status |= IRQ_LEVEL;
+	irq_to_desc(virq)->status |= IRQ_LEVEL;
 	set_irq_chip_and_handler(virq, &cpld_pic, handle_level_irq);
 	return 0;
 }
diff --git a/arch/powerpc/platforms/52xx/media5200.c b/arch/powerpc/platforms/52xx/media5200.c
index 68e4f1696d14..478020358748 100644
--- a/arch/powerpc/platforms/52xx/media5200.c
+++ b/arch/powerpc/platforms/52xx/media5200.c
@@ -114,7 +114,7 @@ void media5200_irq_cascade(unsigned int virq, struct irq_desc *desc)
 static int media5200_irq_map(struct irq_host *h, unsigned int virq,
 			     irq_hw_number_t hw)
 {
-	struct irq_desc *desc = get_irq_desc(virq);
+	struct irq_desc *desc = irq_to_desc(virq);
 
 	pr_debug("%s: h=%p, virq=%i, hwirq=%i\n", __func__, h, virq, (int)hw);
 	set_irq_chip_data(virq, &media5200_irq);
diff --git a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c
index 7ee979f323d1..a682331ba0ff 100644
--- a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c
+++ b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c
@@ -107,7 +107,7 @@ static void pq2ads_pci_irq_demux(unsigned int irq, struct irq_desc *desc)
 static int pci_pic_host_map(struct irq_host *h, unsigned int virq,
 			    irq_hw_number_t hw)
 {
-	get_irq_desc(virq)->status |= IRQ_LEVEL;
+	irq_to_desc(virq)->status |= IRQ_LEVEL;
 	set_irq_chip_data(virq, h->host_data);
 	set_irq_chip_and_handler(virq, &pq2ads_pci_ic, handle_level_irq);
 	return 0;
diff --git a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c
index 60edf63d0157..e59920aa6668 100644
--- a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c
+++ b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c
@@ -245,7 +245,7 @@ static int socrates_fpga_pic_host_map(struct irq_host *h, unsigned int virq,
 		irq_hw_number_t hwirq)
 {
 	/* All interrupts are LEVEL sensitive */
-	get_irq_desc(virq)->status |= IRQ_LEVEL;
+	irq_to_desc(virq)->status |= IRQ_LEVEL;
 	set_irq_chip_and_handler(virq, &socrates_fpga_pic_chip,
 			handle_fasteoi_irq);
 
diff --git a/arch/powerpc/platforms/86xx/gef_pic.c b/arch/powerpc/platforms/86xx/gef_pic.c
index 50d0a2b63809..978d6cb37516 100644
--- a/arch/powerpc/platforms/86xx/gef_pic.c
+++ b/arch/powerpc/platforms/86xx/gef_pic.c
@@ -163,7 +163,7 @@ static int gef_pic_host_map(struct irq_host *h, unsigned int virq,
 			  irq_hw_number_t hwirq)
 {
 	/* All interrupts are LEVEL sensitive */
-	get_irq_desc(virq)->status |= IRQ_LEVEL;
+	irq_to_desc(virq)->status |= IRQ_LEVEL;
 	set_irq_chip_and_handler(virq, &gef_pic_chip, handle_level_irq);
 
 	return 0;
diff --git a/arch/powerpc/platforms/cell/beat_interrupt.c b/arch/powerpc/platforms/cell/beat_interrupt.c
index 72254848a228..4a2bbff57698 100644
--- a/arch/powerpc/platforms/cell/beat_interrupt.c
+++ b/arch/powerpc/platforms/cell/beat_interrupt.c
@@ -136,7 +136,7 @@ static void beatic_pic_host_unmap(struct irq_host *h, unsigned int virq)
 static int beatic_pic_host_map(struct irq_host *h, unsigned int virq,
 			       irq_hw_number_t hw)
 {
-	struct irq_desc *desc = get_irq_desc(virq);
+	struct irq_desc *desc = irq_to_desc(virq);
 	int64_t	err;
 
 	err = beat_construct_and_connect_irq_plug(virq, hw);
diff --git a/arch/powerpc/platforms/cell/spider-pic.c b/arch/powerpc/platforms/cell/spider-pic.c
index 4e5655624ae8..9dd63c5d11a8 100644
--- a/arch/powerpc/platforms/cell/spider-pic.c
+++ b/arch/powerpc/platforms/cell/spider-pic.c
@@ -102,7 +102,7 @@ static void spider_ack_irq(unsigned int virq)
 
 	/* Reset edge detection logic if necessary
 	 */
-	if (get_irq_desc(virq)->status & IRQ_LEVEL)
+	if (irq_to_desc(virq)->status & IRQ_LEVEL)
 		return;
 
 	/* Only interrupts 47 to 50 can be set to edge */
@@ -119,7 +119,7 @@ static int spider_set_irq_type(unsigned int virq, unsigned int type)
 	struct spider_pic *pic = spider_virq_to_pic(virq);
 	unsigned int hw = irq_map[virq].hwirq;
 	void __iomem *cfg = spider_get_irq_config(pic, hw);
-	struct irq_desc *desc = get_irq_desc(virq);
+	struct irq_desc *desc = irq_to_desc(virq);
 	u32 old_mask;
 	u32 ic;
 
diff --git a/arch/powerpc/platforms/iseries/irq.c b/arch/powerpc/platforms/iseries/irq.c
index 94f444758836..f8446ea31189 100644
--- a/arch/powerpc/platforms/iseries/irq.c
+++ b/arch/powerpc/platforms/iseries/irq.c
@@ -214,7 +214,7 @@ void __init iSeries_activate_IRQs()
 	unsigned long flags;
 
 	for_each_irq (irq) {
-		struct irq_desc *desc = get_irq_desc(irq);
+		struct irq_desc *desc = irq_to_desc(irq);
 
 		if (desc && desc->chip && desc->chip->startup) {
 			spin_lock_irqsave(&desc->lock, flags);
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index d212006a5b3c..484d21e55c61 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -152,12 +152,12 @@ static unsigned int pmac_startup_irq(unsigned int virq)
         unsigned long bit = 1UL << (src & 0x1f);
         int i = src >> 5;
 
-  	spin_lock_irqsave(&pmac_pic_lock, flags);
-	if ((irq_desc[virq].status & IRQ_LEVEL) == 0)
+	spin_lock_irqsave(&pmac_pic_lock, flags);
+	if ((irq_to_desc(virq)->status & IRQ_LEVEL) == 0)
 		out_le32(&pmac_irq_hw[i]->ack, bit);
         __set_bit(src, ppc_cached_irq_mask);
         __pmac_set_irq_mask(src, 0);
-  	spin_unlock_irqrestore(&pmac_pic_lock, flags);
+	spin_unlock_irqrestore(&pmac_pic_lock, flags);
 
 	return 0;
 }
@@ -285,7 +285,7 @@ static int pmac_pic_host_match(struct irq_host *h, struct device_node *node)
 static int pmac_pic_host_map(struct irq_host *h, unsigned int virq,
 			     irq_hw_number_t hw)
 {
-	struct irq_desc *desc = get_irq_desc(virq);
+	struct irq_desc *desc = irq_to_desc(virq);
 	int level;
 
 	if (hw >= max_irqs)
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 419f8a637ffe..75935ae1a941 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -156,7 +156,7 @@ static int get_irq_server(unsigned int virq, unsigned int strict_check)
 	cpumask_t cpumask;
 	cpumask_t tmp = CPU_MASK_NONE;
 
-	cpumask_copy(&cpumask, irq_desc[virq].affinity);
+	cpumask_copy(&cpumask, irq_to_desc(virq)->affinity);
 	if (!distribute_irqs)
 		return default_server;
 
@@ -419,7 +419,7 @@ static int xics_host_map(struct irq_host *h, unsigned int virq,
 	/* Insert the interrupt mapping into the radix tree for fast lookup */
 	irq_radix_revmap_insert(xics_host, virq, hw);
 
-	get_irq_desc(virq)->status |= IRQ_LEVEL;
+	irq_to_desc(virq)->status |= IRQ_LEVEL;
 	set_irq_chip_and_handler(virq, xics_irq_chip, handle_fasteoi_irq);
 	return 0;
 }
@@ -843,7 +843,7 @@ void xics_migrate_irqs_away(void)
 		/* We need to get IPIs still. */
 		if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
 			continue;
-		desc = get_irq_desc(virq);
+		desc = irq_to_desc(virq);
 
 		/* We only need to migrate enabled IRQS */
 		if (desc == NULL || desc->chip == NULL
@@ -872,7 +872,7 @@ void xics_migrate_irqs_away(void)
 		       virq, cpu);
 
 		/* Reset affinity to all cpus */
-		cpumask_setall(irq_desc[virq].affinity);
+		cpumask_setall(irq_to_desc(virq)->affinity);
 		desc->chip->set_affinity(virq, cpu_all_mask);
 unlock:
 		spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c
index 82424cd7e128..523537300ad5 100644
--- a/arch/powerpc/sysdev/cpm1.c
+++ b/arch/powerpc/sysdev/cpm1.c
@@ -102,7 +102,7 @@ static int cpm_pic_host_map(struct irq_host *h, unsigned int virq,
 {
 	pr_debug("cpm_pic_host_map(%d, 0x%lx)\n", virq, hw);
 
-	get_irq_desc(virq)->status |= IRQ_LEVEL;
+	irq_to_desc(virq)->status |= IRQ_LEVEL;
 	set_irq_chip_and_handler(virq, &cpm_pic, handle_fasteoi_irq);
 	return 0;
 }
diff --git a/arch/powerpc/sysdev/cpm2_pic.c b/arch/powerpc/sysdev/cpm2_pic.c
index 78f1f7cca0a0..722cf72e190d 100644
--- a/arch/powerpc/sysdev/cpm2_pic.c
+++ b/arch/powerpc/sysdev/cpm2_pic.c
@@ -115,11 +115,13 @@ static void cpm2_ack(unsigned int virq)
 
 static void cpm2_end_irq(unsigned int virq)
 {
+	struct irq_desc *desc;
 	int	bit, word;
 	unsigned int irq_nr = virq_to_hw(virq);
 
-	if (!(irq_desc[irq_nr].status & (IRQ_DISABLED|IRQ_INPROGRESS))
-			&& irq_desc[irq_nr].action) {
+	desc = irq_to_desc(irq_nr);
+	if (!(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS))
+			&& desc->action) {
 
 		bit = irq_to_siubit[irq_nr];
 		word = irq_to_siureg[irq_nr];
@@ -138,7 +140,7 @@ static void cpm2_end_irq(unsigned int virq)
 static int cpm2_set_irq_type(unsigned int virq, unsigned int flow_type)
 {
 	unsigned int src = virq_to_hw(virq);
-	struct irq_desc *desc = get_irq_desc(virq);
+	struct irq_desc *desc = irq_to_desc(virq);
 	unsigned int vold, vnew, edibit;
 
 	if (flow_type == IRQ_TYPE_NONE)
@@ -210,7 +212,7 @@ static int cpm2_pic_host_map(struct irq_host *h, unsigned int virq,
 {
 	pr_debug("cpm2_pic_host_map(%d, 0x%lx)\n", virq, hw);
 
-	get_irq_desc(virq)->status |= IRQ_LEVEL;
+	irq_to_desc(virq)->status |= IRQ_LEVEL;
 	set_irq_chip_and_handler(virq, &cpm2_pic, handle_level_irq);
 	return 0;
 }
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index da38a1ff97bb..7174374f90ff 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -55,7 +55,7 @@ static int fsl_msi_host_map(struct irq_host *h, unsigned int virq,
 {
 	struct irq_chip *chip = &fsl_msi_chip;
 
-	get_irq_desc(virq)->status |= IRQ_TYPE_EDGE_FALLING;
+	irq_to_desc(virq)->status |= IRQ_TYPE_EDGE_FALLING;
 
 	set_irq_chip_and_handler(virq, chip, handle_edge_irq);
 
diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c
index a96584ab33dd..78ed945453db 100644
--- a/arch/powerpc/sysdev/i8259.c
+++ b/arch/powerpc/sysdev/i8259.c
@@ -175,12 +175,12 @@ static int i8259_host_map(struct irq_host *h, unsigned int virq,
 
 	/* We block the internal cascade */
 	if (hw == 2)
-		get_irq_desc(virq)->status |= IRQ_NOREQUEST;
+		irq_to_desc(virq)->status |= IRQ_NOREQUEST;
 
 	/* We use the level handler only for now, we might want to
 	 * be more cautious here but that works for now
 	 */
-	get_irq_desc(virq)->status |= IRQ_LEVEL;
+	irq_to_desc(virq)->status |= IRQ_LEVEL;
 	set_irq_chip_and_handler(virq, &i8259_pic, handle_level_irq);
 	return 0;
 }
diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c
index cb7689c4bfbd..f042c1d69002 100644
--- a/arch/powerpc/sysdev/ipic.c
+++ b/arch/powerpc/sysdev/ipic.c
@@ -605,7 +605,7 @@ static int ipic_set_irq_type(unsigned int virq, unsigned int flow_type)
 {
 	struct ipic *ipic = ipic_from_irq(virq);
 	unsigned int src = ipic_irq_to_hw(virq);
-	struct irq_desc *desc = get_irq_desc(virq);
+	struct irq_desc *desc = irq_to_desc(virq);
 	unsigned int vold, vnew, edibit;
 
 	if (flow_type == IRQ_TYPE_NONE)
diff --git a/arch/powerpc/sysdev/mpc8xx_pic.c b/arch/powerpc/sysdev/mpc8xx_pic.c
index 5d2d5522ef41..01179587df2a 100644
--- a/arch/powerpc/sysdev/mpc8xx_pic.c
+++ b/arch/powerpc/sysdev/mpc8xx_pic.c
@@ -72,7 +72,7 @@ static void mpc8xx_end_irq(unsigned int virq)
 
 static int mpc8xx_set_irq_type(unsigned int virq, unsigned int flow_type)
 {
-	struct irq_desc *desc = get_irq_desc(virq);
+	struct irq_desc *desc = irq_to_desc(virq);
 
 	desc->status &= ~(IRQ_TYPE_SENSE_MASK | IRQ_LEVEL);
 	desc->status |= flow_type & IRQ_TYPE_SENSE_MASK;
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 30c44e6b0413..4fd57ab956bf 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -572,7 +572,7 @@ static int irq_choose_cpu(unsigned int virt_irq)
 	cpumask_t mask;
 	int cpuid;
 
-	cpumask_copy(&mask, irq_desc[virt_irq].affinity);
+	cpumask_copy(&mask, irq_to_desc(virt_irq)->affinity);
 	if (cpus_equal(mask, CPU_MASK_ALL)) {
 		static int irq_rover;
 		static DEFINE_SPINLOCK(irq_rover_lock);
@@ -621,7 +621,7 @@ static struct mpic *mpic_find(unsigned int irq)
 	if (irq < NUM_ISA_INTERRUPTS)
 		return NULL;
 
-	return irq_desc[irq].chip_data;
+	return irq_to_desc(irq)->chip_data;
 }
 
 /* Determine if the linux irq is an IPI */
@@ -648,14 +648,14 @@ static inline u32 mpic_physmask(u32 cpumask)
 /* Get the mpic structure from the IPI number */
 static inline struct mpic * mpic_from_ipi(unsigned int ipi)
 {
-	return irq_desc[ipi].chip_data;
+	return irq_to_desc(ipi)->chip_data;
 }
 #endif
 
 /* Get the mpic structure from the irq number */
 static inline struct mpic * mpic_from_irq(unsigned int irq)
 {
-	return irq_desc[irq].chip_data;
+	return irq_to_desc(irq)->chip_data;
 }
 
 /* Send an EOI */
@@ -735,7 +735,7 @@ static void mpic_unmask_ht_irq(unsigned int irq)
 
 	mpic_unmask_irq(irq);
 
-	if (irq_desc[irq].status & IRQ_LEVEL)
+	if (irq_to_desc(irq)->status & IRQ_LEVEL)
 		mpic_ht_end_irq(mpic, src);
 }
 
@@ -745,7 +745,7 @@ static unsigned int mpic_startup_ht_irq(unsigned int irq)
 	unsigned int src = mpic_irq_to_hw(irq);
 
 	mpic_unmask_irq(irq);
-	mpic_startup_ht_interrupt(mpic, src, irq_desc[irq].status);
+	mpic_startup_ht_interrupt(mpic, src, irq_to_desc(irq)->status);
 
 	return 0;
 }
@@ -755,7 +755,7 @@ static void mpic_shutdown_ht_irq(unsigned int irq)
 	struct mpic *mpic = mpic_from_irq(irq);
 	unsigned int src = mpic_irq_to_hw(irq);
 
-	mpic_shutdown_ht_interrupt(mpic, src, irq_desc[irq].status);
+	mpic_shutdown_ht_interrupt(mpic, src, irq_to_desc(irq)->status);
 	mpic_mask_irq(irq);
 }
 
@@ -772,7 +772,7 @@ static void mpic_end_ht_irq(unsigned int irq)
 	 * latched another edge interrupt coming in anyway
 	 */
 
-	if (irq_desc[irq].status & IRQ_LEVEL)
+	if (irq_to_desc(irq)->status & IRQ_LEVEL)
 		mpic_ht_end_irq(mpic, src);
 	mpic_eoi(mpic);
 }
@@ -856,7 +856,7 @@ int mpic_set_irq_type(unsigned int virq, unsigned int flow_type)
 {
 	struct mpic *mpic = mpic_from_irq(virq);
 	unsigned int src = mpic_irq_to_hw(virq);
-	struct irq_desc *desc = get_irq_desc(virq);
+	struct irq_desc *desc = irq_to_desc(virq);
 	unsigned int vecpri, vold, vnew;
 
 	DBG("mpic: set_irq_type(mpic:@%p,virq:%d,src:0x%x,type:0x%x)\n",
diff --git a/arch/powerpc/sysdev/mv64x60_pic.c b/arch/powerpc/sysdev/mv64x60_pic.c
index 2aa4ed066db1..485b92477d7c 100644
--- a/arch/powerpc/sysdev/mv64x60_pic.c
+++ b/arch/powerpc/sysdev/mv64x60_pic.c
@@ -213,7 +213,7 @@ static int mv64x60_host_map(struct irq_host *h, unsigned int virq,
 {
 	int level1;
 
-	get_irq_desc(virq)->status |= IRQ_LEVEL;
+	irq_to_desc(virq)->status |= IRQ_LEVEL;
 
 	level1 = (hwirq & MV64x60_LEVEL1_MASK) >> MV64x60_LEVEL1_OFFSET;
 	BUG_ON(level1 > MV64x60_LEVEL1_GPP);
diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.c b/arch/powerpc/sysdev/qe_lib/qe_ic.c
index 3faa42e03a85..fc098744ad82 100644
--- a/arch/powerpc/sysdev/qe_lib/qe_ic.c
+++ b/arch/powerpc/sysdev/qe_lib/qe_ic.c
@@ -189,7 +189,7 @@ static inline void qe_ic_write(volatile __be32  __iomem * base, unsigned int reg
 
 static inline struct qe_ic *qe_ic_from_irq(unsigned int virq)
 {
-	return irq_desc[virq].chip_data;
+	return irq_to_desc(virq)->chip_data;
 }
 
 #define virq_to_hw(virq)	((unsigned int)irq_map[virq].hwirq)
@@ -263,7 +263,7 @@ static int qe_ic_host_map(struct irq_host *h, unsigned int virq,
 	chip = &qe_ic->hc_irq;
 
 	set_irq_chip_data(virq, qe_ic);
-	get_irq_desc(virq)->status |= IRQ_LEVEL;
+	irq_to_desc(virq)->status |= IRQ_LEVEL;
 
 	set_irq_chip_and_handler(virq, chip, handle_level_irq);
 
diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c
index cf244a419e96..02f600991dce 100644
--- a/arch/powerpc/sysdev/tsi108_pci.c
+++ b/arch/powerpc/sysdev/tsi108_pci.c
@@ -398,7 +398,7 @@ static int pci_irq_host_map(struct irq_host *h, unsigned int virq,
 	DBG("%s(%d, 0x%lx)\n", __func__, virq, hw);
 	if ((virq >= 1) && (virq <= 4)){
 		irq = virq + IRQ_PCI_INTAD_BASE - 1;
-		get_irq_desc(irq)->status |= IRQ_LEVEL;
+		irq_to_desc(irq)->status |= IRQ_LEVEL;
 		set_irq_chip(irq, &tsi108_pci_irq);
 	}
 	return 0;
diff --git a/arch/powerpc/sysdev/uic.c b/arch/powerpc/sysdev/uic.c
index 466ce9ace127..cf97935863c8 100644
--- a/arch/powerpc/sysdev/uic.c
+++ b/arch/powerpc/sysdev/uic.c
@@ -57,7 +57,7 @@ struct uic {
 
 static void uic_unmask_irq(unsigned int virq)
 {
-	struct irq_desc *desc = get_irq_desc(virq);
+	struct irq_desc *desc = irq_to_desc(virq);
 	struct uic *uic = get_irq_chip_data(virq);
 	unsigned int src = uic_irq_to_hw(virq);
 	unsigned long flags;
@@ -101,7 +101,7 @@ static void uic_ack_irq(unsigned int virq)
 
 static void uic_mask_ack_irq(unsigned int virq)
 {
-	struct irq_desc *desc = get_irq_desc(virq);
+	struct irq_desc *desc = irq_to_desc(virq);
 	struct uic *uic = get_irq_chip_data(virq);
 	unsigned int src = uic_irq_to_hw(virq);
 	unsigned long flags;
@@ -129,7 +129,7 @@ static int uic_set_irq_type(unsigned int virq, unsigned int flow_type)
 {
 	struct uic *uic = get_irq_chip_data(virq);
 	unsigned int src = uic_irq_to_hw(virq);
-	struct irq_desc *desc = get_irq_desc(virq);
+	struct irq_desc *desc = irq_to_desc(virq);
 	unsigned long flags;
 	int trigger, polarity;
 	u32 tr, pr, mask;
diff --git a/arch/powerpc/sysdev/xilinx_intc.c b/arch/powerpc/sysdev/xilinx_intc.c
index 40edad520770..ab743718876b 100644
--- a/arch/powerpc/sysdev/xilinx_intc.c
+++ b/arch/powerpc/sysdev/xilinx_intc.c
@@ -79,7 +79,7 @@ static void xilinx_intc_mask(unsigned int virq)
 
 static int xilinx_intc_set_type(unsigned int virq, unsigned int flow_type)
 {
-	struct irq_desc *desc = get_irq_desc(virq);
+	struct irq_desc *desc = irq_to_desc(virq);
 
 	desc->status &= ~(IRQ_TYPE_SENSE_MASK | IRQ_LEVEL);
 	desc->status |= flow_type & IRQ_TYPE_SENSE_MASK;
-- 
cgit v1.2.3-59-g8ed1b


From a0668cdc154e54bf0c85182e0535eea237d53146 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 28 Oct 2009 16:27:18 +0000
Subject: powerpc/mm: Cleanup management of kmem_caches for pagetables

Currently we have a fair bit of rather fiddly code to manage the
various kmem_caches used to store page tables of various levels.  We
generally have two caches holding some combination of PGD, PUD and PMD
tables, plus several more for the special hugepage pagetables.

This patch cleans this all up by taking a different approach.  Rather
than the caches being designated as for PUDs or for hugeptes for 16M
pages, the caches are simply allocated to be a specific size.  Thus
sharing of caches between different types/levels of pagetables happens
naturally.  The pagetable size, where needed, is passed around encoded
in the same way as {PGD,PUD,PMD}_INDEX_SIZE; that is n where the
pagetable contains 2^n pointers.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/pgalloc-32.h    | 10 ++---
 arch/powerpc/include/asm/pgalloc-64.h    | 60 +++++++++++++++------------
 arch/powerpc/include/asm/pgalloc.h       | 30 ++------------
 arch/powerpc/include/asm/pgtable-ppc64.h |  1 +
 arch/powerpc/mm/hugetlbpage.c            | 51 +++++++----------------
 arch/powerpc/mm/init_64.c                | 70 ++++++++++++++++++++++----------
 arch/powerpc/mm/pgtable.c                | 25 ++++++++----
 7 files changed, 125 insertions(+), 122 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/pgalloc-32.h
index c9500d666a1d..580cf73b96e8 100644
--- a/arch/powerpc/include/asm/pgalloc-32.h
+++ b/arch/powerpc/include/asm/pgalloc-32.h
@@ -3,7 +3,8 @@
 
 #include <linux/threads.h>
 
-#define PTE_NONCACHE_NUM	0  /* dummy for now to share code w/ppc64 */
+/* For 32-bit, all levels of page tables are just drawn from get_free_page() */
+#define MAX_PGTABLE_INDEX_SIZE	0
 
 extern void __bad_pte(pmd_t *pmd);
 
@@ -36,11 +37,10 @@ extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr);
 extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
 
-static inline void pgtable_free(pgtable_free_t pgf)
+static inline void pgtable_free(void *table, unsigned index_size)
 {
-	void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK);
-
-	free_page((unsigned long)p);
+	BUG_ON(index_size); /* 32-bit doesn't use this */
+	free_page((unsigned long)table);
 }
 
 #define check_pgt_cache()	do { } while (0)
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
index e6f069c4f713..5c1cd73dafa8 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -11,27 +11,39 @@
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 
+/*
+ * Functions that deal with pagetables that could be at any level of
+ * the table need to be passed an "index_size" so they know how to
+ * handle allocation.  For PTE pages (which are linked to a struct
+ * page for now, and drawn from the main get_free_pages() pool), the
+ * allocation size will be (2^index_size * sizeof(pointer)) and
+ * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
+ *
+ * The maximum index size needs to be big enough to allow any
+ * pagetable sizes we need, but small enough to fit in the low bits of
+ * any page table pointer.  In other words all pagetables, even tiny
+ * ones, must be aligned to allow at least enough low 0 bits to
+ * contain this value.  This value is also used as a mask, so it must
+ * be one less than a power of two.
+ */
+#define MAX_PGTABLE_INDEX_SIZE	0xf
+
 #ifndef CONFIG_PPC_SUBPAGE_PROT
 static inline void subpage_prot_free(pgd_t *pgd) {}
 #endif
 
 extern struct kmem_cache *pgtable_cache[];
-
-#define PGD_CACHE_NUM		0
-#define PUD_CACHE_NUM		1
-#define PMD_CACHE_NUM		1
-#define HUGEPTE_CACHE_NUM	2
-#define PTE_NONCACHE_NUM	7  /* from GFP rather than kmem_cache */
+#define PGT_CACHE(shift) (pgtable_cache[(shift)-1])
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL);
+	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL);
 }
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	subpage_prot_free(pgd);
-	kmem_cache_free(pgtable_cache[PGD_CACHE_NUM], pgd);
+	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
 }
 
 #ifndef CONFIG_PPC_64K_PAGES
@@ -40,13 +52,13 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM],
+	return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
 				GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
-	kmem_cache_free(pgtable_cache[PUD_CACHE_NUM], pud);
+	kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
 }
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
@@ -78,13 +90,13 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM],
+	return kmem_cache_alloc(PGT_CACHE(PMD_INDEX_SIZE),
 				GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
-	kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd);
+	kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd);
 }
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
@@ -107,24 +119,22 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
 	return page;
 }
 
-static inline void pgtable_free(pgtable_free_t pgf)
+static inline void pgtable_free(void *table, unsigned index_size)
 {
-	void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK);
-	int cachenum = pgf.val & PGF_CACHENUM_MASK;
-
-	if (cachenum == PTE_NONCACHE_NUM)
-		free_page((unsigned long)p);
-	else
-		kmem_cache_free(pgtable_cache[cachenum], p);
+	if (!index_size)
+		free_page((unsigned long)table);
+	else {
+		BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
+		kmem_cache_free(PGT_CACHE(index_size), table);
+	}
 }
 
-#define __pmd_free_tlb(tlb, pmd,addr)		      \
-	pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \
-		PMD_CACHE_NUM, PMD_TABLE_SIZE-1))
+#define __pmd_free_tlb(tlb, pmd, addr)		      \
+	pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE)
 #ifndef CONFIG_PPC_64K_PAGES
 #define __pud_free_tlb(tlb, pud, addr)		      \
-	pgtable_free_tlb(tlb, pgtable_free_cache(pud, \
-		PUD_CACHE_NUM, PUD_TABLE_SIZE-1))
+	pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
+
 #endif /* CONFIG_PPC_64K_PAGES */
 
 #define check_pgt_cache()	do { } while (0)
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index f2e812de7c3c..abe8532bd14e 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -24,25 +24,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 	__free_page(ptepage);
 }
 
-typedef struct pgtable_free {
-	unsigned long val;
-} pgtable_free_t;
-
-/* This needs to be big enough to allow for MMU_PAGE_COUNT + 2 to be stored
- * and small enough to fit in the low bits of any naturally aligned page
- * table cache entry. Arbitrarily set to 0x1f, that should give us some
- * room to grow
- */
-#define PGF_CACHENUM_MASK	0x1f
-
-static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum,
-						unsigned long mask)
-{
-	BUG_ON(cachenum > PGF_CACHENUM_MASK);
-
-	return (pgtable_free_t){.val = ((unsigned long) p & ~mask) | cachenum};
-}
-
 #ifdef CONFIG_PPC64
 #include <asm/pgalloc-64.h>
 #else
@@ -50,12 +31,12 @@ static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum,
 #endif
 
 #ifdef CONFIG_SMP
-extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf);
+extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift);
 extern void pte_free_finish(void);
 #else /* CONFIG_SMP */
-static inline void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
 {
-	pgtable_free(pgf);
+	pgtable_free(table, shift);
 }
 static inline void pte_free_finish(void) { }
 #endif /* !CONFIG_SMP */
@@ -63,12 +44,9 @@ static inline void pte_free_finish(void) { }
 static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage,
 				  unsigned long address)
 {
-	pgtable_free_t pgf = pgtable_free_cache(page_address(ptepage),
-						PTE_NONCACHE_NUM,
-						PTE_TABLE_SIZE-1);
 	tlb_flush_pgtable(tlb, address);
 	pgtable_page_dtor(ptepage);
-	pgtable_free_tlb(tlb, pgf);
+	pgtable_free_tlb(tlb, page_address(ptepage), 0);
 }
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 806abe7a3fa5..8697d6555090 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -354,6 +354,7 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 #define pgoff_to_pte(off)	((pte_t) {((off) << PTE_RPN_SHIFT)|_PAGE_FILE})
 #define PTE_FILE_MAX_BITS	(BITS_PER_LONG - PTE_RPN_SHIFT)
 
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
 
 /*
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 3d542a9732ae..7230d7a4fbd9 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -43,26 +43,14 @@ static unsigned nr_gpages;
 unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
 
 #define hugepte_shift			mmu_huge_psizes
-#define PTRS_PER_HUGEPTE(psize)		(1 << hugepte_shift[psize])
-#define HUGEPTE_TABLE_SIZE(psize)	(sizeof(pte_t) << hugepte_shift[psize])
+#define HUGEPTE_INDEX_SIZE(psize)	(mmu_huge_psizes[(psize)])
+#define PTRS_PER_HUGEPTE(psize)		(1 << mmu_huge_psizes[psize])
 
 #define HUGEPD_SHIFT(psize)		(mmu_psize_to_shift(psize) \
-						+ hugepte_shift[psize])
+					 + HUGEPTE_INDEX_SIZE(psize))
 #define HUGEPD_SIZE(psize)		(1UL << HUGEPD_SHIFT(psize))
 #define HUGEPD_MASK(psize)		(~(HUGEPD_SIZE(psize)-1))
 
-/* Subtract one from array size because we don't need a cache for 4K since
- * is not a huge page size */
-#define HUGE_PGTABLE_INDEX(psize)	(HUGEPTE_CACHE_NUM + psize - 1)
-#define HUGEPTE_CACHE_NAME(psize)	(huge_pgtable_cache_name[psize])
-
-static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
-	[MMU_PAGE_64K]	= "hugepte_cache_64K",
-	[MMU_PAGE_1M]	= "hugepte_cache_1M",
-	[MMU_PAGE_16M]	= "hugepte_cache_16M",
-	[MMU_PAGE_16G]	= "hugepte_cache_16G",
-};
-
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
  * catching screwups early. */
@@ -114,15 +102,15 @@ static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 			   unsigned long address, unsigned int psize)
 {
-	pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)],
-				      GFP_KERNEL|__GFP_REPEAT);
+	pte_t *new = kmem_cache_zalloc(PGT_CACHE(hugepte_shift[psize]),
+				       GFP_KERNEL|__GFP_REPEAT);
 
 	if (! new)
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
 	if (!hugepd_none(*hpdp))
-		kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new);
+		kmem_cache_free(PGT_CACHE(hugepte_shift[psize]), new);
 	else
 		hpdp->pd = (unsigned long)new | HUGEPD_OK;
 	spin_unlock(&mm->page_table_lock);
@@ -271,9 +259,7 @@ static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
 
 	hpdp->pd = 0;
 	tlb->need_flush = 1;
-	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
-						 HUGEPTE_CACHE_NUM+psize-1,
-						 PGF_CACHENUM_MASK));
+	pgtable_free_tlb(tlb, hugepte, hugepte_shift[psize]);
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -698,8 +684,6 @@ static void __init set_huge_psize(int psize)
 		if (mmu_huge_psizes[psize] ||
 		   mmu_psize_defs[psize].shift == PAGE_SHIFT)
 			return;
-		if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL))
-			return;
 		hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
 
 		switch (mmu_psize_defs[psize].shift) {
@@ -753,9 +737,9 @@ static int __init hugetlbpage_init(void)
 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 		return -ENODEV;
 
-	/* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
-	 * and adjust PTE_NONCACHE_NUM if the number of supported huge page
-	 * sizes changes.
+	/* Add supported huge page sizes.  Need to change
+	 *  HUGE_MAX_HSTATE if the number of supported huge page sizes
+	 *  changes.
 	 */
 	set_huge_psize(MMU_PAGE_16M);
 	set_huge_psize(MMU_PAGE_16G);
@@ -769,16 +753,11 @@ static int __init hugetlbpage_init(void)
 
 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 		if (mmu_huge_psizes[psize]) {
-			pgtable_cache[HUGE_PGTABLE_INDEX(psize)] =
-				kmem_cache_create(
-					HUGEPTE_CACHE_NAME(psize),
-					HUGEPTE_TABLE_SIZE(psize),
-					HUGEPTE_TABLE_SIZE(psize),
-					0,
-					NULL);
-			if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)])
-				panic("hugetlbpage_init(): could not create %s"\
-				      "\n", HUGEPTE_CACHE_NAME(psize));
+			pgtable_cache_add(hugepte_shift[psize], NULL);
+			if (!PGT_CACHE(hugepte_shift[psize]))
+				panic("hugetlbpage_init(): could not create "
+				      "pgtable cache for %d bit pagesize\n",
+				      mmu_psize_to_shift(psize));
 		}
 	}
 
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 335c578b9cc3..82ac61dcd3af 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -119,30 +119,58 @@ static void pmd_ctor(void *addr)
 	memset(addr, 0, PMD_TABLE_SIZE);
 }
 
-static const unsigned int pgtable_cache_size[2] = {
-	PGD_TABLE_SIZE, PMD_TABLE_SIZE
-};
-static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
-#ifdef CONFIG_PPC_64K_PAGES
-	"pgd_cache", "pmd_cache",
-#else
-	"pgd_cache", "pud_pmd_cache",
-#endif /* CONFIG_PPC_64K_PAGES */
-};
-
-#ifdef CONFIG_HUGETLB_PAGE
-/* Hugepages need an extra cache per hugepagesize, initialized in
- * hugetlbpage.c.  We can't put into the tables above, because HPAGE_SHIFT
- * is not compile time constant. */
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT];
-#else
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
-#endif
+struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+
+/*
+ * Create a kmem_cache() for pagetables.  This is not used for PTE
+ * pages - they're linked to struct page, come from the normal free
+ * pages pool and have a different entry size (see real_pte_t) to
+ * everything else.  Caches created by this function are used for all
+ * the higher level pagetables, and for hugepage pagetables.
+ */
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
+{
+	char *name;
+	unsigned long table_size = sizeof(void *) << shift;
+	unsigned long align = table_size;
+
+	/* When batching pgtable pointers for RCU freeing, we store
+	 * the index size in the low bits.  Table alignment must be
+	 * big enough to fit it */
+	unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1;
+	struct kmem_cache *new;
+
+	/* It would be nice if this was a BUILD_BUG_ON(), but at the
+	 * moment, gcc doesn't seem to recognize is_power_of_2 as a
+	 * constant expression, so so much for that. */
+	BUG_ON(!is_power_of_2(minalign));
+	BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
+
+	if (PGT_CACHE(shift))
+		return; /* Already have a cache of this size */
+
+	align = max_t(unsigned long, align, minalign);
+	name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
+	new = kmem_cache_create(name, table_size, align, 0, ctor);
+	PGT_CACHE(shift) = new;
+
+	pr_debug("Allocated pgtable cache for order %d\n", shift);
+}
+
 
 void pgtable_cache_init(void)
 {
-	pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
-	pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
+	pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
+	pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
+	if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
+		panic("Couldn't allocate pgtable caches");
+
+	/* In all current configs, when the PUD index exists it's the
+	 * same size as either the pgd or pmd index.  Verify that the
+	 * initialization above has also created a PUD cache.  This
+	 * will need re-examiniation if we add new possibilities for
+	 * the pagetable layout. */
+	BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 53040931de32..99df697c601a 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -49,12 +49,12 @@ struct pte_freelist_batch
 {
 	struct rcu_head	rcu;
 	unsigned int	index;
-	pgtable_free_t	tables[0];
+	unsigned long	tables[0];
 };
 
 #define PTE_FREELIST_SIZE \
 	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
-	  / sizeof(pgtable_free_t))
+	  / sizeof(unsigned long))
 
 static void pte_free_smp_sync(void *arg)
 {
@@ -64,13 +64,13 @@ static void pte_free_smp_sync(void *arg)
 /* This is only called when we are critically out of memory
  * (and fail to get a page in pte_free_tlb).
  */
-static void pgtable_free_now(pgtable_free_t pgf)
+static void pgtable_free_now(void *table, unsigned shift)
 {
 	pte_freelist_forced_free++;
 
 	smp_call_function(pte_free_smp_sync, NULL, 1);
 
-	pgtable_free(pgf);
+	pgtable_free(table, shift);
 }
 
 static void pte_free_rcu_callback(struct rcu_head *head)
@@ -79,8 +79,12 @@ static void pte_free_rcu_callback(struct rcu_head *head)
 		container_of(head, struct pte_freelist_batch, rcu);
 	unsigned int i;
 
-	for (i = 0; i < batch->index; i++)
-		pgtable_free(batch->tables[i]);
+	for (i = 0; i < batch->index; i++) {
+		void *table = (void *)(batch->tables[i] & ~MAX_PGTABLE_INDEX_SIZE);
+		unsigned shift = batch->tables[i] & MAX_PGTABLE_INDEX_SIZE;
+
+		pgtable_free(table, shift);
+	}
 
 	free_page((unsigned long)batch);
 }
@@ -91,25 +95,28 @@ static void pte_free_submit(struct pte_freelist_batch *batch)
 	call_rcu(&batch->rcu, pte_free_rcu_callback);
 }
 
-void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
 {
 	/* This is safe since tlb_gather_mmu has disabled preemption */
 	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+	unsigned long pgf;
 
 	if (atomic_read(&tlb->mm->mm_users) < 2 ||
 	    cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){
-		pgtable_free(pgf);
+		pgtable_free(table, shift);
 		return;
 	}
 
 	if (*batchp == NULL) {
 		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
 		if (*batchp == NULL) {
-			pgtable_free_now(pgf);
+			pgtable_free_now(table, shift);
 			return;
 		}
 		(*batchp)->index = 0;
 	}
+	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+	pgf = (unsigned long)table | shift;
 	(*batchp)->tables[(*batchp)->index++] = pgf;
 	if ((*batchp)->index == PTE_FREELIST_SIZE) {
 		pte_free_submit(*batchp);
-- 
cgit v1.2.3-59-g8ed1b


From a4fe3ce7699bfe1bd88f816b55d42d8fe1dac655 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 26 Oct 2009 19:24:31 +0000
Subject: powerpc/mm: Allow more flexible layouts for hugepage pagetables

Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level.  Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly.  Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.

This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity.  In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask.  This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.

This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out.  All other (hugepage)
pagetable walking paths can just interpret the structure as they go.

There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug.  We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping).  This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.

While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/hugetlb.h       |   1 -
 arch/powerpc/include/asm/mmu-hash64.h    |  14 +-
 arch/powerpc/include/asm/page.h          |  14 +
 arch/powerpc/include/asm/pgtable-ppc64.h |  13 +-
 arch/powerpc/include/asm/pgtable.h       |   3 +
 arch/powerpc/kernel/perf_callchain.c     |  20 +-
 arch/powerpc/mm/gup.c                    | 149 ++--------
 arch/powerpc/mm/hash_utils_64.c          |  26 +-
 arch/powerpc/mm/hugetlbpage.c            | 473 +++++++++++++++----------------
 arch/powerpc/mm/init_64.c                |  10 +-
 10 files changed, 313 insertions(+), 410 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index b1dafb6a9743..a4f08f10fe1f 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -3,7 +3,6 @@
 
 #include <asm/page.h>
 
-
 int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 			   unsigned long len);
 
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index bebe31c2e907..dd50ea15e648 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -173,14 +173,6 @@ extern unsigned long tce_alloc_start, tce_alloc_end;
  */
 extern int mmu_ci_restrictions;
 
-#ifdef CONFIG_HUGETLB_PAGE
-/*
- * The page size indexes of the huge pages for use by hugetlbfs
- */
-extern unsigned int mmu_huge_psizes[MMU_PAGE_COUNT];
-
-#endif /* CONFIG_HUGETLB_PAGE */
-
 /*
  * This function sets the AVPN and L fields of the HPTE  appropriately
  * for the page size
@@ -254,9 +246,9 @@ extern int __hash_page_64K(unsigned long ea, unsigned long access,
 			   unsigned int local, int ssize);
 struct mm_struct;
 extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap);
-extern int hash_huge_page(struct mm_struct *mm, unsigned long access,
-			  unsigned long ea, unsigned long vsid, int local,
-			  unsigned long trap);
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+		     pte_t *ptep, unsigned long trap, int local, int ssize,
+		     unsigned int shift, unsigned int mmu_psize);
 
 extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 			     unsigned long pstart, unsigned long prot,
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index ff24254990e1..e96d52a516ba 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -229,6 +229,20 @@ typedef unsigned long pgprot_t;
 
 #endif
 
+typedef struct { signed long pd; } hugepd_t;
+#define HUGEPD_SHIFT_MASK     0x3f
+
+#ifdef CONFIG_HUGETLB_PAGE
+static inline int hugepd_ok(hugepd_t hpd)
+{
+	return (hpd.pd > 0);
+}
+
+#define is_hugepd(pdep)               (hugepd_ok(*((hugepd_t *)(pdep))))
+#else /* CONFIG_HUGETLB_PAGE */
+#define is_hugepd(pdep)			0
+#endif /* CONFIG_HUGETLB_PAGE */
+
 struct page;
 extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
 extern void copy_user_page(void *to, void *from, unsigned long vaddr,
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 8697d6555090..49865045d56f 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -379,7 +379,18 @@ void pgtable_cache_init(void);
 	return pt;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long address);
+#ifdef CONFIG_HUGETLB_PAGE
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+				 unsigned *shift);
+#else
+static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+					       unsigned *shift)
+{
+	if (shift)
+		*shift = 0;
+	return find_linux_pte(pgdir, ea);
+}
+#endif /* !CONFIG_HUGETLB_PAGE */
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 2a5da069714e..21207e54825b 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -211,6 +211,9 @@ extern void paging_init(void);
  */
 extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t);
 
+extern int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, unsigned long addr,
+		      unsigned long end, int write, struct page **pages, int *nr);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c
index 0a03cf70d247..936f04dbfc6f 100644
--- a/arch/powerpc/kernel/perf_callchain.c
+++ b/arch/powerpc/kernel/perf_callchain.c
@@ -119,13 +119,6 @@ static void perf_callchain_kernel(struct pt_regs *regs,
 }
 
 #ifdef CONFIG_PPC64
-
-#ifdef CONFIG_HUGETLB_PAGE
-#define is_huge_psize(pagesize)	(HPAGE_SHIFT && mmu_huge_psizes[pagesize])
-#else
-#define is_huge_psize(pagesize)	0
-#endif
-
 /*
  * On 64-bit we don't want to invoke hash_page on user addresses from
  * interrupt context, so if the access faults, we read the page tables
@@ -135,7 +128,7 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
 {
 	pgd_t *pgdir;
 	pte_t *ptep, pte;
-	int pagesize;
+	unsigned shift;
 	unsigned long addr = (unsigned long) ptr;
 	unsigned long offset;
 	unsigned long pfn;
@@ -145,17 +138,14 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
 	if (!pgdir)
 		return -EFAULT;
 
-	pagesize = get_slice_psize(current->mm, addr);
+	ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift);
+	if (!shift)
+		shift = PAGE_SHIFT;
 
 	/* align address to page boundary */
-	offset = addr & ((1ul << mmu_psize_defs[pagesize].shift) - 1);
+	offset = addr & ((1UL << shift) - 1);
 	addr -= offset;
 
-	if (is_huge_psize(pagesize))
-		ptep = huge_pte_offset(current->mm, addr);
-	else
-		ptep = find_linux_pte(pgdir, addr);
-
 	if (ptep == NULL)
 		return -EFAULT;
 	pte = *ptep;
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index bc122a120bf0..d7efdbf640c7 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -55,57 +55,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 	return 1;
 }
 
-#ifdef CONFIG_HUGETLB_PAGE
-static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
-				 unsigned long *addr, unsigned long end,
-				 int write, struct page **pages, int *nr)
-{
-	unsigned long mask;
-	unsigned long pte_end;
-	struct page *head, *page;
-	pte_t pte;
-	int refs;
-
-	pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
-	if (pte_end < end)
-		end = pte_end;
-
-	pte = *ptep;
-	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
-		mask |= _PAGE_RW;
-	if ((pte_val(pte) & mask) != mask)
-		return 0;
-	/* hugepages are never "special" */
-	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-	refs = 0;
-	head = pte_page(pte);
-	page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
-	do {
-		VM_BUG_ON(compound_head(page) != head);
-		pages[*nr] = page;
-		(*nr)++;
-		page++;
-		refs++;
-	} while (*addr += PAGE_SIZE, *addr != end);
-
-	if (!page_cache_add_speculative(head, refs)) {
-		*nr -= refs;
-		return 0;
-	}
-	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-		/* Could be optimized better */
-		while (*nr) {
-			put_page(page);
-			(*nr)--;
-		}
-	}
-
-	return 1;
-}
-#endif /* CONFIG_HUGETLB_PAGE */
-
 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 		int write, struct page **pages, int *nr)
 {
@@ -119,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none(pmd))
 			return 0;
-		if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+		if (is_hugepd(pmdp)) {
+			if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
+					addr, next, write, pages, nr))
+				return 0;
+		} else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
 			return 0;
 	} while (pmdp++, addr = next, addr != end);
 
@@ -139,7 +92,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 		next = pud_addr_end(addr, end);
 		if (pud_none(pud))
 			return 0;
-		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+		if (is_hugepd(pudp)) {
+			if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
+					addr, next, write, pages, nr))
+				return 0;
+		} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
 			return 0;
 	} while (pudp++, addr = next, addr != end);
 
@@ -154,10 +111,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	unsigned long next;
 	pgd_t *pgdp;
 	int nr = 0;
-#ifdef CONFIG_PPC64
-	unsigned int shift;
-	int psize;
-#endif
 
 	pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
 
@@ -172,25 +125,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 
 	pr_devel("  aligned: %lx .. %lx\n", start, end);
 
-#ifdef CONFIG_HUGETLB_PAGE
-	/* We bail out on slice boundary crossing when hugetlb is
-	 * enabled in order to not have to deal with two different
-	 * page table formats
-	 */
-	if (addr < SLICE_LOW_TOP) {
-		if (end > SLICE_LOW_TOP)
-			goto slow_irqon;
-
-		if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
-			     GET_LOW_SLICE_INDEX(end - 1)))
-			goto slow_irqon;
-	} else {
-		if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
-			     GET_HIGH_SLICE_INDEX(end - 1)))
-			goto slow_irqon;
-	}
-#endif /* CONFIG_HUGETLB_PAGE */
-
 	/*
 	 * XXX: batch / limit 'nr', to avoid large irq off latency
 	 * needs some instrumenting to determine the common sizes used by
@@ -210,54 +144,23 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	 */
 	local_irq_disable();
 
-#ifdef CONFIG_PPC64
-	/* Those bits are related to hugetlbfs implementation and only exist
-	 * on 64-bit for now
-	 */
-	psize = get_slice_psize(mm, addr);
-	shift = mmu_psize_defs[psize].shift;
-#endif /* CONFIG_PPC64 */
-
-#ifdef CONFIG_HUGETLB_PAGE
-	if (unlikely(mmu_huge_psizes[psize])) {
-		pte_t *ptep;
-		unsigned long a = addr;
-		unsigned long sz = ((1UL) << shift);
-		struct hstate *hstate = size_to_hstate(sz);
-
-		BUG_ON(!hstate);
-		/*
-		 * XXX: could be optimized to avoid hstate
-		 * lookup entirely (just use shift)
-		 */
-
-		do {
-			VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
-			ptep = huge_pte_offset(mm, a);
-			pr_devel(" %016lx: huge ptep %p\n", a, ptep);
-			if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
-						   &nr))
-				goto slow;
-		} while (a != end);
-	} else
-#endif /* CONFIG_HUGETLB_PAGE */
-	{
-		pgdp = pgd_offset(mm, addr);
-		do {
-			pgd_t pgd = *pgdp;
-
-#ifdef CONFIG_PPC64
-			VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
-#endif
-			pr_devel("  %016lx: normal pgd %p\n", addr,
-				 (void *)pgd_val(pgd));
-			next = pgd_addr_end(addr, end);
-			if (pgd_none(pgd))
-				goto slow;
-			if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		pr_devel("  %016lx: normal pgd %p\n", addr,
+			 (void *)pgd_val(pgd));
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			goto slow;
+		if (is_hugepd(pgdp)) {
+			if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
+					addr, next, write, pages, &nr))
 				goto slow;
-		} while (pgdp++, addr = next, addr != end);
-	}
+		} else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			goto slow;
+	} while (pgdp++, addr = next, addr != end);
+
 	local_irq_enable();
 
 	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 1ade7eb6ae00..485dcd197a61 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -891,6 +891,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 	unsigned long vsid;
 	struct mm_struct *mm;
 	pte_t *ptep;
+	unsigned hugeshift;
 	const struct cpumask *tmp;
 	int rc, user_region = 0, local = 0;
 	int psize, ssize;
@@ -943,30 +944,31 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 	if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
 		local = 1;
 
-#ifdef CONFIG_HUGETLB_PAGE
-	/* Handle hugepage regions */
-	if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
-		DBG_LOW(" -> huge page !\n");
-		return hash_huge_page(mm, access, ea, vsid, local, trap);
-	}
-#endif /* CONFIG_HUGETLB_PAGE */
-
 #ifndef CONFIG_PPC_64K_PAGES
-	/* If we use 4K pages and our psize is not 4K, then we are hitting
-	 * a special driver mapping, we need to align the address before
-	 * we fetch the PTE
+	/* If we use 4K pages and our psize is not 4K, then we might
+	 * be hitting a special driver mapping, and need to align the
+	 * address before we fetch the PTE.
+	 *
+	 * It could also be a hugepage mapping, in which case this is
+	 * not necessary, but it's not harmful, either.
 	 */
 	if (psize != MMU_PAGE_4K)
 		ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
 #endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Get PTE and page size from page tables */
-	ptep = find_linux_pte(pgdir, ea);
+	ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
 	if (ptep == NULL || !pte_present(*ptep)) {
 		DBG_LOW(" no PTE !\n");
 		return 1;
 	}
 
+#ifdef CONFIG_HUGETLB_PAGE
+	if (hugeshift)
+		return __hash_page_huge(ea, access, vsid, ptep, trap, local,
+					ssize, hugeshift, psize);
+#endif /* CONFIG_HUGETLB_PAGE */
+
 #ifndef CONFIG_PPC_64K_PAGES
 	DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
 #else
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7230d7a4fbd9..95220a5dee58 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -40,25 +40,11 @@ static unsigned nr_gpages;
 /* Array of valid huge page sizes - non-zero value(hugepte_shift) is
  * stored for the huge page sizes that are valid.
  */
-unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
-
-#define hugepte_shift			mmu_huge_psizes
-#define HUGEPTE_INDEX_SIZE(psize)	(mmu_huge_psizes[(psize)])
-#define PTRS_PER_HUGEPTE(psize)		(1 << mmu_huge_psizes[psize])
-
-#define HUGEPD_SHIFT(psize)		(mmu_psize_to_shift(psize) \
-					 + HUGEPTE_INDEX_SIZE(psize))
-#define HUGEPD_SIZE(psize)		(1UL << HUGEPD_SHIFT(psize))
-#define HUGEPD_MASK(psize)		(~(HUGEPD_SIZE(psize)-1))
+static unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
 
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
  * catching screwups early. */
-#define HUGEPD_OK	0x1
-
-typedef struct { unsigned long pd; } hugepd_t;
-
-#define hugepd_none(hpd)	((hpd).pd == 0)
 
 static inline int shift_to_mmu_psize(unsigned int shift)
 {
@@ -82,71 +68,126 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
 	BUG();
 }
 
+#define hugepd_none(hpd)	((hpd).pd == 0)
+
 static inline pte_t *hugepd_page(hugepd_t hpd)
 {
-	BUG_ON(!(hpd.pd & HUGEPD_OK));
-	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
+	BUG_ON(!hugepd_ok(hpd));
+	return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
+}
+
+static inline unsigned int hugepd_shift(hugepd_t hpd)
+{
+	return hpd.pd & HUGEPD_SHIFT_MASK;
 }
 
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
-				    struct hstate *hstate)
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
 {
-	unsigned int shift = huge_page_shift(hstate);
-	int psize = shift_to_mmu_psize(shift);
-	unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
+	unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
 	pte_t *dir = hugepd_page(*hpdp);
 
 	return dir + idx;
 }
 
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+{
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	hugepd_t *hpdp = NULL;
+	unsigned pdshift = PGDIR_SHIFT;
+
+	if (shift)
+		*shift = 0;
+
+	pg = pgdir + pgd_index(ea);
+	if (is_hugepd(pg)) {
+		hpdp = (hugepd_t *)pg;
+	} else if (!pgd_none(*pg)) {
+		pdshift = PUD_SHIFT;
+		pu = pud_offset(pg, ea);
+		if (is_hugepd(pu))
+			hpdp = (hugepd_t *)pu;
+		else if (!pud_none(*pu)) {
+			pdshift = PMD_SHIFT;
+			pm = pmd_offset(pu, ea);
+			if (is_hugepd(pm))
+				hpdp = (hugepd_t *)pm;
+			else if (!pmd_none(*pm)) {
+				return pte_offset_map(pm, ea);
+			}
+		}
+	}
+
+	if (!hpdp)
+		return NULL;
+
+	if (shift)
+		*shift = hugepd_shift(*hpdp);
+	return hugepte_offset(hpdp, ea, pdshift);
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+	return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
+}
+
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
-			   unsigned long address, unsigned int psize)
+			   unsigned long address, unsigned pdshift, unsigned pshift)
 {
-	pte_t *new = kmem_cache_zalloc(PGT_CACHE(hugepte_shift[psize]),
+	pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
 				       GFP_KERNEL|__GFP_REPEAT);
 
+	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
+	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
+
 	if (! new)
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
 	if (!hugepd_none(*hpdp))
-		kmem_cache_free(PGT_CACHE(hugepte_shift[psize]), new);
+		kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
 	else
-		hpdp->pd = (unsigned long)new | HUGEPD_OK;
+		hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 
-
-static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
-	if (huge_page_shift(hstate) < PUD_SHIFT)
-		return pud_offset(pgd, addr);
-	else
-		return (pud_t *) pgd;
-}
-static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
-			 struct hstate *hstate)
-{
-	if (huge_page_shift(hstate) < PUD_SHIFT)
-		return pud_alloc(mm, pgd, addr);
-	else
-		return (pud_t *) pgd;
-}
-static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
-{
-	if (huge_page_shift(hstate) < PMD_SHIFT)
-		return pmd_offset(pud, addr);
-	else
-		return (pmd_t *) pud;
-}
-static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
-			 struct hstate *hstate)
-{
-	if (huge_page_shift(hstate) < PMD_SHIFT)
-		return pmd_alloc(mm, pud, addr);
-	else
-		return (pmd_t *) pud;
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	hugepd_t *hpdp = NULL;
+	unsigned pshift = __ffs(sz);
+	unsigned pdshift = PGDIR_SHIFT;
+
+	addr &= ~(sz-1);
+
+	pg = pgd_offset(mm, addr);
+	if (pshift >= PUD_SHIFT) {
+		hpdp = (hugepd_t *)pg;
+	} else {
+		pdshift = PUD_SHIFT;
+		pu = pud_alloc(mm, pg, addr);
+		if (pshift >= PMD_SHIFT) {
+			hpdp = (hugepd_t *)pu;
+		} else {
+			pdshift = PMD_SHIFT;
+			pm = pmd_alloc(mm, pu, addr);
+			hpdp = (hugepd_t *)pm;
+		}
+	}
+
+	if (!hpdp)
+		return NULL;
+
+	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
+
+	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
+		return NULL;
+
+	return hugepte_offset(hpdp, addr, pdshift);
 }
 
 /* Build list of addresses of gigantic pages.  This function is used in early
@@ -180,92 +221,38 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
 	return 1;
 }
 
-
-/* Modelled after find_linux_pte() */
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
-	pgd_t *pg;
-	pud_t *pu;
-	pmd_t *pm;
-
-	unsigned int psize;
-	unsigned int shift;
-	unsigned long sz;
-	struct hstate *hstate;
-	psize = get_slice_psize(mm, addr);
-	shift = mmu_psize_to_shift(psize);
-	sz = ((1UL) << shift);
-	hstate = size_to_hstate(sz);
-
-	addr &= hstate->mask;
-
-	pg = pgd_offset(mm, addr);
-	if (!pgd_none(*pg)) {
-		pu = hpud_offset(pg, addr, hstate);
-		if (!pud_none(*pu)) {
-			pm = hpmd_offset(pu, addr, hstate);
-			if (!pmd_none(*pm))
-				return hugepte_offset((hugepd_t *)pm, addr,
-						      hstate);
-		}
-	}
-
-	return NULL;
-}
-
-pte_t *huge_pte_alloc(struct mm_struct *mm,
-			unsigned long addr, unsigned long sz)
-{
-	pgd_t *pg;
-	pud_t *pu;
-	pmd_t *pm;
-	hugepd_t *hpdp = NULL;
-	struct hstate *hstate;
-	unsigned int psize;
-	hstate = size_to_hstate(sz);
-
-	psize = get_slice_psize(mm, addr);
-	BUG_ON(!mmu_huge_psizes[psize]);
-
-	addr &= hstate->mask;
-
-	pg = pgd_offset(mm, addr);
-	pu = hpud_alloc(mm, pg, addr, hstate);
-
-	if (pu) {
-		pm = hpmd_alloc(mm, pu, addr, hstate);
-		if (pm)
-			hpdp = (hugepd_t *)pm;
-	}
-
-	if (! hpdp)
-		return NULL;
-
-	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
-		return NULL;
-
-	return hugepte_offset(hpdp, addr, hstate);
-}
-
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 {
 	return 0;
 }
 
-static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
-			       unsigned int psize)
+static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
+			      unsigned long start, unsigned long end,
+			      unsigned long floor, unsigned long ceiling)
 {
 	pte_t *hugepte = hugepd_page(*hpdp);
+	unsigned shift = hugepd_shift(*hpdp);
+	unsigned long pdmask = ~((1UL << pdshift) - 1);
+
+	start &= pdmask;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= pdmask;
+		if (! ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		return;
 
 	hpdp->pd = 0;
 	tlb->need_flush = 1;
-	pgtable_free_tlb(tlb, hugepte, hugepte_shift[psize]);
+	pgtable_free_tlb(tlb, hugepte, pdshift - shift);
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 				   unsigned long addr, unsigned long end,
-				   unsigned long floor, unsigned long ceiling,
-				   unsigned int psize)
+				   unsigned long floor, unsigned long ceiling)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -277,7 +264,8 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none(*pmd))
 			continue;
-		free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
+		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
+				  addr, next, floor, ceiling);
 	} while (pmd++, addr = next, addr != end);
 
 	start &= PUD_MASK;
@@ -303,23 +291,19 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	pud_t *pud;
 	unsigned long next;
 	unsigned long start;
-	unsigned int shift;
-	unsigned int psize = get_slice_psize(tlb->mm, addr);
-	shift = mmu_psize_to_shift(psize);
 
 	start = addr;
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
-		if (shift < PMD_SHIFT) {
+		if (!is_hugepd(pud)) {
 			if (pud_none_or_clear_bad(pud))
 				continue;
 			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
-					       ceiling, psize);
+					       ceiling);
 		} else {
-			if (pud_none(*pud))
-				continue;
-			free_hugepte_range(tlb, (hugepd_t *)pud, psize);
+			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
+					  addr, next, floor, ceiling);
 		}
 	} while (pud++, addr = next, addr != end);
 
@@ -350,74 +334,34 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 {
 	pgd_t *pgd;
 	unsigned long next;
-	unsigned long start;
 
 	/*
-	 * Comments below take from the normal free_pgd_range().  They
-	 * apply here too.  The tests against HUGEPD_MASK below are
-	 * essential, because we *don't* test for this at the bottom
-	 * level.  Without them we'll attempt to free a hugepte table
-	 * when we unmap just part of it, even if there are other
-	 * active mappings using it.
-	 *
-	 * The next few lines have given us lots of grief...
-	 *
-	 * Why are we testing HUGEPD* at this top level?  Because
-	 * often there will be no work to do at all, and we'd prefer
-	 * not to go all the way down to the bottom just to discover
-	 * that.
-	 *
-	 * Why all these "- 1"s?  Because 0 represents both the bottom
-	 * of the address space and the top of it (using -1 for the
-	 * top wouldn't help much: the masks would do the wrong thing).
-	 * The rule is that addr 0 and floor 0 refer to the bottom of
-	 * the address space, but end 0 and ceiling 0 refer to the top
-	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
-	 * that end 0 case should be mythical).
-	 *
-	 * Wherever addr is brought up or ceiling brought down, we
-	 * must be careful to reject "the opposite 0" before it
-	 * confuses the subsequent tests.  But what about where end is
-	 * brought down by HUGEPD_SIZE below? no, end can't go down to
-	 * 0 there.
+	 * Because there are a number of different possible pagetable
+	 * layouts for hugepage ranges, we limit knowledge of how
+	 * things should be laid out to the allocation path
+	 * (huge_pte_alloc(), above).  Everything else works out the
+	 * structure as it goes from information in the hugepd
+	 * pointers.  That means that we can't here use the
+	 * optimization used in the normal page free_pgd_range(), of
+	 * checking whether we're actually covering a large enough
+	 * range to have to do anything at the top level of the walk
+	 * instead of at the bottom.
 	 *
-	 * Whereas we round start (addr) and ceiling down, by different
-	 * masks at different levels, in order to test whether a table
-	 * now has no other vmas using it, so can be freed, we don't
-	 * bother to round floor or end up - the tests don't need that.
+	 * To make sense of this, you should probably go read the big
+	 * block comment at the top of the normal free_pgd_range(),
+	 * too.
 	 */
-	unsigned int psize = get_slice_psize(tlb->mm, addr);
-
-	addr &= HUGEPD_MASK(psize);
-	if (addr < floor) {
-		addr += HUGEPD_SIZE(psize);
-		if (!addr)
-			return;
-	}
-	if (ceiling) {
-		ceiling &= HUGEPD_MASK(psize);
-		if (!ceiling)
-			return;
-	}
-	if (end - 1 > ceiling - 1)
-		end -= HUGEPD_SIZE(psize);
-	if (addr > end - 1)
-		return;
 
-	start = addr;
 	pgd = pgd_offset(tlb->mm, addr);
 	do {
-		psize = get_slice_psize(tlb->mm, addr);
-		BUG_ON(!mmu_huge_psizes[psize]);
 		next = pgd_addr_end(addr, end);
-		if (mmu_psize_to_shift(psize) < PUD_SHIFT) {
+		if (!is_hugepd(pgd)) {
 			if (pgd_none_or_clear_bad(pgd))
 				continue;
 			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 		} else {
-			if (pgd_none(*pgd))
-				continue;
-			free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
+			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
+					  addr, next, floor, ceiling);
 		}
 	} while (pgd++, addr = next, addr != end);
 }
@@ -448,19 +392,19 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
 	pte_t *ptep;
 	struct page *page;
-	unsigned int mmu_psize = get_slice_psize(mm, address);
+	unsigned shift;
+	unsigned long mask;
+
+	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
 
 	/* Verify it is a huge page else bail. */
-	if (!mmu_huge_psizes[mmu_psize])
+	if (!ptep || !shift)
 		return ERR_PTR(-EINVAL);
 
-	ptep = huge_pte_offset(mm, address);
+	mask = (1UL << shift) - 1;
 	page = pte_page(*ptep);
-	if (page) {
-		unsigned int shift = mmu_psize_to_shift(mmu_psize);
-		unsigned long sz = ((1UL) << shift);
-		page += (address % sz) / PAGE_SIZE;
-	}
+	if (page)
+		page += (address & mask) / PAGE_SIZE;
 
 	return page;
 }
@@ -483,6 +427,73 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
+static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+		       unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask;
+	unsigned long pte_end;
+	struct page *head, *page;
+	pte_t pte;
+	int refs;
+
+	pte_end = (addr + sz) & ~(sz-1);
+	if (pte_end < end)
+		end = pte_end;
+
+	pte = *ptep;
+	mask = _PAGE_PRESENT | _PAGE_USER;
+	if (write)
+		mask |= _PAGE_RW;
+
+	if ((pte_val(pte) & mask) != mask)
+		return 0;
+
+	/* hugepages are never "special" */
+	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+	refs = 0;
+	head = pte_page(pte);
+
+	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+	do {
+		VM_BUG_ON(compound_head(page) != head);
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+
+	if (!page_cache_add_speculative(head, refs)) {
+		*nr -= refs;
+		return 0;
+	}
+
+	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+		/* Could be optimized better */
+		while (*nr) {
+			put_page(page);
+			(*nr)--;
+		}
+	}
+
+	return 1;
+}
+
+int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
+	       unsigned long addr, unsigned long end,
+	       int write, struct page **pages, int *nr)
+{
+	pte_t *ptep;
+	unsigned long sz = 1UL << hugepd_shift(*hugepd);
+
+	ptep = hugepte_offset(hugepd, addr, pdshift);
+	do {
+		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
+			return 0;
+	} while (ptep++, addr += sz, addr != end);
+
+	return 1;
+}
 
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
@@ -530,34 +541,20 @@ static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
 	return rflags;
 }
 
-int hash_huge_page(struct mm_struct *mm, unsigned long access,
-		   unsigned long ea, unsigned long vsid, int local,
-		   unsigned long trap)
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+		     pte_t *ptep, unsigned long trap, int local, int ssize,
+		     unsigned int shift, unsigned int mmu_psize)
 {
-	pte_t *ptep;
 	unsigned long old_pte, new_pte;
 	unsigned long va, rflags, pa, sz;
 	long slot;
 	int err = 1;
-	int ssize = user_segment_size(ea);
-	unsigned int mmu_psize;
-	int shift;
-	mmu_psize = get_slice_psize(mm, ea);
 
-	if (!mmu_huge_psizes[mmu_psize])
-		goto out;
-	ptep = huge_pte_offset(mm, ea);
+	BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
 
 	/* Search the Linux page table for a match with va */
 	va = hpt_va(ea, vsid, ssize);
 
-	/*
-	 * If no pte found or not present, send the problem up to
-	 * do_page_fault
-	 */
-	if (unlikely(!ptep || pte_none(*ptep)))
-		goto out;
-
 	/* 
 	 * Check the user's access rights to the page.  If access should be
 	 * prevented then send the problem up to do_page_fault.
@@ -588,7 +585,6 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 	rflags = 0x2 | (!(new_pte & _PAGE_RW));
  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
-	shift = mmu_psize_to_shift(mmu_psize);
 	sz = ((1UL) << shift);
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 		/* No CPU has hugepages but lacks no execute, so we
@@ -672,6 +668,8 @@ repeat:
 
 static void __init set_huge_psize(int psize)
 {
+	unsigned pdshift;
+
 	/* Check that it is a page size supported by the hardware and
 	 * that it fits within pagetable limits. */
 	if (mmu_psize_defs[psize].shift &&
@@ -686,29 +684,14 @@ static void __init set_huge_psize(int psize)
 			return;
 		hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
 
-		switch (mmu_psize_defs[psize].shift) {
-		case PAGE_SHIFT_64K:
-		    /* We only allow 64k hpages with 4k base page,
-		     * which was checked above, and always put them
-		     * at the PMD */
-		    hugepte_shift[psize] = PMD_SHIFT;
-		    break;
-		case PAGE_SHIFT_16M:
-		    /* 16M pages can be at two different levels
-		     * of pagestables based on base page size */
-		    if (PAGE_SHIFT == PAGE_SHIFT_64K)
-			    hugepte_shift[psize] = PMD_SHIFT;
-		    else /* 4k base page */
-			    hugepte_shift[psize] = PUD_SHIFT;
-		    break;
-		case PAGE_SHIFT_16G:
-		    /* 16G pages are always at PGD level */
-		    hugepte_shift[psize] = PGDIR_SHIFT;
-		    break;
-		}
-		hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
-	} else
-		hugepte_shift[psize] = 0;
+		if (mmu_psize_defs[psize].shift < PMD_SHIFT)
+			pdshift = PMD_SHIFT;
+		else if (mmu_psize_defs[psize].shift < PUD_SHIFT)
+			pdshift = PUD_SHIFT;
+		else
+			pdshift = PGDIR_SHIFT;
+		mmu_huge_psizes[psize] = pdshift - mmu_psize_defs[psize].shift;
+	}
 }
 
 static int __init hugepage_setup_sz(char *str)
@@ -732,7 +715,7 @@ __setup("hugepagesz=", hugepage_setup_sz);
 
 static int __init hugetlbpage_init(void)
 {
-	unsigned int psize;
+	int psize;
 
 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 		return -ENODEV;
@@ -753,8 +736,8 @@ static int __init hugetlbpage_init(void)
 
 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 		if (mmu_huge_psizes[psize]) {
-			pgtable_cache_add(hugepte_shift[psize], NULL);
-			if (!PGT_CACHE(hugepte_shift[psize]))
+			pgtable_cache_add(mmu_huge_psizes[psize], NULL);
+			if (!PGT_CACHE(mmu_huge_psizes[psize]))
 				panic("hugetlbpage_init(): could not create "
 				      "pgtable cache for %d bit pagesize\n",
 				      mmu_psize_to_shift(psize));
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 82ac61dcd3af..776f28d02b6b 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -41,6 +41,7 @@
 #include <linux/module.h>
 #include <linux/poison.h>
 #include <linux/lmb.h>
+#include <linux/hugetlb.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -136,8 +137,13 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
 
 	/* When batching pgtable pointers for RCU freeing, we store
 	 * the index size in the low bits.  Table alignment must be
-	 * big enough to fit it */
-	unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1;
+	 * big enough to fit it.
+	 *
+	 * Likewise, hugeapge pagetable pointers contain a (different)
+	 * shift value in the low bits.  All tables must be aligned so
+	 * as to leave enough 0 bits in the address to contain it. */
+	unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
+				     HUGEPD_SHIFT_MASK + 1);
 	struct kmem_cache *new;
 
 	/* It would be nice if this was a BUILD_BUG_ON(), but at the
-- 
cgit v1.2.3-59-g8ed1b


From d1837cba5d5d5458c09f0a2849db2d3c203cb8e9 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 26 Oct 2009 19:24:31 +0000
Subject: powerpc/mm: Cleanup initialization of hugepages on powerpc

This patch simplifies the logic used to initialize hugepages on
powerpc.  The somewhat oddly named set_huge_psize() is renamed to
add_huge_page_size() and now does all necessary verification of
whether it's given a valid hugepage sizes (instead of just some) and
instantiates the generic hstate structure (but no more).

hugetlbpage_init() now steps through the available pagesizes, checks
if they're valid for hugepages by calling add_huge_page_size() and
initializes the kmem_caches for the hugepage pagetables.  This means
we can now eliminate the mmu_huge_psizes array, since we no longer
need to pass the sizing information for the pagetable caches from
set_huge_psize() into hugetlbpage_init()

Determination of the default huge page size is also moved from the
hash code into the general hugepage code.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/page_64.h |   2 +-
 arch/powerpc/mm/hash_utils_64.c    |  10 ---
 arch/powerpc/mm/hugetlbpage.c      | 128 ++++++++++++++++++-------------------
 3 files changed, 63 insertions(+), 77 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index 3f17b83f55a1..bfc4e027e2ad 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -90,7 +90,7 @@ extern unsigned int HPAGE_SHIFT;
 #define HPAGE_SIZE		((1UL) << HPAGE_SHIFT)
 #define HPAGE_MASK		(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
-#define HUGE_MAX_HSTATE		3
+#define HUGE_MAX_HSTATE		(MMU_PAGE_COUNT-1)
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 485dcd197a61..ef1f047f5431 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -481,16 +481,6 @@ static void __init htab_init_page_sizes(void)
 #ifdef CONFIG_HUGETLB_PAGE
 	/* Reserve 16G huge page memory sections for huge pages */
 	of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
-
-/* Set default large page size. Currently, we pick 16M or 1M depending
-	 * on what is available
-	 */
-	if (mmu_psize_defs[MMU_PAGE_16M].shift)
-		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
-	/* With 4k/4level pagetables, we can't (for now) cope with a
-	 * huge page size < PMD_SIZE */
-	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
-		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
 #endif /* CONFIG_HUGETLB_PAGE */
 }
 
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 95220a5dee58..a7161c07886d 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -37,27 +37,17 @@
 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
 static unsigned nr_gpages;
 
-/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
- * stored for the huge page sizes that are valid.
- */
-static unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
-
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
  * catching screwups early. */
 
 static inline int shift_to_mmu_psize(unsigned int shift)
 {
-	switch (shift) {
-#ifndef CONFIG_PPC_64K_PAGES
-	case PAGE_SHIFT_64K:
-	    return MMU_PAGE_64K;
-#endif
-	case PAGE_SHIFT_16M:
-	    return MMU_PAGE_16M;
-	case PAGE_SHIFT_16G:
-	    return MMU_PAGE_16G;
-	}
+	int psize;
+
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
+		if (mmu_psize_defs[psize].shift == shift)
+			return psize;
 	return -1;
 }
 
@@ -502,8 +492,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	struct hstate *hstate = hstate_file(file);
 	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 
-	if (!mmu_huge_psizes[mmu_psize])
-		return -EINVAL;
 	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 }
 
@@ -666,47 +654,46 @@ repeat:
 	return err;
 }
 
-static void __init set_huge_psize(int psize)
+static int __init add_huge_page_size(unsigned long long size)
 {
-	unsigned pdshift;
+	int shift = __ffs(size);
+	int mmu_psize;
 
 	/* Check that it is a page size supported by the hardware and
-	 * that it fits within pagetable limits. */
-	if (mmu_psize_defs[psize].shift &&
-		mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
-		(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
-		 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
-		 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
-		/* Return if huge page size has already been setup or is the
-		 * same as the base page size. */
-		if (mmu_huge_psizes[psize] ||
-		   mmu_psize_defs[psize].shift == PAGE_SHIFT)
-			return;
-		hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
+	 * that it fits within pagetable and slice limits. */
+	if (!is_power_of_2(size)
+	    || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
+		return -EINVAL;
 
-		if (mmu_psize_defs[psize].shift < PMD_SHIFT)
-			pdshift = PMD_SHIFT;
-		else if (mmu_psize_defs[psize].shift < PUD_SHIFT)
-			pdshift = PUD_SHIFT;
-		else
-			pdshift = PGDIR_SHIFT;
-		mmu_huge_psizes[psize] = pdshift - mmu_psize_defs[psize].shift;
-	}
+	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
+		return -EINVAL;
+
+#ifdef CONFIG_SPU_FS_64K_LS
+	/* Disable support for 64K huge pages when 64K SPU local store
+	 * support is enabled as the current implementation conflicts.
+	 */
+	if (shift == PAGE_SHIFT_64K)
+		return -EINVAL;
+#endif /* CONFIG_SPU_FS_64K_LS */
+
+	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
+
+	/* Return if huge page size has already been setup */
+	if (size_to_hstate(size))
+		return 0;
+
+	hugetlb_add_hstate(shift - PAGE_SHIFT);
+
+	return 0;
 }
 
 static int __init hugepage_setup_sz(char *str)
 {
 	unsigned long long size;
-	int mmu_psize;
-	int shift;
 
 	size = memparse(str, &str);
 
-	shift = __ffs(size);
-	mmu_psize = shift_to_mmu_psize(shift);
-	if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
-		set_huge_psize(mmu_psize);
-	else
+	if (add_huge_page_size(size) != 0)
 		printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
 
 	return 1;
@@ -720,30 +707,39 @@ static int __init hugetlbpage_init(void)
 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 		return -ENODEV;
 
-	/* Add supported huge page sizes.  Need to change
-	 *  HUGE_MAX_HSTATE if the number of supported huge page sizes
-	 *  changes.
-	 */
-	set_huge_psize(MMU_PAGE_16M);
-	set_huge_psize(MMU_PAGE_16G);
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		unsigned shift;
+		unsigned pdshift;
 
-	/* Temporarily disable support for 64K huge pages when 64K SPU local
-	 * store support is enabled as the current implementation conflicts.
-	 */
-#ifndef CONFIG_SPU_FS_64K_LS
-	set_huge_psize(MMU_PAGE_64K);
-#endif
+		if (!mmu_psize_defs[psize].shift)
+			continue;
 
-	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
-		if (mmu_huge_psizes[psize]) {
-			pgtable_cache_add(mmu_huge_psizes[psize], NULL);
-			if (!PGT_CACHE(mmu_huge_psizes[psize]))
-				panic("hugetlbpage_init(): could not create "
-				      "pgtable cache for %d bit pagesize\n",
-				      mmu_psize_to_shift(psize));
-		}
+		shift = mmu_psize_to_shift(psize);
+
+		if (add_huge_page_size(1ULL << shift) < 0)
+			continue;
+
+		if (shift < PMD_SHIFT)
+			pdshift = PMD_SHIFT;
+		else if (shift < PUD_SHIFT)
+			pdshift = PUD_SHIFT;
+		else
+			pdshift = PGDIR_SHIFT;
+
+		pgtable_cache_add(pdshift - shift, NULL);
+		if (!PGT_CACHE(pdshift - shift))
+			panic("hugetlbpage_init(): could not create "
+			      "pgtable cache for %d bit pagesize\n", shift);
 	}
 
+	/* Set default large page size. Currently, we pick 16M or 1M
+	 * depending on what is available
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift)
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
+	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 883a3e523672ebba2ec3969837ba02af4f70fae2 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 26 Oct 2009 19:24:31 +0000
Subject: powerpc/mm: Split hash MMU specific hugepage code into a new file

This patch separates the parts of hugetlbpage.c which are inherently
specific to the hash MMU into a new hugelbpage-hash64.c file.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/hugetlb.h   |   3 +
 arch/powerpc/mm/Makefile             |   5 +-
 arch/powerpc/mm/hugetlbpage-hash64.c | 167 ++++++++++++++++++++++++++++++++++
 arch/powerpc/mm/hugetlbpage.c        | 168 +----------------------------------
 4 files changed, 176 insertions(+), 167 deletions(-)
 create mode 100644 arch/powerpc/mm/hugetlbpage-hash64.c

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index a4f08f10fe1f..038886834da5 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -3,6 +3,9 @@
 
 #include <asm/page.h>
 
+pte_t *huge_pte_offset_and_shift(struct mm_struct *mm,
+				 unsigned long addr, unsigned *shift);
+
 int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 			   unsigned long len);
 
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 6fb8fc8d2fea..ce68708bbad5 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -28,7 +28,10 @@ obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_FSL_BOOKE)		+= fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
-obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
+ifeq ($(CONFIG_HUGETLB_PAGE),y)
+obj-y				+= hugetlbpage.o
+obj-$(CONFIG_PPC_STD_MMU_64)	+= hugetlbpage-hash64.o
+endif
 obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
new file mode 100644
index 000000000000..1508ffc1e1e1
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -0,0 +1,167 @@
+/*
+ * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
+					pte_t pte, int trap, unsigned long sz)
+{
+	struct page *page;
+	int i;
+
+	if (!pfn_valid(pte_pfn(pte)))
+		return rflags;
+
+	page = pte_page(pte);
+
+	/* page is dirty */
+	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+		if (trap == 0x400) {
+			for (i = 0; i < (sz / PAGE_SIZE); i++)
+				__flush_dcache_icache(page_address(page+i));
+			set_bit(PG_arch_1, &page->flags);
+		} else {
+			rflags |= HPTE_R_N;
+		}
+	}
+	return rflags;
+}
+
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+		     pte_t *ptep, unsigned long trap, int local, int ssize,
+		     unsigned int shift, unsigned int mmu_psize)
+{
+	unsigned long old_pte, new_pte;
+	unsigned long va, rflags, pa, sz;
+	long slot;
+	int err = 1;
+
+	BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
+
+	/* Search the Linux page table for a match with va */
+	va = hpt_va(ea, vsid, ssize);
+
+	/*
+	 * Check the user's access rights to the page.  If access should be
+	 * prevented then send the problem up to do_page_fault.
+	 */
+	if (unlikely(access & ~pte_val(*ptep)))
+		goto out;
+	/*
+	 * At this point, we have a pte (old_pte) which can be used to build
+	 * or update an HPTE. There are 2 cases:
+	 *
+	 * 1. There is a valid (present) pte with no associated HPTE (this is
+	 *	the most common case)
+	 * 2. There is a valid (present) pte with an associated HPTE. The
+	 *	current values of the pp bits in the HPTE prevent access
+	 *	because we are doing software DIRTY bit management and the
+	 *	page is currently not DIRTY.
+	 */
+
+
+	do {
+		old_pte = pte_val(*ptep);
+		if (old_pte & _PAGE_BUSY)
+			goto out;
+		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
+	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
+					 old_pte, new_pte));
+
+	rflags = 0x2 | (!(new_pte & _PAGE_RW));
+	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
+	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+	sz = ((1UL) << shift);
+	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		/* No CPU has hugepages but lacks no execute, so we
+		 * don't need to worry about that case */
+		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
+						       trap, sz);
+
+	/* Check if pte already has an hpte (case 2) */
+	if (unlikely(old_pte & _PAGE_HASHPTE)) {
+		/* There MIGHT be an HPTE for this pte */
+		unsigned long hash, slot;
+
+		hash = hpt_hash(va, shift, ssize);
+		if (old_pte & _PAGE_F_SECOND)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += (old_pte & _PAGE_F_GIX) >> 12;
+
+		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
+					 ssize, local) == -1)
+			old_pte &= ~_PAGE_HPTEFLAGS;
+	}
+
+	if (likely(!(old_pte & _PAGE_HASHPTE))) {
+		unsigned long hash = hpt_hash(va, shift, ssize);
+		unsigned long hpte_group;
+
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+
+repeat:
+		hpte_group = ((hash & htab_hash_mask) *
+			      HPTES_PER_GROUP) & ~0x7UL;
+
+		/* clear HPTE slot informations in new PTE */
+#ifdef CONFIG_PPC_64K_PAGES
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
+#else
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+#endif
+		/* Add in WIMG bits */
+		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+				      _PAGE_COHERENT | _PAGE_GUARDED));
+
+		/* Insert into the hash table, primary slot */
+		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
+					  mmu_psize, ssize);
+
+		/* Primary is full, try the secondary */
+		if (unlikely(slot == -1)) {
+			hpte_group = ((~hash & htab_hash_mask) *
+				      HPTES_PER_GROUP) & ~0x7UL;
+			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
+						  HPTE_V_SECONDARY,
+						  mmu_psize, ssize);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = ((hash & htab_hash_mask) *
+						      HPTES_PER_GROUP)&~0x7UL;
+
+				ppc_md.hpte_remove(hpte_group);
+				goto repeat;
+                        }
+		}
+
+		if (unlikely(slot == -2))
+			panic("hash_huge_page: pte_insert failed\n");
+
+		new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+	}
+
+	/*
+	 * No need to use ldarx/stdcx here
+	 */
+	*ptep = __pte(new_pte & ~_PAGE_BUSY);
+
+	err = 0;
+
+ out:
+	return err;
+}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a7161c07886d..1bf065546fa1 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -7,29 +7,17 @@
  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  */
 
-#include <linux/init.h>
-#include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/io.h>
 #include <linux/hugetlb.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/sysctl.h>
-#include <asm/mman.h>
+#include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/machdep.h>
-#include <asm/cputable.h>
-#include <asm/spu.h>
 
 #define PAGE_SHIFT_64K	16
 #define PAGE_SHIFT_16M	24
 #define PAGE_SHIFT_16G	34
 
-#define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
-#define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
 #define MAX_NUMBER_GPAGES	1024
 
 /* Tracks the 16G pages after the device tree is scanned and before the
@@ -502,158 +490,6 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 	return 1UL << mmu_psize_to_shift(psize);
 }
 
-/*
- * Called by asm hashtable.S for doing lazy icache flush
- */
-static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
-					pte_t pte, int trap, unsigned long sz)
-{
-	struct page *page;
-	int i;
-
-	if (!pfn_valid(pte_pfn(pte)))
-		return rflags;
-
-	page = pte_page(pte);
-
-	/* page is dirty */
-	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
-		if (trap == 0x400) {
-			for (i = 0; i < (sz / PAGE_SIZE); i++)
-				__flush_dcache_icache(page_address(page+i));
-			set_bit(PG_arch_1, &page->flags);
-		} else {
-			rflags |= HPTE_R_N;
-		}
-	}
-	return rflags;
-}
-
-int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
-		     pte_t *ptep, unsigned long trap, int local, int ssize,
-		     unsigned int shift, unsigned int mmu_psize)
-{
-	unsigned long old_pte, new_pte;
-	unsigned long va, rflags, pa, sz;
-	long slot;
-	int err = 1;
-
-	BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
-
-	/* Search the Linux page table for a match with va */
-	va = hpt_va(ea, vsid, ssize);
-
-	/* 
-	 * Check the user's access rights to the page.  If access should be
-	 * prevented then send the problem up to do_page_fault.
-	 */
-	if (unlikely(access & ~pte_val(*ptep)))
-		goto out;
-	/*
-	 * At this point, we have a pte (old_pte) which can be used to build
-	 * or update an HPTE. There are 2 cases:
-	 *
-	 * 1. There is a valid (present) pte with no associated HPTE (this is 
-	 *	the most common case)
-	 * 2. There is a valid (present) pte with an associated HPTE. The
-	 *	current values of the pp bits in the HPTE prevent access
-	 *	because we are doing software DIRTY bit management and the
-	 *	page is currently not DIRTY. 
-	 */
-
-
-	do {
-		old_pte = pte_val(*ptep);
-		if (old_pte & _PAGE_BUSY)
-			goto out;
-		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
-	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
-					 old_pte, new_pte));
-
-	rflags = 0x2 | (!(new_pte & _PAGE_RW));
- 	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
-	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
-	sz = ((1UL) << shift);
-	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-		/* No CPU has hugepages but lacks no execute, so we
-		 * don't need to worry about that case */
-		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
-						       trap, sz);
-
-	/* Check if pte already has an hpte (case 2) */
-	if (unlikely(old_pte & _PAGE_HASHPTE)) {
-		/* There MIGHT be an HPTE for this pte */
-		unsigned long hash, slot;
-
-		hash = hpt_hash(va, shift, ssize);
-		if (old_pte & _PAGE_F_SECOND)
-			hash = ~hash;
-		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += (old_pte & _PAGE_F_GIX) >> 12;
-
-		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
-					 ssize, local) == -1)
-			old_pte &= ~_PAGE_HPTEFLAGS;
-	}
-
-	if (likely(!(old_pte & _PAGE_HASHPTE))) {
-		unsigned long hash = hpt_hash(va, shift, ssize);
-		unsigned long hpte_group;
-
-		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-
-repeat:
-		hpte_group = ((hash & htab_hash_mask) *
-			      HPTES_PER_GROUP) & ~0x7UL;
-
-		/* clear HPTE slot informations in new PTE */
-#ifdef CONFIG_PPC_64K_PAGES
-		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
-#else
-		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
-#endif
-		/* Add in WIMG bits */
-		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
-				      _PAGE_COHERENT | _PAGE_GUARDED));
-
-		/* Insert into the hash table, primary slot */
-		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
-					  mmu_psize, ssize);
-
-		/* Primary is full, try the secondary */
-		if (unlikely(slot == -1)) {
-			hpte_group = ((~hash & htab_hash_mask) *
-				      HPTES_PER_GROUP) & ~0x7UL; 
-			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
-						  HPTE_V_SECONDARY,
-						  mmu_psize, ssize);
-			if (slot == -1) {
-				if (mftb() & 0x1)
-					hpte_group = ((hash & htab_hash_mask) *
-						      HPTES_PER_GROUP)&~0x7UL;
-
-				ppc_md.hpte_remove(hpte_group);
-				goto repeat;
-                        }
-		}
-
-		if (unlikely(slot == -2))
-			panic("hash_huge_page: pte_insert failed\n");
-
-		new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
-	}
-
-	/*
-	 * No need to use ldarx/stdcx here
-	 */
-	*ptep = __pte(new_pte & ~_PAGE_BUSY);
-
-	err = 0;
-
- out:
-	return err;
-}
-
 static int __init add_huge_page_size(unsigned long long size)
 {
 	int shift = __ffs(size);
-- 
cgit v1.2.3-59-g8ed1b


From 0895ecda79428df48501e48dd0a868e0c8e1aae2 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 26 Oct 2009 19:24:31 +0000
Subject: powerpc/mm: Bring hugepage PTE accessor functions back into sync with
 normal accessors

The hugepage arch code provides a number of hook functions/macros
which mirror the functionality of various normal page pte access
functions.  Various changes in the normal page accessors (in
particular BenH's recent changes to the handling of lazy icache
flushing and PAGE_EXEC) have caused the hugepage versions to get out
of sync with the originals.  In some cases, this is a bug, at least on
some MMU types.

One of the reasons that some hooks were not identical to the normal
page versions, is that the fact we're dealing with a hugepage needed
to be passed down do use the correct dcache-icache flush function.
This patch makes the main flush_dcache_icache_page() function hugepage
aware (by checking for the PageCompound flag).  That in turn means we
can make set_huge_pte_at() just a call to set_pte_at() bringing it
back into sync.  As a bonus, this lets us remove the
hash_huge_page_do_lazy_icache() function, replacing it with a call to
the hash_page_do_lazy_icache() function it was based on.

Some other hugepage pte access hooks - huge_ptep_get_and_clear() and
huge_ptep_clear_flush() - are not so easily unified, but this patch at
least brings them back into sync with the current versions of the
corresponding normal page functions.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/hugetlb.h    | 25 +++++++++++++++++++------
 arch/powerpc/include/asm/mmu-hash64.h |  1 +
 arch/powerpc/mm/hash_utils_64.c       |  2 +-
 arch/powerpc/mm/hugetlbpage-hash64.c  | 30 +-----------------------------
 arch/powerpc/mm/hugetlbpage.c         | 31 ++++++++++---------------------
 arch/powerpc/mm/mem.c                 | 17 +++++++++++++----
 6 files changed, 45 insertions(+), 61 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 038886834da5..5856a66ab404 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -6,6 +6,8 @@
 pte_t *huge_pte_offset_and_shift(struct mm_struct *mm,
 				 unsigned long addr, unsigned *shift);
 
+void flush_dcache_icache_hugepage(struct page *page);
+
 int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 			   unsigned long len);
 
@@ -13,12 +15,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 			    unsigned long end, unsigned long floor,
 			    unsigned long ceiling);
 
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, pte_t pte);
-
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
-			      pte_t *ptep);
-
 /*
  * The version of vma_mmu_pagesize() in arch/powerpc/mm/hugetlbpage.c needs
  * to override the version in mm/hugetlb.c
@@ -44,9 +40,26 @@ static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
 {
 }
 
+
+static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+				   pte_t *ptep, pte_t pte)
+{
+	set_pte_at(mm, addr, ptep, pte);
+}
+
+static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+					    unsigned long addr, pte_t *ptep)
+{
+	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
+	return __pte(old);
+}
+
 static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
 					 unsigned long addr, pte_t *ptep)
 {
+	pte_t pte;
+	pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
+	flush_tlb_page(vma, addr);
 }
 
 static inline int huge_pte_none(pte_t pte)
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index dd50ea15e648..7514ec2f8540 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -245,6 +245,7 @@ extern int __hash_page_64K(unsigned long ea, unsigned long access,
 			   unsigned long vsid, pte_t *ptep, unsigned long trap,
 			   unsigned int local, int ssize);
 struct mm_struct;
+unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap);
 extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap);
 int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		     pte_t *ptep, unsigned long trap, int local, int ssize,
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index ef1f047f5431..fa251f8c2f82 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -775,7 +775,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
 	/* page is dirty */
 	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
 		if (trap == 0x400) {
-			__flush_dcache_icache(page_address(page));
+			flush_dcache_icache_page(page);
 			set_bit(PG_arch_1, &page->flags);
 		} else
 			pp |= HPTE_R_N;
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 1508ffc1e1e1..199539882f92 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -14,33 +14,6 @@
 #include <asm/cacheflush.h>
 #include <asm/machdep.h>
 
-/*
- * Called by asm hashtable.S for doing lazy icache flush
- */
-static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
-					pte_t pte, int trap, unsigned long sz)
-{
-	struct page *page;
-	int i;
-
-	if (!pfn_valid(pte_pfn(pte)))
-		return rflags;
-
-	page = pte_page(pte);
-
-	/* page is dirty */
-	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
-		if (trap == 0x400) {
-			for (i = 0; i < (sz / PAGE_SIZE); i++)
-				__flush_dcache_icache(page_address(page+i));
-			set_bit(PG_arch_1, &page->flags);
-		} else {
-			rflags |= HPTE_R_N;
-		}
-	}
-	return rflags;
-}
-
 int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		     pte_t *ptep, unsigned long trap, int local, int ssize,
 		     unsigned int shift, unsigned int mmu_psize)
@@ -89,8 +62,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 		/* No CPU has hugepages but lacks no execute, so we
 		 * don't need to worry about that case */
-		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
-						       trap, sz);
+		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
 
 	/* Check if pte already has an hpte (case 2) */
 	if (unlikely(old_pte & _PAGE_HASHPTE)) {
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1bf065546fa1..53b200abb025 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -344,27 +344,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 	} while (pgd++, addr = next, addr != end);
 }
 
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, pte_t pte)
-{
-	if (pte_present(*ptep)) {
-		/* We open-code pte_clear because we need to pass the right
-		 * argument to hpte_need_flush (huge / !huge). Might not be
-		 * necessary anymore if we make hpte_need_flush() get the
-		 * page size from the slices
-		 */
-		pte_update(mm, addr, ptep, ~0UL, 1);
-	}
-	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
-}
-
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
-			      pte_t *ptep)
-{
-	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
-	return __pte(old);
-}
-
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
@@ -580,3 +559,13 @@ static int __init hugetlbpage_init(void)
 }
 
 module_init(hugetlbpage_init);
+
+void flush_dcache_icache_hugepage(struct page *page)
+{
+	int i;
+
+	BUG_ON(!PageCompound(page));
+
+	for (i = 0; i < (1UL << compound_order(page)); i++)
+		__flush_dcache_icache(page_address(page+i));
+}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 59736317bf0e..b9b152558f9c 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -32,6 +32,7 @@
 #include <linux/pagemap.h>
 #include <linux/suspend.h>
 #include <linux/lmb.h>
+#include <linux/hugetlb.h>
 
 #include <asm/pgalloc.h>
 #include <asm/prom.h>
@@ -417,18 +418,26 @@ EXPORT_SYMBOL(flush_dcache_page);
 
 void flush_dcache_icache_page(struct page *page)
 {
+#ifdef CONFIG_HUGETLB_PAGE
+	if (PageCompound(page)) {
+		flush_dcache_icache_hugepage(page);
+		return;
+	}
+#endif
 #ifdef CONFIG_BOOKE
-	void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
-	__flush_dcache_icache(start);
-	kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+	{
+		void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
+		__flush_dcache_icache(start);
+		kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+	}
 #elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
 	/* On 8xx there is no need to kmap since highmem is not supported */
 	__flush_dcache_icache(page_address(page)); 
 #else
 	__flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
 #endif
-
 }
+
 void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
 {
 	clear_page(page);
-- 
cgit v1.2.3-59-g8ed1b


From 0682d6c1044e8a54aafdc6282d44c0c436da208f Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 21 Oct 2009 20:15:43 +0000
Subject: powerpc: Fix potential compile error irqs_disabled_flags

irqs_disabled_flags is #defined in linux/irqflags.h when
CONFIG_TRACE_IRQFLAGS_SUPPORT is enabled.  64 and 32 bit always have
CONFIG_TRACE_IRQFLAGS_SUPPORT enabled so just remove
irqs_disabled_flags.

This fixes the case when someone needs to include both linux/irqflags.h
and asm/hw_irq.h.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/hw_irq.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index abbc2aaaced5..9f4c9d4f5803 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -64,11 +64,6 @@ extern void iseries_handle_interrupts(void);
 		get_paca()->hard_enabled = 0;	\
 	} while(0)
 
-static inline int irqs_disabled_flags(unsigned long flags)
-{
-	return flags == 0;
-}
-
 #else
 
 #if defined(CONFIG_BOOKE)
-- 
cgit v1.2.3-59-g8ed1b


From ae7dd0208f62f1d6db4c49b85e54fa7bbed0ea4e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 14 Oct 2009 22:54:36 +0000
Subject: powerpc/nvram_64: Remove unused code

nvram_find_partition() has no user. The call site was removed in the
arch/powerpc move, but the function stayed. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: linuxppc-dev@ozlabs.org
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/nvram.h |  1 -
 arch/powerpc/kernel/nvram_64.c   | 25 -------------------------
 2 files changed, 26 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h
index 6c587eddee59..850b72f27445 100644
--- a/arch/powerpc/include/asm/nvram.h
+++ b/arch/powerpc/include/asm/nvram.h
@@ -73,7 +73,6 @@ extern int nvram_write_error_log(char * buff, int length,
 extern int nvram_read_error_log(char * buff, int length,
 					 unsigned int * err_type, unsigned int *err_seq);
 extern int nvram_clear_error_log(void);
-extern struct nvram_partition *nvram_find_partition(int sig, const char *name);
 
 extern int pSeries_nvram_init(void);
 
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 0ed31f220482..c67e0102df96 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -228,31 +228,6 @@ static unsigned char nvram_checksum(struct nvram_header *p)
 	return c_sum;
 }
 
-
-/*
- * Find an nvram partition, sig can be 0 for any
- * partition or name can be NULL for any name, else
- * tries to match both
- */
-struct nvram_partition *nvram_find_partition(int sig, const char *name)
-{
-	struct nvram_partition * part;
-	struct list_head * p;
-
-	list_for_each(p, &nvram_part->partition) {
-		part = list_entry(p, struct nvram_partition, partition);
-
-		if (sig && part->header.signature != sig)
-			continue;
-		if (name && 0 != strncmp(name, part->header.name, 12))
-			continue;
-		return part; 
-	}
-	return NULL;
-}
-EXPORT_SYMBOL(nvram_find_partition);
-
-
 static int nvram_remove_os_partition(void)
 {
 	struct list_head *i;
-- 
cgit v1.2.3-59-g8ed1b


From cd015707176820b86d07b5dffdecfefdd539a497 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Tue, 13 Oct 2009 19:45:03 +0000
Subject: powerpc: Enable sparse irq_descs on powerpc

Defining CONFIG_SPARSE_IRQ enables generic code that gets rid of the
static irq_desc array, and replaces it with an array of pointers to
irq_descs.

It also allows node local allocation of irq_descs, however we
currently don't have the information available to do that, so we just
allocate them on all on node 0.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/Kconfig            | 13 +++++++++++++
 arch/powerpc/include/asm/irq.h  |  3 +++
 arch/powerpc/kernel/irq.c       | 40 +++++++++++++++++++++++++++++++++-------
 arch/powerpc/kernel/ppc_ksyms.c |  1 -
 arch/powerpc/kernel/setup_64.c  |  5 -----
 5 files changed, 49 insertions(+), 13 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 3aa79f8e39e4..61abde11a45e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -377,6 +377,19 @@ config IRQ_ALL_CPUS
 	  CPU.  Generally saying Y is safe, although some problems have been
 	  reported with SMP Power Macintoshes with this option enabled.
 
+config SPARSE_IRQ
+	bool "Support sparse irq numbering"
+	default y
+	help
+	  This enables support for sparse irqs. This is useful for distro
+	  kernels that want to define a high CONFIG_NR_CPUS value but still
+	  want to have low kernel memory footprint on smaller machines.
+
+	  ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
+	    out the irq_desc[] array in a more NUMA-friendly way. )
+
+	  If you don't know what to do here, say Y.
+
 config NUMA
 	bool "NUMA support"
 	depends on PPC64
diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index 03dc28cdb4da..c85a32f1a17f 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -38,6 +38,9 @@ extern atomic_t ppc_n_lost_interrupts;
 /* Number of irqs reserved for the legacy controller */
 #define NUM_ISA_INTERRUPTS	16
 
+/* Same thing, used by the generic IRQ code */
+#define NR_IRQS_LEGACY		NUM_ISA_INTERRUPTS
+
 /* This type is the placeholder for a hardware interrupt number. It has to
  * be big enough to enclose whatever representation is used by a given
  * platform.
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 63e27d5c52de..eba53923630f 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -85,7 +85,10 @@ extern int tau_interrupts(int);
 #endif /* CONFIG_PPC32 */
 
 #ifdef CONFIG_PPC64
+
+#ifndef CONFIG_SPARSE_IRQ
 EXPORT_SYMBOL(irq_desc);
+#endif
 
 int distribute_irqs = 1;
 
@@ -613,8 +616,16 @@ void irq_set_virq_count(unsigned int count)
 static int irq_setup_virq(struct irq_host *host, unsigned int virq,
 			    irq_hw_number_t hwirq)
 {
+	struct irq_desc *desc;
+
+	desc = irq_to_desc_alloc_node(virq, 0);
+	if (!desc) {
+		pr_debug("irq: -> allocating desc failed\n");
+		goto error;
+	}
+
 	/* Clear IRQ_NOREQUEST flag */
-	irq_to_desc(virq)->status &= ~IRQ_NOREQUEST;
+	desc->status &= ~IRQ_NOREQUEST;
 
 	/* map it */
 	smp_wmb();
@@ -623,11 +634,14 @@ static int irq_setup_virq(struct irq_host *host, unsigned int virq,
 
 	if (host->ops->map(host, virq, hwirq)) {
 		pr_debug("irq: -> mapping failed, freeing\n");
-		irq_free_virt(virq, 1);
-		return -1;
+		goto error;
 	}
 
 	return 0;
+
+error:
+	irq_free_virt(virq, 1);
+	return -1;
 }
 
 unsigned int irq_create_direct_mapping(struct irq_host *host)
@@ -1008,12 +1022,24 @@ void irq_free_virt(unsigned int virq, unsigned int count)
 	spin_unlock_irqrestore(&irq_big_lock, flags);
 }
 
-void irq_early_init(void)
+int arch_early_irq_init(void)
 {
-	unsigned int i;
+	struct irq_desc *desc;
+	int i;
 
-	for (i = 0; i < NR_IRQS; i++)
-		irq_to_desc(i)->status |= IRQ_NOREQUEST;
+	for (i = 0; i < NR_IRQS; i++) {
+		desc = irq_to_desc(i);
+		if (desc)
+			desc->status |= IRQ_NOREQUEST;
+	}
+
+	return 0;
+}
+
+int arch_init_chip_data(struct irq_desc *desc, int node)
+{
+	desc->status |= IRQ_NOREQUEST;
+	return 0;
 }
 
 /* We need to create the radix trees late */
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index c8b27bb4dbde..07115d6cd4ba 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -162,7 +162,6 @@ EXPORT_SYMBOL(screen_info);
 
 #ifdef CONFIG_PPC32
 EXPORT_SYMBOL(timer_interrupt);
-EXPORT_SYMBOL(irq_desc);
 EXPORT_SYMBOL(tb_ticks_per_jiffy);
 EXPORT_SYMBOL(cacheable_memcpy);
 EXPORT_SYMBOL(cacheable_memzero);
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 04f638d82fb3..fd785f7a279b 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -356,11 +356,6 @@ void __init setup_system(void)
 	 */
 	initialize_cache_info();
 
-	/*
-	 * Initialize irq remapping subsystem
-	 */
-	irq_early_init();
-
 #ifdef CONFIG_PPC_RTAS
 	/*
 	 * Initialize RTAS if available
-- 
cgit v1.2.3-59-g8ed1b


From 4f59ecfa9b87da09bdc346f2c443e25fa2c0674c Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Wed, 4 Nov 2009 16:42:33 -0700
Subject: powerpc/5200: add general purpose timer API for the MPC5200

This patch adds an interface for controlling the timer function of the
MPC5200 GPT devices.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/powerpc/include/asm/mpc52xx.h        |   7 ++
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c | 133 +++++++++++++++++++++++++++---
 2 files changed, 130 insertions(+), 10 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/mpc52xx.h b/arch/powerpc/include/asm/mpc52xx.h
index 1b4f697abbdd..671685011a23 100644
--- a/arch/powerpc/include/asm/mpc52xx.h
+++ b/arch/powerpc/include/asm/mpc52xx.h
@@ -276,6 +276,13 @@ extern int mpc52xx_set_psc_clkdiv(int psc_id, int clkdiv);
 extern unsigned int mpc52xx_get_xtal_freq(struct device_node *node);
 extern void mpc52xx_restart(char *cmd);
 
+/* mpc52xx_gpt.c */
+struct mpc52xx_gpt_priv;
+extern struct mpc52xx_gpt_priv *mpc52xx_gpt_from_irq(int irq);
+extern int mpc52xx_gpt_start_timer(struct mpc52xx_gpt_priv *gpt, int period,
+                            int continuous);
+extern void mpc52xx_gpt_stop_timer(struct mpc52xx_gpt_priv *gpt);
+
 /* mpc52xx_pic.c */
 extern void mpc52xx_init_irq(void);
 extern unsigned int mpc52xx_get_irq(void);
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index bfbcd418e690..2c3fa13571ce 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -46,13 +46,17 @@
  * the output mode.  This driver does not change the output mode setting.
  */
 
+#include <linux/device.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
 #include <linux/of_gpio.h>
 #include <linux/kernel.h>
+#include <asm/div64.h>
 #include <asm/mpc52xx.h>
 
 MODULE_DESCRIPTION("Freescale MPC52xx gpt driver");
@@ -68,16 +72,21 @@ MODULE_LICENSE("GPL");
  * @irqhost: Pointer to irq_host instance; used when IRQ mode is supported
  */
 struct mpc52xx_gpt_priv {
+	struct list_head list;		/* List of all GPT devices */
 	struct device *dev;
 	struct mpc52xx_gpt __iomem *regs;
 	spinlock_t lock;
 	struct irq_host *irqhost;
+	u32 ipb_freq;
 
 #if defined(CONFIG_GPIOLIB)
 	struct of_gpio_chip of_gc;
 #endif
 };
 
+LIST_HEAD(mpc52xx_gpt_list);
+DEFINE_MUTEX(mpc52xx_gpt_list_mutex);
+
 #define MPC52xx_GPT_MODE_MS_MASK	(0x07)
 #define MPC52xx_GPT_MODE_MS_IC		(0x01)
 #define MPC52xx_GPT_MODE_MS_OC		(0x02)
@@ -88,6 +97,9 @@ struct mpc52xx_gpt_priv {
 #define MPC52xx_GPT_MODE_GPIO_OUT_LOW	(0x20)
 #define MPC52xx_GPT_MODE_GPIO_OUT_HIGH	(0x30)
 
+#define MPC52xx_GPT_MODE_COUNTER_ENABLE	(0x1000)
+#define MPC52xx_GPT_MODE_CONTINUOUS	(0x0400)
+#define MPC52xx_GPT_MODE_OPEN_DRAIN	(0x0200)
 #define MPC52xx_GPT_MODE_IRQ_EN		(0x0100)
 
 #define MPC52xx_GPT_MODE_ICT_MASK	(0x030000)
@@ -190,7 +202,7 @@ static int mpc52xx_gpt_irq_xlate(struct irq_host *h, struct device_node *ct,
 
 	dev_dbg(gpt->dev, "%s: flags=%i\n", __func__, intspec[0]);
 
-	if ((intsize < 1) || (intspec[0] < 1) || (intspec[0] > 3)) {
+	if ((intsize < 1) || (intspec[0] > 3)) {
 		dev_err(gpt->dev, "bad irq specifier in %s\n", ct->full_name);
 		return -EINVAL;
 	}
@@ -211,13 +223,11 @@ mpc52xx_gpt_irq_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node)
 {
 	int cascade_virq;
 	unsigned long flags;
-
-	/* Only setup cascaded IRQ if device tree claims the GPT is
-	 * an interrupt controller */
-	if (!of_find_property(node, "interrupt-controller", NULL))
-		return;
+	u32 mode;
 
 	cascade_virq = irq_of_parse_and_map(node, 0);
+	if (!cascade_virq)
+		return;
 
 	gpt->irqhost = irq_alloc_host(node, IRQ_HOST_MAP_LINEAR, 1,
 				      &mpc52xx_gpt_irq_ops, -1);
@@ -227,14 +237,16 @@ mpc52xx_gpt_irq_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node)
 	}
 
 	gpt->irqhost->host_data = gpt;
-
 	set_irq_data(cascade_virq, gpt);
 	set_irq_chained_handler(cascade_virq, mpc52xx_gpt_irq_cascade);
 
-	/* Set to Input Capture mode */
+	/* If the GPT is currently disabled, then change it to be in Input
+	 * Capture mode.  If the mode is non-zero, then the pin could be
+	 * already in use for something. */
 	spin_lock_irqsave(&gpt->lock, flags);
-	clrsetbits_be32(&gpt->regs->mode, MPC52xx_GPT_MODE_MS_MASK,
-			MPC52xx_GPT_MODE_MS_IC);
+	mode = in_be32(&gpt->regs->mode);
+	if ((mode & MPC52xx_GPT_MODE_MS_MASK) == 0)
+		out_be32(&gpt->regs->mode, mode | MPC52xx_GPT_MODE_MS_IC);
 	spin_unlock_irqrestore(&gpt->lock, flags);
 
 	dev_dbg(gpt->dev, "%s() complete. virq=%i\n", __func__, cascade_virq);
@@ -335,6 +347,102 @@ static void
 mpc52xx_gpt_gpio_setup(struct mpc52xx_gpt_priv *p, struct device_node *np) { }
 #endif /* defined(CONFIG_GPIOLIB) */
 
+/***********************************************************************
+ * Timer API
+ */
+
+/**
+ * mpc52xx_gpt_from_irq - Return the GPT device associated with an IRQ number
+ * @irq: irq of timer.
+ */
+struct mpc52xx_gpt_priv *mpc52xx_gpt_from_irq(int irq)
+{
+	struct mpc52xx_gpt_priv *gpt;
+	struct list_head *pos;
+
+	/* Iterate over the list of timers looking for a matching device */
+	mutex_lock(&mpc52xx_gpt_list_mutex);
+	list_for_each(pos, &mpc52xx_gpt_list) {
+		gpt = container_of(pos, struct mpc52xx_gpt_priv, list);
+		if (gpt->irqhost && irq == irq_linear_revmap(gpt->irqhost, 0)) {
+			mutex_unlock(&mpc52xx_gpt_list_mutex);
+			return gpt;
+		}
+	}
+	mutex_unlock(&mpc52xx_gpt_list_mutex);
+
+	return NULL;
+}
+EXPORT_SYMBOL(mpc52xx_gpt_from_irq);
+
+/**
+ * mpc52xx_gpt_start_timer - Set and enable the GPT timer
+ * @gpt: Pointer to gpt private data structure
+ * @period: period of timer
+ * @continuous: set to 1 to make timer continuous free running
+ *
+ * An interrupt will be generated every time the timer fires
+ */
+int mpc52xx_gpt_start_timer(struct mpc52xx_gpt_priv *gpt, int period,
+                            int continuous)
+{
+	u32 clear, set;
+	u64 clocks;
+	u32 prescale;
+	unsigned long flags;
+
+	clear = MPC52xx_GPT_MODE_MS_MASK | MPC52xx_GPT_MODE_CONTINUOUS;
+	set = MPC52xx_GPT_MODE_MS_GPIO | MPC52xx_GPT_MODE_COUNTER_ENABLE;
+	if (continuous)
+		set |= MPC52xx_GPT_MODE_CONTINUOUS;
+
+	/* Determine the number of clocks in the requested period.  64 bit
+	 * arithmatic is done here to preserve the precision until the value
+	 * is scaled back down into the u32 range.  Period is in 'ns', bus
+	 * frequency is in Hz. */
+	clocks = (u64)period * (u64)gpt->ipb_freq;
+	do_div(clocks, 1000000000); /* Scale it down to ns range */
+
+	/* This device cannot handle a clock count greater than 32 bits */
+	if (clocks > 0xffffffff)
+		return -EINVAL;
+
+	/* Calculate the prescaler and count values from the clocks value.
+	 * 'clocks' is the number of clock ticks in the period.  The timer
+	 * has 16 bit precision and a 16 bit prescaler.  Prescaler is
+	 * calculated by integer dividing the clocks by 0x10000 (shifting
+	 * down 16 bits) to obtain the smallest possible divisor for clocks
+	 * to get a 16 bit count value.
+	 *
+	 * Note: the prescale register is '1' based, not '0' based.  ie. a
+	 * value of '1' means divide the clock by one.  0xffff divides the
+	 * clock by 0xffff.  '0x0000' does not divide by zero, but wraps
+	 * around and divides by 0x10000.  That is why prescale must be
+	 * a u32 variable, not a u16, for this calculation. */
+	prescale = (clocks >> 16) + 1;
+	do_div(clocks, prescale);
+	if (clocks > 0xffff) {
+		pr_err("calculation error; prescale:%x clocks:%llx\n",
+		       prescale, clocks);
+		return -EINVAL;
+	}
+
+	/* Set and enable the timer */
+	spin_lock_irqsave(&gpt->lock, flags);
+	out_be32(&gpt->regs->count, prescale << 16 | clocks);
+	clrsetbits_be32(&gpt->regs->mode, clear, set);
+	spin_unlock_irqrestore(&gpt->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(mpc52xx_gpt_start_timer);
+
+void mpc52xx_gpt_stop_timer(struct mpc52xx_gpt_priv *gpt)
+{
+	clrbits32(&gpt->regs->mode, MPC52xx_GPT_MODE_COUNTER_ENABLE);
+}
+EXPORT_SYMBOL(mpc52xx_gpt_stop_timer);
+
 /* ---------------------------------------------------------------------
  * of_platform bus binding code
  */
@@ -349,6 +457,7 @@ static int __devinit mpc52xx_gpt_probe(struct of_device *ofdev,
 
 	spin_lock_init(&gpt->lock);
 	gpt->dev = &ofdev->dev;
+	gpt->ipb_freq = mpc5xxx_get_bus_frequency(ofdev->node);
 	gpt->regs = of_iomap(ofdev->node, 0);
 	if (!gpt->regs) {
 		kfree(gpt);
@@ -360,6 +469,10 @@ static int __devinit mpc52xx_gpt_probe(struct of_device *ofdev,
 	mpc52xx_gpt_gpio_setup(gpt, ofdev->node);
 	mpc52xx_gpt_irq_setup(gpt, ofdev->node);
 
+	mutex_lock(&mpc52xx_gpt_list_mutex);
+	list_add(&gpt->list, &mpc52xx_gpt_list);
+	mutex_unlock(&mpc52xx_gpt_list_mutex);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 3c9059d79f5eea6b8b75ddac97693127c3c41db4 Mon Sep 17 00:00:00 2001
From: John Bonesio <bones@secretlab.ca>
Date: Tue, 29 Sep 2009 10:43:42 +0000
Subject: powerpc/5200: add LocalPlus bus FIFO device driver

This is a driver for the FIFO device on the LocalPlus bus on an mpc5200 system.
The driver supports programmed I/O through the FIFO as well as setting up DMA
via the BestComm engine through the FIFO.

Signed-off-by: John Bonesio <bones@secretlab.ca>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/powerpc/include/asm/mpc52xx.h            |  39 ++
 arch/powerpc/platforms/52xx/Kconfig           |   5 +
 arch/powerpc/platforms/52xx/Makefile          |   1 +
 arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c | 560 ++++++++++++++++++++++++++
 4 files changed, 605 insertions(+)
 create mode 100644 arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/mpc52xx.h b/arch/powerpc/include/asm/mpc52xx.h
index 671685011a23..707ab7590cfb 100644
--- a/arch/powerpc/include/asm/mpc52xx.h
+++ b/arch/powerpc/include/asm/mpc52xx.h
@@ -283,6 +283,45 @@ extern int mpc52xx_gpt_start_timer(struct mpc52xx_gpt_priv *gpt, int period,
                             int continuous);
 extern void mpc52xx_gpt_stop_timer(struct mpc52xx_gpt_priv *gpt);
 
+/* mpc52xx_lpbfifo.c */
+#define MPC52XX_LPBFIFO_FLAG_READ		(0)
+#define MPC52XX_LPBFIFO_FLAG_WRITE		(1<<0)
+#define MPC52XX_LPBFIFO_FLAG_NO_INCREMENT	(1<<1)
+#define MPC52XX_LPBFIFO_FLAG_NO_DMA		(1<<2)
+#define MPC52XX_LPBFIFO_FLAG_POLL_DMA		(1<<3)
+
+struct mpc52xx_lpbfifo_request {
+	struct list_head list;
+
+	/* localplus bus address */
+	unsigned int cs;
+	size_t offset;
+
+	/* Memory address */
+	void *data;
+	phys_addr_t data_phys;
+
+	/* Details of transfer */
+	size_t size;
+	size_t pos;	/* current position of transfer */
+	int flags;
+
+	/* What to do when finished */
+	void (*callback)(struct mpc52xx_lpbfifo_request *);
+
+	void *priv;		/* Driver private data */
+
+	/* statistics */
+	int irq_count;
+	int irq_ticks;
+	u8 last_byte;
+	int buffer_not_done_cnt;
+};
+
+extern int mpc52xx_lpbfifo_submit(struct mpc52xx_lpbfifo_request *req);
+extern void mpc52xx_lpbfifo_abort(struct mpc52xx_lpbfifo_request *req);
+extern void mpc52xx_lpbfifo_poll(void);
+
 /* mpc52xx_pic.c */
 extern void mpc52xx_init_irq(void);
 extern unsigned int mpc52xx_get_irq(void);
diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig
index 8b8e9560a315..47ea1be1481b 100644
--- a/arch/powerpc/platforms/52xx/Kconfig
+++ b/arch/powerpc/platforms/52xx/Kconfig
@@ -62,3 +62,8 @@ config PPC_MPC5200_GPIO
 	select GENERIC_GPIO
 	help
 	  Enable gpiolib support for mpc5200 based boards
+
+config PPC_MPC5200_LPBFIFO
+	tristate "MPC5200 LocalPlus bus FIFO driver"
+	depends on PPC_MPC52xx
+	select PPC_BESTCOMM_GEN_BD
diff --git a/arch/powerpc/platforms/52xx/Makefile b/arch/powerpc/platforms/52xx/Makefile
index bfd4f52cf3dd..2bc8cd0c5cfc 100644
--- a/arch/powerpc/platforms/52xx/Makefile
+++ b/arch/powerpc/platforms/52xx/Makefile
@@ -15,3 +15,4 @@ ifeq ($(CONFIG_PPC_LITE5200),y)
 endif
 
 obj-$(CONFIG_PPC_MPC5200_GPIO)	+= mpc52xx_gpio.o
+obj-$(CONFIG_PPC_MPC5200_LPBFIFO)	+= mpc52xx_lpbfifo.o
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c b/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c
new file mode 100644
index 000000000000..929d017535a3
--- /dev/null
+++ b/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c
@@ -0,0 +1,560 @@
+/*
+ * LocalPlus Bus FIFO driver for the Freescale MPC52xx.
+ *
+ * Copyright (C) 2009 Secret Lab Technologies Ltd.
+ *
+ * This file is released under the GPLv2
+ *
+ * Todo:
+ * - Add support for multiple requests to be queued.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/spinlock.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/mpc52xx.h>
+#include <asm/time.h>
+
+#include <sysdev/bestcomm/bestcomm.h>
+#include <sysdev/bestcomm/bestcomm_priv.h>
+#include <sysdev/bestcomm/gen_bd.h>
+
+MODULE_AUTHOR("Grant Likely <grant.likely@secretlab.ca>");
+MODULE_DESCRIPTION("MPC5200 LocalPlus FIFO device driver");
+MODULE_LICENSE("GPL");
+
+#define LPBFIFO_REG_PACKET_SIZE		(0x00)
+#define LPBFIFO_REG_START_ADDRESS	(0x04)
+#define LPBFIFO_REG_CONTROL		(0x08)
+#define LPBFIFO_REG_ENABLE		(0x0C)
+#define LPBFIFO_REG_BYTES_DONE_STATUS	(0x14)
+#define LPBFIFO_REG_FIFO_DATA		(0x40)
+#define LPBFIFO_REG_FIFO_STATUS		(0x44)
+#define LPBFIFO_REG_FIFO_CONTROL	(0x48)
+#define LPBFIFO_REG_FIFO_ALARM		(0x4C)
+
+struct mpc52xx_lpbfifo {
+	struct device *dev;
+	phys_addr_t regs_phys;
+	void __iomem *regs;
+	int irq;
+	spinlock_t lock;
+
+	struct bcom_task *bcom_tx_task;
+	struct bcom_task *bcom_rx_task;
+	struct bcom_task *bcom_cur_task;
+
+	/* Current state data */
+	struct mpc52xx_lpbfifo_request *req;
+	int dma_irqs_enabled;
+};
+
+/* The MPC5200 has only one fifo, so only need one instance structure */
+static struct mpc52xx_lpbfifo lpbfifo;
+
+/**
+ * mpc52xx_lpbfifo_kick - Trigger the next block of data to be transfered
+ */
+static void mpc52xx_lpbfifo_kick(struct mpc52xx_lpbfifo_request *req)
+{
+	size_t transfer_size = req->size - req->pos;
+	struct bcom_bd *bd;
+	void __iomem *reg;
+	u32 *data;
+	int i;
+	int bit_fields;
+	int dma = !(req->flags & MPC52XX_LPBFIFO_FLAG_NO_DMA);
+	int write = req->flags & MPC52XX_LPBFIFO_FLAG_WRITE;
+	int poll_dma = req->flags & MPC52XX_LPBFIFO_FLAG_POLL_DMA;
+
+	/* Set and clear the reset bits; is good practice in User Manual */
+	out_be32(lpbfifo.regs + LPBFIFO_REG_ENABLE, 0x01010000);
+
+	/* set master enable bit */
+	out_be32(lpbfifo.regs + LPBFIFO_REG_ENABLE, 0x00000001);
+	if (!dma) {
+		/* While the FIFO can be setup for transfer sizes as large as
+		 * 16M-1, the FIFO itself is only 512 bytes deep and it does
+		 * not generate interrupts for FIFO full events (only transfer
+		 * complete will raise an IRQ).  Therefore when not using
+		 * Bestcomm to drive the FIFO it needs to either be polled, or
+		 * transfers need to constrained to the size of the fifo.
+		 *
+		 * This driver restricts the size of the transfer
+		 */
+		if (transfer_size > 512)
+			transfer_size = 512;
+
+		/* Load the FIFO with data */
+		if (write) {
+			reg = lpbfifo.regs + LPBFIFO_REG_FIFO_DATA;
+			data = req->data + req->pos;
+			for (i = 0; i < transfer_size; i += 4)
+				out_be32(reg, *data++);
+		}
+
+		/* Unmask both error and completion irqs */
+		out_be32(lpbfifo.regs + LPBFIFO_REG_ENABLE, 0x00000301);
+	} else {
+		/* Choose the correct direction
+		 *
+		 * Configure the watermarks so DMA will always complete correctly.
+		 * It may be worth experimenting with the ALARM value to see if
+		 * there is a performance impacit.  However, if it is wrong there
+		 * is a risk of DMA not transferring the last chunk of data
+		 */
+		if (write) {
+			out_be32(lpbfifo.regs + LPBFIFO_REG_FIFO_ALARM, 0x1e4);
+			out_8(lpbfifo.regs + LPBFIFO_REG_FIFO_CONTROL, 7);
+			lpbfifo.bcom_cur_task = lpbfifo.bcom_tx_task;
+		} else {
+			out_be32(lpbfifo.regs + LPBFIFO_REG_FIFO_ALARM, 0x1ff);
+			out_8(lpbfifo.regs + LPBFIFO_REG_FIFO_CONTROL, 0);
+			lpbfifo.bcom_cur_task = lpbfifo.bcom_rx_task;
+
+			if (poll_dma) {
+				if (lpbfifo.dma_irqs_enabled) {
+					disable_irq(bcom_get_task_irq(lpbfifo.bcom_rx_task));
+					lpbfifo.dma_irqs_enabled = 0;
+				}
+			} else {
+				if (!lpbfifo.dma_irqs_enabled) {
+					enable_irq(bcom_get_task_irq(lpbfifo.bcom_rx_task));
+					lpbfifo.dma_irqs_enabled = 1;
+				}
+			}
+		}
+
+		bd = bcom_prepare_next_buffer(lpbfifo.bcom_cur_task);
+		bd->status = transfer_size;
+		if (!write) {
+			/*
+			 * In the DMA read case, the DMA doesn't complete,
+			 * possibly due to incorrect watermarks in the ALARM
+			 * and CONTROL regs. For now instead of trying to
+			 * determine the right watermarks that will make this
+			 * work, just increase the number of bytes the FIFO is
+			 * expecting.
+			 *
+			 * When submitting another operation, the FIFO will get
+			 * reset, so the condition of the FIFO waiting for a
+			 * non-existent 4 bytes will get cleared.
+			 */
+			transfer_size += 4; /* BLECH! */
+		}
+		bd->data[0] = req->data_phys + req->pos;
+		bcom_submit_next_buffer(lpbfifo.bcom_cur_task, NULL);
+
+		/* error irq & master enabled bit */
+		bit_fields = 0x00000201;
+
+		/* Unmask irqs */
+		if (write && (!poll_dma))
+			bit_fields |= 0x00000100; /* completion irq too */
+		out_be32(lpbfifo.regs + LPBFIFO_REG_ENABLE, bit_fields);
+	}
+
+	/* Set transfer size, width, chip select and READ mode */
+	out_be32(lpbfifo.regs + LPBFIFO_REG_START_ADDRESS,
+		 req->offset + req->pos);
+	out_be32(lpbfifo.regs + LPBFIFO_REG_PACKET_SIZE, transfer_size);
+
+	bit_fields = req->cs << 24 | 0x000008;
+	if (!write)
+		bit_fields |= 0x010000; /* read mode */
+	out_be32(lpbfifo.regs + LPBFIFO_REG_CONTROL, bit_fields);
+
+	/* Kick it off */
+	out_8(lpbfifo.regs + LPBFIFO_REG_PACKET_SIZE, 0x01);
+	if (dma)
+		bcom_enable(lpbfifo.bcom_cur_task);
+}
+
+/**
+ * mpc52xx_lpbfifo_irq - IRQ handler for LPB FIFO
+ *
+ * On transmit, the dma completion irq triggers before the fifo completion
+ * triggers.  Handle the dma completion here instead of the LPB FIFO Bestcomm
+ * task completion irq becuase everyting is not really done until the LPB FIFO
+ * completion irq triggers.
+ *
+ * In other words:
+ * For DMA, on receive, the "Fat Lady" is the bestcom completion irq. on
+ * transmit, the fifo completion irq is the "Fat Lady". The opera (or in this
+ * case the DMA/FIFO operation) is not finished until the "Fat Lady" sings.
+ *
+ * Reasons for entering this routine:
+ * 1) PIO mode rx and tx completion irq
+ * 2) DMA interrupt mode tx completion irq
+ * 3) DMA polled mode tx
+ *
+ * Exit conditions:
+ * 1) Transfer aborted
+ * 2) FIFO complete without DMA; more data to do
+ * 3) FIFO complete without DMA; all data transfered
+ * 4) FIFO complete using DMA
+ *
+ * Condition 1 can occur regardless of whether or not DMA is used.
+ * It requires executing the callback to report the error and exiting
+ * immediately.
+ *
+ * Condition 2 requires programming the FIFO with the next block of data
+ *
+ * Condition 3 requires executing the callback to report completion
+ *
+ * Condition 4 means the same as 3, except that we also retrieve the bcom
+ * buffer so DMA doesn't get clogged up.
+ *
+ * To make things trickier, the spinlock must be dropped before
+ * executing the callback, otherwise we could end up with a deadlock
+ * or nested spinlock condition.  The out path is non-trivial, so
+ * extra fiddling is done to make sure all paths lead to the same
+ * outbound code.
+ */
+static irqreturn_t mpc52xx_lpbfifo_irq(int irq, void *dev_id)
+{
+	struct mpc52xx_lpbfifo_request *req;
+	u32 status = in_8(lpbfifo.regs + LPBFIFO_REG_BYTES_DONE_STATUS);
+	void __iomem *reg;
+	u32 *data;
+	int count, i;
+	int do_callback = 0;
+	u32 ts;
+	unsigned long flags;
+	int dma, write, poll_dma;
+
+	spin_lock_irqsave(&lpbfifo.lock, flags);
+	ts = get_tbl();
+
+	req = lpbfifo.req;
+	if (!req) {
+		spin_unlock_irqrestore(&lpbfifo.lock, flags);
+		pr_err("bogus LPBFIFO IRQ\n");
+		return IRQ_HANDLED;
+	}
+
+	dma = !(req->flags & MPC52XX_LPBFIFO_FLAG_NO_DMA);
+	write = req->flags & MPC52XX_LPBFIFO_FLAG_WRITE;
+	poll_dma = req->flags & MPC52XX_LPBFIFO_FLAG_POLL_DMA;
+
+	if (dma && !write) {
+		spin_unlock_irqrestore(&lpbfifo.lock, flags);
+		pr_err("bogus LPBFIFO IRQ (dma and not writting)\n");
+		return IRQ_HANDLED;
+	}
+
+	if ((status & 0x01) == 0) {
+		goto out;
+	}
+
+	/* check abort bit */
+	if (status & 0x10) {
+		out_be32(lpbfifo.regs + LPBFIFO_REG_ENABLE, 0x01010000);
+		do_callback = 1;
+		goto out;
+	}
+
+	/* Read result from hardware */
+	count = in_be32(lpbfifo.regs + LPBFIFO_REG_BYTES_DONE_STATUS);
+	count &= 0x00ffffff;
+
+	if (!dma && !write) {
+		/* copy the data out of the FIFO */
+		reg = lpbfifo.regs + LPBFIFO_REG_FIFO_DATA;
+		data = req->data + req->pos;
+		for (i = 0; i < count; i += 4)
+			*data++ = in_be32(reg);
+	}
+
+	/* Update transfer position and count */
+	req->pos += count;
+
+	/* Decide what to do next */
+	if (req->size - req->pos)
+		mpc52xx_lpbfifo_kick(req); /* more work to do */
+	else
+		do_callback = 1;
+
+ out:
+	/* Clear the IRQ */
+	out_8(lpbfifo.regs + LPBFIFO_REG_BYTES_DONE_STATUS, 0x01);
+
+	if (dma && (status & 0x11)) {
+		/*
+		 * Count the DMA as complete only when the FIFO completion
+		 * status or abort bits are set.
+		 *
+		 * (status & 0x01) should always be the case except sometimes
+		 * when using polled DMA.
+		 *
+		 * (status & 0x10) {transfer aborted}: This case needs more
+		 * testing.
+		 */
+		bcom_retrieve_buffer(lpbfifo.bcom_cur_task, &status, NULL);
+	}
+	req->last_byte = ((u8 *)req->data)[req->size - 1];
+
+	/* When the do_callback flag is set; it means the transfer is finished
+	 * so set the FIFO as idle */
+	if (do_callback)
+		lpbfifo.req = NULL;
+
+	if (irq != 0) /* don't increment on polled case */
+		req->irq_count++;
+
+	req->irq_ticks += get_tbl() - ts;
+	spin_unlock_irqrestore(&lpbfifo.lock, flags);
+
+	/* Spinlock is released; it is now safe to call the callback */
+	if (do_callback && req->callback)
+		req->callback(req);
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * mpc52xx_lpbfifo_bcom_irq - IRQ handler for LPB FIFO Bestcomm task
+ *
+ * Only used when receiving data.
+ */
+static irqreturn_t mpc52xx_lpbfifo_bcom_irq(int irq, void *dev_id)
+{
+	struct mpc52xx_lpbfifo_request *req;
+	unsigned long flags;
+	u32 status;
+	u32 ts;
+
+	spin_lock_irqsave(&lpbfifo.lock, flags);
+	ts = get_tbl();
+
+	req = lpbfifo.req;
+	if (!req || (req->flags & MPC52XX_LPBFIFO_FLAG_NO_DMA)) {
+		spin_unlock_irqrestore(&lpbfifo.lock, flags);
+		return IRQ_HANDLED;
+	}
+
+	if (irq != 0) /* don't increment on polled case */
+		req->irq_count++;
+
+	if (!bcom_buffer_done(lpbfifo.bcom_cur_task)) {
+		spin_unlock_irqrestore(&lpbfifo.lock, flags);
+
+		req->buffer_not_done_cnt++;
+		if ((req->buffer_not_done_cnt % 1000) == 0)
+			pr_err("transfer stalled\n");
+
+		return IRQ_HANDLED;
+	}
+
+	bcom_retrieve_buffer(lpbfifo.bcom_cur_task, &status, NULL);
+
+	req->last_byte = ((u8 *)req->data)[req->size - 1];
+
+	req->pos = status & 0x00ffffff;
+
+	/* Mark the FIFO as idle */
+	lpbfifo.req = NULL;
+
+	/* Release the lock before calling out to the callback. */
+	req->irq_ticks += get_tbl() - ts;
+	spin_unlock_irqrestore(&lpbfifo.lock, flags);
+
+	if (req->callback)
+		req->callback(req);
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * mpc52xx_lpbfifo_bcom_poll - Poll for DMA completion
+ */
+void mpc52xx_lpbfifo_poll(void)
+{
+	struct mpc52xx_lpbfifo_request *req = lpbfifo.req;
+	int dma = !(req->flags & MPC52XX_LPBFIFO_FLAG_NO_DMA);
+	int write = req->flags & MPC52XX_LPBFIFO_FLAG_WRITE;
+
+	/*
+	 * For more information, see comments on the "Fat Lady" 
+	 */
+	if (dma && write)
+		mpc52xx_lpbfifo_irq(0, NULL);
+	else 
+		mpc52xx_lpbfifo_bcom_irq(0, NULL);
+}
+EXPORT_SYMBOL(mpc52xx_lpbfifo_poll);
+
+/**
+ * mpc52xx_lpbfifo_submit - Submit an LPB FIFO transfer request.
+ * @req: Pointer to request structure
+ */
+int mpc52xx_lpbfifo_submit(struct mpc52xx_lpbfifo_request *req)
+{
+	unsigned long flags;
+
+	if (!lpbfifo.regs)
+		return -ENODEV;
+
+	spin_lock_irqsave(&lpbfifo.lock, flags);
+
+	/* If the req pointer is already set, then a transfer is in progress */
+	if (lpbfifo.req) {
+		spin_unlock_irqrestore(&lpbfifo.lock, flags);
+		return -EBUSY;
+	}
+
+	/* Setup the transfer */
+	lpbfifo.req = req;
+	req->irq_count = 0;
+	req->irq_ticks = 0;
+	req->buffer_not_done_cnt = 0;
+	req->pos = 0;
+
+	mpc52xx_lpbfifo_kick(req);
+	spin_unlock_irqrestore(&lpbfifo.lock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(mpc52xx_lpbfifo_submit);
+
+void mpc52xx_lpbfifo_abort(struct mpc52xx_lpbfifo_request *req)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lpbfifo.lock, flags);
+	if (lpbfifo.req == req) {
+		/* Put it into reset and clear the state */
+		bcom_gen_bd_rx_reset(lpbfifo.bcom_rx_task);
+		bcom_gen_bd_tx_reset(lpbfifo.bcom_tx_task);
+		out_be32(lpbfifo.regs + LPBFIFO_REG_ENABLE, 0x01010000);
+		lpbfifo.req = NULL;
+	}
+	spin_unlock_irqrestore(&lpbfifo.lock, flags);
+}
+EXPORT_SYMBOL(mpc52xx_lpbfifo_abort);
+
+static int __devinit
+mpc52xx_lpbfifo_probe(struct of_device *op, const struct of_device_id *match)
+{
+	struct resource res;
+	int rc = -ENOMEM;
+
+	if (lpbfifo.dev != NULL)
+		return -ENOSPC;
+
+	lpbfifo.irq = irq_of_parse_and_map(op->node, 0);
+	if (!lpbfifo.irq)
+		return -ENODEV;
+
+	if (of_address_to_resource(op->node, 0, &res))
+		return -ENODEV;
+	lpbfifo.regs_phys = res.start;
+	lpbfifo.regs = of_iomap(op->node, 0);
+	if (!lpbfifo.regs)
+		return -ENOMEM;
+
+	spin_lock_init(&lpbfifo.lock);
+
+	/* Put FIFO into reset */
+	out_be32(lpbfifo.regs + LPBFIFO_REG_ENABLE, 0x01010000);
+
+	/* Register the interrupt handler */
+	rc = request_irq(lpbfifo.irq, mpc52xx_lpbfifo_irq, 0,
+			 "mpc52xx-lpbfifo", &lpbfifo);
+	if (rc)
+		goto err_irq;
+
+	/* Request the Bestcomm receive (fifo --> memory) task and IRQ */
+	lpbfifo.bcom_rx_task =
+		bcom_gen_bd_rx_init(2, res.start + LPBFIFO_REG_FIFO_DATA,
+				    BCOM_INITIATOR_SCLPC, BCOM_IPR_SCLPC,
+				    16*1024*1024);
+	if (!lpbfifo.bcom_rx_task)
+		goto err_bcom_rx;
+
+	rc = request_irq(bcom_get_task_irq(lpbfifo.bcom_rx_task),
+			 mpc52xx_lpbfifo_bcom_irq, 0,
+			 "mpc52xx-lpbfifo-rx", &lpbfifo);
+	if (rc)
+		goto err_bcom_rx_irq;
+
+	/* Request the Bestcomm transmit (memory --> fifo) task and IRQ */
+	lpbfifo.bcom_tx_task =
+		bcom_gen_bd_tx_init(2, res.start + LPBFIFO_REG_FIFO_DATA,
+				    BCOM_INITIATOR_SCLPC, BCOM_IPR_SCLPC);
+	if (!lpbfifo.bcom_tx_task)
+		goto err_bcom_tx;
+
+	lpbfifo.dev = &op->dev;
+	return 0;
+
+ err_bcom_tx:
+	free_irq(bcom_get_task_irq(lpbfifo.bcom_rx_task), &lpbfifo);
+ err_bcom_rx_irq:
+	bcom_gen_bd_rx_release(lpbfifo.bcom_rx_task);
+ err_bcom_rx:
+ err_irq:
+	iounmap(lpbfifo.regs);
+	lpbfifo.regs = NULL;
+
+	dev_err(&op->dev, "mpc52xx_lpbfifo_probe() failed\n");
+	return -ENODEV;
+}
+
+
+static int __devexit mpc52xx_lpbfifo_remove(struct of_device *op)
+{
+	if (lpbfifo.dev != &op->dev)
+		return 0;
+
+	/* Put FIFO in reset */
+	out_be32(lpbfifo.regs + LPBFIFO_REG_ENABLE, 0x01010000);
+
+	/* Release the bestcomm transmit task */
+	free_irq(bcom_get_task_irq(lpbfifo.bcom_tx_task), &lpbfifo);
+	bcom_gen_bd_tx_release(lpbfifo.bcom_tx_task);
+	
+	/* Release the bestcomm receive task */
+	free_irq(bcom_get_task_irq(lpbfifo.bcom_rx_task), &lpbfifo);
+	bcom_gen_bd_rx_release(lpbfifo.bcom_rx_task);
+
+	free_irq(lpbfifo.irq, &lpbfifo);
+	iounmap(lpbfifo.regs);
+	lpbfifo.regs = NULL;
+	lpbfifo.dev = NULL;
+
+	return 0;
+}
+
+static struct of_device_id mpc52xx_lpbfifo_match[] __devinitconst = {
+	{ .compatible = "fsl,mpc5200-lpbfifo", },
+	{},
+};
+
+static struct of_platform_driver mpc52xx_lpbfifo_driver = {
+	.owner = THIS_MODULE,
+	.name = "mpc52xx-lpbfifo",
+	.match_table = mpc52xx_lpbfifo_match,
+	.probe = mpc52xx_lpbfifo_probe,
+	.remove = __devexit_p(mpc52xx_lpbfifo_remove),
+};
+
+/***********************************************************************
+ * Module init/exit
+ */
+static int __init mpc52xx_lpbfifo_init(void)
+{
+	pr_debug("Registering LocalPlus bus FIFO driver\n");
+	return of_register_platform_driver(&mpc52xx_lpbfifo_driver);
+}
+module_init(mpc52xx_lpbfifo_init);
+
+static void __exit mpc52xx_lpbfifo_exit(void)
+{
+	pr_debug("Unregistering LocalPlus bus FIFO driver\n");
+	of_unregister_platform_driver(&mpc52xx_lpbfifo_driver);
+}
+module_exit(mpc52xx_lpbfifo_exit);
-- 
cgit v1.2.3-59-g8ed1b


From ec3c11aa5f9d0a7f48f46d6790c33ccc654fd6ec Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 30 Oct 2009 05:47:02 +0000
Subject: Pass PVR in sregs

Right now sregs is unused on PPC, so we can use it for initialization
of the CPU.

KVM on BookE always virtualizes the host CPU. On Book3s we go a step further
and take the PVR from userspace that tells us what kind of CPU we are supposed
to virtualize, because we support Book3s_32 and Book3s_64 guests.

In order to get that information, we use the sregs ioctl, because we don't
want to reset the guest CPU on every normal register set.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/kvm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index bb2de6aa5ce0..c9ca97f43bc1 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -46,6 +46,8 @@ struct kvm_regs {
 };
 
 struct kvm_sregs {
+	__u32 pvr;
+	char pad[1020];
 };
 
 struct kvm_fpu {
-- 
cgit v1.2.3-59-g8ed1b


From 83cd259d8e5b9878be0535f7ddd326676172279a Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 30 Oct 2009 05:47:03 +0000
Subject: Add Book3s definitions

We need quite a bunch of new constants for KVM on Book3s,
so let's define them now.

These constants will be used in later patches.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/kvm_asm.h | 39 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 56bfae59837f..19ddb352fd0f 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -49,6 +49,45 @@
 #define BOOKE_INTERRUPT_SPE_FP_ROUND 34
 #define BOOKE_INTERRUPT_PERFORMANCE_MONITOR 35
 
+/* book3s */
+
+#define BOOK3S_INTERRUPT_SYSTEM_RESET	0x100
+#define BOOK3S_INTERRUPT_MACHINE_CHECK	0x200
+#define BOOK3S_INTERRUPT_DATA_STORAGE	0x300
+#define BOOK3S_INTERRUPT_DATA_SEGMENT	0x380
+#define BOOK3S_INTERRUPT_INST_STORAGE	0x400
+#define BOOK3S_INTERRUPT_INST_SEGMENT	0x480
+#define BOOK3S_INTERRUPT_EXTERNAL	0x500
+#define BOOK3S_INTERRUPT_ALIGNMENT	0x600
+#define BOOK3S_INTERRUPT_PROGRAM	0x700
+#define BOOK3S_INTERRUPT_FP_UNAVAIL	0x800
+#define BOOK3S_INTERRUPT_DECREMENTER	0x900
+#define BOOK3S_INTERRUPT_SYSCALL	0xc00
+#define BOOK3S_INTERRUPT_TRACE		0xd00
+#define BOOK3S_INTERRUPT_PERFMON	0xf00
+#define BOOK3S_INTERRUPT_ALTIVEC	0xf20
+#define BOOK3S_INTERRUPT_VSX		0xf40
+
+#define BOOK3S_IRQPRIO_SYSTEM_RESET		0
+#define BOOK3S_IRQPRIO_DATA_SEGMENT		1
+#define BOOK3S_IRQPRIO_INST_SEGMENT		2
+#define BOOK3S_IRQPRIO_DATA_STORAGE		3
+#define BOOK3S_IRQPRIO_INST_STORAGE		4
+#define BOOK3S_IRQPRIO_ALIGNMENT		5
+#define BOOK3S_IRQPRIO_PROGRAM			6
+#define BOOK3S_IRQPRIO_FP_UNAVAIL		7
+#define BOOK3S_IRQPRIO_ALTIVEC			8
+#define BOOK3S_IRQPRIO_VSX			9
+#define BOOK3S_IRQPRIO_SYSCALL			10
+#define BOOK3S_IRQPRIO_MACHINE_CHECK		11
+#define BOOK3S_IRQPRIO_DEBUG			12
+#define BOOK3S_IRQPRIO_EXTERNAL			13
+#define BOOK3S_IRQPRIO_DECREMENTER		14
+#define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR	15
+#define BOOK3S_IRQPRIO_MAX			16
+
+#define BOOK3S_HFLAG_DCBZ32			0x1
+
 #define RESUME_FLAG_NV          (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
 
-- 
cgit v1.2.3-59-g8ed1b


From ca95150b3a9f3f3146a686296f2156a7ec6e98e9 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 30 Oct 2009 05:47:04 +0000
Subject: Add Book3s fields to vcpu structs

We need to store more information than we currently have for vcpus
when running on Book3s.

So let's extend the internal struct definitions.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/kvm_host.h | 73 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index c9c930ed11d7..2cff5fe0cbe6 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -37,6 +37,8 @@
 #define KVM_NR_PAGE_SIZES	1
 #define KVM_PAGES_PER_HPAGE(x)	(1UL<<31)
 
+#define HPTEG_CACHE_NUM 1024
+
 struct kvm;
 struct kvm_run;
 struct kvm_vcpu;
@@ -63,6 +65,17 @@ struct kvm_vcpu_stat {
 	u32 dec_exits;
 	u32 ext_intr_exits;
 	u32 halt_wakeup;
+#ifdef CONFIG_PPC64
+	u32 pf_storage;
+	u32 pf_instruc;
+	u32 sp_storage;
+	u32 sp_instruc;
+	u32 queue_intr;
+	u32 ld;
+	u32 ld_slow;
+	u32 st;
+	u32 st_slow;
+#endif
 };
 
 enum kvm_exit_types {
@@ -109,9 +122,53 @@ struct kvmppc_exit_timing {
 struct kvm_arch {
 };
 
+struct kvmppc_pte {
+	u64 eaddr;
+	u64 vpage;
+	u64 raddr;
+	bool may_read;
+	bool may_write;
+	bool may_execute;
+};
+
+struct kvmppc_mmu {
+	/* book3s_64 only */
+	void (*slbmte)(struct kvm_vcpu *vcpu, u64 rb, u64 rs);
+	u64  (*slbmfee)(struct kvm_vcpu *vcpu, u64 slb_nr);
+	u64  (*slbmfev)(struct kvm_vcpu *vcpu, u64 slb_nr);
+	void (*slbie)(struct kvm_vcpu *vcpu, u64 slb_nr);
+	void (*slbia)(struct kvm_vcpu *vcpu);
+	/* book3s */
+	void (*mtsrin)(struct kvm_vcpu *vcpu, u32 srnum, ulong value);
+	u32  (*mfsrin)(struct kvm_vcpu *vcpu, u32 srnum);
+	int  (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data);
+	void (*reset_msr)(struct kvm_vcpu *vcpu);
+	void (*tlbie)(struct kvm_vcpu *vcpu, ulong addr, bool large);
+	int  (*esid_to_vsid)(struct kvm_vcpu *vcpu, u64 esid, u64 *vsid);
+	u64  (*ea_to_vp)(struct kvm_vcpu *vcpu, gva_t eaddr, bool data);
+	bool (*is_dcbz32)(struct kvm_vcpu *vcpu);
+};
+
+struct hpte_cache {
+	u64 host_va;
+	u64 pfn;
+	ulong slot;
+	struct kvmppc_pte pte;
+};
+
 struct kvm_vcpu_arch {
-	u32 host_stack;
+	ulong host_stack;
 	u32 host_pid;
+#ifdef CONFIG_PPC64
+	ulong host_msr;
+	ulong host_r2;
+	void *host_retip;
+	ulong trampoline_lowmem;
+	ulong trampoline_enter;
+	ulong highmem_handler;
+	ulong host_paca_phys;
+	struct kvmppc_mmu mmu;
+#endif
 
 	u64 fpr[32];
 	ulong gpr[32];
@@ -123,6 +180,10 @@ struct kvm_vcpu_arch {
 	ulong xer;
 
 	ulong msr;
+#ifdef CONFIG_PPC64
+	ulong shadow_msr;
+	ulong hflags;
+#endif
 	u32 mmucr;
 	ulong sprg0;
 	ulong sprg1;
@@ -149,6 +210,7 @@ struct kvm_vcpu_arch {
 	u32 ivor[64];
 	ulong ivpr;
 	u32 pir;
+	u32 pvr;
 
 	u32 shadow_pid;
 	u32 pid;
@@ -174,6 +236,9 @@ struct kvm_vcpu_arch {
 #endif
 
 	u32 last_inst;
+#ifdef CONFIG_PPC64
+	ulong fault_dsisr;
+#endif
 	ulong fault_dear;
 	ulong fault_esr;
 	gpa_t paddr_accessed;
@@ -186,7 +251,13 @@ struct kvm_vcpu_arch {
 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
 
 	struct timer_list dec_timer;
+	u64 dec_jiffies;
 	unsigned long pending_exceptions;
+
+#ifdef CONFIG_PPC64
+	struct hpte_cache hpte_cache[HPTEG_CACHE_NUM];
+	int hpte_cache_offset;
+#endif
 };
 
 #endif /* __POWERPC_KVM_HOST_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 4e342025e625a7271be0a9e2d20b7caf1ab70e8a Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 30 Oct 2009 05:47:05 +0000
Subject: Add asm/kvm_book3s.h

This adds the book3s specific header file that contains structs that
are only valid on book3s specific code.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/kvm_book3s.h | 136 ++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 arch/powerpc/include/asm/kvm_book3s.h

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
new file mode 100644
index 000000000000..c6011336371e
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -0,0 +1,136 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#ifndef __ASM_KVM_BOOK3S_H__
+#define __ASM_KVM_BOOK3S_H__
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_ppc.h>
+
+struct kvmppc_slb {
+	u64 esid;
+	u64 vsid;
+	u64 orige;
+	u64 origv;
+	bool valid;
+	bool Ks;
+	bool Kp;
+	bool nx;
+	bool large;
+	bool class;
+};
+
+struct kvmppc_sr {
+	u32 raw;
+	u32 vsid;
+	bool Ks;
+	bool Kp;
+	bool nx;
+};
+
+struct kvmppc_bat {
+	u32 bepi;
+	u32 bepi_mask;
+	bool vs;
+	bool vp;
+	u32 brpn;
+	u8 wimg;
+	u8 pp;
+};
+
+struct kvmppc_sid_map {
+	u64 guest_vsid;
+	u64 guest_esid;
+	u64 host_vsid;
+	bool valid;
+};
+
+#define SID_MAP_BITS    9
+#define SID_MAP_NUM     (1 << SID_MAP_BITS)
+#define SID_MAP_MASK    (SID_MAP_NUM - 1)
+
+struct kvmppc_vcpu_book3s {
+	struct kvm_vcpu vcpu;
+	struct kvmppc_sid_map sid_map[SID_MAP_NUM];
+	struct kvmppc_slb slb[64];
+	struct {
+		u64 esid;
+		u64 vsid;
+	} slb_shadow[64];
+	u8 slb_shadow_max;
+	struct kvmppc_sr sr[16];
+	struct kvmppc_bat ibat[8];
+	struct kvmppc_bat dbat[8];
+	u64 hid[6];
+	int slb_nr;
+	u64 sdr1;
+	u64 dsisr;
+	u64 hior;
+	u64 msr_mask;
+	u64 vsid_first;
+	u64 vsid_next;
+	u64 vsid_max;
+	int context_id;
+};
+
+#define CONTEXT_HOST		0
+#define CONTEXT_GUEST		1
+#define CONTEXT_GUEST_END	2
+
+#define VSID_REAL	0xfffffffffff00000
+#define VSID_REAL_DR	0xffffffffffe00000
+#define VSID_REAL_IR	0xffffffffffd00000
+#define VSID_BAT	0xffffffffffc00000
+#define VSID_PR		0x8000000000000000
+
+extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, u64 ea, u64 ea_mask);
+extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask);
+extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, u64 pa_start, u64 pa_end);
+extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
+extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
+extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
+extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
+extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
+extern struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data);
+extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr, bool data);
+extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr);
+extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+
+extern u32 kvmppc_trampoline_lowmem;
+extern u32 kvmppc_trampoline_enter;
+
+static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
+{
+	return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
+}
+
+static inline ulong dsisr(void)
+{
+	ulong r;
+	asm ( "mfdsisr %0 " : "=r" (r) );
+	return r;
+}
+
+extern void kvm_return_point(void);
+
+#define INS_DCBZ			0x7c0007ec
+
+#endif /* __ASM_KVM_BOOK3S_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 3cea8c435d0b142eb2b3dd2c411a24aa1b32bfe4 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 30 Oct 2009 05:47:06 +0000
Subject: Add Book3s_64 intercept helpers

We need to intercept interrupt vectors. To do that, let's add a file
we can always include which only activates the intercepts when we have
then configured.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/kvm_book3s_64_asm.h | 58 ++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 arch/powerpc/include/asm/kvm_book3s_64_asm.h

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/kvm_book3s_64_asm.h b/arch/powerpc/include/asm/kvm_book3s_64_asm.h
new file mode 100644
index 000000000000..2e06ee8184ef
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_book3s_64_asm.h
@@ -0,0 +1,58 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#ifndef __ASM_KVM_BOOK3S_ASM_H__
+#define __ASM_KVM_BOOK3S_ASM_H__
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+
+#include <asm/kvm_asm.h>
+
+.macro DO_KVM intno
+	.if (\intno == BOOK3S_INTERRUPT_SYSTEM_RESET) || \
+	    (\intno == BOOK3S_INTERRUPT_MACHINE_CHECK) || \
+	    (\intno == BOOK3S_INTERRUPT_DATA_STORAGE) || \
+	    (\intno == BOOK3S_INTERRUPT_INST_STORAGE) || \
+	    (\intno == BOOK3S_INTERRUPT_DATA_SEGMENT) || \
+	    (\intno == BOOK3S_INTERRUPT_INST_SEGMENT) || \
+	    (\intno == BOOK3S_INTERRUPT_EXTERNAL) || \
+	    (\intno == BOOK3S_INTERRUPT_ALIGNMENT) || \
+	    (\intno == BOOK3S_INTERRUPT_PROGRAM) || \
+	    (\intno == BOOK3S_INTERRUPT_FP_UNAVAIL) || \
+	    (\intno == BOOK3S_INTERRUPT_DECREMENTER) || \
+	    (\intno == BOOK3S_INTERRUPT_SYSCALL) || \
+	    (\intno == BOOK3S_INTERRUPT_TRACE) || \
+	    (\intno == BOOK3S_INTERRUPT_PERFMON) || \
+	    (\intno == BOOK3S_INTERRUPT_ALTIVEC) || \
+	    (\intno == BOOK3S_INTERRUPT_VSX)
+
+	b	kvmppc_trampoline_\intno
+kvmppc_resume_\intno:
+
+	.endif
+.endm
+
+#else
+
+.macro DO_KVM intno
+.endm
+
+#endif /* CONFIG_KVM_BOOK3S_64_HANDLER */
+
+#endif /* __ASM_KVM_BOOK3S_ASM_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 29eb61bca1e82dc59e4d9c575e6f21ce7a36d61d Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 30 Oct 2009 05:47:07 +0000
Subject: Add book3s_64 highmem asm code

This is the of entry / exit code. In order to switch between host and guest
context, we need to switch register state and call the exit code handler on
exit.

This assembly file does exactly that. To finally enter the guest it calls
into book3s_64_slb.S. On exit it gets jumped at from book3s_64_slb.S too.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/kvm_ppc.h      |   1 +
 arch/powerpc/kvm/book3s_64_interrupts.S | 392 ++++++++++++++++++++++++++++++++
 2 files changed, 393 insertions(+)
 create mode 100644 arch/powerpc/kvm/book3s_64_interrupts.S

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2c6ee349df5e..269ee46ab028 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -39,6 +39,7 @@ enum emulation_result {
 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern char kvmppc_handlers_start[];
 extern unsigned long kvmppc_handler_len;
+extern void kvmppc_handler_highmem(void);
 
 extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu);
 extern int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/kvm/book3s_64_interrupts.S b/arch/powerpc/kvm/book3s_64_interrupts.S
new file mode 100644
index 000000000000..7b55d8094c8b
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_interrupts.S
@@ -0,0 +1,392 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+
+#define KVMPPC_HANDLE_EXIT .kvmppc_handle_exit
+#define ULONG_SIZE 8
+#define VCPU_GPR(n)     (VCPU_GPRS + (n * ULONG_SIZE))
+
+.macro mfpaca tmp_reg, src_reg, offset, vcpu_reg
+	ld	\tmp_reg, (PACA_EXMC+\offset)(r13)
+	std	\tmp_reg, VCPU_GPR(\src_reg)(\vcpu_reg)
+.endm
+
+.macro DISABLE_INTERRUPTS
+       mfmsr   r0
+       rldicl  r0,r0,48,1
+       rotldi  r0,r0,16
+       mtmsrd  r0,1
+.endm
+
+/*****************************************************************************
+ *                                                                           *
+ *     Guest entry / exit code that is in kernel module memory (highmem)     *
+ *                                                                           *
+ ****************************************************************************/
+
+/* Registers:
+ *  r3: kvm_run pointer
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcpu_entry)
+
+kvm_start_entry:
+	/* Write correct stack frame */
+	mflr    r0
+	std     r0,16(r1)
+
+	/* Save host state to the stack */
+	stdu	r1, -SWITCH_FRAME_SIZE(r1)
+
+	/* Save r3 (kvm_run) and r4 (vcpu) */
+	SAVE_2GPRS(3, r1)
+
+	/* Save non-volatile registers (r14 - r31) */
+	SAVE_NVGPRS(r1)
+
+	/* Save LR */
+	mflr	r14
+	std	r14, _LINK(r1)
+
+/* XXX optimize non-volatile loading away */
+kvm_start_lightweight:
+
+	DISABLE_INTERRUPTS
+
+	/* Save R1/R2 in the PACA */
+	std	r1, PACAR1(r13)
+	std	r2, (PACA_EXMC+EX_SRR0)(r13)
+	ld	r3, VCPU_HIGHMEM_HANDLER(r4)
+	std	r3, PACASAVEDMSR(r13)
+
+	/* Load non-volatile guest state from the vcpu */
+	ld	r14, VCPU_GPR(r14)(r4)
+	ld	r15, VCPU_GPR(r15)(r4)
+	ld	r16, VCPU_GPR(r16)(r4)
+	ld	r17, VCPU_GPR(r17)(r4)
+	ld	r18, VCPU_GPR(r18)(r4)
+	ld	r19, VCPU_GPR(r19)(r4)
+	ld	r20, VCPU_GPR(r20)(r4)
+	ld	r21, VCPU_GPR(r21)(r4)
+	ld	r22, VCPU_GPR(r22)(r4)
+	ld	r23, VCPU_GPR(r23)(r4)
+	ld	r24, VCPU_GPR(r24)(r4)
+	ld	r25, VCPU_GPR(r25)(r4)
+	ld	r26, VCPU_GPR(r26)(r4)
+	ld	r27, VCPU_GPR(r27)(r4)
+	ld	r28, VCPU_GPR(r28)(r4)
+	ld	r29, VCPU_GPR(r29)(r4)
+	ld	r30, VCPU_GPR(r30)(r4)
+	ld	r31, VCPU_GPR(r31)(r4)
+
+	ld	r9, VCPU_PC(r4)			/* r9 = vcpu->arch.pc */
+	ld	r10, VCPU_SHADOW_MSR(r4)	/* r10 = vcpu->arch.shadow_msr */
+
+	ld	r3, VCPU_TRAMPOLINE_ENTER(r4)
+	mtsrr0	r3
+
+	LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR))
+	mtsrr1	r3
+
+	/* Load guest state in the respective registers */
+	lwz	r3, VCPU_CR(r4)		/* r3 = vcpu->arch.cr */
+	stw	r3, (PACA_EXMC + EX_CCR)(r13)
+
+	ld	r3, VCPU_CTR(r4)	/* r3 = vcpu->arch.ctr */
+	mtctr	r3			/* CTR = r3 */
+
+	ld	r3, VCPU_LR(r4)		/* r3 = vcpu->arch.lr */
+	mtlr	r3			/* LR = r3 */
+
+	ld	r3, VCPU_XER(r4)	/* r3 = vcpu->arch.xer */
+	std	r3, (PACA_EXMC + EX_R3)(r13)
+
+	/* Some guests may need to have dcbz set to 32 byte length.
+	 *
+	 * Usually we ensure that by patching the guest's instructions
+	 * to trap on dcbz and emulate it in the hypervisor.
+	 *
+	 * If we can, we should tell the CPU to use 32 byte dcbz though,
+	 * because that's a lot faster.
+	 */
+
+	ld	r3, VCPU_HFLAGS(r4)
+	rldicl.	r3, r3, 0, 63		/* CR = ((r3 & 1) == 0) */
+	beq	no_dcbz32_on
+
+	mfspr   r3,SPRN_HID5
+	ori     r3, r3, 0x80		/* XXX HID5_dcbz32 = 0x80 */
+	mtspr   SPRN_HID5,r3
+
+no_dcbz32_on:
+	/*	Load guest GPRs */
+
+	ld	r3, VCPU_GPR(r9)(r4)
+	std	r3, (PACA_EXMC + EX_R9)(r13)
+	ld	r3, VCPU_GPR(r10)(r4)
+	std	r3, (PACA_EXMC + EX_R10)(r13)
+	ld	r3, VCPU_GPR(r11)(r4)
+	std	r3, (PACA_EXMC + EX_R11)(r13)
+	ld	r3, VCPU_GPR(r12)(r4)
+	std	r3, (PACA_EXMC + EX_R12)(r13)
+	ld	r3, VCPU_GPR(r13)(r4)
+	std	r3, (PACA_EXMC + EX_R13)(r13)
+
+	ld	r0, VCPU_GPR(r0)(r4)
+	ld	r1, VCPU_GPR(r1)(r4)
+	ld	r2, VCPU_GPR(r2)(r4)
+	ld	r3, VCPU_GPR(r3)(r4)
+	ld	r5, VCPU_GPR(r5)(r4)
+	ld	r6, VCPU_GPR(r6)(r4)
+	ld	r7, VCPU_GPR(r7)(r4)
+	ld	r8, VCPU_GPR(r8)(r4)
+	ld	r4, VCPU_GPR(r4)(r4)
+
+	/* This sets the Magic value for the trampoline */
+
+	li	r11, 1
+	stb	r11, PACA_KVM_IN_GUEST(r13)
+
+	/* Jump to SLB patching handlder and into our guest */
+	RFI
+
+/*
+ * This is the handler in module memory. It gets jumped at from the
+ * lowmem trampoline code, so it's basically the guest exit code.
+ *
+ */
+
+.global kvmppc_handler_highmem
+kvmppc_handler_highmem:
+
+	/*
+	 * Register usage at this point:
+	 *
+	 * R00   = guest R13
+	 * R01   = host R1
+	 * R02   = host R2
+	 * R10   = guest PC
+	 * R11   = guest MSR
+	 * R12   = exit handler id
+	 * R13   = PACA
+	 * PACA.exmc.R9    = guest R1
+	 * PACA.exmc.R10   = guest R10
+	 * PACA.exmc.R11   = guest R11
+	 * PACA.exmc.R12   = guest R12
+	 * PACA.exmc.R13   = guest R2
+	 * PACA.exmc.DAR   = guest DAR
+	 * PACA.exmc.DSISR = guest DSISR
+	 * PACA.exmc.LR    = guest instruction
+	 * PACA.exmc.CCR   = guest CR
+	 * PACA.exmc.SRR0  = guest R0
+	 *
+	 */
+
+	std	r3, (PACA_EXMC+EX_R3)(r13)
+
+	/* save the exit id in R3 */
+	mr	r3, r12
+
+	/* R12 = vcpu */
+	ld	r12, GPR4(r1)
+
+	/* Now save the guest state */
+
+	std	r0, VCPU_GPR(r13)(r12)
+	std	r4, VCPU_GPR(r4)(r12)
+	std	r5, VCPU_GPR(r5)(r12)
+	std	r6, VCPU_GPR(r6)(r12)
+	std	r7, VCPU_GPR(r7)(r12)
+	std	r8, VCPU_GPR(r8)(r12)
+	std	r9, VCPU_GPR(r9)(r12)
+
+	/* get registers from PACA */
+	mfpaca	r5, r0, EX_SRR0, r12
+	mfpaca	r5, r3, EX_R3, r12
+	mfpaca	r5, r1, EX_R9, r12
+	mfpaca	r5, r10, EX_R10, r12
+	mfpaca	r5, r11, EX_R11, r12
+	mfpaca	r5, r12, EX_R12, r12
+	mfpaca	r5, r2, EX_R13, r12
+
+	lwz	r5, (PACA_EXMC+EX_LR)(r13)
+	stw	r5, VCPU_LAST_INST(r12)
+
+	lwz	r5, (PACA_EXMC+EX_CCR)(r13)
+	stw	r5, VCPU_CR(r12)
+
+	ld	r5, VCPU_HFLAGS(r12)
+	rldicl.	r5, r5, 0, 63		/* CR = ((r5 & 1) == 0) */
+	beq	no_dcbz32_off
+
+	mfspr   r5,SPRN_HID5
+	rldimi  r5,r5,6,56
+	mtspr   SPRN_HID5,r5
+
+no_dcbz32_off:
+
+	/* XXX maybe skip on lightweight? */
+	std	r14, VCPU_GPR(r14)(r12)
+	std	r15, VCPU_GPR(r15)(r12)
+	std	r16, VCPU_GPR(r16)(r12)
+	std	r17, VCPU_GPR(r17)(r12)
+	std	r18, VCPU_GPR(r18)(r12)
+	std	r19, VCPU_GPR(r19)(r12)
+	std	r20, VCPU_GPR(r20)(r12)
+	std	r21, VCPU_GPR(r21)(r12)
+	std	r22, VCPU_GPR(r22)(r12)
+	std	r23, VCPU_GPR(r23)(r12)
+	std	r24, VCPU_GPR(r24)(r12)
+	std	r25, VCPU_GPR(r25)(r12)
+	std	r26, VCPU_GPR(r26)(r12)
+	std	r27, VCPU_GPR(r27)(r12)
+	std	r28, VCPU_GPR(r28)(r12)
+	std	r29, VCPU_GPR(r29)(r12)
+	std	r30, VCPU_GPR(r30)(r12)
+	std	r31, VCPU_GPR(r31)(r12)
+
+	/* Restore non-volatile host registers (r14 - r31) */
+	REST_NVGPRS(r1)
+
+	/* Save guest PC (R10) */
+	std	r10, VCPU_PC(r12)
+
+	/* Save guest msr (R11) */
+	std	r11, VCPU_SHADOW_MSR(r12)
+
+	/* Save guest CTR (in R12) */
+	mfctr	r5
+	std	r5, VCPU_CTR(r12)
+
+	/* Save guest LR */
+	mflr	r5
+	std	r5, VCPU_LR(r12)
+
+	/* Save guest XER */
+	mfxer	r5
+	std	r5, VCPU_XER(r12)
+
+	/* Save guest DAR */
+	ld	r5, (PACA_EXMC+EX_DAR)(r13)
+	std	r5, VCPU_FAULT_DEAR(r12)
+
+	/* Save guest DSISR */
+	lwz	r5, (PACA_EXMC+EX_DSISR)(r13)
+	std	r5, VCPU_FAULT_DSISR(r12)
+
+	/* Restore host msr -> SRR1 */
+	ld	r7, VCPU_HOST_MSR(r12)
+	mtsrr1	r7
+
+	/* Restore host IP -> SRR0 */
+	ld	r6, VCPU_HOST_RETIP(r12)
+	mtsrr0	r6
+
+	/*
+	 * For some interrupts, we need to call the real Linux
+	 * handler, so it can do work for us. This has to happen
+	 * as if the interrupt arrived from the kernel though,
+	 * so let's fake it here where most state is restored.
+	 *
+	 * Call Linux for hardware interrupts/decrementer
+	 * r3 = address of interrupt handler (exit reason)
+	 */
+
+	cmpwi	r3, BOOK3S_INTERRUPT_EXTERNAL
+	beq	call_linux_handler
+	cmpwi	r3, BOOK3S_INTERRUPT_DECREMENTER
+	beq	call_linux_handler
+
+	/* Back to Interruptable Mode! (goto kvm_return_point) */
+	RFI
+
+call_linux_handler:
+
+	/*
+	 * If we land here we need to jump back to the handler we
+	 * came from.
+	 *
+	 * We have a page that we can access from real mode, so let's
+	 * jump back to that and use it as a trampoline to get back into the
+	 * interrupt handler!
+	 *
+	 * R3 still contains the exit code,
+	 * R6 VCPU_HOST_RETIP and
+	 * R7 VCPU_HOST_MSR
+	 */
+
+	mtlr	r3
+
+	ld	r5, VCPU_TRAMPOLINE_LOWMEM(r12)
+	mtsrr0	r5
+	LOAD_REG_IMMEDIATE(r5, MSR_KERNEL & ~(MSR_IR | MSR_DR))
+	mtsrr1	r5
+
+	RFI
+
+.global kvm_return_point
+kvm_return_point:
+
+	/* Jump back to lightweight entry if we're supposed to */
+	/* go back into the guest */
+	mr	r5, r3
+	/* Restore r3 (kvm_run) and r4 (vcpu) */
+	REST_2GPRS(3, r1)
+	bl	KVMPPC_HANDLE_EXIT
+
+#if 0 /* XXX get lightweight exits back */
+	cmpwi	r3, RESUME_GUEST
+	bne	kvm_exit_heavyweight
+
+	/* put VCPU and KVM_RUN back into place and roll again! */
+	REST_2GPRS(3, r1)
+	b	kvm_start_lightweight
+
+kvm_exit_heavyweight:
+	/* Restore non-volatile host registers */
+	ld	r14, _LINK(r1)
+	mtlr	r14
+	REST_NVGPRS(r1)
+
+	addi    r1, r1, SWITCH_FRAME_SIZE
+#else
+	ld	r4, _LINK(r1)
+	mtlr	r4
+
+	cmpwi	r3, RESUME_GUEST
+	bne	kvm_exit_heavyweight
+
+	REST_2GPRS(3, r1)
+
+	addi    r1, r1, SWITCH_FRAME_SIZE
+
+	b	kvm_start_entry
+
+kvm_exit_heavyweight:
+
+	addi    r1, r1, SWITCH_FRAME_SIZE
+#endif
+
+	blr
-- 
cgit v1.2.3-59-g8ed1b


From 842f2fedcdc4f9ea8e6ac5b2222971c31666dd3e Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 30 Oct 2009 05:47:17 +0000
Subject: Make head_64.S aware of KVM real mode code

We need to run some KVM trampoline code in real mode. Unfortunately, real mode
only covers 8MB on Cell so we need to squeeze ourselves as low as possible.

Also, we need to trap interrupts to get us back from guest state to host state
without telling Linux about it.

This patch adds interrupt traps and includes the KVM code that requires real
mode in the real mode parts of Linux.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/exception-64s.h | 2 ++
 arch/powerpc/kernel/exceptions-64s.S     | 8 ++++++++
 arch/powerpc/kernel/head_64.S            | 7 +++++++
 3 files changed, 17 insertions(+)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index a98653b26231..57c400071995 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -147,6 +147,7 @@
 	.globl label##_pSeries;				\
 label##_pSeries:					\
 	HMT_MEDIUM;					\
+	DO_KVM	n;					\
 	mtspr	SPRN_SPRG_SCRATCH0,r13;		/* save r13 */	\
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common)
 
@@ -170,6 +171,7 @@ label##_pSeries:					\
 	.globl label##_pSeries;						\
 label##_pSeries:							\
 	HMT_MEDIUM;							\
+	DO_KVM	n;							\
 	mtspr	SPRN_SPRG_SCRATCH0,r13;	/* save r13 */			\
 	mfspr	r13,SPRN_SPRG_PACA;	/* get paca address into r13 */	\
 	std	r9,PACA_EXGEN+EX_R9(r13);	/* save r9, r10 */	\
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 1808876edcc9..fc3ead066cec 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -41,6 +41,7 @@ __start_interrupts:
 	. = 0x200
 _machine_check_pSeries:
 	HMT_MEDIUM
+	DO_KVM	0x200
 	mtspr	SPRN_SPRG_SCRATCH0,r13		/* save r13 */
 	EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common)
 
@@ -48,6 +49,7 @@ _machine_check_pSeries:
 	.globl data_access_pSeries
 data_access_pSeries:
 	HMT_MEDIUM
+	DO_KVM	0x300
 	mtspr	SPRN_SPRG_SCRATCH0,r13
 BEGIN_FTR_SECTION
 	mfspr	r13,SPRN_SPRG_PACA
@@ -77,6 +79,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_SLB)
 	.globl data_access_slb_pSeries
 data_access_slb_pSeries:
 	HMT_MEDIUM
+	DO_KVM	0x380
 	mtspr	SPRN_SPRG_SCRATCH0,r13
 	mfspr	r13,SPRN_SPRG_PACA		/* get paca address into r13 */
 	std	r3,PACA_EXSLB+EX_R3(r13)
@@ -115,6 +118,7 @@ data_access_slb_pSeries:
 	.globl instruction_access_slb_pSeries
 instruction_access_slb_pSeries:
 	HMT_MEDIUM
+	DO_KVM	0x480
 	mtspr	SPRN_SPRG_SCRATCH0,r13
 	mfspr	r13,SPRN_SPRG_PACA		/* get paca address into r13 */
 	std	r3,PACA_EXSLB+EX_R3(r13)
@@ -154,6 +158,7 @@ instruction_access_slb_pSeries:
 	.globl	system_call_pSeries
 system_call_pSeries:
 	HMT_MEDIUM
+	DO_KVM	0xc00
 BEGIN_FTR_SECTION
 	cmpdi	r0,0x1ebe
 	beq-	1f
@@ -186,12 +191,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
 	 * trickery is thus necessary
 	 */
 	. = 0xf00
+	DO_KVM	0xf00
 	b	performance_monitor_pSeries
 
 	. = 0xf20
+	DO_KVM	0xf20
 	b	altivec_unavailable_pSeries
 
 	. = 0xf40
+	DO_KVM	0xf40
 	b	vsx_unavailable_pSeries
 
 #ifdef CONFIG_CBE_RAS
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index c38afdb45d7b..925807488022 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -37,6 +37,7 @@
 #include <asm/firmware.h>
 #include <asm/page_64.h>
 #include <asm/irqflags.h>
+#include <asm/kvm_book3s_64_asm.h>
 
 /* The physical memory is layed out such that the secondary processor
  * spin code sits at 0x0000...0x00ff. On server, the vectors follow
@@ -165,6 +166,12 @@ exception_marker:
 #include "exceptions-64s.S"
 #endif
 
+/* KVM trampoline code needs to be close to the interrupt handlers */
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include "../kvm/book3s_64_rmhandlers.S"
+#endif
+
 _GLOBAL(generic_secondary_thread_init)
 	mr	r24,r3
 
-- 
cgit v1.2.3-59-g8ed1b


From e85a47106abb928e048d89d7fa48f982fcb018aa Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Nov 2009 12:02:30 +0000
Subject: Split init_new_context and destroy_context

For KVM we need to allocate a new context id, but don't really care about
all the mm context around it.

So let's split the alloc and destroy functions for the context id, so we can
grab one without allocating an mm context.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/mmu_context.h |  2 ++
 arch/powerpc/mm/mmu_context_hash64.c   | 24 +++++++++++++++++++++---
 2 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index b34e94d94435..26383e0778aa 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -23,6 +23,8 @@ extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
 extern void set_context(unsigned long id, pgd_t *pgd);
 
 #ifdef CONFIG_PPC_BOOK3S_64
+extern int __init_new_context(void);
+extern void __destroy_context(int context_id);
 static inline void mmu_context_init(void) { }
 #else
 extern void mmu_context_init(void);
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index dbeb86ac90cd..b9e4cc2c2057 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/spinlock.h>
 #include <linux/idr.h>
+#include <linux/module.h>
 
 #include <asm/mmu_context.h>
 
@@ -32,7 +33,7 @@ static DEFINE_IDR(mmu_context_idr);
 #define NO_CONTEXT	0
 #define MAX_CONTEXT	((1UL << 19) - 1)
 
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+int __init_new_context(void)
 {
 	int index;
 	int err;
@@ -57,6 +58,18 @@ again:
 		return -ENOMEM;
 	}
 
+	return index;
+}
+EXPORT_SYMBOL_GPL(__init_new_context);
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+	int index;
+
+	index = __init_new_context();
+	if (index < 0)
+		return index;
+
 	/* The old code would re-promote on fork, we don't do that
 	 * when using slices as it could cause problem promoting slices
 	 * that have been forced down to 4K
@@ -68,11 +81,16 @@ again:
 	return 0;
 }
 
-void destroy_context(struct mm_struct *mm)
+void __destroy_context(int context_id)
 {
 	spin_lock(&mmu_context_lock);
-	idr_remove(&mmu_context_idr, mm->context.id);
+	idr_remove(&mmu_context_idr, context_id);
 	spin_unlock(&mmu_context_lock);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
 
+void destroy_context(struct mm_struct *mm)
+{
+	__destroy_context(mm->context.id);
 	mm->context.id = NO_CONTEXT;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 4b7ae55df3621dd9eef56a6fde953ec9c73ac596 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 30 Oct 2009 05:47:22 +0000
Subject: Add fields to PACA

For KVM we need to store some information in the PACA, so we
need to extend it.

This patch adds KVM SLB shadow related entries to the PACA and
a field that indicates if we're inside a guest.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/paca.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 7d8514ceceae..5e9b4ef71415 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -129,6 +129,15 @@ struct paca_struct {
 	u64 system_time;		/* accumulated system TB ticks */
 	u64 startpurr;			/* PURR/TB value snapshot */
 	u64 startspurr;			/* SPURR value snapshot */
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	struct  {
+		u64     esid;
+		u64     vsid;
+	} kvm_slb[64];			/* guest SLB */
+	u8 kvm_slb_max;			/* highest used guest slb entry */
+	u8 kvm_in_guest;		/* are we inside the guest? */
+#endif
 };
 
 extern struct paca_struct paca[];
-- 
cgit v1.2.3-59-g8ed1b


From 544c6761bb05a1dd19a39cb9bed096273f9bdb36 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Nov 2009 12:02:31 +0000
Subject: Use hrtimers for the decrementer

Following S390's good example we should use hrtimers for the decrementer too!
This patch converts the timer from the old mechanism to hrtimers.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/kvm_host.h |  6 ++++--
 arch/powerpc/kvm/emulate.c          | 18 +++++++++++-------
 arch/powerpc/kvm/powerpc.c          | 20 ++++++++++++++++++--
 3 files changed, 33 insertions(+), 11 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 2cff5fe0cbe6..1201f62d0d73 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -21,7 +21,8 @@
 #define __POWERPC_KVM_HOST_H__
 
 #include <linux/mutex.h>
-#include <linux/timer.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/kvm_types.h>
 #include <asm/kvm_asm.h>
@@ -250,7 +251,8 @@ struct kvm_vcpu_arch {
 
 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
 
-	struct timer_list dec_timer;
+	struct hrtimer dec_timer;
+	struct tasklet_struct tasklet;
 	u64 dec_jiffies;
 	unsigned long pending_exceptions;
 
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 1ec5e07b81eb..4a9ac6640fad 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -18,7 +18,7 @@
  */
 
 #include <linux/jiffies.h>
-#include <linux/timer.h>
+#include <linux/hrtimer.h>
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/kvm_host.h>
@@ -79,12 +79,13 @@ static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu)
 
 void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 {
-	unsigned long nr_jiffies;
+	unsigned long dec_nsec;
 
+	pr_debug("mtDEC: %x\n", vcpu->arch.dec);
 #ifdef CONFIG_PPC64
 	/* POWER4+ triggers a dec interrupt if the value is < 0 */
 	if (vcpu->arch.dec & 0x80000000) {
-		del_timer(&vcpu->arch.dec_timer);
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
 		kvmppc_core_queue_dec(vcpu);
 		return;
 	}
@@ -94,12 +95,15 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 		 * that's how we convert the guest DEC value to the number of
 		 * host ticks. */
 
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+		dec_nsec = vcpu->arch.dec;
+		dec_nsec *= 1000;
+		dec_nsec /= tb_ticks_per_usec;
+		hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+			      HRTIMER_MODE_REL);
 		vcpu->arch.dec_jiffies = get_tb();
-		nr_jiffies = vcpu->arch.dec / tb_ticks_per_jiffy;
-		mod_timer(&vcpu->arch.dec_timer,
-		          get_jiffies_64() + nr_jiffies);
 	} else {
-		del_timer(&vcpu->arch.dec_timer);
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
 	}
 }
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index a06ecc3401fd..692c3709011e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -23,6 +23,7 @@
 #include <linux/kvm_host.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
+#include <linux/hrtimer.h>
 #include <linux/fs.h>
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
@@ -208,10 +209,25 @@ static void kvmppc_decrementer_func(unsigned long data)
 	}
 }
 
+/*
+ * low level hrtimer wake routine. Because this runs in hardirq context
+ * we schedule a tasklet to do the real work.
+ */
+enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
+{
+	struct kvm_vcpu *vcpu;
+
+	vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer);
+	tasklet_schedule(&vcpu->arch.tasklet);
+
+	return HRTIMER_NORESTART;
+}
+
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
-	setup_timer(&vcpu->arch.dec_timer, kvmppc_decrementer_func,
-	            (unsigned long)vcpu);
+	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+	tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
+	vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From b79ddf2c2dba44114d150c7758002ef29e693086 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Thu, 27 Aug 2009 21:30:19 +0400
Subject: powerpc/qe: Add qe_upload_firmware() stub for non-QE builds

This is needed to avoid #ifdefs in MPC85xx suspend/resume code.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/include/asm/qe.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/qe.h b/arch/powerpc/include/asm/qe.h
index f388f0ab193f..28fee3b7887a 100644
--- a/arch/powerpc/include/asm/qe.h
+++ b/arch/powerpc/include/asm/qe.h
@@ -210,8 +210,15 @@ struct qe_firmware_info {
 	u64 extended_modes;	/* Extended modes */
 };
 
+#ifdef CONFIG_QUICC_ENGINE
 /* Upload a firmware to the QE */
 int qe_upload_firmware(const struct qe_firmware *firmware);
+#else
+static inline int qe_upload_firmware(const struct qe_firmware *firmware)
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_QUICC_ENGINE */
 
 /* Obtain information on the uploaded firmware */
 struct qe_firmware_info *qe_get_firmware_info(void);
-- 
cgit v1.2.3-59-g8ed1b


From 0c7b87b0857f0e17be982fd840046444a83c3996 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Wed, 16 Sep 2009 01:43:52 +0400
Subject: powerpc/qe: Make qe_reset() code path safe for repeated invocation

For MPC8569 CPUs we'll need to reset QE after each suspend, so make
qe_reset() code path suitable for repeated invocation, that is:

- Don't initialize rheap structures if already initialized;
- Don't allocate muram for SDMA if already allocated, just reinitialize
  registers with previously allocated muram offset;
- Remove __init attributes from qe_reset() and cpm_muram_init();

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/include/asm/qe.h    |  2 +-
 arch/powerpc/sysdev/cpm_common.c |  5 ++++-
 arch/powerpc/sysdev/qe_lib/qe.c  | 12 +++++++-----
 3 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/qe.h b/arch/powerpc/include/asm/qe.h
index 28fee3b7887a..908f0b75745b 100644
--- a/arch/powerpc/include/asm/qe.h
+++ b/arch/powerpc/include/asm/qe.h
@@ -87,7 +87,7 @@ extern spinlock_t cmxgcr_lock;
 
 /* Export QE common operations */
 #ifdef CONFIG_QUICC_ENGINE
-extern void __init qe_reset(void);
+extern void qe_reset(void);
 #else
 static inline void qe_reset(void) {}
 #endif
diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c
index e4b6d66d93de..9de72c96e6d1 100644
--- a/arch/powerpc/sysdev/cpm_common.c
+++ b/arch/powerpc/sysdev/cpm_common.c
@@ -72,7 +72,7 @@ static phys_addr_t muram_pbase;
 /* Max address size we deal with */
 #define OF_MAX_ADDR_CELLS	4
 
-int __init cpm_muram_init(void)
+int cpm_muram_init(void)
 {
 	struct device_node *np;
 	struct resource r;
@@ -81,6 +81,9 @@ int __init cpm_muram_init(void)
 	int i = 0;
 	int ret = 0;
 
+	if (muram_pbase)
+		return 0;
+
 	spin_lock_init(&cpm_muram_lock);
 	/* initialize the info header */
 	rh_init(&cpm_muram_info, 1,
diff --git a/arch/powerpc/sysdev/qe_lib/qe.c b/arch/powerpc/sysdev/qe_lib/qe.c
index fff2701b90df..1ed1a9fd9bcf 100644
--- a/arch/powerpc/sysdev/qe_lib/qe.c
+++ b/arch/powerpc/sysdev/qe_lib/qe.c
@@ -104,7 +104,7 @@ phys_addr_t get_qe_base(void)
 
 EXPORT_SYMBOL(get_qe_base);
 
-void __init qe_reset(void)
+void qe_reset(void)
 {
 	if (qe_immr == NULL)
 		qe_immr = ioremap(get_qe_base(), QE_IMMAP_SIZE);
@@ -330,16 +330,18 @@ EXPORT_SYMBOL(qe_put_snum);
 static int qe_sdma_init(void)
 {
 	struct sdma __iomem *sdma = &qe_immr->sdma;
-	unsigned long sdma_buf_offset;
+	static unsigned long sdma_buf_offset = (unsigned long)-ENOMEM;
 
 	if (!sdma)
 		return -ENODEV;
 
 	/* allocate 2 internal temporary buffers (512 bytes size each) for
 	 * the SDMA */
- 	sdma_buf_offset = qe_muram_alloc(512 * 2, 4096);
-	if (IS_ERR_VALUE(sdma_buf_offset))
-		return -ENOMEM;
+	if (IS_ERR_VALUE(sdma_buf_offset)) {
+		sdma_buf_offset = qe_muram_alloc(512 * 2, 4096);
+		if (IS_ERR_VALUE(sdma_buf_offset))
+			return -ENOMEM;
+	}
 
 	out_be32(&sdma->sdebcr, (u32) sdma_buf_offset & QE_SDEBCR_BA_MASK);
  	out_be32(&sdma->sdmr, (QE_SDMR_GLB_1_MSK |
-- 
cgit v1.2.3-59-g8ed1b


From 46d2293470c18c1bb632083bf0b589deff30ccae Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Wed, 16 Sep 2009 01:43:54 +0400
Subject: powerpc/qe: QE also shuts down on MPC8568

It appears that QE shuts down on all MPC85xx CPUs (i.e. MPC8568 and
MPC8569) and thus needs reset upon resume.

So modify qe_alive_during_sleep() to account that.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/include/asm/qe.h   | 23 ++++++++++++++++++++++-
 arch/powerpc/sysdev/qe_lib/qe.c | 13 -------------
 2 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/qe.h b/arch/powerpc/include/asm/qe.h
index 908f0b75745b..5e0e8b2b0aa1 100644
--- a/arch/powerpc/include/asm/qe.h
+++ b/arch/powerpc/include/asm/qe.h
@@ -154,7 +154,28 @@ int qe_get_snum(void);
 void qe_put_snum(u8 snum);
 unsigned int qe_get_num_of_risc(void);
 unsigned int qe_get_num_of_snums(void);
-int qe_alive_during_sleep(void);
+
+static inline int qe_alive_during_sleep(void)
+{
+	/*
+	 * MPC8568E reference manual says:
+	 *
+	 * "...power down sequence waits for all I/O interfaces to become idle.
+	 *  In some applications this may happen eventually without actively
+	 *  shutting down interfaces, but most likely, software will have to
+	 *  take steps to shut down the eTSEC, QUICC Engine Block, and PCI
+	 *  interfaces before issuing the command (either the write to the core
+	 *  MSR[WE] as described above or writing to POWMGTCSR) to put the
+	 *  device into sleep state."
+	 *
+	 * MPC8569E reference manual has a similar paragraph.
+	 */
+#ifdef CONFIG_PPC_85xx
+	return 0;
+#else
+	return 1;
+#endif
+}
 
 /* we actually use cpm_muram implementation, define this for convenience */
 #define qe_muram_init cpm_muram_init
diff --git a/arch/powerpc/sysdev/qe_lib/qe.c b/arch/powerpc/sysdev/qe_lib/qe.c
index 1ed1a9fd9bcf..4eaf2a962914 100644
--- a/arch/powerpc/sysdev/qe_lib/qe.c
+++ b/arch/powerpc/sysdev/qe_lib/qe.c
@@ -65,19 +65,6 @@ static unsigned int qe_num_of_snum;
 
 static phys_addr_t qebase = -1;
 
-int qe_alive_during_sleep(void)
-{
-	static int ret = -1;
-
-	if (ret != -1)
-		return ret;
-
-	ret = !of_find_compatible_node(NULL, NULL, "fsl,mpc8569-pmc");
-
-	return ret;
-}
-EXPORT_SYMBOL(qe_alive_during_sleep);
-
 phys_addr_t get_qe_base(void)
 {
 	struct device_node *qe;
-- 
cgit v1.2.3-59-g8ed1b


From 644b2a680ccc51a9ec4d6beb12e9d47d2dee98e2 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Mon, 12 Oct 2009 20:49:13 +0400
Subject: powerpc/cpm: Remove SPI defines and spi structs

When cpm2.h included into spi_mpc8xxx driver, the SPI defines
in the header conflict with defines in the driver.

We don't need them in the header file, so remove them. Plus
remove "struct spi", we'll use a better version in the driver.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/include/asm/cpm1.h | 45 -----------------------------------------
 arch/powerpc/include/asm/cpm2.h | 39 -----------------------------------
 2 files changed, 84 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/cpm1.h b/arch/powerpc/include/asm/cpm1.h
index 7685ffde8821..81b01192f440 100644
--- a/arch/powerpc/include/asm/cpm1.h
+++ b/arch/powerpc/include/asm/cpm1.h
@@ -478,51 +478,6 @@ typedef struct iic {
 	char	res2[2];	/* Reserved */
 } iic_t;
 
-/* SPI parameter RAM.
-*/
-typedef struct spi {
-	ushort	spi_rbase;	/* Rx Buffer descriptor base address */
-	ushort	spi_tbase;	/* Tx Buffer descriptor base address */
-	u_char	spi_rfcr;	/* Rx function code */
-	u_char	spi_tfcr;	/* Tx function code */
-	ushort	spi_mrblr;	/* Max receive buffer length */
-	uint	spi_rstate;	/* Internal */
-	uint	spi_rdp;	/* Internal */
-	ushort	spi_rbptr;	/* Internal */
-	ushort	spi_rbc;	/* Internal */
-	uint	spi_rxtmp;	/* Internal */
-	uint	spi_tstate;	/* Internal */
-	uint	spi_tdp;	/* Internal */
-	ushort	spi_tbptr;	/* Internal */
-	ushort	spi_tbc;	/* Internal */
-	uint	spi_txtmp;	/* Internal */
-	uint	spi_res;
-	ushort	spi_rpbase;	/* Relocation pointer */
-	ushort	spi_res2;
-} spi_t;
-
-/* SPI Mode register.
-*/
-#define SPMODE_LOOP	((ushort)0x4000)	/* Loopback */
-#define SPMODE_CI	((ushort)0x2000)	/* Clock Invert */
-#define SPMODE_CP	((ushort)0x1000)	/* Clock Phase */
-#define SPMODE_DIV16	((ushort)0x0800)	/* BRG/16 mode */
-#define SPMODE_REV	((ushort)0x0400)	/* Reversed Data */
-#define SPMODE_MSTR	((ushort)0x0200)	/* SPI Master */
-#define SPMODE_EN	((ushort)0x0100)	/* Enable */
-#define SPMODE_LENMSK	((ushort)0x00f0)	/* character length */
-#define SPMODE_LEN4	((ushort)0x0030)	/*  4 bits per char */
-#define SPMODE_LEN8	((ushort)0x0070)	/*  8 bits per char */
-#define SPMODE_LEN16	((ushort)0x00f0)	/* 16 bits per char */
-#define SPMODE_PMMSK	((ushort)0x000f)	/* prescale modulus */
-
-/* SPIE fields */
-#define SPIE_MME	0x20
-#define SPIE_TXE	0x10
-#define SPIE_BSY	0x04
-#define SPIE_TXB	0x02
-#define SPIE_RXB	0x01
-
 /*
  * RISC Controller Configuration Register definitons
  */
diff --git a/arch/powerpc/include/asm/cpm2.h b/arch/powerpc/include/asm/cpm2.h
index 990ff191da8b..236cfa344a7c 100644
--- a/arch/powerpc/include/asm/cpm2.h
+++ b/arch/powerpc/include/asm/cpm2.h
@@ -654,45 +654,6 @@ typedef struct iic {
 	uint	iic_txtmp;	/* Internal */
 } iic_t;
 
-/* SPI parameter RAM.
-*/
-typedef struct spi {
-	ushort	spi_rbase;	/* Rx Buffer descriptor base address */
-	ushort	spi_tbase;	/* Tx Buffer descriptor base address */
-	u_char	spi_rfcr;	/* Rx function code */
-	u_char	spi_tfcr;	/* Tx function code */
-	ushort	spi_mrblr;	/* Max receive buffer length */
-	uint	spi_rstate;	/* Internal */
-	uint	spi_rdp;	/* Internal */
-	ushort	spi_rbptr;	/* Internal */
-	ushort	spi_rbc;	/* Internal */
-	uint	spi_rxtmp;	/* Internal */
-	uint	spi_tstate;	/* Internal */
-	uint	spi_tdp;	/* Internal */
-	ushort	spi_tbptr;	/* Internal */
-	ushort	spi_tbc;	/* Internal */
-	uint	spi_txtmp;	/* Internal */
-	uint	spi_res;	/* Tx temp. */
-	uint	spi_res1[4];	/* SDMA temp. */
-} spi_t;
-
-/* SPI Mode register.
-*/
-#define SPMODE_LOOP	((ushort)0x4000)	/* Loopback */
-#define SPMODE_CI	((ushort)0x2000)	/* Clock Invert */
-#define SPMODE_CP	((ushort)0x1000)	/* Clock Phase */
-#define SPMODE_DIV16	((ushort)0x0800)	/* BRG/16 mode */
-#define SPMODE_REV	((ushort)0x0400)	/* Reversed Data */
-#define SPMODE_MSTR	((ushort)0x0200)	/* SPI Master */
-#define SPMODE_EN	((ushort)0x0100)	/* Enable */
-#define SPMODE_LENMSK	((ushort)0x00f0)	/* character length */
-#define SPMODE_PMMSK	((ushort)0x000f)	/* prescale modulus */
-
-#define SPMODE_LEN(x)	((((x)-1)&0xF)<<4)
-#define SPMODE_PM(x)	((x) &0xF)
-
-#define SPI_EB		((u_char)0x10)		/* big endian byte order */
-
 /* IDMA parameter RAM
 */
 typedef struct idma {
-- 
cgit v1.2.3-59-g8ed1b


From 80952988a2c6f0aa73e9ec6554084446da8d8791 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Mon, 12 Oct 2009 20:49:16 +0400
Subject: powerpc/qe&cpm2: Avoid redefinitions in CPM2 and QE headers

struct mcc defined in both immap_qe.h and immap_cpm2.h, so they will
conflic when included in a single file. The mcc struct is easy to deal
with, since it isn't used in any driver (yet), so let's just rename QE
version to qe_mcc.

The ucb_ctlr is a bit trickier, since it is used by fsl_qe_udc driver,
and the driver supports both CPM and QE UDCs, plus the QE version is
used to form a bigger immap struct.

I don't want to touch too much of USB code in this series, so for now
let's just copy most generic version into the common cpm.h header,
later we'll create cpm_usb.h where we'll place common USB structs that
are used by QE/CPM UDC and QE Host drivers (FHCI).

And as for the structs in qe.h and cpm2.h, just prefix them with qe_
and cpm_.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/include/asm/cpm.h        | 22 ++++++++++++++++++++++
 arch/powerpc/include/asm/immap_cpm2.h |  2 +-
 arch/powerpc/include/asm/immap_qe.h   |  8 ++++----
 3 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/cpm.h b/arch/powerpc/include/asm/cpm.h
index 24d79e3abd8e..b5f15340dc22 100644
--- a/arch/powerpc/include/asm/cpm.h
+++ b/arch/powerpc/include/asm/cpm.h
@@ -5,6 +5,28 @@
 #include <linux/types.h>
 #include <linux/of.h>
 
+/*
+ * USB Controller pram common to QE and CPM.
+ */
+struct usb_ctlr {
+	u8	usb_usmod;
+	u8	usb_usadr;
+	u8	usb_uscom;
+	u8	res1[1];
+	__be16	usb_usep[4];
+	u8	res2[4];
+	__be16	usb_usber;
+	u8	res3[2];
+	__be16	usb_usbmr;
+	u8	res4[1];
+	u8	usb_usbs;
+	/* Fields down below are QE-only */
+	__be16	usb_ussft;
+	u8	res5[2];
+	__be16	usb_usfrn;
+	u8	res6[0x22];
+} __attribute__ ((packed));
+
 /* Opcodes common to CPM1 and CPM2
 */
 #define CPM_CR_INIT_TRX		((ushort)0x0000)
diff --git a/arch/powerpc/include/asm/immap_cpm2.h b/arch/powerpc/include/asm/immap_cpm2.h
index d4f069bf0e57..7c64fda5357b 100644
--- a/arch/powerpc/include/asm/immap_cpm2.h
+++ b/arch/powerpc/include/asm/immap_cpm2.h
@@ -549,7 +549,7 @@ typedef struct comm_proc {
 
 /* USB Controller.
 */
-typedef struct usb_ctlr {
+typedef struct cpm_usb_ctlr {
 	u8	usb_usmod;
 	u8	usb_usadr;
 	u8	usb_uscom;
diff --git a/arch/powerpc/include/asm/immap_qe.h b/arch/powerpc/include/asm/immap_qe.h
index c346d0bcd230..4e10f508570a 100644
--- a/arch/powerpc/include/asm/immap_qe.h
+++ b/arch/powerpc/include/asm/immap_qe.h
@@ -210,7 +210,7 @@ struct sir {
 } __attribute__ ((packed));
 
 /* USB Controller */
-struct usb_ctlr {
+struct qe_usb_ctlr {
 	u8	usb_usmod;
 	u8	usb_usadr;
 	u8	usb_uscom;
@@ -229,7 +229,7 @@ struct usb_ctlr {
 } __attribute__ ((packed));
 
 /* MCC */
-struct mcc {
+struct qe_mcc {
 	__be32	mcce;		/* MCC event register */
 	__be32	mccm;		/* MCC mask register */
 	__be32	mccf;		/* MCC configuration register */
@@ -431,9 +431,9 @@ struct qe_immap {
 	struct qe_mux		qmx;		/* QE Multiplexer */
 	struct qe_timers	qet;		/* QE Timers */
 	struct spi		spi[0x2];	/* spi */
-	struct mcc		mcc;		/* mcc */
+	struct qe_mcc		mcc;		/* mcc */
 	struct qe_brg		brg;		/* brg */
-	struct usb_ctlr		usb;		/* USB */
+	struct qe_usb_ctlr	usb;		/* USB */
 	struct si1		si1;		/* SI */
 	u8			res11[0x800];
 	struct sir		sir;		/* SI Routing Tables */
-- 
cgit v1.2.3-59-g8ed1b


From 71d94fe842c34fb93eb32ae20207bea757292b79 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Mon, 12 Oct 2009 20:49:18 +0400
Subject: powerpc/cpm: Move CPMFCR_* defines into cpm.h

The bits are generic to CPM devices, so let's move them to the
common header file, so drivers won't need to privately reintroduce
another bunch of the same bits (as we can't include cpm2.h header
together with cpm1.h).

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/include/asm/cpm.h  | 16 ++++++++++++++++
 arch/powerpc/include/asm/cpm2.h |  8 --------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/cpm.h b/arch/powerpc/include/asm/cpm.h
index b5f15340dc22..ea3fdb9fe257 100644
--- a/arch/powerpc/include/asm/cpm.h
+++ b/arch/powerpc/include/asm/cpm.h
@@ -27,6 +27,22 @@ struct usb_ctlr {
 	u8	res6[0x22];
 } __attribute__ ((packed));
 
+/*
+ * Function code bits, usually generic to devices.
+ */
+#ifdef CONFIG_CPM1
+#define CPMFCR_GBL	((u_char)0x00)	/* Flag doesn't exist in CPM1 */
+#define CPMFCR_TC2	((u_char)0x00)	/* Flag doesn't exist in CPM1 */
+#define CPMFCR_DTB	((u_char)0x00)	/* Flag doesn't exist in CPM1 */
+#define CPMFCR_BDB	((u_char)0x00)	/* Flag doesn't exist in CPM1 */
+#else
+#define CPMFCR_GBL	((u_char)0x20)	/* Set memory snooping */
+#define CPMFCR_TC2	((u_char)0x04)	/* Transfer code 2 value */
+#define CPMFCR_DTB	((u_char)0x02)	/* Use local bus for data when set */
+#define CPMFCR_BDB	((u_char)0x01)	/* Use local bus for BD when set */
+#endif
+#define CPMFCR_EB	((u_char)0x10)	/* Set big endian byte order */
+
 /* Opcodes common to CPM1 and CPM2
 */
 #define CPM_CR_INIT_TRX		((ushort)0x0000)
diff --git a/arch/powerpc/include/asm/cpm2.h b/arch/powerpc/include/asm/cpm2.h
index 236cfa344a7c..f42e9baf3a4e 100644
--- a/arch/powerpc/include/asm/cpm2.h
+++ b/arch/powerpc/include/asm/cpm2.h
@@ -124,14 +124,6 @@ static inline void cpm2_fastbrg(uint brg, uint rate, int div16)
 	__cpm2_setbrg(brg, rate, CPM2_BRG_INT_CLK, div16, CPM_BRG_EXTC_INT);
 }
 
-/* Function code bits, usually generic to devices.
-*/
-#define CPMFCR_GBL	((u_char)0x20)	/* Set memory snooping */
-#define CPMFCR_EB	((u_char)0x10)	/* Set big endian byte order */
-#define CPMFCR_TC2	((u_char)0x04)	/* Transfer code 2 value */
-#define CPMFCR_DTB	((u_char)0x02)	/* Use local bus for data when set */
-#define CPMFCR_BDB	((u_char)0x01)	/* Use local bus for BD when set */
-
 /* Parameter RAM offsets from the base.
 */
 #define PROFF_SCC1		((uint)0x8000)
-- 
cgit v1.2.3-59-g8ed1b


From 58c12bdc5d924e4bca60c2660df2a71be4953ac9 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Mon, 12 Oct 2009 20:49:20 +0400
Subject: powerpc/qe&cpm: Implement static inline stubs for non-QE/CPM builds

This is needed to avoid ugly #ifdefs in drivers. Also update fsl_qe_udc
driver so that now it doesn't define its own versions that cause build
breakage when the generic stubs are used.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/include/asm/cpm.h  | 44 +++++++++++++++++++++++++++++++++++++++++
 arch/powerpc/include/asm/qe.h   | 11 ++++++++++-
 drivers/usb/gadget/fsl_qe_udc.h | 15 --------------
 3 files changed, 54 insertions(+), 16 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/cpm.h b/arch/powerpc/include/asm/cpm.h
index ea3fdb9fe257..0835eb977ba9 100644
--- a/arch/powerpc/include/asm/cpm.h
+++ b/arch/powerpc/include/asm/cpm.h
@@ -3,6 +3,7 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <linux/errno.h>
 #include <linux/of.h>
 
 /*
@@ -131,13 +132,56 @@ typedef struct cpm_buf_desc {
 #define BD_I2C_START		(0x0400)
 
 int cpm_muram_init(void);
+
+#if defined(CONFIG_CPM) || defined(CONFIG_QUICC_ENGINE)
 unsigned long cpm_muram_alloc(unsigned long size, unsigned long align);
 int cpm_muram_free(unsigned long offset);
 unsigned long cpm_muram_alloc_fixed(unsigned long offset, unsigned long size);
 void __iomem *cpm_muram_addr(unsigned long offset);
 unsigned long cpm_muram_offset(void __iomem *addr);
 dma_addr_t cpm_muram_dma(void __iomem *addr);
+#else
+static inline unsigned long cpm_muram_alloc(unsigned long size,
+					    unsigned long align)
+{
+	return -ENOSYS;
+}
+
+static inline int cpm_muram_free(unsigned long offset)
+{
+	return -ENOSYS;
+}
+
+static inline unsigned long cpm_muram_alloc_fixed(unsigned long offset,
+						  unsigned long size)
+{
+	return -ENOSYS;
+}
+
+static inline void __iomem *cpm_muram_addr(unsigned long offset)
+{
+	return NULL;
+}
+
+static inline unsigned long cpm_muram_offset(void __iomem *addr)
+{
+	return -ENOSYS;
+}
+
+static inline dma_addr_t cpm_muram_dma(void __iomem *addr)
+{
+	return 0;
+}
+#endif /* defined(CONFIG_CPM) || defined(CONFIG_QUICC_ENGINE) */
+
+#ifdef CONFIG_CPM
 int cpm_command(u32 command, u8 opcode);
+#else
+static inline int cpm_command(u32 command, u8 opcode)
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_CPM */
 
 int cpm2_gpiochip_add32(struct device_node *np);
 
diff --git a/arch/powerpc/include/asm/qe.h b/arch/powerpc/include/asm/qe.h
index 5e0e8b2b0aa1..0947b36e534c 100644
--- a/arch/powerpc/include/asm/qe.h
+++ b/arch/powerpc/include/asm/qe.h
@@ -145,8 +145,17 @@ static inline void qe_pin_set_gpio(struct qe_pin *qe_pin) {}
 static inline void qe_pin_set_dedicated(struct qe_pin *pin) {}
 #endif /* CONFIG_QE_GPIO */
 
-/* QE internal API */
+#ifdef CONFIG_QUICC_ENGINE
 int qe_issue_cmd(u32 cmd, u32 device, u8 mcn_protocol, u32 cmd_input);
+#else
+static inline int qe_issue_cmd(u32 cmd, u32 device, u8 mcn_protocol,
+			       u32 cmd_input)
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_QUICC_ENGINE */
+
+/* QE internal API */
 enum qe_clock qe_clock_source(const char *source);
 unsigned int qe_get_brg_clk(void);
 int qe_setbrg(enum qe_clock brg, unsigned int rate, unsigned int multiplier);
diff --git a/drivers/usb/gadget/fsl_qe_udc.h b/drivers/usb/gadget/fsl_qe_udc.h
index 31b2710882e4..bea5b827bebe 100644
--- a/drivers/usb/gadget/fsl_qe_udc.h
+++ b/drivers/usb/gadget/fsl_qe_udc.h
@@ -419,19 +419,4 @@ struct qe_udc {
 #define CPM_USB_RESTART_TX_OPCODE 0x0b
 #define CPM_USB_EP_SHIFT 5
 
-#ifndef CONFIG_CPM
-inline int cpm_command(u32 command, u8 opcode)
-{
-	return -EOPNOTSUPP;
-}
-#endif
-
-#ifndef CONFIG_QUICC_ENGINE
-inline int qe_issue_cmd(u32 cmd, u32 device, u8 mcn_protocol,
-	u32 cmd_input)
-{
-	return -EOPNOTSUPP;
-}
-#endif
-
 #endif  /* __FSL_QE_UDC_H */
-- 
cgit v1.2.3-59-g8ed1b


From 690b846aa1b42b4f35bfac0022b75a288d97fd13 Mon Sep 17 00:00:00 2001
From: Albrecht Dreß <albrecht.dress@arcor.de>
Date: Thu, 12 Nov 2009 13:31:35 -0700
Subject: mpc5200/gpt: tiny fix for gpt period limitation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch changes the period parameter of mpc52xx_gpt_start_timer to
a u64 to support larger timeout periods.

Signed-off-by: Albrecht Dreß <albrecht.dress@arcor.de>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/powerpc/include/asm/mpc52xx.h        | 2 +-
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/mpc52xx.h b/arch/powerpc/include/asm/mpc52xx.h
index 707ab7590cfb..933fb8f6e797 100644
--- a/arch/powerpc/include/asm/mpc52xx.h
+++ b/arch/powerpc/include/asm/mpc52xx.h
@@ -279,7 +279,7 @@ extern void mpc52xx_restart(char *cmd);
 /* mpc52xx_gpt.c */
 struct mpc52xx_gpt_priv;
 extern struct mpc52xx_gpt_priv *mpc52xx_gpt_from_irq(int irq);
-extern int mpc52xx_gpt_start_timer(struct mpc52xx_gpt_priv *gpt, int period,
+extern int mpc52xx_gpt_start_timer(struct mpc52xx_gpt_priv *gpt, u64 period,
                             int continuous);
 extern void mpc52xx_gpt_stop_timer(struct mpc52xx_gpt_priv *gpt);
 
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index 2c3fa13571ce..77572abca6c3 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -378,12 +378,12 @@ EXPORT_SYMBOL(mpc52xx_gpt_from_irq);
 /**
  * mpc52xx_gpt_start_timer - Set and enable the GPT timer
  * @gpt: Pointer to gpt private data structure
- * @period: period of timer
+ * @period: period of timer in ns; max. ~130s @ 33MHz IPB clock
  * @continuous: set to 1 to make timer continuous free running
  *
  * An interrupt will be generated every time the timer fires
  */
-int mpc52xx_gpt_start_timer(struct mpc52xx_gpt_priv *gpt, int period,
+int mpc52xx_gpt_start_timer(struct mpc52xx_gpt_priv *gpt, u64 period,
                             int continuous)
 {
 	u32 clear, set;
@@ -400,7 +400,7 @@ int mpc52xx_gpt_start_timer(struct mpc52xx_gpt_priv *gpt, int period,
 	 * arithmatic is done here to preserve the precision until the value
 	 * is scaled back down into the u32 range.  Period is in 'ns', bus
 	 * frequency is in Hz. */
-	clocks = (u64)period * (u64)gpt->ipb_freq;
+	clocks = period * (u64)gpt->ipb_freq;
 	do_div(clocks, 1000000000); /* Scale it down to ns range */
 
 	/* This device cannot handle a clock count greater than 32 bits */
-- 
cgit v1.2.3-59-g8ed1b


From eda43d16ef3d0bd59e3b762de3ffc73bab02efe9 Mon Sep 17 00:00:00 2001
From: Albrecht Dreß <albrecht.dress@arcor.de>
Date: Fri, 13 Nov 2009 11:09:31 -0700
Subject: mpc52xx/wdt: merge WDT code into the GPT driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Merge the WDT code into the GPT interface.

Signed-off-by: Albrecht Dreß <albrecht.dress@arcor.de>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/powerpc/include/asm/mpc52xx.h        |   3 +-
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c | 321 ++++++++++++++++++++++++++++--
 2 files changed, 307 insertions(+), 17 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/mpc52xx.h b/arch/powerpc/include/asm/mpc52xx.h
index 933fb8f6e797..b664ce79a172 100644
--- a/arch/powerpc/include/asm/mpc52xx.h
+++ b/arch/powerpc/include/asm/mpc52xx.h
@@ -281,7 +281,8 @@ struct mpc52xx_gpt_priv;
 extern struct mpc52xx_gpt_priv *mpc52xx_gpt_from_irq(int irq);
 extern int mpc52xx_gpt_start_timer(struct mpc52xx_gpt_priv *gpt, u64 period,
                             int continuous);
-extern void mpc52xx_gpt_stop_timer(struct mpc52xx_gpt_priv *gpt);
+extern u64 mpc52xx_gpt_timer_period(struct mpc52xx_gpt_priv *gpt);
+extern int mpc52xx_gpt_stop_timer(struct mpc52xx_gpt_priv *gpt);
 
 /* mpc52xx_lpbfifo.c */
 #define MPC52XX_LPBFIFO_FLAG_READ		(0)
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index 77572abca6c3..7085e4c60ba1 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -16,8 +16,14 @@
  * output signals or measure input signals.
  *
  * This driver supports the GPIO and IRQ controller functions of the GPT
- * device.  Timer functions are not yet supported, nor is the watchdog
- * timer.
+ * device.  Timer functions are not yet supported.
+ *
+ * The timer gpt0 can be used as watchdog (wdt).  If the wdt mode is used,
+ * this prevents the use of any gpt0 gpt function (i.e. they will fail with
+ * -EBUSY).  Thus, the safety wdt function always has precedence over the gpt
+ * function.  If the kernel has been compiled with CONFIG_WATCHDOG_NOWAYOUT,
+ * this means that gpt0 is locked in wdt mode until the next reboot - this
+ * may be a requirement in safety applications.
  *
  * To use the GPIO function, the following two properties must be added
  * to the device tree node for the gpt device (typically in the .dts file
@@ -56,11 +62,14 @@
 #include <linux/of_platform.h>
 #include <linux/of_gpio.h>
 #include <linux/kernel.h>
+#include <linux/watchdog.h>
+#include <linux/miscdevice.h>
+#include <linux/uaccess.h>
 #include <asm/div64.h>
 #include <asm/mpc52xx.h>
 
 MODULE_DESCRIPTION("Freescale MPC52xx gpt driver");
-MODULE_AUTHOR("Sascha Hauer, Grant Likely");
+MODULE_AUTHOR("Sascha Hauer, Grant Likely, Albrecht Dreß");
 MODULE_LICENSE("GPL");
 
 /**
@@ -70,6 +79,9 @@ MODULE_LICENSE("GPL");
  * @lock: spinlock to coordinate between different functions.
  * @of_gc: of_gpio_chip instance structure; used when GPIO is enabled
  * @irqhost: Pointer to irq_host instance; used when IRQ mode is supported
+ * @wdt_mode: only relevant for gpt0: bit 0 (MPC52xx_GPT_CAN_WDT) indicates
+ *   if the gpt may be used as wdt, bit 1 (MPC52xx_GPT_IS_WDT) indicates
+ *   if the timer is actively used as wdt which blocks gpt functions
  */
 struct mpc52xx_gpt_priv {
 	struct list_head list;		/* List of all GPT devices */
@@ -78,6 +90,7 @@ struct mpc52xx_gpt_priv {
 	spinlock_t lock;
 	struct irq_host *irqhost;
 	u32 ipb_freq;
+	u8 wdt_mode;
 
 #if defined(CONFIG_GPIOLIB)
 	struct of_gpio_chip of_gc;
@@ -101,14 +114,21 @@ DEFINE_MUTEX(mpc52xx_gpt_list_mutex);
 #define MPC52xx_GPT_MODE_CONTINUOUS	(0x0400)
 #define MPC52xx_GPT_MODE_OPEN_DRAIN	(0x0200)
 #define MPC52xx_GPT_MODE_IRQ_EN		(0x0100)
+#define MPC52xx_GPT_MODE_WDT_EN		(0x8000)
 
 #define MPC52xx_GPT_MODE_ICT_MASK	(0x030000)
 #define MPC52xx_GPT_MODE_ICT_RISING	(0x010000)
 #define MPC52xx_GPT_MODE_ICT_FALLING	(0x020000)
 #define MPC52xx_GPT_MODE_ICT_TOGGLE	(0x030000)
 
+#define MPC52xx_GPT_MODE_WDT_PING	(0xa5)
+
 #define MPC52xx_GPT_STATUS_IRQMASK	(0x000f)
 
+#define MPC52xx_GPT_CAN_WDT		(1 << 0)
+#define MPC52xx_GPT_IS_WDT		(1 << 1)
+
+
 /* ---------------------------------------------------------------------
  * Cascaded interrupt controller hooks
  */
@@ -375,16 +395,8 @@ struct mpc52xx_gpt_priv *mpc52xx_gpt_from_irq(int irq)
 }
 EXPORT_SYMBOL(mpc52xx_gpt_from_irq);
 
-/**
- * mpc52xx_gpt_start_timer - Set and enable the GPT timer
- * @gpt: Pointer to gpt private data structure
- * @period: period of timer in ns; max. ~130s @ 33MHz IPB clock
- * @continuous: set to 1 to make timer continuous free running
- *
- * An interrupt will be generated every time the timer fires
- */
-int mpc52xx_gpt_start_timer(struct mpc52xx_gpt_priv *gpt, u64 period,
-                            int continuous)
+static int mpc52xx_gpt_do_start(struct mpc52xx_gpt_priv *gpt, u64 period,
+				int continuous, int as_wdt)
 {
 	u32 clear, set;
 	u64 clocks;
@@ -393,7 +405,10 @@ int mpc52xx_gpt_start_timer(struct mpc52xx_gpt_priv *gpt, u64 period,
 
 	clear = MPC52xx_GPT_MODE_MS_MASK | MPC52xx_GPT_MODE_CONTINUOUS;
 	set = MPC52xx_GPT_MODE_MS_GPIO | MPC52xx_GPT_MODE_COUNTER_ENABLE;
-	if (continuous)
+	if (as_wdt) {
+		clear |= MPC52xx_GPT_MODE_IRQ_EN;
+		set |= MPC52xx_GPT_MODE_WDT_EN;
+	} else if (continuous)
 		set |= MPC52xx_GPT_MODE_CONTINUOUS;
 
 	/* Determine the number of clocks in the requested period.  64 bit
@@ -427,22 +442,279 @@ int mpc52xx_gpt_start_timer(struct mpc52xx_gpt_priv *gpt, u64 period,
 		return -EINVAL;
 	}
 
-	/* Set and enable the timer */
+	/* Set and enable the timer, reject an attempt to use a wdt as gpt */
 	spin_lock_irqsave(&gpt->lock, flags);
+	if (as_wdt)
+		gpt->wdt_mode |= MPC52xx_GPT_IS_WDT;
+	else if ((gpt->wdt_mode & MPC52xx_GPT_IS_WDT) != 0) {
+		spin_unlock_irqrestore(&gpt->lock, flags);
+		return -EBUSY;
+	}
 	out_be32(&gpt->regs->count, prescale << 16 | clocks);
 	clrsetbits_be32(&gpt->regs->mode, clear, set);
 	spin_unlock_irqrestore(&gpt->lock, flags);
 
 	return 0;
 }
+
+/**
+ * mpc52xx_gpt_start_timer - Set and enable the GPT timer
+ * @gpt: Pointer to gpt private data structure
+ * @period: period of timer in ns; max. ~130s @ 33MHz IPB clock
+ * @continuous: set to 1 to make timer continuous free running
+ *
+ * An interrupt will be generated every time the timer fires
+ */
+int mpc52xx_gpt_start_timer(struct mpc52xx_gpt_priv *gpt, u64 period,
+                            int continuous)
+{
+	return mpc52xx_gpt_do_start(gpt, period, continuous, 0);
+}
 EXPORT_SYMBOL(mpc52xx_gpt_start_timer);
 
-void mpc52xx_gpt_stop_timer(struct mpc52xx_gpt_priv *gpt)
+/**
+ * mpc52xx_gpt_stop_timer - Stop a gpt
+ * @gpt: Pointer to gpt private data structure
+ *
+ * Returns an error if attempting to stop a wdt
+ */
+int mpc52xx_gpt_stop_timer(struct mpc52xx_gpt_priv *gpt)
 {
+	unsigned long flags;
+
+	/* reject the operation if the timer is used as watchdog (gpt 0 only) */
+	spin_lock_irqsave(&gpt->lock, flags);
+	if ((gpt->wdt_mode & MPC52xx_GPT_IS_WDT) != 0) {
+		spin_unlock_irqrestore(&gpt->lock, flags);
+		return -EBUSY;
+	}
+
 	clrbits32(&gpt->regs->mode, MPC52xx_GPT_MODE_COUNTER_ENABLE);
+	spin_unlock_irqrestore(&gpt->lock, flags);
+	return 0;
 }
 EXPORT_SYMBOL(mpc52xx_gpt_stop_timer);
 
+/**
+ * mpc52xx_gpt_timer_period - Read the timer period
+ * @gpt: Pointer to gpt private data structure
+ *
+ * Returns the timer period in ns
+ */
+u64 mpc52xx_gpt_timer_period(struct mpc52xx_gpt_priv *gpt)
+{
+	u64 period;
+	u64 prescale;
+	unsigned long flags;
+
+	spin_lock_irqsave(&gpt->lock, flags);
+	period = in_be32(&gpt->regs->count);
+	spin_unlock_irqrestore(&gpt->lock, flags);
+
+	prescale = period >> 16;
+	period &= 0xffff;
+	if (prescale == 0)
+		prescale = 0x10000;
+	period = period * prescale * 1000000000ULL;
+	do_div(period, (u64)gpt->ipb_freq);
+	return period;
+}
+EXPORT_SYMBOL(mpc52xx_gpt_timer_period);
+
+#if defined(CONFIG_MPC5200_WDT)
+/***********************************************************************
+ * Watchdog API for gpt0
+ */
+
+#define WDT_IDENTITY	    "mpc52xx watchdog on GPT0"
+
+/* wdt_is_active stores wether or not the /dev/watchdog device is opened */
+static unsigned long wdt_is_active;
+
+/* wdt-capable gpt */
+static struct mpc52xx_gpt_priv *mpc52xx_gpt_wdt;
+
+/* low-level wdt functions */
+static inline void mpc52xx_gpt_wdt_ping(struct mpc52xx_gpt_priv *gpt_wdt)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&gpt_wdt->lock, flags);
+	out_8((u8 *) &gpt_wdt->regs->mode, MPC52xx_GPT_MODE_WDT_PING);
+	spin_unlock_irqrestore(&gpt_wdt->lock, flags);
+}
+
+/* wdt misc device api */
+static ssize_t mpc52xx_wdt_write(struct file *file, const char __user *data,
+				 size_t len, loff_t *ppos)
+{
+	struct mpc52xx_gpt_priv *gpt_wdt = file->private_data;
+	mpc52xx_gpt_wdt_ping(gpt_wdt);
+	return 0;
+}
+
+static struct watchdog_info mpc5200_wdt_info = {
+	.options	= WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING,
+	.identity	= WDT_IDENTITY,
+};
+
+static long mpc52xx_wdt_ioctl(struct file *file, unsigned int cmd,
+			      unsigned long arg)
+{
+	struct mpc52xx_gpt_priv *gpt_wdt = file->private_data;
+	int __user *data = (int __user *)arg;
+	int timeout;
+	u64 real_timeout;
+	int ret = 0;
+
+	switch (cmd) {
+	case WDIOC_GETSUPPORT:
+		ret = copy_to_user(data, &mpc5200_wdt_info,
+				   sizeof(mpc5200_wdt_info));
+		if (ret)
+			ret = -EFAULT;
+		break;
+
+	case WDIOC_GETSTATUS:
+	case WDIOC_GETBOOTSTATUS:
+		ret = put_user(0, data);
+		break;
+
+	case WDIOC_KEEPALIVE:
+		mpc52xx_gpt_wdt_ping(gpt_wdt);
+		break;
+
+	case WDIOC_SETTIMEOUT:
+		ret = get_user(timeout, data);
+		if (ret)
+			break;
+		real_timeout = (u64) timeout * 1000000000ULL;
+		ret = mpc52xx_gpt_do_start(gpt_wdt, real_timeout, 0, 1);
+		if (ret)
+			break;
+		/* fall through and return the timeout */
+
+	case WDIOC_GETTIMEOUT:
+		/* we need to round here as to avoid e.g. the following
+		 * situation:
+		 * - timeout requested is 1 second;
+		 * - real timeout @33MHz is 999997090ns
+		 * - the int divide by 10^9 will return 0.
+		 */
+		real_timeout =
+			mpc52xx_gpt_timer_period(gpt_wdt) + 500000000ULL;
+		do_div(real_timeout, 1000000000ULL);
+		timeout = (int) real_timeout;
+		ret = put_user(timeout, data);
+		break;
+
+	default:
+		ret = -ENOTTY;
+	}
+	return ret;
+}
+
+static int mpc52xx_wdt_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	/* sanity check */
+	if (!mpc52xx_gpt_wdt)
+		return -ENODEV;
+
+	/* /dev/watchdog can only be opened once */
+	if (test_and_set_bit(0, &wdt_is_active))
+		return -EBUSY;
+
+	/* Set and activate the watchdog with 30 seconds timeout */
+	ret = mpc52xx_gpt_do_start(mpc52xx_gpt_wdt, 30ULL * 1000000000ULL,
+				   0, 1);
+	if (ret) {
+		clear_bit(0, &wdt_is_active);
+		return ret;
+	}
+
+	file->private_data = mpc52xx_gpt_wdt;
+	return nonseekable_open(inode, file);
+}
+
+static int mpc52xx_wdt_release(struct inode *inode, struct file *file)
+{
+	/* note: releasing the wdt in NOWAYOUT-mode does not stop it */
+#if !defined(CONFIG_WATCHDOG_NOWAYOUT)
+	struct mpc52xx_gpt_priv *gpt_wdt = file->private_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&gpt_wdt->lock, flags);
+	clrbits32(&gpt_wdt->regs->mode,
+		  MPC52xx_GPT_MODE_COUNTER_ENABLE | MPC52xx_GPT_MODE_WDT_EN);
+	gpt_wdt->wdt_mode &= ~MPC52xx_GPT_IS_WDT;
+	spin_unlock_irqrestore(&gpt_wdt->lock, flags);
+#endif
+	clear_bit(0, &wdt_is_active);
+	return 0;
+}
+
+
+static const struct file_operations mpc52xx_wdt_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.write		= mpc52xx_wdt_write,
+	.unlocked_ioctl = mpc52xx_wdt_ioctl,
+	.open		= mpc52xx_wdt_open,
+	.release	= mpc52xx_wdt_release,
+};
+
+static struct miscdevice mpc52xx_wdt_miscdev = {
+	.minor		= WATCHDOG_MINOR,
+	.name		= "watchdog",
+	.fops		= &mpc52xx_wdt_fops,
+};
+
+static int __devinit mpc52xx_gpt_wdt_init(void)
+{
+	int err;
+
+	/* try to register the watchdog misc device */
+	err = misc_register(&mpc52xx_wdt_miscdev);
+	if (err)
+		pr_err("%s: cannot register watchdog device\n", WDT_IDENTITY);
+	else
+		pr_info("%s: watchdog device registered\n", WDT_IDENTITY);
+	return err;
+}
+
+static int mpc52xx_gpt_wdt_setup(struct mpc52xx_gpt_priv *gpt,
+				 const u32 *period)
+{
+	u64 real_timeout;
+
+	/* remember the gpt for the wdt operation */
+	mpc52xx_gpt_wdt = gpt;
+
+	/* configure the wdt if the device tree contained a timeout */
+	if (!period || *period == 0)
+		return 0;
+
+	real_timeout = (u64) *period * 1000000000ULL;
+	if (mpc52xx_gpt_do_start(gpt, real_timeout, 0, 1))
+		dev_warn(gpt->dev, "starting as wdt failed\n");
+	else
+		dev_info(gpt->dev, "watchdog set to %us timeout\n", *period);
+	return 0;
+}
+
+#else
+
+static int __devinit mpc52xx_gpt_wdt_init(void)
+{
+	return 0;
+}
+
+#define mpc52xx_gpt_wdt_setup(x, y)		(0)
+
+#endif	/*  CONFIG_MPC5200_WDT	*/
+
 /* ---------------------------------------------------------------------
  * of_platform bus binding code
  */
@@ -473,6 +745,22 @@ static int __devinit mpc52xx_gpt_probe(struct of_device *ofdev,
 	list_add(&gpt->list, &mpc52xx_gpt_list);
 	mutex_unlock(&mpc52xx_gpt_list_mutex);
 
+	/* check if this device could be a watchdog */
+	if (of_get_property(ofdev->node, "fsl,has-wdt", NULL) ||
+	    of_get_property(ofdev->node, "has-wdt", NULL)) {
+		const u32 *on_boot_wdt;
+
+		gpt->wdt_mode = MPC52xx_GPT_CAN_WDT;
+		on_boot_wdt = of_get_property(ofdev->node, "fsl,wdt-on-boot",
+					      NULL);
+		if (on_boot_wdt) {
+			dev_info(gpt->dev, "used as watchdog\n");
+			gpt->wdt_mode |= MPC52xx_GPT_IS_WDT;
+		} else
+			dev_info(gpt->dev, "can function as watchdog\n");
+		mpc52xx_gpt_wdt_setup(gpt, on_boot_wdt);
+	}
+
 	return 0;
 }
 
@@ -507,3 +795,4 @@ static int __init mpc52xx_gpt_init(void)
 
 /* Make sure GPIOs and IRQs get set up before anyone tries to use them */
 subsys_initcall(mpc52xx_gpt_init);
+device_initcall(mpc52xx_gpt_wdt_init);
-- 
cgit v1.2.3-59-g8ed1b


From d95cacc599a79d87052b3975f60cae62c04dc01f Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 20 Oct 2009 20:06:21 +0000
Subject: powerpc: Move ehea hcall definitions into hvcall.h

Move ehea hcall definitions into hvcall.h.

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Thomas Klein <tklein@de.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/hvcall.h | 13 ++++++++++
 drivers/net/ehea/ehea_hcall.h     | 51 ---------------------------------------
 drivers/net/ehea/ehea_phyp.h      |  1 -
 3 files changed, 13 insertions(+), 52 deletions(-)
 delete mode 100644 drivers/net/ehea/ehea_hcall.h

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 6251a4b10be7..3bf38af7c834 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -212,6 +212,19 @@
 #define H_QUERY_INT_STATE       0x1E4
 #define H_POLL_PENDING		0x1D8
 #define H_ILLAN_ATTRIBUTES	0x244
+#define H_MODIFY_HEA_QP		0x250
+#define H_QUERY_HEA_QP		0x254
+#define H_QUERY_HEA		0x258
+#define H_QUERY_HEA_PORT	0x25C
+#define H_MODIFY_HEA_PORT	0x260
+#define H_REG_BCMC		0x264
+#define H_DEREG_BCMC		0x268
+#define H_REGISTER_HEA_RPAGES	0x26C
+#define H_DISABLE_AND_GET_HEA	0x270
+#define H_GET_HEA_INFO		0x274
+#define H_ALLOC_HEA_RESOURCE	0x278
+#define H_ADD_CONN		0x284
+#define H_DEL_CONN		0x288
 #define H_JOIN			0x298
 #define H_VASI_STATE            0x2A4
 #define H_ENABLE_CRQ		0x2B0
diff --git a/drivers/net/ehea/ehea_hcall.h b/drivers/net/ehea/ehea_hcall.h
deleted file mode 100644
index 8e7d1c3edc60..000000000000
--- a/drivers/net/ehea/ehea_hcall.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- *  linux/drivers/net/ehea/ehea_hcall.h
- *
- *  eHEA ethernet device driver for IBM eServer System p
- *
- *  (C) Copyright IBM Corp. 2006
- *
- *  Authors:
- *       Christoph Raisch <raisch@de.ibm.com>
- *       Jan-Bernd Themann <themann@de.ibm.com>
- *       Thomas Klein <tklein@de.ibm.com>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#ifndef __EHEA_HCALL_H__
-#define __EHEA_HCALL_H__
-
-/**
- * This file contains HCALL defines that are to be included in the appropriate
- * kernel files later
- */
-
-#define H_ALLOC_HEA_RESOURCE   0x278
-#define H_MODIFY_HEA_QP        0x250
-#define H_QUERY_HEA_QP         0x254
-#define H_QUERY_HEA            0x258
-#define H_QUERY_HEA_PORT       0x25C
-#define H_MODIFY_HEA_PORT      0x260
-#define H_REG_BCMC             0x264
-#define H_DEREG_BCMC           0x268
-#define H_REGISTER_HEA_RPAGES  0x26C
-#define H_DISABLE_AND_GET_HEA  0x270
-#define H_GET_HEA_INFO         0x274
-#define H_ADD_CONN             0x284
-#define H_DEL_CONN             0x288
-
-#endif	/* __EHEA_HCALL_H__ */
diff --git a/drivers/net/ehea/ehea_phyp.h b/drivers/net/ehea/ehea_phyp.h
index f3628c803567..2f8174c248bc 100644
--- a/drivers/net/ehea/ehea_phyp.h
+++ b/drivers/net/ehea/ehea_phyp.h
@@ -33,7 +33,6 @@
 #include <asm/hvcall.h>
 #include "ehea.h"
 #include "ehea_hw.h"
-#include "ehea_hcall.h"
 
 /* Some abbreviations used here:
  *
-- 
cgit v1.2.3-59-g8ed1b


From 69ddb57cbea0b3dd851ea5f1edd1e609ad4da04e Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Thu, 29 Oct 2009 19:22:48 +0000
Subject: powerpc/pseries: Add extended_cede_processor() helper function.

This patch provides an extended_cede_processor() helper function
which takes the cede latency hint as an argument. This hint is to be passed
on to the hypervisor to cede to the corresponding state on platforms
which support it.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/lppaca.h               |  9 ++++++++-
 arch/powerpc/platforms/pseries/plpar_wrappers.h | 22 ++++++++++++++++++++++
 arch/powerpc/xmon/xmon.c                        |  3 ++-
 3 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h
index f78f65c38f05..14b592dfb4e8 100644
--- a/arch/powerpc/include/asm/lppaca.h
+++ b/arch/powerpc/include/asm/lppaca.h
@@ -100,7 +100,14 @@ struct lppaca {
 	// Used to pass parms from the OS to PLIC for SetAsrAndRfid
 	u64	saved_gpr3;		// Saved GPR3                   x20-x27
 	u64	saved_gpr4;		// Saved GPR4                   x28-x2F
-	u64	saved_gpr5;		// Saved GPR5                   x30-x37
+	union {
+		u64	saved_gpr5;	/* Saved GPR5               x30-x37 */
+		struct {
+			u8	cede_latency_hint;  /*			x30 */
+			u8	reserved[7];        /*		    x31-x36 */
+		} fields;
+	} gpr5_dword;
+
 
 	u8	dtl_enable_mask;	// Dispatch Trace Log mask	x38-x38
 	u8	donate_dedicated_cpu;	// Donate dedicated CPU cycles  x39-x39
diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h
index a24a6b2333b2..0603c91538ae 100644
--- a/arch/powerpc/platforms/pseries/plpar_wrappers.h
+++ b/arch/powerpc/platforms/pseries/plpar_wrappers.h
@@ -9,11 +9,33 @@ static inline long poll_pending(void)
 	return plpar_hcall_norets(H_POLL_PENDING);
 }
 
+static inline u8 get_cede_latency_hint(void)
+{
+	return get_lppaca()->gpr5_dword.fields.cede_latency_hint;
+}
+
+static inline void set_cede_latency_hint(u8 latency_hint)
+{
+	get_lppaca()->gpr5_dword.fields.cede_latency_hint = latency_hint;
+}
+
 static inline long cede_processor(void)
 {
 	return plpar_hcall_norets(H_CEDE);
 }
 
+static inline long extended_cede_processor(unsigned long latency_hint)
+{
+	long rc;
+	u8 old_latency_hint = get_cede_latency_hint();
+
+	set_cede_latency_hint(latency_hint);
+	rc = cede_processor();
+	set_cede_latency_hint(old_latency_hint);
+
+	return rc;
+}
+
 static inline long vpa_call(unsigned long flags, unsigned long cpu,
 		unsigned long vpa)
 {
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index bdbe96c8a7e4..4e6152c13764 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1641,7 +1641,8 @@ static void super_regs(void)
 			       ptrLpPaca->saved_srr0, ptrLpPaca->saved_srr1);
 			printf("    Saved Gpr3=%.16lx  Saved Gpr4=%.16lx \n",
 			       ptrLpPaca->saved_gpr3, ptrLpPaca->saved_gpr4);
-			printf("    Saved Gpr5=%.16lx \n", ptrLpPaca->saved_gpr5);
+			printf("    Saved Gpr5=%.16lx \n",
+				ptrLpPaca->gpr5_dword.saved_gpr5);
 		}
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From 2d7cf3ef879b22bdfd271aa3b66733c53279e813 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Mon, 23 Nov 2009 12:28:53 +0000
Subject: powerpc: Fix DEBUG_HIGHMEM build break from d4515646699

Code was added to mm/higmem.c that depends on several
kmap types that powerpc does not support.  We add dummy
invalid definitions for KM_NMI, KM_NM_PTE, and KM_IRQ_PTE.

According to list discussion, this fix should not be needed
anymore starting with 2.6.33.  The code is commented to this
effect so hopefully we will remember to remove this.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/kmap_types.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/kmap_types.h b/arch/powerpc/include/asm/kmap_types.h
index b6bac6f61c16..916369575c97 100644
--- a/arch/powerpc/include/asm/kmap_types.h
+++ b/arch/powerpc/include/asm/kmap_types.h
@@ -29,5 +29,16 @@ enum km_type {
 	KM_TYPE_NR
 };
 
+/*
+ * This is a temporary build fix that (so they say on lkml....) should no longer
+ * be required after 2.6.33, because of changes planned to the kmap code.
+ * Let's try to remove this cruft then.
+ */
+#ifdef CONFIG_DEBUG_HIGHMEM
+#define KM_NMI		(-1)
+#define KM_NMI_PTE	(-1)
+#define KM_IRQ_PTE	(-1)
+#endif
+
 #endif	/* __KERNEL__ */
 #endif	/* _ASM_POWERPC_KMAP_TYPES_H */
-- 
cgit v1.2.3-59-g8ed1b


From dad2f2fb0fc74afb634beba8c57bb34bb862d4c6 Mon Sep 17 00:00:00 2001
From: "arnd@arndb.de" <arnd@arndb.de>
Date: Mon, 23 Nov 2009 03:25:06 +0000
Subject: powerpc: Fix wrong error code from ppc32 select syscall

This patch was submitted, discussed, and eventually Acked by everyone, yet
still isn't in the tree.  See:

http://patchwork.ozlabs.org/patch/1240/

Signed-off-by: Josh Boyer <jwboyer@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@anrdb.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/systbl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index c7d671a7d9a1..07d2d19ab5e9 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -145,7 +145,7 @@ SYSCALL_SPU(setfsuid)
 SYSCALL_SPU(setfsgid)
 SYSCALL_SPU(llseek)
 COMPAT_SYS_SPU(getdents)
-SYSX_SPU(sys_select,ppc32_select,ppc_select)
+SYSX_SPU(sys_select,ppc32_select,sys_select)
 SYSCALL_SPU(flock)
 SYSCALL_SPU(msync)
 COMPAT_SYS_SPU(readv)
-- 
cgit v1.2.3-59-g8ed1b


From c045256d146800ea1d741a8e9e377dada6b7e195 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 23 Nov 2009 19:31:14 +0000
Subject: powerpc/mm: Fix bug in pagetable cache cleanup with
 CONFIG_PPC_SUBPAGE_PROT

Commit a0668cdc154e54bf0c85182e0535eea237d53146 cleans up the handling
of kmem_caches for allocating various levels of pagetables.
Unfortunately, it conflicts badly with CONFIG_PPC_SUBPAGE_PROT, due to
the latter's cleverly hidden technique of adding some extra allocation
space to the top level page directory to store the extra information
it needs.

Since that extra allocation really doesn't fit into the cleaned up
page directory allocating scheme, this patch alters
CONFIG_PPC_SUBPAGE_PROT to instead allocate its struct
subpage_prot_table as part of the mm_context_t.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/mmu-hash64.h     | 35 +++++++++++++++++++++++++++++
 arch/powerpc/include/asm/pgalloc-64.h     |  5 -----
 arch/powerpc/include/asm/pte-hash64-64k.h | 37 -------------------------------
 arch/powerpc/mm/hash_utils_64.c           |  6 ++---
 arch/powerpc/mm/mmu_context_hash64.c      |  2 ++
 arch/powerpc/mm/subpage-prot.c            | 15 +++++++++----
 6 files changed, 51 insertions(+), 49 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 7514ec2f8540..9d9551840f4a 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -373,6 +373,38 @@ extern void slb_set_size(u16 size);
 
 #ifndef __ASSEMBLY__
 
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+/*
+ * For the sub-page protection option, we extend the PGD with one of
+ * these.  Basically we have a 3-level tree, with the top level being
+ * the protptrs array.  To optimize speed and memory consumption when
+ * only addresses < 4GB are being protected, pointers to the first
+ * four pages of sub-page protection words are stored in the low_prot
+ * array.
+ * Each page of sub-page protection words protects 1GB (4 bytes
+ * protects 64k).  For the 3-level tree, each page of pointers then
+ * protects 8TB.
+ */
+struct subpage_prot_table {
+	unsigned long maxaddr;	/* only addresses < this are protected */
+	unsigned int **protptrs[2];
+	unsigned int *low_prot[4];
+};
+
+#define SBP_L1_BITS		(PAGE_SHIFT - 2)
+#define SBP_L2_BITS		(PAGE_SHIFT - 3)
+#define SBP_L1_COUNT		(1 << SBP_L1_BITS)
+#define SBP_L2_COUNT		(1 << SBP_L2_BITS)
+#define SBP_L2_SHIFT		(PAGE_SHIFT + SBP_L1_BITS)
+#define SBP_L3_SHIFT		(SBP_L2_SHIFT + SBP_L2_BITS)
+
+extern void subpage_prot_free(struct mm_struct *mm);
+extern void subpage_prot_init_new_context(struct mm_struct *mm);
+#else
+static inline void subpage_prot_free(pgd_t *pgd) {}
+static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
+#endif /* CONFIG_PPC_SUBPAGE_PROT */
+
 typedef unsigned long mm_context_id_t;
 
 typedef struct {
@@ -386,6 +418,9 @@ typedef struct {
 	u16 sllp;		/* SLB page size encoding */
 #endif
 	unsigned long vdso_base;
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+	struct subpage_prot_table spt;
+#endif /* CONFIG_PPC_SUBPAGE_PROT */
 } mm_context_t;
 
 
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
index 5c1cd73dafa8..605f5c5398d1 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -28,10 +28,6 @@
  */
 #define MAX_PGTABLE_INDEX_SIZE	0xf
 
-#ifndef CONFIG_PPC_SUBPAGE_PROT
-static inline void subpage_prot_free(pgd_t *pgd) {}
-#endif
-
 extern struct kmem_cache *pgtable_cache[];
 #define PGT_CACHE(shift) (pgtable_cache[(shift)-1])
 
@@ -42,7 +38,6 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	subpage_prot_free(pgd);
 	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
 }
 
diff --git a/arch/powerpc/include/asm/pte-hash64-64k.h b/arch/powerpc/include/asm/pte-hash64-64k.h
index 82b72207c51c..c4490f9c67c4 100644
--- a/arch/powerpc/include/asm/pte-hash64-64k.h
+++ b/arch/powerpc/include/asm/pte-hash64-64k.h
@@ -76,41 +76,4 @@
 	remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE,		\
 			__pgprot(pgprot_val((prot)) | _PAGE_4K_PFN))
 
-
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-/*
- * For the sub-page protection option, we extend the PGD with one of
- * these.  Basically we have a 3-level tree, with the top level being
- * the protptrs array.  To optimize speed and memory consumption when
- * only addresses < 4GB are being protected, pointers to the first
- * four pages of sub-page protection words are stored in the low_prot
- * array.
- * Each page of sub-page protection words protects 1GB (4 bytes
- * protects 64k).  For the 3-level tree, each page of pointers then
- * protects 8TB.
- */
-struct subpage_prot_table {
-	unsigned long maxaddr;	/* only addresses < this are protected */
-	unsigned int **protptrs[2];
-	unsigned int *low_prot[4];
-};
-
-#undef PGD_TABLE_SIZE
-#define PGD_TABLE_SIZE		((sizeof(pgd_t) << PGD_INDEX_SIZE) + \
-				 sizeof(struct subpage_prot_table))
-
-#define SBP_L1_BITS		(PAGE_SHIFT - 2)
-#define SBP_L2_BITS		(PAGE_SHIFT - 3)
-#define SBP_L1_COUNT		(1 << SBP_L1_BITS)
-#define SBP_L2_COUNT		(1 << SBP_L2_BITS)
-#define SBP_L2_SHIFT		(PAGE_SHIFT + SBP_L1_BITS)
-#define SBP_L3_SHIFT		(SBP_L2_SHIFT + SBP_L2_BITS)
-
-extern void subpage_prot_free(pgd_t *pgd);
-
-static inline struct subpage_prot_table *pgd_subpage_prot(pgd_t *pgd)
-{
-	return (struct subpage_prot_table *)(pgd + PTRS_PER_PGD);
-}
-#endif /* CONFIG_PPC_SUBPAGE_PROT */
 #endif	/* __ASSEMBLY__ */
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 6810128aba30..50f867d657df 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -835,9 +835,9 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
  * Result is 0: full permissions, _PAGE_RW: read-only,
  * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access.
  */
-static int subpage_protection(pgd_t *pgdir, unsigned long ea)
+static int subpage_protection(struct mm_struct *mm, unsigned long ea)
 {
-	struct subpage_prot_table *spt = pgd_subpage_prot(pgdir);
+	struct subpage_prot_table *spt = &mm->context.spt;
 	u32 spp = 0;
 	u32 **sbpm, *sbpp;
 
@@ -865,7 +865,7 @@ static int subpage_protection(pgd_t *pgdir, unsigned long ea)
 }
 
 #else /* CONFIG_PPC_SUBPAGE_PROT */
-static inline int subpage_protection(pgd_t *pgdir, unsigned long ea)
+static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
 {
 	return 0;
 }
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index b9e4cc2c2057..b910d37aea1a 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -76,6 +76,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	 */
 	if (slice_mm_new_context(mm))
 		slice_set_user_psize(mm, mmu_virtual_psize);
+	subpage_prot_init_new_context(mm);
 	mm->context.id = index;
 
 	return 0;
@@ -92,5 +93,6 @@ EXPORT_SYMBOL_GPL(__destroy_context);
 void destroy_context(struct mm_struct *mm)
 {
 	__destroy_context(mm->context.id);
+	subpage_prot_free(mm);
 	mm->context.id = NO_CONTEXT;
 }
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 4cafc0c33d0a..a040b81e93bd 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -24,9 +24,9 @@
  * Also makes sure that the subpage_prot_table structure is
  * reinitialized for the next user.
  */
-void subpage_prot_free(pgd_t *pgd)
+void subpage_prot_free(struct mm_struct *mm)
 {
-	struct subpage_prot_table *spt = pgd_subpage_prot(pgd);
+	struct subpage_prot_table *spt = &mm->context.spt;
 	unsigned long i, j, addr;
 	u32 **p;
 
@@ -51,6 +51,13 @@ void subpage_prot_free(pgd_t *pgd)
 	spt->maxaddr = 0;
 }
 
+void subpage_prot_init_new_context(struct mm_struct *mm)
+{
+	struct subpage_prot_table *spt = &mm->context.spt;
+
+	memset(spt, 0, sizeof(*spt));
+}
+
 static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 			     int npages)
 {
@@ -87,7 +94,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 static void subpage_prot_clear(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
+	struct subpage_prot_table *spt = &mm->context.spt;
 	u32 **spm, *spp;
 	int i, nw;
 	unsigned long next, limit;
@@ -136,7 +143,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
 long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
+	struct subpage_prot_table *spt = &mm->context.spt;
 	u32 **spm, *spp;
 	int i, nw;
 	unsigned long next, limit;
-- 
cgit v1.2.3-59-g8ed1b


From 5a7b4193e564d1611ecf1cd859aed60d5612d78f Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 2 Dec 2009 09:28:35 +1100
Subject: Revert "powerpc/mm: Fix bug in pagetable cache cleanup with
 CONFIG_PPC_SUBPAGE_PROT"

This reverts commit c045256d146800ea1d741a8e9e377dada6b7e195.

It breaks build when CONFIG_PPC_SUBPAGE_PROT is not set. I will
commit a fixed version separately

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/mmu-hash64.h     | 35 -----------------------------
 arch/powerpc/include/asm/pgalloc-64.h     |  5 +++++
 arch/powerpc/include/asm/pte-hash64-64k.h | 37 +++++++++++++++++++++++++++++++
 arch/powerpc/mm/hash_utils_64.c           |  6 ++---
 arch/powerpc/mm/mmu_context_hash64.c      |  2 --
 arch/powerpc/mm/subpage-prot.c            | 15 ++++---------
 6 files changed, 49 insertions(+), 51 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 9d9551840f4a..7514ec2f8540 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -373,38 +373,6 @@ extern void slb_set_size(u16 size);
 
 #ifndef __ASSEMBLY__
 
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-/*
- * For the sub-page protection option, we extend the PGD with one of
- * these.  Basically we have a 3-level tree, with the top level being
- * the protptrs array.  To optimize speed and memory consumption when
- * only addresses < 4GB are being protected, pointers to the first
- * four pages of sub-page protection words are stored in the low_prot
- * array.
- * Each page of sub-page protection words protects 1GB (4 bytes
- * protects 64k).  For the 3-level tree, each page of pointers then
- * protects 8TB.
- */
-struct subpage_prot_table {
-	unsigned long maxaddr;	/* only addresses < this are protected */
-	unsigned int **protptrs[2];
-	unsigned int *low_prot[4];
-};
-
-#define SBP_L1_BITS		(PAGE_SHIFT - 2)
-#define SBP_L2_BITS		(PAGE_SHIFT - 3)
-#define SBP_L1_COUNT		(1 << SBP_L1_BITS)
-#define SBP_L2_COUNT		(1 << SBP_L2_BITS)
-#define SBP_L2_SHIFT		(PAGE_SHIFT + SBP_L1_BITS)
-#define SBP_L3_SHIFT		(SBP_L2_SHIFT + SBP_L2_BITS)
-
-extern void subpage_prot_free(struct mm_struct *mm);
-extern void subpage_prot_init_new_context(struct mm_struct *mm);
-#else
-static inline void subpage_prot_free(pgd_t *pgd) {}
-static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
-#endif /* CONFIG_PPC_SUBPAGE_PROT */
-
 typedef unsigned long mm_context_id_t;
 
 typedef struct {
@@ -418,9 +386,6 @@ typedef struct {
 	u16 sllp;		/* SLB page size encoding */
 #endif
 	unsigned long vdso_base;
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-	struct subpage_prot_table spt;
-#endif /* CONFIG_PPC_SUBPAGE_PROT */
 } mm_context_t;
 
 
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
index 605f5c5398d1..5c1cd73dafa8 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -28,6 +28,10 @@
  */
 #define MAX_PGTABLE_INDEX_SIZE	0xf
 
+#ifndef CONFIG_PPC_SUBPAGE_PROT
+static inline void subpage_prot_free(pgd_t *pgd) {}
+#endif
+
 extern struct kmem_cache *pgtable_cache[];
 #define PGT_CACHE(shift) (pgtable_cache[(shift)-1])
 
@@ -38,6 +42,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
+	subpage_prot_free(pgd);
 	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
 }
 
diff --git a/arch/powerpc/include/asm/pte-hash64-64k.h b/arch/powerpc/include/asm/pte-hash64-64k.h
index c4490f9c67c4..82b72207c51c 100644
--- a/arch/powerpc/include/asm/pte-hash64-64k.h
+++ b/arch/powerpc/include/asm/pte-hash64-64k.h
@@ -76,4 +76,41 @@
 	remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE,		\
 			__pgprot(pgprot_val((prot)) | _PAGE_4K_PFN))
 
+
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+/*
+ * For the sub-page protection option, we extend the PGD with one of
+ * these.  Basically we have a 3-level tree, with the top level being
+ * the protptrs array.  To optimize speed and memory consumption when
+ * only addresses < 4GB are being protected, pointers to the first
+ * four pages of sub-page protection words are stored in the low_prot
+ * array.
+ * Each page of sub-page protection words protects 1GB (4 bytes
+ * protects 64k).  For the 3-level tree, each page of pointers then
+ * protects 8TB.
+ */
+struct subpage_prot_table {
+	unsigned long maxaddr;	/* only addresses < this are protected */
+	unsigned int **protptrs[2];
+	unsigned int *low_prot[4];
+};
+
+#undef PGD_TABLE_SIZE
+#define PGD_TABLE_SIZE		((sizeof(pgd_t) << PGD_INDEX_SIZE) + \
+				 sizeof(struct subpage_prot_table))
+
+#define SBP_L1_BITS		(PAGE_SHIFT - 2)
+#define SBP_L2_BITS		(PAGE_SHIFT - 3)
+#define SBP_L1_COUNT		(1 << SBP_L1_BITS)
+#define SBP_L2_COUNT		(1 << SBP_L2_BITS)
+#define SBP_L2_SHIFT		(PAGE_SHIFT + SBP_L1_BITS)
+#define SBP_L3_SHIFT		(SBP_L2_SHIFT + SBP_L2_BITS)
+
+extern void subpage_prot_free(pgd_t *pgd);
+
+static inline struct subpage_prot_table *pgd_subpage_prot(pgd_t *pgd)
+{
+	return (struct subpage_prot_table *)(pgd + PTRS_PER_PGD);
+}
+#endif /* CONFIG_PPC_SUBPAGE_PROT */
 #endif	/* __ASSEMBLY__ */
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 50f867d657df..6810128aba30 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -835,9 +835,9 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
  * Result is 0: full permissions, _PAGE_RW: read-only,
  * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access.
  */
-static int subpage_protection(struct mm_struct *mm, unsigned long ea)
+static int subpage_protection(pgd_t *pgdir, unsigned long ea)
 {
-	struct subpage_prot_table *spt = &mm->context.spt;
+	struct subpage_prot_table *spt = pgd_subpage_prot(pgdir);
 	u32 spp = 0;
 	u32 **sbpm, *sbpp;
 
@@ -865,7 +865,7 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea)
 }
 
 #else /* CONFIG_PPC_SUBPAGE_PROT */
-static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
+static inline int subpage_protection(pgd_t *pgdir, unsigned long ea)
 {
 	return 0;
 }
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index b910d37aea1a..b9e4cc2c2057 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -76,7 +76,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	 */
 	if (slice_mm_new_context(mm))
 		slice_set_user_psize(mm, mmu_virtual_psize);
-	subpage_prot_init_new_context(mm);
 	mm->context.id = index;
 
 	return 0;
@@ -93,6 +92,5 @@ EXPORT_SYMBOL_GPL(__destroy_context);
 void destroy_context(struct mm_struct *mm)
 {
 	__destroy_context(mm->context.id);
-	subpage_prot_free(mm);
 	mm->context.id = NO_CONTEXT;
 }
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index a040b81e93bd..4cafc0c33d0a 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -24,9 +24,9 @@
  * Also makes sure that the subpage_prot_table structure is
  * reinitialized for the next user.
  */
-void subpage_prot_free(struct mm_struct *mm)
+void subpage_prot_free(pgd_t *pgd)
 {
-	struct subpage_prot_table *spt = &mm->context.spt;
+	struct subpage_prot_table *spt = pgd_subpage_prot(pgd);
 	unsigned long i, j, addr;
 	u32 **p;
 
@@ -51,13 +51,6 @@ void subpage_prot_free(struct mm_struct *mm)
 	spt->maxaddr = 0;
 }
 
-void subpage_prot_init_new_context(struct mm_struct *mm)
-{
-	struct subpage_prot_table *spt = &mm->context.spt;
-
-	memset(spt, 0, sizeof(*spt));
-}
-
 static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 			     int npages)
 {
@@ -94,7 +87,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 static void subpage_prot_clear(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = &mm->context.spt;
+	struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
 	u32 **spm, *spp;
 	int i, nw;
 	unsigned long next, limit;
@@ -143,7 +136,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
 long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = &mm->context.spt;
+	struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
 	u32 **spm, *spp;
 	int i, nw;
 	unsigned long next, limit;
-- 
cgit v1.2.3-59-g8ed1b


From d28513bc7f675d28b479db666d572e078ecf182d Mon Sep 17 00:00:00 2001
From: David Gibson <dwg@au1.ibm.com>
Date: Thu, 26 Nov 2009 18:56:04 +0000
Subject: powerpc/mm: Fix pgtable cache cleanup with CONFIG_PPC_SUBPAGE_PROT

Commit a0668cdc154e54bf0c85182e0535eea237d53146 cleans up the handling
of kmem_caches for allocating various levels of pagetables.
Unfortunately, it conflicts badly with CONFIG_PPC_SUBPAGE_PROT, due to
the latter's cleverly hidden technique of adding some extra allocation
space to the top level page directory to store the extra information
it needs.

Since that extra allocation really doesn't fit into the cleaned up
page directory allocating scheme, this patch alters
CONFIG_PPC_SUBPAGE_PROT to instead allocate its struct
subpage_prot_table as part of the mm_context_t.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/mmu-hash64.h     | 35 +++++++++++++++++++++++++++++
 arch/powerpc/include/asm/pgalloc-64.h     |  5 -----
 arch/powerpc/include/asm/pte-hash64-64k.h | 37 -------------------------------
 arch/powerpc/mm/hash_utils_64.c           |  6 ++---
 arch/powerpc/mm/mmu_context_hash64.c      |  2 ++
 arch/powerpc/mm/subpage-prot.c            | 15 +++++++++----
 6 files changed, 51 insertions(+), 49 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 7514ec2f8540..2102b214a87c 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -373,6 +373,38 @@ extern void slb_set_size(u16 size);
 
 #ifndef __ASSEMBLY__
 
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+/*
+ * For the sub-page protection option, we extend the PGD with one of
+ * these.  Basically we have a 3-level tree, with the top level being
+ * the protptrs array.  To optimize speed and memory consumption when
+ * only addresses < 4GB are being protected, pointers to the first
+ * four pages of sub-page protection words are stored in the low_prot
+ * array.
+ * Each page of sub-page protection words protects 1GB (4 bytes
+ * protects 64k).  For the 3-level tree, each page of pointers then
+ * protects 8TB.
+ */
+struct subpage_prot_table {
+	unsigned long maxaddr;	/* only addresses < this are protected */
+	unsigned int **protptrs[2];
+	unsigned int *low_prot[4];
+};
+
+#define SBP_L1_BITS		(PAGE_SHIFT - 2)
+#define SBP_L2_BITS		(PAGE_SHIFT - 3)
+#define SBP_L1_COUNT		(1 << SBP_L1_BITS)
+#define SBP_L2_COUNT		(1 << SBP_L2_BITS)
+#define SBP_L2_SHIFT		(PAGE_SHIFT + SBP_L1_BITS)
+#define SBP_L3_SHIFT		(SBP_L2_SHIFT + SBP_L2_BITS)
+
+extern void subpage_prot_free(struct mm_struct *mm);
+extern void subpage_prot_init_new_context(struct mm_struct *mm);
+#else
+static inline void subpage_prot_free(struct mm_struct *mm) {}
+static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
+#endif /* CONFIG_PPC_SUBPAGE_PROT */
+
 typedef unsigned long mm_context_id_t;
 
 typedef struct {
@@ -386,6 +418,9 @@ typedef struct {
 	u16 sllp;		/* SLB page size encoding */
 #endif
 	unsigned long vdso_base;
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+	struct subpage_prot_table spt;
+#endif /* CONFIG_PPC_SUBPAGE_PROT */
 } mm_context_t;
 
 
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
index 5c1cd73dafa8..605f5c5398d1 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -28,10 +28,6 @@
  */
 #define MAX_PGTABLE_INDEX_SIZE	0xf
 
-#ifndef CONFIG_PPC_SUBPAGE_PROT
-static inline void subpage_prot_free(pgd_t *pgd) {}
-#endif
-
 extern struct kmem_cache *pgtable_cache[];
 #define PGT_CACHE(shift) (pgtable_cache[(shift)-1])
 
@@ -42,7 +38,6 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	subpage_prot_free(pgd);
 	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
 }
 
diff --git a/arch/powerpc/include/asm/pte-hash64-64k.h b/arch/powerpc/include/asm/pte-hash64-64k.h
index 82b72207c51c..c4490f9c67c4 100644
--- a/arch/powerpc/include/asm/pte-hash64-64k.h
+++ b/arch/powerpc/include/asm/pte-hash64-64k.h
@@ -76,41 +76,4 @@
 	remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE,		\
 			__pgprot(pgprot_val((prot)) | _PAGE_4K_PFN))
 
-
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-/*
- * For the sub-page protection option, we extend the PGD with one of
- * these.  Basically we have a 3-level tree, with the top level being
- * the protptrs array.  To optimize speed and memory consumption when
- * only addresses < 4GB are being protected, pointers to the first
- * four pages of sub-page protection words are stored in the low_prot
- * array.
- * Each page of sub-page protection words protects 1GB (4 bytes
- * protects 64k).  For the 3-level tree, each page of pointers then
- * protects 8TB.
- */
-struct subpage_prot_table {
-	unsigned long maxaddr;	/* only addresses < this are protected */
-	unsigned int **protptrs[2];
-	unsigned int *low_prot[4];
-};
-
-#undef PGD_TABLE_SIZE
-#define PGD_TABLE_SIZE		((sizeof(pgd_t) << PGD_INDEX_SIZE) + \
-				 sizeof(struct subpage_prot_table))
-
-#define SBP_L1_BITS		(PAGE_SHIFT - 2)
-#define SBP_L2_BITS		(PAGE_SHIFT - 3)
-#define SBP_L1_COUNT		(1 << SBP_L1_BITS)
-#define SBP_L2_COUNT		(1 << SBP_L2_BITS)
-#define SBP_L2_SHIFT		(PAGE_SHIFT + SBP_L1_BITS)
-#define SBP_L3_SHIFT		(SBP_L2_SHIFT + SBP_L2_BITS)
-
-extern void subpage_prot_free(pgd_t *pgd);
-
-static inline struct subpage_prot_table *pgd_subpage_prot(pgd_t *pgd)
-{
-	return (struct subpage_prot_table *)(pgd + PTRS_PER_PGD);
-}
-#endif /* CONFIG_PPC_SUBPAGE_PROT */
 #endif	/* __ASSEMBLY__ */
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 6810128aba30..50f867d657df 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -835,9 +835,9 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
  * Result is 0: full permissions, _PAGE_RW: read-only,
  * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access.
  */
-static int subpage_protection(pgd_t *pgdir, unsigned long ea)
+static int subpage_protection(struct mm_struct *mm, unsigned long ea)
 {
-	struct subpage_prot_table *spt = pgd_subpage_prot(pgdir);
+	struct subpage_prot_table *spt = &mm->context.spt;
 	u32 spp = 0;
 	u32 **sbpm, *sbpp;
 
@@ -865,7 +865,7 @@ static int subpage_protection(pgd_t *pgdir, unsigned long ea)
 }
 
 #else /* CONFIG_PPC_SUBPAGE_PROT */
-static inline int subpage_protection(pgd_t *pgdir, unsigned long ea)
+static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
 {
 	return 0;
 }
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index b9e4cc2c2057..b910d37aea1a 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -76,6 +76,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	 */
 	if (slice_mm_new_context(mm))
 		slice_set_user_psize(mm, mmu_virtual_psize);
+	subpage_prot_init_new_context(mm);
 	mm->context.id = index;
 
 	return 0;
@@ -92,5 +93,6 @@ EXPORT_SYMBOL_GPL(__destroy_context);
 void destroy_context(struct mm_struct *mm)
 {
 	__destroy_context(mm->context.id);
+	subpage_prot_free(mm);
 	mm->context.id = NO_CONTEXT;
 }
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 4cafc0c33d0a..a040b81e93bd 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -24,9 +24,9 @@
  * Also makes sure that the subpage_prot_table structure is
  * reinitialized for the next user.
  */
-void subpage_prot_free(pgd_t *pgd)
+void subpage_prot_free(struct mm_struct *mm)
 {
-	struct subpage_prot_table *spt = pgd_subpage_prot(pgd);
+	struct subpage_prot_table *spt = &mm->context.spt;
 	unsigned long i, j, addr;
 	u32 **p;
 
@@ -51,6 +51,13 @@ void subpage_prot_free(pgd_t *pgd)
 	spt->maxaddr = 0;
 }
 
+void subpage_prot_init_new_context(struct mm_struct *mm)
+{
+	struct subpage_prot_table *spt = &mm->context.spt;
+
+	memset(spt, 0, sizeof(*spt));
+}
+
 static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 			     int npages)
 {
@@ -87,7 +94,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 static void subpage_prot_clear(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
+	struct subpage_prot_table *spt = &mm->context.spt;
 	u32 **spm, *spp;
 	int i, nw;
 	unsigned long next, limit;
@@ -136,7 +143,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
 long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
+	struct subpage_prot_table *spt = &mm->context.spt;
 	u32 **spm, *spp;
 	int i, nw;
 	unsigned long next, limit;
-- 
cgit v1.2.3-59-g8ed1b


From e15a113700324f7fdcee95589875daed2b98a2fe Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 30 Nov 2009 03:02:02 +0000
Subject: powerpc/kvm: Sync guest visible MMU state

Currently userspace has no chance to find out which virtual address space we're
in and resolve addresses. While that is a big problem for migration, it's also
unpleasent when debugging, as gdb and the monitor don't work on virtual
addresses.

This patch exports enough of the MMU segment state to userspace to make
debugging work and thus also includes the groundwork for migration.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/kvm.h        | 18 ++++++++++++-
 arch/powerpc/include/asm/kvm_asm.h    |  1 +
 arch/powerpc/include/asm/kvm_book3s.h |  3 +++
 arch/powerpc/kvm/book3s.c             | 49 +++++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_64_emulate.c  | 38 ++++++++++++++++-----------
 arch/powerpc/kvm/book3s_64_mmu.c      |  2 ++
 arch/powerpc/kvm/powerpc.c            |  3 +++
 include/linux/kvm.h                   |  3 +++
 8 files changed, 101 insertions(+), 16 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index c9ca97f43bc1..81f3b0b5601e 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -47,7 +47,23 @@ struct kvm_regs {
 
 struct kvm_sregs {
 	__u32 pvr;
-	char pad[1020];
+	union {
+		struct {
+			__u64 sdr1;
+			struct {
+				struct {
+					__u64 slbe;
+					__u64 slbv;
+				} slb[64];
+			} ppc64;
+			struct {
+				__u32 sr[16];
+				__u64 ibat[8]; 
+				__u64 dbat[8]; 
+			} ppc32;
+		} s;
+		__u8 pad[1020];
+	} u;
 };
 
 struct kvm_fpu {
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 19ddb352fd0f..af2abe74f544 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -87,6 +87,7 @@
 #define BOOK3S_IRQPRIO_MAX			16
 
 #define BOOK3S_HFLAG_DCBZ32			0x1
+#define BOOK3S_HFLAG_SLB			0x2
 
 #define RESUME_FLAG_NV          (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index c6011336371e..74b7369770d0 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -46,6 +46,7 @@ struct kvmppc_sr {
 };
 
 struct kvmppc_bat {
+	u64 raw;
 	u32 bepi;
 	u32 bepi_mask;
 	bool vs;
@@ -113,6 +114,8 @@ extern struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, boo
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
+			   bool upper, u32 val);
 
 extern u32 kvmppc_trampoline_lowmem;
 extern u32 kvmppc_trampoline_enter;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 42037d46a416..3e294bd9b8c6 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -281,6 +281,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 
 void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
 {
+	vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB;
 	vcpu->arch.pvr = pvr;
 	if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
 		kvmppc_mmu_book3s_64_init(vcpu);
@@ -762,14 +763,62 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                   struct kvm_sregs *sregs)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int i;
+
 	sregs->pvr = vcpu->arch.pvr;
+
+	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+		for (i = 0; i < 64; i++) {
+			sregs->u.s.ppc64.slb[i].slbe = vcpu3s->slb[i].orige | i;
+			sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv;
+		}
+	} else {
+		for (i = 0; i < 16; i++) {
+			sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i].raw;
+			sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i].raw;
+		}
+		for (i = 0; i < 8; i++) {
+			sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
+			sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
+		}
+	}
 	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                                   struct kvm_sregs *sregs)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int i;
+
 	kvmppc_set_pvr(vcpu, sregs->pvr);
+
+	vcpu3s->sdr1 = sregs->u.s.sdr1;
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+		for (i = 0; i < 64; i++) {
+			vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
+						    sregs->u.s.ppc64.slb[i].slbe);
+		}
+	} else {
+		for (i = 0; i < 16; i++) {
+			vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
+		}
+		for (i = 0; i < 8; i++) {
+			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false,
+				       (u32)sregs->u.s.ppc32.ibat[i]);
+			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true,
+				       (u32)(sregs->u.s.ppc32.ibat[i] >> 32));
+			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false,
+				       (u32)sregs->u.s.ppc32.dbat[i]);
+			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true,
+				       (u32)(sregs->u.s.ppc32.dbat[i] >> 32));
+		}
+	}
+
+	/* Flush the MMU after messing with the segments */
+	kvmppc_mmu_pte_flush(vcpu, 0, 0);
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/book3s_64_emulate.c b/arch/powerpc/kvm/book3s_64_emulate.c
index c343e67306e0..1027eac6d474 100644
--- a/arch/powerpc/kvm/book3s_64_emulate.c
+++ b/arch/powerpc/kvm/book3s_64_emulate.c
@@ -185,7 +185,27 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return emulated;
 }
 
-static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u64 val)
+void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, bool upper,
+                    u32 val)
+{
+	if (upper) {
+		/* Upper BAT */
+		u32 bl = (val >> 2) & 0x7ff;
+		bat->bepi_mask = (~bl << 17);
+		bat->bepi = val & 0xfffe0000;
+		bat->vs = (val & 2) ? 1 : 0;
+		bat->vp = (val & 1) ? 1 : 0;
+		bat->raw = (bat->raw & 0xffffffff00000000ULL) | val;
+	} else {
+		/* Lower BAT */
+		bat->brpn = val & 0xfffe0000;
+		bat->wimg = (val >> 3) & 0xf;
+		bat->pp = val & 3;
+		bat->raw = (bat->raw & 0x00000000ffffffffULL) | ((u64)val << 32);
+	}
+}
+
+static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u32 val)
 {
 	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 	struct kvmppc_bat *bat;
@@ -207,19 +227,7 @@ static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u64 val)
 		BUG();
 	}
 
-	if (!(sprn % 2)) {
-		/* Upper BAT */
-		u32 bl = (val >> 2) & 0x7ff;
-		bat->bepi_mask = (~bl << 17);
-		bat->bepi = val & 0xfffe0000;
-		bat->vs = (val & 2) ? 1 : 0;
-		bat->vp = (val & 1) ? 1 : 0;
-	} else {
-		/* Lower BAT */
-		bat->brpn = val & 0xfffe0000;
-		bat->wimg = (val >> 3) & 0xf;
-		bat->pp = val & 3;
-	}
+	kvmppc_set_bat(vcpu, bat, !(sprn % 2), val);
 }
 
 int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
@@ -243,7 +251,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	case SPRN_IBAT4U ... SPRN_IBAT7L:
 	case SPRN_DBAT0U ... SPRN_DBAT3L:
 	case SPRN_DBAT4U ... SPRN_DBAT7L:
-		kvmppc_write_bat(vcpu, sprn, vcpu->arch.gpr[rs]);
+		kvmppc_write_bat(vcpu, sprn, (u32)vcpu->arch.gpr[rs]);
 		/* BAT writes happen so rarely that we're ok to flush
 		 * everything here */
 		kvmppc_mmu_pte_flush(vcpu, 0, 0);
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index a31f9c677d23..5598f88f142e 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -473,4 +473,6 @@ void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu)
 	mmu->esid_to_vsid = kvmppc_mmu_book3s_64_esid_to_vsid;
 	mmu->ea_to_vp = kvmppc_mmu_book3s_64_ea_to_vp;
 	mmu->is_dcbz32 = kvmppc_mmu_book3s_64_is_dcbz32;
+
+	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
 }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 692c3709011e..d82551efbfbf 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -144,6 +144,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	int r;
 
 	switch (ext) {
+	case KVM_CAP_PPC_SEGSTATE:
+		r = 1;
+		break;
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index f8f8900fc5ec..caf6173bd2e8 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -436,6 +436,9 @@ struct kvm_ioeventfd {
 #endif
 #define KVM_CAP_IOEVENTFD 36
 #define KVM_CAP_SET_IDENTITY_MAP_ADDR 37
+/* KVM upstream has more features, but we synched this number.
+   Linux, please remove this comment on rebase. */
+#define KVM_CAP_PPC_SEGSTATE 43
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3-59-g8ed1b


From 7fb19ea054a0cdf1a4d935e68d51bde4d3725414 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 1 Dec 2009 14:36:26 +0000
Subject: powerpc/macio: Add devres support to macio_device

This adds some basic devres support. When enabled via macio_enable_devres()
resources requested by drivers will be automatically released.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/macio.h |  2 ++
 drivers/macintosh/macio_asic.c   | 47 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/macio.h b/arch/powerpc/include/asm/macio.h
index 079c06eae446..2b7b39294a6a 100644
--- a/arch/powerpc/include/asm/macio.h
+++ b/arch/powerpc/include/asm/macio.h
@@ -78,6 +78,8 @@ static inline unsigned long macio_resource_len(struct macio_dev *dev, int resour
 	return res->end - res->start + 1;
 }
 
+extern int macio_enable_devres(struct macio_dev *dev);
+
 extern int macio_request_resource(struct macio_dev *dev, int resource_no, const char *name);
 extern void macio_release_resource(struct macio_dev *dev, int resource_no);
 extern int macio_request_resources(struct macio_dev *dev, const char *name);
diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c
index 588a5b0bc4b5..5200acfc9d38 100644
--- a/drivers/macintosh/macio_asic.c
+++ b/drivers/macintosh/macio_asic.c
@@ -538,6 +538,42 @@ void macio_unregister_driver(struct macio_driver *drv)
 	driver_unregister(&drv->driver);
 }
 
+/* Managed MacIO resources */
+struct macio_devres {
+	u32	res_mask;
+};
+
+static void maciom_release(struct device *gendev, void *res)
+{
+	struct macio_dev *dev = to_macio_device(gendev);
+	struct macio_devres *dr = res;
+	int i, max;
+
+	max = min(dev->n_resources, 32);
+	for (i = 0; i < max; i++) {
+		if (dr->res_mask & (1 << i))
+			macio_release_resource(dev, i);
+	}
+}
+
+int macio_enable_devres(struct macio_dev *dev)
+{
+	struct macio_devres *dr;
+
+	dr = devres_find(&dev->ofdev.dev, maciom_release, NULL, NULL);
+	if (!dr) {
+		dr = devres_alloc(maciom_release, sizeof(*dr), GFP_KERNEL);
+		if (!dr)
+			return -ENOMEM;
+	}
+	return devres_get(&dev->ofdev.dev, dr, NULL, NULL) != NULL;
+}
+
+static struct macio_devres * find_macio_dr(struct macio_dev *dev)
+{
+	return devres_find(&dev->ofdev.dev, maciom_release, NULL, NULL);
+}
+
 /**
  *	macio_request_resource - Request an MMIO resource
  * 	@dev: pointer to the device holding the resource
@@ -555,6 +591,8 @@ void macio_unregister_driver(struct macio_driver *drv)
 int macio_request_resource(struct macio_dev *dev, int resource_no,
 			   const char *name)
 {
+	struct macio_devres *dr = find_macio_dr(dev);
+
 	if (macio_resource_len(dev, resource_no) == 0)
 		return 0;
 		
@@ -562,6 +600,9 @@ int macio_request_resource(struct macio_dev *dev, int resource_no,
 				macio_resource_len(dev, resource_no),
 				name))
 		goto err_out;
+
+	if (dr && resource_no < 32)
+		dr->res_mask |= 1 << resource_no;
 	
 	return 0;
 
@@ -582,10 +623,14 @@ err_out:
  */
 void macio_release_resource(struct macio_dev *dev, int resource_no)
 {
+	struct macio_devres *dr = find_macio_dr(dev);
+
 	if (macio_resource_len(dev, resource_no) == 0)
 		return;
 	release_mem_region(macio_resource_start(dev, resource_no),
 			   macio_resource_len(dev, resource_no));
+	if (dr && resource_no < 32)
+		dr->res_mask &= ~(1 << resource_no);
 }
 
 /**
@@ -744,3 +789,5 @@ EXPORT_SYMBOL(macio_request_resource);
 EXPORT_SYMBOL(macio_release_resource);
 EXPORT_SYMBOL(macio_request_resources);
 EXPORT_SYMBOL(macio_release_resources);
+EXPORT_SYMBOL(macio_enable_devres);
+
-- 
cgit v1.2.3-59-g8ed1b


From 128b4a0ef74e8d48033513e41a413087ba30e36b Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 1 Dec 2009 14:36:27 +0000
Subject: powerpc/macio: Add dma_parms support to macio

This adds dma_parms to macio devices and initializes them with
default values. This will allow pata_macio to setup the appropriate
max segment size for the block layer.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/macio.h | 1 +
 drivers/macintosh/macio_asic.c   | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/macio.h b/arch/powerpc/include/asm/macio.h
index 2b7b39294a6a..86d5fed1c49f 100644
--- a/arch/powerpc/include/asm/macio.h
+++ b/arch/powerpc/include/asm/macio.h
@@ -39,6 +39,7 @@ struct macio_dev
 	struct macio_bus	*bus;		/* macio bus this device is on */
 	struct macio_dev	*media_bay;	/* Device is part of a media bay */
 	struct of_device	ofdev;
+	struct device_dma_parameters dma_parms; /* ide needs that */
 	int			n_resources;
 	struct resource		resource[MACIO_DEV_COUNT_RESOURCES];
 	int			n_interrupts;
diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c
index 5200acfc9d38..26a303a1d1ab 100644
--- a/drivers/macintosh/macio_asic.c
+++ b/drivers/macintosh/macio_asic.c
@@ -379,6 +379,11 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip,
 	dev->ofdev.dev.parent = parent;
 	dev->ofdev.dev.bus = &macio_bus_type;
 	dev->ofdev.dev.release = macio_release_dev;
+	dev->ofdev.dev.dma_parms = &dev->dma_parms;
+
+	/* Standard DMA paremeters */
+	dma_set_max_seg_size(&dev->ofdev.dev, 65536);
+	dma_set_seg_boundary(&dev->ofdev.dev, 0xffffffff);
 
 #ifdef CONFIG_PCI
 	/* Set the DMA ops to the ones from the PCI device, this could be
-- 
cgit v1.2.3-59-g8ed1b


From d58b0c39e32f1b410af4d070f9d1a1416057c166 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 1 Dec 2009 14:36:28 +0000
Subject: powerpc/macio: Rework hotplug media bay support

The hotplug mediabay has tendrils deep into drivers/ide code
which makes a libata port reather difficult. In addition it's
ugly and could be done better.

This reworks the interface between the mediabay and the rest
of the world so that:

   - Any macio_driver can now have a mediabay_event callback
which will be called when that driver sits on a mediabay and
it's been either plugged or unplugged. The device type is
passed as an argument. We can now move all the IDE cruft
into the IDE driver itself

   - A check_media_bay() function can be used to take a peek
at the type of device currently in the bay if any, a cleaner
variant of the previous function with the same name.

   - A pair of lock/unlock functions are exposed to allow the
IDE driver to block the hotplug callbacks during the initial
setup and probing of the bay in order to avoid nasty race
conditions.

   - The mediabay code no longer needs to spin on the status
register of the IDE interface when it detects an IDE device,
this is done just fine by the IDE code itself

Overall, less code, simpler, and allows for another driver
than our old drivers/ide based one.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/macio.h    |   3 +
 arch/powerpc/include/asm/mediabay.h |  27 +--
 drivers/block/swim3.c               |  39 ++---
 drivers/ide/pmac.c                  |  92 ++++++----
 drivers/macintosh/mediabay.c        | 328 ++++++++++++++----------------------
 5 files changed, 214 insertions(+), 275 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/macio.h b/arch/powerpc/include/asm/macio.h
index 86d5fed1c49f..a062c57696d0 100644
--- a/arch/powerpc/include/asm/macio.h
+++ b/arch/powerpc/include/asm/macio.h
@@ -134,6 +134,9 @@ struct macio_driver
 	int	(*resume)(struct macio_dev* dev);
 	int	(*shutdown)(struct macio_dev* dev);
 
+#ifdef CONFIG_PMAC_MEDIABAY
+	void	(*mediabay_event)(struct macio_dev* dev, int mb_state);
+#endif
 	struct device_driver	driver;
 };
 #define	to_macio_driver(drv) container_of(drv,struct macio_driver, driver)
diff --git a/arch/powerpc/include/asm/mediabay.h b/arch/powerpc/include/asm/mediabay.h
index b2efb3325808..11037a4133ee 100644
--- a/arch/powerpc/include/asm/mediabay.h
+++ b/arch/powerpc/include/asm/mediabay.h
@@ -17,26 +17,31 @@
 #define MB_POWER	6	/* media bay contains a Power device (???) */
 #define MB_NO		7	/* media bay contains nothing */
 
-/* Number of bays in the machine or 0 */
-extern int media_bay_count;
+struct macio_dev;
 
-#ifdef CONFIG_BLK_DEV_IDE_PMAC
-#include <linux/ide.h>
+#ifdef CONFIG_PMAC_MEDIABAY
 
-int check_media_bay_by_base(unsigned long base, int what);
-/* called by IDE PMAC host driver to register IDE controller for media bay */
-int media_bay_set_ide_infos(struct device_node *which_bay, unsigned long base,
-			    int irq, ide_hwif_t *hwif);
+/* Check the content type of the bay, returns MB_NO if the bay is still
+ * transitionning
+ */
+extern int check_media_bay(struct macio_dev *bay);
 
-int check_media_bay(struct device_node *which_bay, int what);
+/* The ATA driver uses the calls below to temporarily hold on the
+ * media bay callbacks while initializing the interface
+ */
+extern void lock_media_bay(struct macio_dev *bay);
+extern void unlock_media_bay(struct macio_dev *bay);
 
 #else
 
-static inline int check_media_bay(struct device_node *which_bay, int what)
+static inline int check_media_bay(struct macio_dev *bay)
 {
-	return -ENODEV;
+	return MB_NO;
 }
 
+static inline void lock_media_bay(struct macio_dev *bay) { }
+static inline void unlock_media_bay(struct macio_dev *bay) { }
+
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 6380ad8d91bd..59ca2b77b574 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -200,7 +200,7 @@ struct floppy_state {
 	int	ejected;
 	wait_queue_head_t wait;
 	int	wanted;
-	struct device_node*	media_bay; /* NULL when not in bay */
+	struct macio_dev *mdev;
 	char	dbdma_cmd_space[5 * sizeof(struct dbdma_cmd)];
 };
 
@@ -303,14 +303,13 @@ static int swim3_readbit(struct floppy_state *fs, int bit)
 static void do_fd_request(struct request_queue * q)
 {
 	int i;
-	for(i=0;i<floppy_count;i++)
-	{
-#ifdef CONFIG_PMAC_MEDIABAY
-		if (floppy_states[i].media_bay &&
-			check_media_bay(floppy_states[i].media_bay, MB_FD))
+
+	for(i=0; i<floppy_count; i++) {
+		struct floppy_state *fs = &floppy_states[i];
+		if (fs->mdev->media_bay &&
+		    check_media_bay(fs->mdev->media_bay) != MB_FD)
 			continue;
-#endif /* CONFIG_PMAC_MEDIABAY */
-		start_request(&floppy_states[i]);
+		start_request(fs);
 	}
 }
 
@@ -849,10 +848,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
 	if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-#ifdef CONFIG_PMAC_MEDIABAY
-	if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD))
+	if (fs->mdev->media_bay &&
+	    check_media_bay(fs->mdev->media_bay) != MB_FD)
 		return -ENXIO;
-#endif
 
 	switch (cmd) {
 	case FDEJECT:
@@ -876,10 +874,9 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	int n, err = 0;
 
 	if (fs->ref_count == 0) {
-#ifdef CONFIG_PMAC_MEDIABAY
-		if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD))
+		if (fs->mdev->media_bay &&
+		    check_media_bay(fs->mdev->media_bay) != MB_FD)
 			return -ENXIO;
-#endif
 		out_8(&sw->setup, S_IBM_DRIVE | S_FCLK_DIV2);
 		out_8(&sw->control_bic, 0xff);
 		out_8(&sw->mode, 0x95);
@@ -963,10 +960,9 @@ static int floppy_revalidate(struct gendisk *disk)
 	struct swim3 __iomem *sw;
 	int ret, n;
 
-#ifdef CONFIG_PMAC_MEDIABAY
-	if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD))
+	if (fs->mdev->media_bay &&
+	    check_media_bay(fs->mdev->media_bay) != MB_FD)
 		return -ENXIO;
-#endif
 
 	sw = fs->swim3;
 	grab_drive(fs, revalidating, 0);
@@ -1009,7 +1005,6 @@ static const struct block_device_operations floppy_fops = {
 static int swim3_add_device(struct macio_dev *mdev, int index)
 {
 	struct device_node *swim = mdev->ofdev.node;
-	struct device_node *mediabay;
 	struct floppy_state *fs = &floppy_states[index];
 	int rc = -EBUSY;
 
@@ -1036,9 +1031,7 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
 	}
 	dev_set_drvdata(&mdev->ofdev.dev, fs);
 
-	mediabay = (strcasecmp(swim->parent->type, "media-bay") == 0) ?
-		swim->parent : NULL;
-	if (mediabay == NULL)
+	if (mdev->media_bay == NULL)
 		pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 1);
 	
 	memset(fs, 0, sizeof(*fs));
@@ -1068,7 +1061,7 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
 	fs->secpercyl = 36;
 	fs->secpertrack = 18;
 	fs->total_secs = 2880;
-	fs->media_bay = mediabay;
+	fs->mdev = mdev;
 	init_waitqueue_head(&fs->wait);
 
 	fs->dma_cmd = (struct dbdma_cmd *) DBDMA_ALIGN(fs->dbdma_cmd_space);
@@ -1093,7 +1086,7 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
 	init_timer(&fs->timeout);
 
 	printk(KERN_INFO "fd%d: SWIM3 floppy controller %s\n", floppy_count,
-		mediabay ? "in media bay" : "");
+		mdev->media_bay ? "in media bay" : "");
 
 	return 0;
 
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 97642a7a79c4..7a4e788cab2f 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -43,10 +43,7 @@
 #include <asm/pmac_feature.h>
 #include <asm/sections.h>
 #include <asm/irq.h>
-
-#ifndef CONFIG_PPC64
 #include <asm/mediabay.h>
-#endif
 
 #define DRV_NAME "ide-pmac"
 
@@ -59,13 +56,14 @@ typedef struct pmac_ide_hwif {
 	int				irq;
 	int				kind;
 	int				aapl_bus_id;
-	unsigned			mediabay : 1;
 	unsigned			broken_dma : 1;
 	unsigned			broken_dma_warn : 1;
 	struct device_node*		node;
 	struct macio_dev		*mdev;
 	u32				timings[4];
 	volatile u32 __iomem *		*kauai_fcr;
+	ide_hwif_t			*hwif;
+
 	/* Those fields are duplicating what is in hwif. We currently
 	 * can't use the hwif ones because of some assumptions that are
 	 * beeing done by the generic code about the kind of dma controller
@@ -854,6 +852,11 @@ sanitize_timings(pmac_ide_hwif_t *pmif)
 	pmif->timings[2] = pmif->timings[3] = value2;
 }
 
+static int on_media_bay(pmac_ide_hwif_t *pmif)
+{
+	return pmif->mdev && pmif->mdev->media_bay != NULL;
+}
+
 /* Suspend call back, should be called after the child devices
  * have actually been suspended
  */
@@ -866,7 +869,7 @@ static int pmac_ide_do_suspend(pmac_ide_hwif_t *pmif)
 	disable_irq(pmif->irq);
 
 	/* The media bay will handle itself just fine */
-	if (pmif->mediabay)
+	if (on_media_bay(pmif))
 		return 0;
 	
 	/* Kauai has bus control FCRs directly here */
@@ -889,7 +892,7 @@ static int pmac_ide_do_suspend(pmac_ide_hwif_t *pmif)
 static int pmac_ide_do_resume(pmac_ide_hwif_t *pmif)
 {
 	/* Hard reset & re-enable controller (do we really need to reset ? -BenH) */
-	if (!pmif->mediabay) {
+	if (!on_media_bay(pmif)) {
 		ppc_md.feature_call(PMAC_FTR_IDE_RESET, pmif->node, pmif->aapl_bus_id, 1);
 		ppc_md.feature_call(PMAC_FTR_IDE_ENABLE, pmif->node, pmif->aapl_bus_id, 1);
 		msleep(10);
@@ -950,13 +953,11 @@ static void pmac_ide_init_dev(ide_drive_t *drive)
 	pmac_ide_hwif_t *pmif =
 		(pmac_ide_hwif_t *)dev_get_drvdata(hwif->gendev.parent);
 
-	if (pmif->mediabay) {
-#ifdef CONFIG_PMAC_MEDIABAY
-		if (check_media_bay_by_base(pmif->regbase, MB_CD) == 0) {
+	if (on_media_bay(pmif)) {
+		if (check_media_bay(pmif->mdev->media_bay) == MB_CD) {
 			drive->dev_flags &= ~IDE_DFLAG_NOPROBE;
 			return;
 		}
-#endif
 		drive->dev_flags |= IDE_DFLAG_NOPROBE;
 	}
 }
@@ -1072,26 +1073,23 @@ static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif,
 		writel(KAUAI_FCR_UATA_MAGIC |
 		       KAUAI_FCR_UATA_RESET_N |
 		       KAUAI_FCR_UATA_ENABLE, pmif->kauai_fcr);
-
-	pmif->mediabay = 0;
 	
 	/* Make sure we have sane timings */
 	sanitize_timings(pmif);
 
+	/* If we are on a media bay, wait for it to settle and lock it */
+	if (pmif->mdev)
+		lock_media_bay(pmif->mdev->media_bay);
+
 	host = ide_host_alloc(&d, hws, 1);
-	if (host == NULL)
-		return -ENOMEM;
-	hwif = host->ports[0];
+	if (host == NULL) {
+		rc = -ENOMEM;
+		goto bail;
+	}
+	hwif = pmif->hwif = host->ports[0];
 
-#ifndef CONFIG_PPC64
-	/* XXX FIXME: Media bay stuff need re-organizing */
-	if (np->parent && np->parent->name
-	    && strcasecmp(np->parent->name, "media-bay") == 0) {
-#ifdef CONFIG_PMAC_MEDIABAY
-		media_bay_set_ide_infos(np->parent, pmif->regbase, pmif->irq,
-					hwif);
-#endif /* CONFIG_PMAC_MEDIABAY */
-		pmif->mediabay = 1;
+	if (on_media_bay(pmif)) {
+		/* Fixup bus ID for media bay */
 		if (!bidp)
 			pmif->aapl_bus_id = 1;
 	} else if (pmif->kind == controller_ohare) {
@@ -1100,9 +1098,7 @@ static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif,
 		 * units, I keep the old way
 		 */
 		ppc_md.feature_call(PMAC_FTR_IDE_ENABLE, np, 0, 1);
-	} else
-#endif
-	{
+	} else {
  		/* This is necessary to enable IDE when net-booting */
 		ppc_md.feature_call(PMAC_FTR_IDE_RESET, np, pmif->aapl_bus_id, 1);
 		ppc_md.feature_call(PMAC_FTR_IDE_ENABLE, np, pmif->aapl_bus_id, 1);
@@ -1112,17 +1108,21 @@ static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif,
 	}
 
 	printk(KERN_INFO DRV_NAME ": Found Apple %s controller (%s), "
-			 "bus ID %d%s, irq %d\n", model_name[pmif->kind],
-			 pmif->mdev ? "macio" : "PCI", pmif->aapl_bus_id,
-			 pmif->mediabay ? " (mediabay)" : "", hw->irq);
+	       "bus ID %d%s, irq %d\n", model_name[pmif->kind],
+	       pmif->mdev ? "macio" : "PCI", pmif->aapl_bus_id,
+	       on_media_bay(pmif) ? " (mediabay)" : "", hw->irq);
 
 	rc = ide_host_register(host, &d, hws);
-	if (rc) {
-		ide_host_free(host);
-		return rc;
-	}
+	if (rc)
+		pmif->hwif = NULL;
 
-	return 0;
+	if (pmif->mdev)
+		unlock_media_bay(pmif->mdev->media_bay);
+
+ bail:
+	if (rc && host)
+		ide_host_free(host);
+	return rc;
 }
 
 static void __devinit pmac_ide_init_ports(struct ide_hw *hw, unsigned long base)
@@ -1362,6 +1362,25 @@ pmac_ide_pci_resume(struct pci_dev *pdev)
 	return rc;
 }
 
+#ifdef CONFIG_PMAC_MEDIABAY
+static void pmac_ide_macio_mb_event(struct macio_dev* mdev, int mb_state)
+{
+	pmac_ide_hwif_t *pmif =
+		(pmac_ide_hwif_t *)dev_get_drvdata(&mdev->ofdev.dev);
+
+	switch(mb_state) {
+	case MB_CD:
+		if (!pmif->hwif->present)
+			ide_port_scan(pmif->hwif);
+		break;
+	default:
+		if (pmif->hwif->present)
+			ide_port_unregister_devices(pmif->hwif);
+	}
+}
+#endif /* CONFIG_PMAC_MEDIABAY */
+
+
 static struct of_device_id pmac_ide_macio_match[] = 
 {
 	{
@@ -1386,6 +1405,9 @@ static struct macio_driver pmac_ide_macio_driver =
 	.probe		= pmac_ide_macio_attach,
 	.suspend	= pmac_ide_macio_suspend,
 	.resume		= pmac_ide_macio_resume,
+#ifdef CONFIG_PMAC_MEDIABAY
+	.mediabay_event	= pmac_ide_macio_mb_event,
+#endif
 };
 
 static const struct pci_device_id pmac_ide_pci_match[] = {
diff --git a/drivers/macintosh/mediabay.c b/drivers/macintosh/mediabay.c
index 029ad8ce8a7e..08002b88f342 100644
--- a/drivers/macintosh/mediabay.c
+++ b/drivers/macintosh/mediabay.c
@@ -33,15 +33,6 @@
 #include <linux/adb.h>
 #include <linux/pmu.h>
 
-
-#define MB_DEBUG
-
-#ifdef MB_DEBUG
-#define MBDBG(fmt, arg...)	printk(KERN_INFO fmt , ## arg)
-#else
-#define MBDBG(fmt, arg...)	do { } while (0)
-#endif
-
 #define MB_FCR32(bay, r)	((bay)->base + ((r) >> 2))
 #define MB_FCR8(bay, r)		(((volatile u8 __iomem *)((bay)->base)) + (r))
 
@@ -76,28 +67,14 @@ struct media_bay_info {
 	int				index;
 	int				cached_gpio;
 	int				sleeping;
+	int				user_lock;
 	struct mutex			lock;
-#ifdef CONFIG_BLK_DEV_IDE_PMAC
-	ide_hwif_t			*cd_port;
-	void __iomem			*cd_base;
-	int				cd_irq;
-	int				cd_retry;
-#endif
-#if defined(CONFIG_BLK_DEV_IDE_PMAC)
-	int 				cd_index;
-#endif
 };
 
 #define MAX_BAYS	2
 
 static struct media_bay_info media_bays[MAX_BAYS];
-int media_bay_count = 0;
-
-#ifdef CONFIG_BLK_DEV_IDE_PMAC
-/* check the busy bit in the media-bay ide interface
-   (assumes the media-bay contains an ide device) */
-#define MB_IDE_READY(i)	((readb(media_bays[i].cd_base + 0x70) & 0x80) == 0)
-#endif
+static int media_bay_count = 0;
 
 /*
  * Wait that number of ms between each step in normal polling mode
@@ -130,20 +107,10 @@ int media_bay_count = 0;
 
 /*
  * Wait this many ticks after an IDE device (e.g. CD-ROM) is inserted
- * (or until the device is ready) before waiting for busy bit to disappear
+ * (or until the device is ready) before calling into the driver
  */
 #define MB_IDE_WAIT	1000
 
-/*
- * Timeout waiting for busy bit of an IDE device to go down
- */
-#define MB_IDE_TIMEOUT	5000
-
-/*
- * Max retries of the full power up/down sequence for an IDE device
- */
-#define MAX_CD_RETRIES	3
-
 /*
  * States of a media bay
  */
@@ -153,7 +120,6 @@ enum {
 	mb_enabling_bay,	/* enable bits set, waiting MB_RESET_DELAY */
 	mb_resetting,		/* reset bit unset, waiting MB_SETUP_DELAY */
 	mb_ide_resetting,	/* IDE reset bit unser, waiting MB_IDE_WAIT */
-	mb_ide_waiting,		/* Waiting for BUSY bit to go away until MB_IDE_TIMEOUT */
 	mb_up,			/* Media bay full */
 	mb_powering_down	/* Powering down (avoid too fast down/up) */
 };
@@ -373,12 +339,12 @@ static inline void set_mb_power(struct media_bay_info* bay, int onoff)
 	if (onoff) {
 		bay->ops->power(bay, 1);
 		bay->state = mb_powering_up;
-		MBDBG("mediabay%d: powering up\n", bay->index);
+		pr_debug("mediabay%d: powering up\n", bay->index);
 	} else { 
 		/* Make sure everything is powered down & disabled */
 		bay->ops->power(bay, 0);
 		bay->state = mb_powering_down;
-		MBDBG("mediabay%d: powering down\n", bay->index);
+		pr_debug("mediabay%d: powering down\n", bay->index);
 	}
 	bay->timer = msecs_to_jiffies(MB_POWER_DELAY);
 }
@@ -387,107 +353,118 @@ static void poll_media_bay(struct media_bay_info* bay)
 {
 	int id = bay->ops->content(bay);
 
-	if (id == bay->last_value) {
-		if (id != bay->content_id) {
-			bay->value_count += msecs_to_jiffies(MB_POLL_DELAY);
-			if (bay->value_count >= msecs_to_jiffies(MB_STABLE_DELAY)) {
-				/* If the device type changes without going thru
-				 * "MB_NO", we force a pass by "MB_NO" to make sure
-				 * things are properly reset
-				 */
-				if ((id != MB_NO) && (bay->content_id != MB_NO)) {
-					id = MB_NO;
-					MBDBG("mediabay%d: forcing MB_NO\n", bay->index);
-				}
-				MBDBG("mediabay%d: switching to %d\n", bay->index, id);
-				set_mb_power(bay, id != MB_NO);
-				bay->content_id = id;
-				if (id == MB_NO) {
-#ifdef CONFIG_BLK_DEV_IDE_PMAC
-					bay->cd_retry = 0;
-#endif
-					printk(KERN_INFO "media bay %d is empty\n", bay->index);
-				}
-			}
-		}
-	} else {
+	static char *mb_content_types[] = {
+		"a floppy drive",
+		"a floppy drive",
+		"an unsuported audio device",
+		"an ATA device",
+		"an unsupported PCI device",
+		"an unknown device",
+	};
+
+	if (id != bay->last_value) {
 		bay->last_value = id;
 		bay->value_count = 0;
+		return;
+	}
+	if (id == bay->content_id)
+		return;
+
+	bay->value_count += msecs_to_jiffies(MB_POLL_DELAY);
+	if (bay->value_count >= msecs_to_jiffies(MB_STABLE_DELAY)) {
+		/* If the device type changes without going thru
+		 * "MB_NO", we force a pass by "MB_NO" to make sure
+		 * things are properly reset
+		 */
+		if ((id != MB_NO) && (bay->content_id != MB_NO)) {
+			id = MB_NO;
+			pr_debug("mediabay%d: forcing MB_NO\n", bay->index);
+		}
+		pr_debug("mediabay%d: switching to %d\n", bay->index, id);
+		set_mb_power(bay, id != MB_NO);
+		bay->content_id = id;
+		if (id >= MB_NO || id < 0)
+			printk(KERN_INFO "mediabay%d: Bay is now empty\n", bay->index);
+		else
+			printk(KERN_INFO "mediabay%d: Bay contains %s\n",
+			       bay->index, mb_content_types[id]);
 	}
 }
 
-#ifdef CONFIG_BLK_DEV_IDE_PMAC
-int check_media_bay(struct device_node *which_bay, int what)
+int check_media_bay(struct macio_dev *baydev)
 {
-	int	i;
+	struct media_bay_info* bay;
+	int id;
 
-	for (i=0; i<media_bay_count; i++)
-		if (media_bays[i].mdev && which_bay == media_bays[i].mdev->ofdev.node) {
-			if ((what == media_bays[i].content_id) && media_bays[i].state == mb_up)
-				return 0;
-			media_bays[i].cd_index = -1;
-			return -EINVAL;
-		}
-	return -ENODEV;
+	if (baydev == NULL)
+		return MB_NO;
+
+	/* This returns an instant snapshot, not locking, sine
+	 * we may be called with the bay lock held. The resulting
+	 * fuzzyness of the result if called at the wrong time is
+	 * not actually a huge deal
+	 */
+	bay = macio_get_drvdata(baydev);
+	if (bay == NULL)
+		return MB_NO;
+	id = bay->content_id;
+	if (bay->state != mb_up)
+		return MB_NO;
+	if (id == MB_FD1)
+		return MB_FD;
+	return id;
 }
-EXPORT_SYMBOL(check_media_bay);
+EXPORT_SYMBOL_GPL(check_media_bay);
 
-int check_media_bay_by_base(unsigned long base, int what)
+void lock_media_bay(struct macio_dev *baydev)
 {
-	int	i;
-
-	for (i=0; i<media_bay_count; i++)
-		if (media_bays[i].mdev && base == (unsigned long) media_bays[i].cd_base) {
-			if ((what == media_bays[i].content_id) && media_bays[i].state == mb_up)
-				return 0;
-			media_bays[i].cd_index = -1;
-			return -EINVAL;
-		} 
+	struct media_bay_info* bay;
 
-	return -ENODEV;
+	if (baydev == NULL)
+		return;
+	bay = macio_get_drvdata(baydev);
+	if (bay == NULL)
+		return;
+	mutex_lock(&bay->lock);
+	bay->user_lock = 1;
 }
-EXPORT_SYMBOL_GPL(check_media_bay_by_base);
+EXPORT_SYMBOL_GPL(lock_media_bay);
 
-int media_bay_set_ide_infos(struct device_node* which_bay, unsigned long base,
-			    int irq, ide_hwif_t *hwif)
+void unlock_media_bay(struct macio_dev *baydev)
 {
-	int	i;
+	struct media_bay_info* bay;
 
-	for (i=0; i<media_bay_count; i++) {
-		struct media_bay_info* bay = &media_bays[i];
-
-		if (bay->mdev && which_bay == bay->mdev->ofdev.node) {
-			int timeout = 5000, index = hwif->index;
-			
-			mutex_lock(&bay->lock);
-
-			bay->cd_port	= hwif;
- 			bay->cd_base	= (void __iomem *) base;
-			bay->cd_irq	= irq;
-
-			if ((MB_CD != bay->content_id) || bay->state != mb_up) {
-				mutex_unlock(&bay->lock);
-				return 0;
-			}
-			printk(KERN_DEBUG "Registered ide%d for media bay %d\n", index, i);
-			do {
-				if (MB_IDE_READY(i)) {
-					bay->cd_index	= index;
-					mutex_unlock(&bay->lock);
-					return 0;
-				}
-				mdelay(1);
-			} while(--timeout);
-			printk(KERN_DEBUG "Timeount waiting IDE in bay %d\n", i);
-			mutex_unlock(&bay->lock);
-			return -ENODEV;
-		}
+	if (baydev == NULL)
+		return;
+	bay = macio_get_drvdata(baydev);
+	if (bay == NULL)
+		return;
+	if (bay->user_lock) {
+		bay->user_lock = 0;
+		mutex_unlock(&bay->lock);
 	}
+}
+EXPORT_SYMBOL_GPL(unlock_media_bay);
 
-	return -ENODEV;
+static int mb_broadcast_hotplug(struct device *dev, void *data)
+{
+	struct media_bay_info* bay = data;
+	struct macio_dev *mdev;
+	struct macio_driver *drv;
+	int state;
+
+	if (dev->bus != &macio_bus_type)
+		return 0;
+
+	state = bay->state == mb_up ? bay->content_id : MB_NO;
+	if (state == MB_FD1)
+		state = MB_FD;
+	mdev = to_macio_device(dev);
+	drv = to_macio_driver(dev->driver);
+	if (dev->driver && drv->mediabay_event)
+		drv->mediabay_event(mdev, state);
+	return 0;
 }
-EXPORT_SYMBOL_GPL(media_bay_set_ide_infos);
-#endif /* CONFIG_BLK_DEV_IDE_PMAC */
 
 static void media_bay_step(int i)
 {
@@ -497,8 +474,8 @@ static void media_bay_step(int i)
 	if (bay->state != mb_powering_down)
 	    poll_media_bay(bay);
 
-	/* If timer expired or polling IDE busy, run state machine */
-	if ((bay->state != mb_ide_waiting) && (bay->timer != 0)) {
+	/* If timer expired run state machine */
+	if (bay->timer != 0) {
 		bay->timer -= msecs_to_jiffies(MB_POLL_DELAY);
 		if (bay->timer > 0)
 			return;
@@ -508,100 +485,50 @@ static void media_bay_step(int i)
 	switch(bay->state) {
 	case mb_powering_up:
 	    	if (bay->ops->setup_bus(bay, bay->last_value) < 0) {
-			MBDBG("mediabay%d: device not supported (kind:%d)\n", i, bay->content_id);
+			pr_debug("mediabay%d: device not supported (kind:%d)\n",
+				 i, bay->content_id);
 	    		set_mb_power(bay, 0);
 	    		break;
 	    	}
 	    	bay->timer = msecs_to_jiffies(MB_RESET_DELAY);
 	    	bay->state = mb_enabling_bay;
-		MBDBG("mediabay%d: enabling (kind:%d)\n", i, bay->content_id);
+		pr_debug("mediabay%d: enabling (kind:%d)\n", i, bay->content_id);
 		break;
 	case mb_enabling_bay:
 		bay->ops->un_reset(bay);
 	    	bay->timer = msecs_to_jiffies(MB_SETUP_DELAY);
 	    	bay->state = mb_resetting;
-		MBDBG("mediabay%d: waiting reset (kind:%d)\n", i, bay->content_id);
+		pr_debug("mediabay%d: releasing bay reset (kind:%d)\n",
+			 i, bay->content_id);
 	    	break;
 	case mb_resetting:
 		if (bay->content_id != MB_CD) {
-			MBDBG("mediabay%d: bay is up (kind:%d)\n", i, bay->content_id);
+			pr_debug("mediabay%d: bay is up (kind:%d)\n", i,
+				 bay->content_id);
 			bay->state = mb_up;
+			device_for_each_child(&bay->mdev->ofdev.dev,
+					      bay, mb_broadcast_hotplug);
 			break;
 	    	}
-#ifdef CONFIG_BLK_DEV_IDE_PMAC
-		MBDBG("mediabay%d: waiting IDE reset (kind:%d)\n", i, bay->content_id);
+		pr_debug("mediabay%d: releasing ATA reset (kind:%d)\n",
+			 i, bay->content_id);
 		bay->ops->un_reset_ide(bay);
 	    	bay->timer = msecs_to_jiffies(MB_IDE_WAIT);
 	    	bay->state = mb_ide_resetting;
-#else
-		printk(KERN_DEBUG "media-bay %d is ide (not compiled in kernel)\n", i);
-		set_mb_power(bay, 0);
-#endif /* CONFIG_BLK_DEV_IDE_PMAC */
 	    	break;
-#ifdef CONFIG_BLK_DEV_IDE_PMAC
+
 	case mb_ide_resetting:
-	    	bay->timer = msecs_to_jiffies(MB_IDE_TIMEOUT);
-	    	bay->state = mb_ide_waiting;
-		MBDBG("mediabay%d: waiting IDE ready (kind:%d)\n", i, bay->content_id);
+		pr_debug("mediabay%d: bay is up (kind:%d)\n", i, bay->content_id);
+		bay->state = mb_up;
+		device_for_each_child(&bay->mdev->ofdev.dev,
+				      bay, mb_broadcast_hotplug);
 	    	break;
-	case mb_ide_waiting:
-		if (bay->cd_base == NULL) {
-			bay->timer = 0;
-			bay->state = mb_up;
-			MBDBG("mediabay%d: up before IDE init\n", i);
-			break;
-		} else if (MB_IDE_READY(i)) {
-			bay->timer = 0;
-			bay->state = mb_up;
-			if (bay->cd_index < 0) {
-				printk("mediabay %d, registering IDE...\n", i);
-				pmu_suspend();
-				ide_port_scan(bay->cd_port);
-				if (bay->cd_port->present)
-					bay->cd_index = bay->cd_port->index;
-				pmu_resume();
-			}
-			if (bay->cd_index == -1) {
-				/* We eventually do a retry */
-				bay->cd_retry++;
-				printk("IDE register error\n");
-				set_mb_power(bay, 0);
-			} else {
-				printk(KERN_DEBUG "media-bay %d is ide%d\n", i, bay->cd_index);
-				MBDBG("mediabay %d IDE ready\n", i);
-			}
-			break;
-	    	} else if (bay->timer > 0)
-			bay->timer -= msecs_to_jiffies(MB_POLL_DELAY);
-	    	if (bay->timer <= 0) {
-			printk("\nIDE Timeout in bay %d !, IDE state is: 0x%02x\n",
-			       i, readb(bay->cd_base + 0x70));
-			MBDBG("mediabay%d: nIDE Timeout !\n", i);
-			set_mb_power(bay, 0);
-			bay->timer = 0;
-	    	}
-		break;
-#endif /* CONFIG_BLK_DEV_IDE_PMAC */
+
 	case mb_powering_down:
 	    	bay->state = mb_empty;
-#ifdef CONFIG_BLK_DEV_IDE_PMAC
-    	        if (bay->cd_index >= 0) {
-			printk(KERN_DEBUG "Unregistering mb %d ide, index:%d\n", i,
-			       bay->cd_index);
-			ide_port_unregister_devices(bay->cd_port);
-			bay->cd_index = -1;
-		}
-	    	if (bay->cd_retry) {
-			if (bay->cd_retry > MAX_CD_RETRIES) {
-				/* Should add an error sound (sort of beep in dmasound) */
-				printk("\nmedia-bay %d, IDE device badly inserted or unrecognised\n", i);
-			} else {
-				/* Force a new power down/up sequence */
-				bay->content_id = MB_NO;
-			}
-	    	}
-#endif /* CONFIG_BLK_DEV_IDE_PMAC */
-		MBDBG("mediabay%d: end of power down\n", i);
+		device_for_each_child(&bay->mdev->ofdev.dev,
+				      bay, mb_broadcast_hotplug);
+		pr_debug("mediabay%d: end of power down\n", i);
 	    	break;
 	}
 }
@@ -676,11 +603,6 @@ static int __devinit media_bay_attach(struct macio_dev *mdev, const struct of_de
 	bay->last_value = bay->ops->content(bay);
 	bay->value_count = msecs_to_jiffies(MB_STABLE_DELAY);
 	bay->state = mb_empty;
-	do {
-		msleep(MB_POLL_DELAY);
-		media_bay_step(i);
-	} while((bay->state != mb_empty) &&
-		(bay->state != mb_up));
 
 	/* Mark us ready by filling our mdev data */
 	macio_set_drvdata(mdev, bay);
@@ -725,7 +647,7 @@ static int media_bay_resume(struct macio_dev *mdev)
 	       	set_mb_power(bay, 0);
 		msleep(MB_POWER_DELAY);
 	       	if (bay->ops->content(bay) != bay->content_id) {
-			printk("mediabay%d: content changed during sleep...\n", bay->index);
+			printk("mediabay%d: Content changed during sleep...\n", bay->index);
 			mutex_unlock(&bay->lock);
 	       		return 0;
 		}
@@ -733,9 +655,6 @@ static int media_bay_resume(struct macio_dev *mdev)
 	       	bay->last_value = bay->content_id;
 	       	bay->value_count = msecs_to_jiffies(MB_STABLE_DELAY);
 	       	bay->timer = msecs_to_jiffies(MB_POWER_DELAY);
-#ifdef CONFIG_BLK_DEV_IDE_PMAC
-	       	bay->cd_retry = 0;
-#endif
 	       	do {
 			msleep(MB_POLL_DELAY);
 	       		media_bay_step(bay->index);
@@ -823,9 +742,6 @@ static int __init media_bay_init(void)
 	for (i=0; i<MAX_BAYS; i++) {
 		memset((char *)&media_bays[i], 0, sizeof(struct media_bay_info));
 		media_bays[i].content_id	= -1;
-#ifdef CONFIG_BLK_DEV_IDE_PMAC
-		media_bays[i].cd_index		= -1;
-#endif
 	}
 	if (!machine_is(powermac))
 		return 0;
-- 
cgit v1.2.3-59-g8ed1b


From ab519a011caa5ec47d992cb8a4fc8e7af9b9e3f8 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@austin.ibm.com>
Date: Tue, 24 Nov 2009 21:10:49 +0000
Subject: powerpc/pseries: Kernel DLPAR Infrastructure

The Dynamic Logical Partitioning capabilities of the powerpc pseries platform
allows for the addition and removal of resources (i.e. CPU's, memory, and PCI
devices) from a partition. The removal of a resource involves
removing the resource's node from the device tree and then returning the
resource to firmware via the rtas set-indicator call.  To add a resource, it
is first obtained from firmware via the rtas set-indicator call and then a
new device tree node is created using the ibm,configure-coinnector rtas call
and added to the device tree.

This patch provides the kernel DLPAR infrastructure in a new filed named
dlpar.c.  The functionality provided is for acquiring and releasing a resource
from firmware and the parsing of information returned from the
ibm,configure-connector rtas call.  Additionally this exports the pSeries
reconfiguration notifier chain so that it can be invoked when device tree
updates are made.

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/pSeries_reconfig.h |   1 +
 arch/powerpc/platforms/pseries/Makefile     |   2 +-
 arch/powerpc/platforms/pseries/dlpar.c      | 344 ++++++++++++++++++++++++++++
 arch/powerpc/platforms/pseries/reconfig.c   |   2 +-
 4 files changed, 347 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/platforms/pseries/dlpar.c

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/pSeries_reconfig.h b/arch/powerpc/include/asm/pSeries_reconfig.h
index e482e5352e69..d4b4bfa26fb3 100644
--- a/arch/powerpc/include/asm/pSeries_reconfig.h
+++ b/arch/powerpc/include/asm/pSeries_reconfig.h
@@ -17,6 +17,7 @@
 #ifdef CONFIG_PPC_PSERIES
 extern int pSeries_reconfig_notifier_register(struct notifier_block *);
 extern void pSeries_reconfig_notifier_unregister(struct notifier_block *);
+extern struct blocking_notifier_head pSeries_reconfig_chain;
 #else /* !CONFIG_PPC_PSERIES */
 static inline int pSeries_reconfig_notifier_register(struct notifier_block *nb)
 {
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 4b1c422b8145..0ff5174ae4f5 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -8,7 +8,7 @@ endif
 
 obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
 			   setup.o iommu.o ras.o \
-			   firmware.o power.o
+			   firmware.o power.o dlpar.o
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_XICS)	+= xics.o
 obj-$(CONFIG_SCANLOG)	+= scanlog.o
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
new file mode 100644
index 000000000000..c80e8ef0eb58
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -0,0 +1,344 @@
+/*
+ * Support for dynamic reconfiguration for PCI, Memory, and CPU
+ * Hotplug and Dynamic Logical Partitioning on RPA platforms.
+ *
+ * Copyright (C) 2009 Nathan Fontenot
+ * Copyright (C) 2009 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/uaccess.h>
+#include <asm/rtas.h>
+#include <asm/pSeries_reconfig.h>
+
+struct cc_workarea {
+	u32	drc_index;
+	u32	zero;
+	u32	name_offset;
+	u32	prop_length;
+	u32	prop_offset;
+};
+
+static void dlpar_free_cc_property(struct property *prop)
+{
+	kfree(prop->name);
+	kfree(prop->value);
+	kfree(prop);
+}
+
+static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa)
+{
+	struct property *prop;
+	char *name;
+	char *value;
+
+	prop = kzalloc(sizeof(*prop), GFP_KERNEL);
+	if (!prop)
+		return NULL;
+
+	name = (char *)ccwa + ccwa->name_offset;
+	prop->name = kstrdup(name, GFP_KERNEL);
+
+	prop->length = ccwa->prop_length;
+	value = (char *)ccwa + ccwa->prop_offset;
+	prop->value = kzalloc(prop->length, GFP_KERNEL);
+	if (!prop->value) {
+		dlpar_free_cc_property(prop);
+		return NULL;
+	}
+
+	memcpy(prop->value, value, prop->length);
+	return prop;
+}
+
+static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa)
+{
+	struct device_node *dn;
+	char *name;
+
+	dn = kzalloc(sizeof(*dn), GFP_KERNEL);
+	if (!dn)
+		return NULL;
+
+	/* The configure connector reported name does not contain a
+	 * preceeding '/', so we allocate a buffer large enough to
+	 * prepend this to the full_name.
+	 */
+	name = (char *)ccwa + ccwa->name_offset;
+	dn->full_name = kmalloc(strlen(name) + 2, GFP_KERNEL);
+	if (!dn->full_name) {
+		kfree(dn);
+		return NULL;
+	}
+
+	sprintf(dn->full_name, "/%s", name);
+	return dn;
+}
+
+static void dlpar_free_one_cc_node(struct device_node *dn)
+{
+	struct property *prop;
+
+	while (dn->properties) {
+		prop = dn->properties;
+		dn->properties = prop->next;
+		dlpar_free_cc_property(prop);
+	}
+
+	kfree(dn->full_name);
+	kfree(dn);
+}
+
+static void dlpar_free_cc_nodes(struct device_node *dn)
+{
+	if (dn->child)
+		dlpar_free_cc_nodes(dn->child);
+
+	if (dn->sibling)
+		dlpar_free_cc_nodes(dn->sibling);
+
+	dlpar_free_one_cc_node(dn);
+}
+
+#define NEXT_SIBLING    1
+#define NEXT_CHILD      2
+#define NEXT_PROPERTY   3
+#define PREV_PARENT     4
+#define MORE_MEMORY     5
+#define CALL_AGAIN	-2
+#define ERR_CFG_USE     -9003
+
+struct device_node *dlpar_configure_connector(u32 drc_index)
+{
+	struct device_node *dn;
+	struct device_node *first_dn = NULL;
+	struct device_node *last_dn = NULL;
+	struct property *property;
+	struct property *last_property = NULL;
+	struct cc_workarea *ccwa;
+	int cc_token;
+	int rc;
+
+	cc_token = rtas_token("ibm,configure-connector");
+	if (cc_token == RTAS_UNKNOWN_SERVICE)
+		return NULL;
+
+	spin_lock(&rtas_data_buf_lock);
+	ccwa = (struct cc_workarea *)&rtas_data_buf[0];
+	ccwa->drc_index = drc_index;
+	ccwa->zero = 0;
+
+	rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL);
+	while (rc) {
+		switch (rc) {
+		case NEXT_SIBLING:
+			dn = dlpar_parse_cc_node(ccwa);
+			if (!dn)
+				goto cc_error;
+
+			dn->parent = last_dn->parent;
+			last_dn->sibling = dn;
+			last_dn = dn;
+			break;
+
+		case NEXT_CHILD:
+			dn = dlpar_parse_cc_node(ccwa);
+			if (!dn)
+				goto cc_error;
+
+			if (!first_dn)
+				first_dn = dn;
+			else {
+				dn->parent = last_dn;
+				if (last_dn)
+					last_dn->child = dn;
+			}
+
+			last_dn = dn;
+			break;
+
+		case NEXT_PROPERTY:
+			property = dlpar_parse_cc_property(ccwa);
+			if (!property)
+				goto cc_error;
+
+			if (!last_dn->properties)
+				last_dn->properties = property;
+			else
+				last_property->next = property;
+
+			last_property = property;
+			break;
+
+		case PREV_PARENT:
+			last_dn = last_dn->parent;
+			break;
+
+		case CALL_AGAIN:
+			break;
+
+		case MORE_MEMORY:
+		case ERR_CFG_USE:
+		default:
+			printk(KERN_ERR "Unexpected Error (%d) "
+			       "returned from configure-connector\n", rc);
+			goto cc_error;
+		}
+
+		rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL);
+	}
+
+	spin_unlock(&rtas_data_buf_lock);
+	return first_dn;
+
+cc_error:
+	if (first_dn)
+		dlpar_free_cc_nodes(first_dn);
+	spin_unlock(&rtas_data_buf_lock);
+	return NULL;
+}
+
+static struct device_node *derive_parent(const char *path)
+{
+	struct device_node *parent;
+	char *last_slash;
+
+	last_slash = strrchr(path, '/');
+	if (last_slash == path) {
+		parent = of_find_node_by_path("/");
+	} else {
+		char *parent_path;
+		int parent_path_len = last_slash - path + 1;
+		parent_path = kmalloc(parent_path_len, GFP_KERNEL);
+		if (!parent_path)
+			return NULL;
+
+		strlcpy(parent_path, path, parent_path_len);
+		parent = of_find_node_by_path(parent_path);
+		kfree(parent_path);
+	}
+
+	return parent;
+}
+
+int dlpar_attach_node(struct device_node *dn)
+{
+	struct proc_dir_entry *ent;
+	int rc;
+
+	of_node_set_flag(dn, OF_DYNAMIC);
+	kref_init(&dn->kref);
+	dn->parent = derive_parent(dn->full_name);
+	if (!dn->parent)
+		return -ENOMEM;
+
+	rc = blocking_notifier_call_chain(&pSeries_reconfig_chain,
+					  PSERIES_RECONFIG_ADD, dn);
+	if (rc == NOTIFY_BAD) {
+		printk(KERN_ERR "Failed to add device node %s\n",
+		       dn->full_name);
+		return -ENOMEM; /* For now, safe to assume kmalloc failure */
+	}
+
+	of_attach_node(dn);
+
+#ifdef CONFIG_PROC_DEVICETREE
+	ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde);
+	if (ent)
+		proc_device_tree_add_node(dn, ent);
+#endif
+
+	of_node_put(dn->parent);
+	return 0;
+}
+
+int dlpar_detach_node(struct device_node *dn)
+{
+	struct device_node *parent = dn->parent;
+	struct property *prop = dn->properties;
+
+#ifdef CONFIG_PROC_DEVICETREE
+	while (prop) {
+		remove_proc_entry(prop->name, dn->pde);
+		prop = prop->next;
+	}
+
+	if (dn->pde)
+		remove_proc_entry(dn->pde->name, parent->pde);
+#endif
+
+	blocking_notifier_call_chain(&pSeries_reconfig_chain,
+			    PSERIES_RECONFIG_REMOVE, dn);
+	of_detach_node(dn);
+	of_node_put(dn); /* Must decrement the refcount */
+
+	return 0;
+}
+
+#define DR_ENTITY_SENSE		9003
+#define DR_ENTITY_PRESENT	1
+#define DR_ENTITY_UNUSABLE	2
+#define ALLOCATION_STATE	9003
+#define ALLOC_UNUSABLE		0
+#define ALLOC_USABLE		1
+#define ISOLATION_STATE		9001
+#define ISOLATE			0
+#define UNISOLATE		1
+
+int dlpar_acquire_drc(u32 drc_index)
+{
+	int dr_status, rc;
+
+	rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+		       DR_ENTITY_SENSE, drc_index);
+	if (rc || dr_status != DR_ENTITY_UNUSABLE)
+		return -1;
+
+	rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE);
+	if (rc)
+		return rc;
+
+	rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+	if (rc) {
+		rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+		return rc;
+	}
+
+	return 0;
+}
+
+int dlpar_release_drc(u32 drc_index)
+{
+	int dr_status, rc;
+
+	rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+		       DR_ENTITY_SENSE, drc_index);
+	if (rc || dr_status != DR_ENTITY_PRESENT)
+		return -1;
+
+	rc = rtas_set_indicator(ISOLATION_STATE, drc_index, ISOLATE);
+	if (rc)
+		return rc;
+
+	rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+	if (rc) {
+		rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+		return rc;
+	}
+
+	return 0;
+}
+
+
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
index 5182d2b992c6..a2305d29bbbd 100644
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -96,7 +96,7 @@ static struct device_node *derive_parent(const char *path)
 	return parent;
 }
 
-static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain);
+BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain);
 
 int pSeries_reconfig_notifier_register(struct notifier_block *nb)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 12633e803a2a556f6469e0933d08233d0844a2d9 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@austin.ibm.com>
Date: Wed, 25 Nov 2009 17:23:25 +0000
Subject: sysfs/cpu: Add probe/release files

Version 3 of this patch is updated with documentation added to
Documentation/ABI.  There are no changes to any of the C code from v2
of the patch.

In order to support kernel DLPAR of CPU resources we need to provide an
interface to add (probe) and remove (release) the resource from the system.
This patch Creates new generic probe and release sysfs files to facilitate
cpu probe/release.  The probe/release interface provides for allowing each
arch to supply their own routines for implementing the backend of adding
and removing cpus to/from the system.

This also creates the powerpc specific stubs to handle the arch callouts
from writes to the sysfs files.

The creation and use of these files is regulated by the
CONFIG_ARCH_CPU_PROBE_RELEASE option so that only architectures that need the
capability will have the files created.

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 Documentation/ABI/testing/sysfs-devices-system-cpu | 15 ++++++++++
 arch/powerpc/Kconfig                               |  4 +++
 arch/powerpc/include/asm/machdep.h                 |  5 ++++
 arch/powerpc/kernel/sysfs.c                        | 19 +++++++++++++
 drivers/base/cpu.c                                 | 32 ++++++++++++++++++++++
 include/linux/cpu.h                                |  2 ++
 6 files changed, 77 insertions(+)

(limited to 'arch/powerpc/include')

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index a703b9e9aeb9..d868a11c94a5 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -62,6 +62,21 @@ Description:	CPU topology files that describe kernel limits related to
 		See Documentation/cputopology.txt for more information.
 
 
+What:		/sys/devices/system/cpu/probe
+		/sys/devices/system/cpu/release
+Date:		November 2009
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:	Dynamic addition and removal of CPU's.  This is not hotplug
+		removal, this is meant complete removal/addition of the CPU
+		from the system.
+
+		probe: writes to this file will dynamically add a CPU to the
+		system.  Information written to the file to add CPU's is
+		architecture specific.
+
+		release: writes to this file dynamically remove a CPU from
+		the system.  Information writtento the file to remove CPU's
+		is architecture specific.
 
 What:		/sys/devices/system/cpu/cpu#/node
 Date:		October 2009
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5dbd375a3f2a..0df57466e783 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -320,6 +320,10 @@ config HOTPLUG_CPU
 
 	  Say N if you are unsure.
 
+config ARCH_CPU_PROBE_RELEASE
+	def_bool y
+	depends on HOTPLUG_CPU
+
 config ARCH_ENABLE_MEMORY_HOTPLUG
 	def_bool y
 
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 9efa2be78331..9f0fc9e6ce0d 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -266,6 +266,11 @@ struct machdep_calls {
 	void (*suspend_disable_irqs)(void);
 	void (*suspend_enable_irqs)(void);
 #endif
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	ssize_t (*cpu_probe)(const char *, size_t);
+	ssize_t (*cpu_release)(const char *, size_t);
+#endif
 };
 
 extern void e500_idle(void);
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 956ab33fd73f..e235e52dc4fe 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -461,6 +461,25 @@ static void unregister_cpu_online(unsigned int cpu)
 
 	cacheinfo_cpu_offline(cpu);
 }
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+ssize_t arch_cpu_probe(const char *buf, size_t count)
+{
+	if (ppc_md.cpu_probe)
+		return ppc_md.cpu_probe(buf, count);
+
+	return -EINVAL;
+}
+
+ssize_t arch_cpu_release(const char *buf, size_t count)
+{
+	if (ppc_md.cpu_release)
+		return ppc_md.cpu_release(buf, count);
+
+	return -EINVAL;
+}
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index e62a4ccea54d..7c03af7b84a9 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -72,6 +72,38 @@ void unregister_cpu(struct cpu *cpu)
 	per_cpu(cpu_sys_devices, logical_cpu) = NULL;
 	return;
 }
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+static ssize_t cpu_probe_store(struct class *class, const char *buf,
+			       size_t count)
+{
+	return arch_cpu_probe(buf, count);
+}
+
+static ssize_t cpu_release_store(struct class *class, const char *buf,
+				 size_t count)
+{
+	return arch_cpu_release(buf, count);
+}
+
+static CLASS_ATTR(probe, S_IWUSR, NULL, cpu_probe_store);
+static CLASS_ATTR(release, S_IWUSR, NULL, cpu_release_store);
+
+int __init cpu_probe_release_init(void)
+{
+	int rc;
+
+	rc = sysfs_create_file(&cpu_sysdev_class.kset.kobj,
+			       &class_attr_probe.attr);
+	if (!rc)
+		rc = sysfs_create_file(&cpu_sysdev_class.kset.kobj,
+				       &class_attr_release.attr);
+
+	return rc;
+}
+device_initcall(cpu_probe_release_init);
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
+
 #else /* ... !CONFIG_HOTPLUG_CPU */
 static inline void register_cpu_control(struct cpu *cpu)
 {
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 47536197ffdd..c972f7ccb7d3 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -43,6 +43,8 @@ extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls);
 
 #ifdef CONFIG_HOTPLUG_CPU
 extern void unregister_cpu(struct cpu *cpu);
+extern ssize_t arch_cpu_probe(const char *, size_t);
+extern ssize_t arch_cpu_release(const char *, size_t);
 #endif
 struct notifier_block;
 
-- 
cgit v1.2.3-59-g8ed1b


From fe11dc3f9628e5393e932567b7e29d35cbbad136 Mon Sep 17 00:00:00 2001
From: Joakim Tjernlund <joakim.tjernlund@transmode.se>
Date: Fri, 20 Nov 2009 00:21:03 +0000
Subject: powerpc/8xx: Update TLB asm so it behaves as linux mm expects.

Update the TLB asm to make proper use of _PAGE_DIRY and _PAGE_ACCESSED.
Get rid of _PAGE_HWWRITE too.
Pros:
 - I/D TLB Miss never needs to write to the linux pte.
 - _PAGE_ACCESSED is only set on TLB Error fixing accounting
 - _PAGE_DIRTY is mapped to 0x100, the changed bit, and is set directly
    when a page has been made dirty.
 - Proper RO/RW mapping of user space.
 - Free up 2 SW TLB bits in the linux pte(add back _PAGE_WRITETHRU ?)
 - kernel RO/user NA support.
Cons:
 - A few more instructions in the TLB Miss routines.

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/pte-8xx.h | 13 +++--
 arch/powerpc/kernel/head_8xx.S     | 99 ++++++++++++++++++++------------------
 2 files changed, 57 insertions(+), 55 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h
index dd5ea95fe61e..68ba861331ee 100644
--- a/arch/powerpc/include/asm/pte-8xx.h
+++ b/arch/powerpc/include/asm/pte-8xx.h
@@ -33,21 +33,20 @@
 #define _PAGE_NO_CACHE	0x0002	/* I: cache inhibit */
 #define _PAGE_SHARED	0x0004	/* No ASID (context) compare */
 #define _PAGE_SPECIAL	0x0008	/* SW entry, forced to 0 by the TLB miss */
+#define _PAGE_DIRTY	0x0100	/* C: page changed */
 
-/* These five software bits must be masked out when the entry is loaded
- * into the TLB.
+/* These 3 software bits must be masked out when the entry is loaded
+ * into the TLB, 2 SW bits left.
  */
 #define _PAGE_GUARDED	0x0010	/* software: guarded access */
-#define _PAGE_DIRTY	0x0020	/* software: page changed */
-#define _PAGE_RW	0x0040	/* software: user write access allowed */
-#define _PAGE_ACCESSED	0x0080	/* software: page referenced */
+#define _PAGE_ACCESSED	0x0020	/* software: page referenced */
 
 /* Setting any bits in the nibble with the follow two controls will
  * require a TLB exception handler change.  It is assumed unused bits
  * are always zero.
  */
-#define _PAGE_HWWRITE	0x0100	/* h/w write enable: never set in Linux PTE */
-#define _PAGE_USER	0x0800	/* One of the PP bits, the other is USER&~RW */
+#define _PAGE_RW	0x0400	/* lsb PP bits, inverted in HW */
+#define _PAGE_USER	0x0800	/* msb PP bits */
 
 #define _PMD_PRESENT	0x0001
 #define _PMD_BAD	0x0ff0
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 6ded19d01891..97bd523a0278 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -333,26 +333,20 @@ InstructionTLBMiss:
 	mfspr	r11, SPRN_MD_TWC	/* ....and get the pte address */
 	lwz	r10, 0(r11)	/* Get the pte */
 
-#ifdef CONFIG_SWAP
-	/* do not set the _PAGE_ACCESSED bit of a non-present page */
-	andi.	r11, r10, _PAGE_PRESENT
-	beq	4f
-	ori	r10, r10, _PAGE_ACCESSED
-	mfspr	r11, SPRN_MD_TWC	/* get the pte address again */
-	stw	r10, 0(r11)
-4:
-#else
-	ori	r10, r10, _PAGE_ACCESSED
-	stw	r10, 0(r11)
-#endif
+	andi.	r11, r10, _PAGE_ACCESSED | _PAGE_PRESENT
+	cmpwi	cr0, r11, _PAGE_ACCESSED | _PAGE_PRESENT
+	bne-	cr0, 2f
+
+	/* Clear PP lsb, 0x400 */
+	rlwinm 	r10, r10, 0, 22, 20
 
 	/* The Linux PTE won't go exactly into the MMU TLB.
-	 * Software indicator bits 21, 22 and 28 must be clear.
+	 * Software indicator bits 22 and 28 must be clear.
 	 * Software indicator bits 24, 25, 26, and 27 must be
 	 * set.  All other Linux PTE bits control the behavior
 	 * of the MMU.
 	 */
-2:	li	r11, 0x00f0
+	li	r11, 0x00f0
 	rlwimi	r10, r11, 0, 24, 28	/* Set 24-27, clear 28 */
 	DO_8xx_CPU6(0x2d80, r3)
 	mtspr	SPRN_MI_RPN, r10	/* Update TLB entry */
@@ -365,6 +359,22 @@ InstructionTLBMiss:
 	lwz	r3, 8(r0)
 #endif
 	rfi
+2:
+	mfspr	r11, SPRN_SRR1
+	/* clear all error bits as TLB Miss
+	 * sets a few unconditionally
+	*/
+	rlwinm	r11, r11, 0, 0xffff
+	mtspr	SPRN_SRR1, r11
+
+	mfspr	r10, SPRN_M_TW	/* Restore registers */
+	lwz	r11, 0(r0)
+	mtcr	r11
+	lwz	r11, 4(r0)
+#ifdef CONFIG_8xx_CPU6
+	lwz	r3, 8(r0)
+#endif
+	b	InstructionAccess
 
 	. = 0x1200
 DataStoreTLBMiss:
@@ -409,21 +419,27 @@ DataStoreTLBMiss:
 	DO_8xx_CPU6(0x3b80, r3)
 	mtspr	SPRN_MD_TWC, r11
 
-#ifdef CONFIG_SWAP
-	/* do not set the _PAGE_ACCESSED bit of a non-present page */
-	andi.	r11, r10, _PAGE_PRESENT
-	beq	4f
-	ori	r10, r10, _PAGE_ACCESSED
-4:
-	/* and update pte in table */
-#else
-	ori	r10, r10, _PAGE_ACCESSED
-#endif
-	mfspr	r11, SPRN_MD_TWC	/* get the pte address again */
-	stw	r10, 0(r11)
+	/* Both _PAGE_ACCESSED and _PAGE_PRESENT has to be set.
+	 * We also need to know if the insn is a load/store, so:
+	 * Clear _PAGE_PRESENT and load that which will
+	 * trap into DTLB Error with store bit set accordinly.
+	 */
+	/* PRESENT=0x1, ACCESSED=0x20
+	 * r11 = ((r10 & PRESENT) & ((r10 & ACCESSED) >> 5));
+	 * r10 = (r10 & ~PRESENT) | r11;
+	 */
+	rlwinm	r11, r10, 32-5, 31, 31
+	and	r11, r11, r10
+	rlwimi	r10, r11, 0, 31, 31
+
+	/* Honour kernel RO, User NA */
+	andi.	r11, r10, _PAGE_USER | _PAGE_RW
+	bne-	cr0, 5f
+	ori	r10,r10, 0x200 /* Extended encoding, bit 22 */
+5:	xori	r10, r10, _PAGE_RW  /* invert RW bit */
 
 	/* The Linux PTE won't go exactly into the MMU TLB.
-	 * Software indicator bits 21, 22 and 28 must be clear.
+	 * Software indicator bits 22 and 28 must be clear.
 	 * Software indicator bits 24, 25, 26, and 27 must be
 	 * set.  All other Linux PTE bits control the behavior
 	 * of the MMU.
@@ -469,11 +485,12 @@ DataTLBError:
 	stw	r10, 0(r0)
 	stw	r11, 4(r0)
 
-	/* First, make sure this was a store operation.
+	mfspr	r11, SPRN_DSISR
+	andis.	r11, r11, 0x4800	/* !translation or protection */
+	bne	2f	/* branch if either is set */
+	/* Only Change bit left now, do it here as it is faster
+	 * than trapping to the C fault handler.
 	*/
-	mfspr	r10, SPRN_DSISR
-	andis.	r11, r10, 0x0200	/* If set, indicates store op */
-	beq	2f
 
 	/* The EA of a data TLB miss is automatically stored in the MD_EPN
 	 * register.  The EA of a data TLB error is automatically stored in
@@ -522,26 +539,12 @@ DataTLBError:
 	mfspr	r11, SPRN_MD_TWC		/* ....and get the pte address */
 	lwz	r10, 0(r11)		/* Get the pte */
 
-	andi.	r11, r10, _PAGE_RW	/* Is it writeable? */
-	beq	2f			/* Bail out if not */
-
-	/* Update 'changed', among others.
-	*/
-#ifdef CONFIG_SWAP
-	ori	r10, r10, _PAGE_DIRTY|_PAGE_HWWRITE
-	/* do not set the _PAGE_ACCESSED bit of a non-present page */
-	andi.	r11, r10, _PAGE_PRESENT
-	beq	4f
-	ori	r10, r10, _PAGE_ACCESSED
-4:
-#else
-	ori	r10, r10, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE
-#endif
-	mfspr	r11, SPRN_MD_TWC		/* Get pte address again */
+	ori	r10, r10, _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_HWWRITE
 	stw	r10, 0(r11)		/* and update pte in table */
+	xori	r10, r10, _PAGE_RW	/* RW bit is inverted */
 
 	/* The Linux PTE won't go exactly into the MMU TLB.
-	 * Software indicator bits 21, 22 and 28 must be clear.
+	 * Software indicator bits 22 and 28 must be clear.
 	 * Software indicator bits 24, 25, 26, and 27 must be
 	 * set.  All other Linux PTE bits control the behavior
 	 * of the MMU.
-- 
cgit v1.2.3-59-g8ed1b


From 0c4661698c58db2a9efc44f403b893bd4d98f348 Mon Sep 17 00:00:00 2001
From: Joakim Tjernlund <joakim.tjernlund@transmode.se>
Date: Fri, 20 Nov 2009 00:21:08 +0000
Subject: powerpc/8xx: Restore _PAGE_WRITETHRU

8xx has not had WRITETHRU due to lack of bits in the pte.
After the recent rewrite of the 8xx TLB code, there are
two bits left. Use one of them to WRITETHRU.

Perhaps use the last SW bit to PAGE_SPECIAL or PAGE_FILE?

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/pte-8xx.h | 5 +++--
 arch/powerpc/kernel/head_8xx.S     | 8 ++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h
index 68ba861331ee..d44826e4ff97 100644
--- a/arch/powerpc/include/asm/pte-8xx.h
+++ b/arch/powerpc/include/asm/pte-8xx.h
@@ -35,11 +35,12 @@
 #define _PAGE_SPECIAL	0x0008	/* SW entry, forced to 0 by the TLB miss */
 #define _PAGE_DIRTY	0x0100	/* C: page changed */
 
-/* These 3 software bits must be masked out when the entry is loaded
- * into the TLB, 2 SW bits left.
+/* These 4 software bits must be masked out when the entry is loaded
+ * into the TLB, 1 SW bit left(0x0080).
  */
 #define _PAGE_GUARDED	0x0010	/* software: guarded access */
 #define _PAGE_ACCESSED	0x0020	/* software: page referenced */
+#define _PAGE_WRITETHRU	0x0040	/* software: caching is write through */
 
 /* Setting any bits in the nibble with the follow two controls will
  * require a TLB exception handler change.  It is assumed unused bits
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 1a28ee8ca318..c4ae85b8f8c0 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -422,6 +422,10 @@ DataStoreTLBMiss:
 	 * above.
 	 */
 	rlwimi	r11, r10, 0, 27, 27
+	/* Insert the WriteThru flag into the TWC from the Linux PTE.
+	 * It is bit 25 in the Linux PTE and bit 30 in the TWC
+	 */
+	rlwimi	r11, r10, 32-5, 30, 30
 	DO_8xx_CPU6(0x3b80, r3)
 	mtspr	SPRN_MD_TWC, r11
 
@@ -559,6 +563,10 @@ DARFixed:/* Return from dcbx instruction bug workaround, r10 holds value of DAR
 	 * It is bit 27 of both the Linux PTE and the TWC
 	 */
 	rlwimi	r11, r10, 0, 27, 27
+	/* Insert the WriteThru flag into the TWC from the Linux PTE.
+	 * It is bit 25 in the Linux PTE and bit 30 in the TWC
+	 */
+	rlwimi	r11, r10, 32-5, 30, 30
 	DO_8xx_CPU6(0x3b80, r3)
 	mtspr	SPRN_MD_TWC, r11
 	mfspr	r11, SPRN_MD_TWC	/* get the pte address again */
-- 
cgit v1.2.3-59-g8ed1b


From 40d50cf7ca956183f3a573bc21082e1c7d04fa7b Mon Sep 17 00:00:00 2001
From: Roman Fietze <roman.fietze@telemotive.de>
Date: Tue, 8 Dec 2009 02:39:50 +0000
Subject: powerpc: Make "intspec" pointers in irq_host->xlate() const

Writing a driver using SCLPC on the MPC5200B I detected, that the
intspec arrays to map irqs to Linux virq cannot be const, because the
mapping and xlate functions only take non const pointers. All those
functions do not modify the intspec, so a const pointer could be used.

Signed-off-by: Roman Fietze <roman.fietze@telemotive.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/irq.h                  | 4 ++--
 arch/powerpc/kernel/irq.c                       | 2 +-
 arch/powerpc/platforms/52xx/media5200.c         | 2 +-
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c       | 2 +-
 arch/powerpc/platforms/52xx/mpc52xx_pic.c       | 2 +-
 arch/powerpc/platforms/85xx/socrates_fpga_pic.c | 2 +-
 arch/powerpc/platforms/86xx/gef_pic.c           | 2 +-
 arch/powerpc/platforms/cell/beat_interrupt.c    | 4 ++--
 arch/powerpc/platforms/cell/interrupt.c         | 2 +-
 arch/powerpc/platforms/cell/spider-pic.c        | 2 +-
 arch/powerpc/platforms/powermac/pic.c           | 2 +-
 arch/powerpc/platforms/pseries/xics.c           | 2 +-
 arch/powerpc/sysdev/cpm2_pic.c                  | 2 +-
 arch/powerpc/sysdev/i8259.c                     | 2 +-
 arch/powerpc/sysdev/ipic.c                      | 2 +-
 arch/powerpc/sysdev/mpc8xx_pic.c                | 2 +-
 arch/powerpc/sysdev/mpic.c                      | 2 +-
 arch/powerpc/sysdev/qe_lib/qe_ic.c              | 2 +-
 arch/powerpc/sysdev/tsi108_pci.c                | 2 +-
 arch/powerpc/sysdev/uic.c                       | 2 +-
 arch/powerpc/sysdev/xilinx_intc.c               | 2 +-
 21 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index c85a32f1a17f..e054baef1845 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -100,7 +100,7 @@ struct irq_host_ops {
 	 * interrupt controller has for that line)
 	 */
 	int (*xlate)(struct irq_host *h, struct device_node *ctrler,
-		     u32 *intspec, unsigned int intsize,
+		     const u32 *intspec, unsigned int intsize,
 		     irq_hw_number_t *out_hwirq, unsigned int *out_type);
 };
 
@@ -314,7 +314,7 @@ extern void irq_free_virt(unsigned int virq, unsigned int count);
  * of the of_irq_map_*() functions.
  */
 extern unsigned int irq_create_of_mapping(struct device_node *controller,
-					  u32 *intspec, unsigned int intsize);
+					  const u32 *intspec, unsigned int intsize);
 
 /**
  * irq_of_parse_and_map - Parse and Map an interrupt into linux virq space
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index a31176ace02b..042a53009701 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -725,7 +725,7 @@ unsigned int irq_create_mapping(struct irq_host *host,
 EXPORT_SYMBOL_GPL(irq_create_mapping);
 
 unsigned int irq_create_of_mapping(struct device_node *controller,
-				   u32 *intspec, unsigned int intsize)
+				   const u32 *intspec, unsigned int intsize)
 {
 	struct irq_host *host;
 	irq_hw_number_t hwirq;
diff --git a/arch/powerpc/platforms/52xx/media5200.c b/arch/powerpc/platforms/52xx/media5200.c
index 85001a4cbdff..cc0c854291d7 100644
--- a/arch/powerpc/platforms/52xx/media5200.c
+++ b/arch/powerpc/platforms/52xx/media5200.c
@@ -127,7 +127,7 @@ static int media5200_irq_map(struct irq_host *h, unsigned int virq,
 }
 
 static int media5200_irq_xlate(struct irq_host *h, struct device_node *ct,
-				 u32 *intspec, unsigned int intsize,
+				 const u32 *intspec, unsigned int intsize,
 				 irq_hw_number_t *out_hwirq,
 				 unsigned int *out_flags)
 {
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index 17ecdf4c87ae..6f8ebe1085b3 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -214,7 +214,7 @@ static int mpc52xx_gpt_irq_map(struct irq_host *h, unsigned int virq,
 }
 
 static int mpc52xx_gpt_irq_xlate(struct irq_host *h, struct device_node *ct,
-				 u32 *intspec, unsigned int intsize,
+				 const u32 *intspec, unsigned int intsize,
 				 irq_hw_number_t *out_hwirq,
 				 unsigned int *out_flags)
 {
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pic.c b/arch/powerpc/platforms/52xx/mpc52xx_pic.c
index a3122d163b6a..4bf4bf7b063e 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_pic.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_pic.c
@@ -355,7 +355,7 @@ static int mpc52xx_is_extirq(int l1, int l2)
  * mpc52xx_irqhost_xlate - translate virq# from device tree interrupts property
  */
 static int mpc52xx_irqhost_xlate(struct irq_host *h, struct device_node *ct,
-				 u32 *intspec, unsigned int intsize,
+				 const u32 *intspec, unsigned int intsize,
 				 irq_hw_number_t *out_hwirq,
 				 unsigned int *out_flags)
 {
diff --git a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c
index 37a2e5f60af9..e5da5f62b24a 100644
--- a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c
+++ b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c
@@ -253,7 +253,7 @@ static int socrates_fpga_pic_host_map(struct irq_host *h, unsigned int virq,
 }
 
 static int socrates_fpga_pic_host_xlate(struct irq_host *h,
-		struct device_node *ct,	u32 *intspec, unsigned int intsize,
+		struct device_node *ct,	const u32 *intspec, unsigned int intsize,
 		irq_hw_number_t *out_hwirq, unsigned int *out_flags)
 {
 	struct socrates_fpga_irq_info *fpga_irq = &fpga_irqs[intspec[0]];
diff --git a/arch/powerpc/platforms/86xx/gef_pic.c b/arch/powerpc/platforms/86xx/gef_pic.c
index e1d5d36995df..0110a8736d33 100644
--- a/arch/powerpc/platforms/86xx/gef_pic.c
+++ b/arch/powerpc/platforms/86xx/gef_pic.c
@@ -170,7 +170,7 @@ static int gef_pic_host_map(struct irq_host *h, unsigned int virq,
 }
 
 static int gef_pic_host_xlate(struct irq_host *h, struct device_node *ct,
-			    u32 *intspec, unsigned int intsize,
+			    const u32 *intspec, unsigned int intsize,
 			    irq_hw_number_t *out_hwirq, unsigned int *out_flags)
 {
 
diff --git a/arch/powerpc/platforms/cell/beat_interrupt.c b/arch/powerpc/platforms/cell/beat_interrupt.c
index c3479a47d45a..36052a9ebcda 100644
--- a/arch/powerpc/platforms/cell/beat_interrupt.c
+++ b/arch/powerpc/platforms/cell/beat_interrupt.c
@@ -166,11 +166,11 @@ static void beatic_pic_host_remap(struct irq_host *h, unsigned int virq,
  * Note: We have only 1 entry to translate.
  */
 static int beatic_pic_host_xlate(struct irq_host *h, struct device_node *ct,
-				 u32 *intspec, unsigned int intsize,
+				 const u32 *intspec, unsigned int intsize,
 				 irq_hw_number_t *out_hwirq,
 				 unsigned int *out_flags)
 {
-	u64 *intspec2 = (u64 *)intspec;
+	const u64 *intspec2 = (const u64 *)intspec;
 
 	*out_hwirq = *intspec2;
 	*out_flags |= IRQ_TYPE_LEVEL_LOW;
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index 3b67afba3f9b..f9dbf76a763f 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -297,7 +297,7 @@ static int iic_host_map(struct irq_host *h, unsigned int virq,
 }
 
 static int iic_host_xlate(struct irq_host *h, struct device_node *ct,
-			   u32 *intspec, unsigned int intsize,
+			   const u32 *intspec, unsigned int intsize,
 			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
 
 {
diff --git a/arch/powerpc/platforms/cell/spider-pic.c b/arch/powerpc/platforms/cell/spider-pic.c
index 167dedaada76..01244f254a11 100644
--- a/arch/powerpc/platforms/cell/spider-pic.c
+++ b/arch/powerpc/platforms/cell/spider-pic.c
@@ -187,7 +187,7 @@ static int spider_host_map(struct irq_host *h, unsigned int virq,
 }
 
 static int spider_host_xlate(struct irq_host *h, struct device_node *ct,
-			   u32 *intspec, unsigned int intsize,
+			   const u32 *intspec, unsigned int intsize,
 			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
 
 {
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index 99d0b313e9a5..09e827296276 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -303,7 +303,7 @@ static int pmac_pic_host_map(struct irq_host *h, unsigned int virq,
 }
 
 static int pmac_pic_host_xlate(struct irq_host *h, struct device_node *ct,
-			       u32 *intspec, unsigned int intsize,
+			       const u32 *intspec, unsigned int intsize,
 			       irq_hw_number_t *out_hwirq,
 			       unsigned int *out_flags)
 
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 6592becd4410..690f87584f6b 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -434,7 +434,7 @@ static int xics_host_map(struct irq_host *h, unsigned int virq,
 }
 
 static int xics_host_xlate(struct irq_host *h, struct device_node *ct,
-			   u32 *intspec, unsigned int intsize,
+			   const u32 *intspec, unsigned int intsize,
 			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
 
 {
diff --git a/arch/powerpc/sysdev/cpm2_pic.c b/arch/powerpc/sysdev/cpm2_pic.c
index 059ea4e5e25f..971483f0dfac 100644
--- a/arch/powerpc/sysdev/cpm2_pic.c
+++ b/arch/powerpc/sysdev/cpm2_pic.c
@@ -218,7 +218,7 @@ static int cpm2_pic_host_map(struct irq_host *h, unsigned int virq,
 }
 
 static int cpm2_pic_host_xlate(struct irq_host *h, struct device_node *ct,
-			    u32 *intspec, unsigned int intsize,
+			    const u32 *intspec, unsigned int intsize,
 			    irq_hw_number_t *out_hwirq, unsigned int *out_flags)
 {
 	*out_hwirq = intspec[0];
diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c
index ba8f1f708992..0a55db8a5a29 100644
--- a/arch/powerpc/sysdev/i8259.c
+++ b/arch/powerpc/sysdev/i8259.c
@@ -198,7 +198,7 @@ static void i8259_host_unmap(struct irq_host *h, unsigned int virq)
 }
 
 static int i8259_host_xlate(struct irq_host *h, struct device_node *ct,
-			    u32 *intspec, unsigned int intsize,
+			    const u32 *intspec, unsigned int intsize,
 			    irq_hw_number_t *out_hwirq, unsigned int *out_flags)
 {
 	static unsigned char map_isa_senses[4] = {
diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c
index c89d78075ba0..28cdddd2f89e 100644
--- a/arch/powerpc/sysdev/ipic.c
+++ b/arch/powerpc/sysdev/ipic.c
@@ -697,7 +697,7 @@ static int ipic_host_map(struct irq_host *h, unsigned int virq,
 }
 
 static int ipic_host_xlate(struct irq_host *h, struct device_node *ct,
-			   u32 *intspec, unsigned int intsize,
+			   const u32 *intspec, unsigned int intsize,
 			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
 
 {
diff --git a/arch/powerpc/sysdev/mpc8xx_pic.c b/arch/powerpc/sysdev/mpc8xx_pic.c
index db0a712f6075..69bd6f4dff83 100644
--- a/arch/powerpc/sysdev/mpc8xx_pic.c
+++ b/arch/powerpc/sysdev/mpc8xx_pic.c
@@ -130,7 +130,7 @@ static int mpc8xx_pic_host_map(struct irq_host *h, unsigned int virq,
 
 
 static int mpc8xx_pic_host_xlate(struct irq_host *h, struct device_node *ct,
-			    u32 *intspec, unsigned int intsize,
+			    const u32 *intspec, unsigned int intsize,
 			    irq_hw_number_t *out_hwirq, unsigned int *out_flags)
 {
 	static unsigned char map_pic_senses[4] = {
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 7a64bc5808da..aa9d06e5925b 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -994,7 +994,7 @@ static int mpic_host_map(struct irq_host *h, unsigned int virq,
 }
 
 static int mpic_host_xlate(struct irq_host *h, struct device_node *ct,
-			   u32 *intspec, unsigned int intsize,
+			   const u32 *intspec, unsigned int intsize,
 			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
 
 {
diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.c b/arch/powerpc/sysdev/qe_lib/qe_ic.c
index c1e17b3d3982..2acc928d1920 100644
--- a/arch/powerpc/sysdev/qe_lib/qe_ic.c
+++ b/arch/powerpc/sysdev/qe_lib/qe_ic.c
@@ -271,7 +271,7 @@ static int qe_ic_host_map(struct irq_host *h, unsigned int virq,
 }
 
 static int qe_ic_host_xlate(struct irq_host *h, struct device_node *ct,
-			    u32 * intspec, unsigned int intsize,
+			    const u32 * intspec, unsigned int intsize,
 			    irq_hw_number_t * out_hwirq,
 			    unsigned int *out_flags)
 {
diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c
index 47769d2359d6..595034cfb85a 100644
--- a/arch/powerpc/sysdev/tsi108_pci.c
+++ b/arch/powerpc/sysdev/tsi108_pci.c
@@ -384,7 +384,7 @@ static struct irq_chip tsi108_pci_irq = {
 };
 
 static int pci_irq_host_xlate(struct irq_host *h, struct device_node *ct,
-			    u32 *intspec, unsigned int intsize,
+			    const u32 *intspec, unsigned int intsize,
 			    irq_hw_number_t *out_hwirq, unsigned int *out_flags)
 {
 	*out_hwirq = intspec[0];
diff --git a/arch/powerpc/sysdev/uic.c b/arch/powerpc/sysdev/uic.c
index c907601e44db..7d10074b3304 100644
--- a/arch/powerpc/sysdev/uic.c
+++ b/arch/powerpc/sysdev/uic.c
@@ -202,7 +202,7 @@ static int uic_host_map(struct irq_host *h, unsigned int virq,
 }
 
 static int uic_host_xlate(struct irq_host *h, struct device_node *ct,
-			  u32 *intspec, unsigned int intsize,
+			  const u32 *intspec, unsigned int intsize,
 			  irq_hw_number_t *out_hwirq, unsigned int *out_type)
 
 {
diff --git a/arch/powerpc/sysdev/xilinx_intc.c b/arch/powerpc/sysdev/xilinx_intc.c
index 45eb225ec25e..1e0ccfaf403e 100644
--- a/arch/powerpc/sysdev/xilinx_intc.c
+++ b/arch/powerpc/sysdev/xilinx_intc.c
@@ -148,7 +148,7 @@ static struct irq_chip xilinx_intc_edge_irqchip = {
  * xilinx_intc_xlate - translate virq# from device tree interrupts property
  */
 static int xilinx_intc_xlate(struct irq_host *h, struct device_node *ct,
-				u32 *intspec, unsigned int intsize,
+				const u32 *intspec, unsigned int intsize,
 				irq_hw_number_t *out_hwirq,
 				unsigned int *out_flags)
 {
-- 
cgit v1.2.3-59-g8ed1b