11 files changed, 3925 insertions, 5 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 046b4bf1f21e..4970e3721a84 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -49,12 +49,23 @@ extra-y				+= vmlinux.lds
 obj-y				+= process.o init_task.o time.o \
 				   prom.o traps.o setup-common.o
 obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o misc_32.o systbl.o
-obj-$(CONFIG_PPC64)		+= misc_64.o
+obj-$(CONFIG_PPC64)		+= misc_64.o dma_64.o iommu.o
 obj-$(CONFIG_PPC_OF)		+= prom_init.o
 obj-$(CONFIG_MODULES)		+= ppc_ksyms.o
 obj-$(CONFIG_BOOTX_TEXT)	+= btext.o
 obj-$(CONFIG_6xx)		+= idle_6xx.o
 obj-$(CONFIG_SMP)		+= smp.o
+obj-$(CONFIG_KPROBES)		+= kprobes.o
+
+module-$(CONFIG_PPC64)		+= module_64.o
+obj-$(CONFIG_MODULES)		+= $(module-y)
+
+pci64-$(CONFIG_PPC64)		+= pci_64.o pci_dn.o pci_iommu.o \
+				   pci_direct_iommu.o iomap.o
+obj-$(CONFIG_PCI)		+= $(pci64-y)
+
+kexec64-$(CONFIG_PPC64)		+= machine_kexec_64.o
+obj-$(CONFIG_KEXEC)		+= $(kexec64-y)
 
 ifeq ($(CONFIG_PPC_ISERIES),y)
 $(obj)/head_64.o: $(obj)/lparmap.s
@@ -62,11 +73,8 @@ AFLAGS_head_64.o += -I$(obj)
 endif
 
 else
-# stuff used from here for ARCH=ppc or ARCH=ppc64
+# stuff used from here for ARCH=ppc
 smpobj-$(CONFIG_SMP)		+= smp.o
-obj-$(CONFIG_PPC64)		+= traps.o process.o init_task.o time.o \
-				   setup-common.o $(smpobj-y)
-
 
 endif
 
diff --git a/arch/powerpc/kernel/dma_64.c b/arch/powerpc/kernel/dma_64.c
new file mode 100644
index 000000000000..7c3419656ccc
--- /dev/null
+++ b/arch/powerpc/kernel/dma_64.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Implements the generic device dma API for ppc64. Handles
+ * the pci and vio busses
+ */
+
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+/* Include the busses we support */
+#include <linux/pci.h>
+#include <asm/vio.h>
+#include <asm/scatterlist.h>
+#include <asm/bug.h>
+
+static struct dma_mapping_ops *get_dma_ops(struct device *dev)
+{
+#ifdef CONFIG_PCI
+	if (dev->bus == &pci_bus_type)
+		return &pci_dma_ops;
+#endif
+#ifdef CONFIG_IBMVIO
+	if (dev->bus == &vio_bus_type)
+		return &vio_dma_ops;
+#endif
+	return NULL;
+}
+
+int dma_supported(struct device *dev, u64 mask)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		return dma_ops->dma_supported(dev, mask);
+	BUG();
+	return 0;
+}
+EXPORT_SYMBOL(dma_supported);
+
+int dma_set_mask(struct device *dev, u64 dma_mask)
+{
+#ifdef CONFIG_PCI
+	if (dev->bus == &pci_bus_type)
+		return pci_set_dma_mask(to_pci_dev(dev), dma_mask);
+#endif
+#ifdef CONFIG_IBMVIO
+	if (dev->bus == &vio_bus_type)
+		return -EIO;
+#endif /* CONFIG_IBMVIO */
+	BUG();
+	return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+void *dma_alloc_coherent(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t flag)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		return dma_ops->alloc_coherent(dev, size, dma_handle, flag);
+	BUG();
+	return NULL;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+
+void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
+		dma_addr_t dma_handle)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		dma_ops->free_coherent(dev, size, cpu_addr, dma_handle);
+	else
+		BUG();
+}
+EXPORT_SYMBOL(dma_free_coherent);
+
+dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, size_t size,
+		enum dma_data_direction direction)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		return dma_ops->map_single(dev, cpu_addr, size, direction);
+	BUG();
+	return (dma_addr_t)0;
+}
+EXPORT_SYMBOL(dma_map_single);
+
+void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+		enum dma_data_direction direction)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		dma_ops->unmap_single(dev, dma_addr, size, direction);
+	else
+		BUG();
+}
+EXPORT_SYMBOL(dma_unmap_single);
+
+dma_addr_t dma_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size,
+		enum dma_data_direction direction)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		return dma_ops->map_single(dev,
+				(page_address(page) + offset), size, direction);
+	BUG();
+	return (dma_addr_t)0;
+}
+EXPORT_SYMBOL(dma_map_page);
+
+void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
+		enum dma_data_direction direction)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		dma_ops->unmap_single(dev, dma_address, size, direction);
+	else
+		BUG();
+}
+EXPORT_SYMBOL(dma_unmap_page);
+
+int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction direction)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		return dma_ops->map_sg(dev, sg, nents, direction);
+	BUG();
+	return 0;
+}
+EXPORT_SYMBOL(dma_map_sg);
+
+void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
+		enum dma_data_direction direction)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		dma_ops->unmap_sg(dev, sg, nhwentries, direction);
+	else
+		BUG();
+}
+EXPORT_SYMBOL(dma_unmap_sg);
diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c
new file mode 100644
index 000000000000..6160c8dbb7c5
--- /dev/null
+++ b/arch/powerpc/kernel/iomap.c
@@ -0,0 +1,146 @@
+/*
+ * arch/ppc64/kernel/iomap.c
+ *
+ * ppc64 "iomap" interface implementation.
+ *
+ * (C) Copyright 2004 Linus Torvalds
+ */
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/mm.h>
+#include <asm/io.h>
+
+/*
+ * Here comes the ppc64 implementation of the IOMAP 
+ * interfaces.
+ */
+unsigned int fastcall ioread8(void __iomem *addr)
+{
+	return readb(addr);
+}
+unsigned int fastcall ioread16(void __iomem *addr)
+{
+	return readw(addr);
+}
+unsigned int fastcall ioread16be(void __iomem *addr)
+{
+	return in_be16(addr);
+}
+unsigned int fastcall ioread32(void __iomem *addr)
+{
+	return readl(addr);
+}
+unsigned int fastcall ioread32be(void __iomem *addr)
+{
+	return in_be32(addr);
+}
+EXPORT_SYMBOL(ioread8);
+EXPORT_SYMBOL(ioread16);
+EXPORT_SYMBOL(ioread16be);
+EXPORT_SYMBOL(ioread32);
+EXPORT_SYMBOL(ioread32be);
+
+void fastcall iowrite8(u8 val, void __iomem *addr)
+{
+	writeb(val, addr);
+}
+void fastcall iowrite16(u16 val, void __iomem *addr)
+{
+	writew(val, addr);
+}
+void fastcall iowrite16be(u16 val, void __iomem *addr)
+{
+	out_be16(addr, val);
+}
+void fastcall iowrite32(u32 val, void __iomem *addr)
+{
+	writel(val, addr);
+}
+void fastcall iowrite32be(u32 val, void __iomem *addr)
+{
+	out_be32(addr, val);
+}
+EXPORT_SYMBOL(iowrite8);
+EXPORT_SYMBOL(iowrite16);
+EXPORT_SYMBOL(iowrite16be);
+EXPORT_SYMBOL(iowrite32);
+EXPORT_SYMBOL(iowrite32be);
+
+/*
+ * These are the "repeat read/write" functions. Note the
+ * non-CPU byte order. We do things in "IO byteorder"
+ * here.
+ *
+ * FIXME! We could make these do EEH handling if we really
+ * wanted. Not clear if we do.
+ */
+void ioread8_rep(void __iomem *addr, void *dst, unsigned long count)
+{
+	_insb((u8 __iomem *) addr, dst, count);
+}
+void ioread16_rep(void __iomem *addr, void *dst, unsigned long count)
+{
+	_insw_ns((u16 __iomem *) addr, dst, count);
+}
+void ioread32_rep(void __iomem *addr, void *dst, unsigned long count)
+{
+	_insl_ns((u32 __iomem *) addr, dst, count);
+}
+EXPORT_SYMBOL(ioread8_rep);
+EXPORT_SYMBOL(ioread16_rep);
+EXPORT_SYMBOL(ioread32_rep);
+
+void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count)
+{
+	_outsb((u8 __iomem *) addr, src, count);
+}
+void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count)
+{
+	_outsw_ns((u16 __iomem *) addr, src, count);
+}
+void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count)
+{
+	_outsl_ns((u32 __iomem *) addr, src, count);
+}
+EXPORT_SYMBOL(iowrite8_rep);
+EXPORT_SYMBOL(iowrite16_rep);
+EXPORT_SYMBOL(iowrite32_rep);
+
+void __iomem *ioport_map(unsigned long port, unsigned int len)
+{
+	if (!_IO_IS_VALID(port))
+		return NULL;
+	return (void __iomem *) (port+pci_io_base);
+}
+
+void ioport_unmap(void __iomem *addr)
+{
+	/* Nothing to do */
+}
+EXPORT_SYMBOL(ioport_map);
+EXPORT_SYMBOL(ioport_unmap);
+
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max)
+{
+	unsigned long start = pci_resource_start(dev, bar);
+	unsigned long len = pci_resource_len(dev, bar);
+	unsigned long flags = pci_resource_flags(dev, bar);
+
+	if (!len)
+		return NULL;
+	if (max && len > max)
+		len = max;
+	if (flags & IORESOURCE_IO)
+		return ioport_map(start, len);
+	if (flags & IORESOURCE_MEM)
+		return ioremap(start, len);
+	/* What? */
+	return NULL;
+}
+
+void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
+{
+	/* Nothing to do */
+}
+EXPORT_SYMBOL(pci_iomap);
+EXPORT_SYMBOL(pci_iounmap);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
new file mode 100644
index 000000000000..4d9b4388918b
--- /dev/null
+++ b/arch/powerpc/kernel/iommu.c
@@ -0,0 +1,572 @@
+/*
+ * arch/ppc64/kernel/iommu.c
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ * 
+ * Rewrite, cleanup, new allocation schemes, virtual merging: 
+ * Copyright (C) 2004 Olof Johansson, IBM Corporation
+ *               and  Ben. Herrenschmidt, IBM Corporation
+ *
+ * Dynamic DMA mapping support, bus-independent parts.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/dma-mapping.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/iommu.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+
+#define DBG(...)
+
+#ifdef CONFIG_IOMMU_VMERGE
+static int novmerge = 0;
+#else
+static int novmerge = 1;
+#endif
+
+static int __init setup_iommu(char *str)
+{
+	if (!strcmp(str, "novmerge"))
+		novmerge = 1;
+	else if (!strcmp(str, "vmerge"))
+		novmerge = 0;
+	return 1;
+}
+
+__setup("iommu=", setup_iommu);
+
+static unsigned long iommu_range_alloc(struct iommu_table *tbl,
+                                       unsigned long npages,
+                                       unsigned long *handle,
+                                       unsigned int align_order)
+{ 
+	unsigned long n, end, i, start;
+	unsigned long limit;
+	int largealloc = npages > 15;
+	int pass = 0;
+	unsigned long align_mask;
+
+	align_mask = 0xffffffffffffffffl >> (64 - align_order);
+
+	/* This allocator was derived from x86_64's bit string search */
+
+	/* Sanity check */
+	if (unlikely(npages) == 0) {
+		if (printk_ratelimit())
+			WARN_ON(1);
+		return DMA_ERROR_CODE;
+	}
+
+	if (handle && *handle)
+		start = *handle;
+	else
+		start = largealloc ? tbl->it_largehint : tbl->it_hint;
+
+	/* Use only half of the table for small allocs (15 pages or less) */
+	limit = largealloc ? tbl->it_size : tbl->it_halfpoint;
+
+	if (largealloc && start < tbl->it_halfpoint)
+		start = tbl->it_halfpoint;
+
+	/* The case below can happen if we have a small segment appended
+	 * to a large, or when the previous alloc was at the very end of
+	 * the available space. If so, go back to the initial start.
+	 */
+	if (start >= limit)
+		start = largealloc ? tbl->it_largehint : tbl->it_hint;
+	
+ again:
+
+	n = find_next_zero_bit(tbl->it_map, limit, start);
+
+	/* Align allocation */
+	n = (n + align_mask) & ~align_mask;
+
+	end = n + npages;
+
+	if (unlikely(end >= limit)) {
+		if (likely(pass < 2)) {
+			/* First failure, just rescan the half of the table.
+			 * Second failure, rescan the other half of the table.
+			 */
+			start = (largealloc ^ pass) ? tbl->it_halfpoint : 0;
+			limit = pass ? tbl->it_size : limit;
+			pass++;
+			goto again;
+		} else {
+			/* Third failure, give up */
+			return DMA_ERROR_CODE;
+		}
+	}
+
+	for (i = n; i < end; i++)
+		if (test_bit(i, tbl->it_map)) {
+			start = i+1;
+			goto again;
+		}
+
+	for (i = n; i < end; i++)
+		__set_bit(i, tbl->it_map);
+
+	/* Bump the hint to a new block for small allocs. */
+	if (largealloc) {
+		/* Don't bump to new block to avoid fragmentation */
+		tbl->it_largehint = end;
+	} else {
+		/* Overflow will be taken care of at the next allocation */
+		tbl->it_hint = (end + tbl->it_blocksize - 1) &
+		                ~(tbl->it_blocksize - 1);
+	}
+
+	/* Update handle for SG allocations */
+	if (handle)
+		*handle = end;
+
+	return n;
+}
+
+static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page,
+		       unsigned int npages, enum dma_data_direction direction,
+		       unsigned int align_order)
+{
+	unsigned long entry, flags;
+	dma_addr_t ret = DMA_ERROR_CODE;
+	
+	spin_lock_irqsave(&(tbl->it_lock), flags);
+
+	entry = iommu_range_alloc(tbl, npages, NULL, align_order);
+
+	if (unlikely(entry == DMA_ERROR_CODE)) {
+		spin_unlock_irqrestore(&(tbl->it_lock), flags);
+		return DMA_ERROR_CODE;
+	}
+
+	entry += tbl->it_offset;	/* Offset into real TCE table */
+	ret = entry << PAGE_SHIFT;	/* Set the return dma address */
+
+	/* Put the TCEs in the HW table */
+	ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & PAGE_MASK,
+			 direction);
+
+
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	spin_unlock_irqrestore(&(tbl->it_lock), flags);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+
+	return ret;
+}
+
+static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 
+			 unsigned int npages)
+{
+	unsigned long entry, free_entry;
+	unsigned long i;
+
+	entry = dma_addr >> PAGE_SHIFT;
+	free_entry = entry - tbl->it_offset;
+
+	if (((free_entry + npages) > tbl->it_size) ||
+	    (entry < tbl->it_offset)) {
+		if (printk_ratelimit()) {
+			printk(KERN_INFO "iommu_free: invalid entry\n");
+			printk(KERN_INFO "\tentry     = 0x%lx\n", entry); 
+			printk(KERN_INFO "\tdma_addr  = 0x%lx\n", (u64)dma_addr);
+			printk(KERN_INFO "\tTable     = 0x%lx\n", (u64)tbl);
+			printk(KERN_INFO "\tbus#      = 0x%lx\n", (u64)tbl->it_busno);
+			printk(KERN_INFO "\tsize      = 0x%lx\n", (u64)tbl->it_size);
+			printk(KERN_INFO "\tstartOff  = 0x%lx\n", (u64)tbl->it_offset);
+			printk(KERN_INFO "\tindex     = 0x%lx\n", (u64)tbl->it_index);
+			WARN_ON(1);
+		}
+		return;
+	}
+
+	ppc_md.tce_free(tbl, entry, npages);
+	
+	for (i = 0; i < npages; i++)
+		__clear_bit(free_entry+i, tbl->it_map);
+}
+
+static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+		unsigned int npages)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&(tbl->it_lock), flags);
+
+	__iommu_free(tbl, dma_addr, npages);
+
+	/* Make sure TLB cache is flushed if the HW needs it. We do
+	 * not do an mb() here on purpose, it is not needed on any of
+	 * the current platforms.
+	 */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	spin_unlock_irqrestore(&(tbl->it_lock), flags);
+}
+
+int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
+		struct scatterlist *sglist, int nelems,
+		enum dma_data_direction direction)
+{
+	dma_addr_t dma_next = 0, dma_addr;
+	unsigned long flags;
+	struct scatterlist *s, *outs, *segstart;
+	int outcount, incount;
+	unsigned long handle;
+
+	BUG_ON(direction == DMA_NONE);
+
+	if ((nelems == 0) || !tbl)
+		return 0;
+
+	outs = s = segstart = &sglist[0];
+	outcount = 1;
+	incount = nelems;
+	handle = 0;
+
+	/* Init first segment length for backout at failure */
+	outs->dma_length = 0;
+
+	DBG("mapping %d elements:\n", nelems);
+
+	spin_lock_irqsave(&(tbl->it_lock), flags);
+
+	for (s = outs; nelems; nelems--, s++) {
+		unsigned long vaddr, npages, entry, slen;
+
+		slen = s->length;
+		/* Sanity check */
+		if (slen == 0) {
+			dma_next = 0;
+			continue;
+		}
+		/* Allocate iommu entries for that segment */
+		vaddr = (unsigned long)page_address(s->page) + s->offset;
+		npages = PAGE_ALIGN(vaddr + slen) - (vaddr & PAGE_MASK);
+		npages >>= PAGE_SHIFT;
+		entry = iommu_range_alloc(tbl, npages, &handle, 0);
+
+		DBG("  - vaddr: %lx, size: %lx\n", vaddr, slen);
+
+		/* Handle failure */
+		if (unlikely(entry == DMA_ERROR_CODE)) {
+			if (printk_ratelimit())
+				printk(KERN_INFO "iommu_alloc failed, tbl %p vaddr %lx"
+				       " npages %lx\n", tbl, vaddr, npages);
+			goto failure;
+		}
+
+		/* Convert entry to a dma_addr_t */
+		entry += tbl->it_offset;
+		dma_addr = entry << PAGE_SHIFT;
+		dma_addr |= s->offset;
+
+		DBG("  - %lx pages, entry: %lx, dma_addr: %lx\n",
+			    npages, entry, dma_addr);
+
+		/* Insert into HW table */
+		ppc_md.tce_build(tbl, entry, npages, vaddr & PAGE_MASK, direction);
+
+		/* If we are in an open segment, try merging */
+		if (segstart != s) {
+			DBG("  - trying merge...\n");
+			/* We cannot merge if:
+			 * - allocated dma_addr isn't contiguous to previous allocation
+			 */
+			if (novmerge || (dma_addr != dma_next)) {
+				/* Can't merge: create a new segment */
+				segstart = s;
+				outcount++; outs++;
+				DBG("    can't merge, new segment.\n");
+			} else {
+				outs->dma_length += s->length;
+				DBG("    merged, new len: %lx\n", outs->dma_length);
+			}
+		}
+
+		if (segstart == s) {
+			/* This is a new segment, fill entries */
+			DBG("  - filling new segment.\n");
+			outs->dma_address = dma_addr;
+			outs->dma_length = slen;
+		}
+
+		/* Calculate next page pointer for contiguous check */
+		dma_next = dma_addr + slen;
+
+		DBG("  - dma next is: %lx\n", dma_next);
+	}
+
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	spin_unlock_irqrestore(&(tbl->it_lock), flags);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+
+	DBG("mapped %d elements:\n", outcount);
+
+	/* For the sake of iommu_unmap_sg, we clear out the length in the
+	 * next entry of the sglist if we didn't fill the list completely
+	 */
+	if (outcount < incount) {
+		outs++;
+		outs->dma_address = DMA_ERROR_CODE;
+		outs->dma_length = 0;
+	}
+	return outcount;
+
+ failure:
+	for (s = &sglist[0]; s <= outs; s++) {
+		if (s->dma_length != 0) {
+			unsigned long vaddr, npages;
+
+			vaddr = s->dma_address & PAGE_MASK;
+			npages = (PAGE_ALIGN(s->dma_address + s->dma_length) - vaddr)
+				>> PAGE_SHIFT;
+			__iommu_free(tbl, vaddr, npages);
+		}
+	}
+	spin_unlock_irqrestore(&(tbl->it_lock), flags);
+	return 0;
+}
+
+
+void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
+		int nelems, enum dma_data_direction direction)
+{
+	unsigned long flags;
+
+	BUG_ON(direction == DMA_NONE);
+
+	if (!tbl)
+		return;
+
+	spin_lock_irqsave(&(tbl->it_lock), flags);
+
+	while (nelems--) {
+		unsigned int npages;
+		dma_addr_t dma_handle = sglist->dma_address;
+
+		if (sglist->dma_length == 0)
+			break;
+		npages = (PAGE_ALIGN(dma_handle + sglist->dma_length)
+			  - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT;
+		__iommu_free(tbl, dma_handle, npages);
+		sglist++;
+	}
+
+	/* Flush/invalidate TLBs if necessary. As for iommu_free(), we
+	 * do not do an mb() here, the affected platforms do not need it
+	 * when freeing.
+	 */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	spin_unlock_irqrestore(&(tbl->it_lock), flags);
+}
+
+/*
+ * Build a iommu_table structure.  This contains a bit map which
+ * is used to manage allocation of the tce space.
+ */
+struct iommu_table *iommu_init_table(struct iommu_table *tbl)
+{
+	unsigned long sz;
+	static int welcomed = 0;
+
+	/* Set aside 1/4 of the table for large allocations. */
+	tbl->it_halfpoint = tbl->it_size * 3 / 4;
+
+	/* number of bytes needed for the bitmap */
+	sz = (tbl->it_size + 7) >> 3;
+
+	tbl->it_map = (unsigned long *)__get_free_pages(GFP_ATOMIC, get_order(sz));
+	if (!tbl->it_map)
+		panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
+
+	memset(tbl->it_map, 0, sz);
+
+	tbl->it_hint = 0;
+	tbl->it_largehint = tbl->it_halfpoint;
+	spin_lock_init(&tbl->it_lock);
+
+	/* Clear the hardware table in case firmware left allocations in it */
+	ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size);
+
+	if (!welcomed) {
+		printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n",
+		       novmerge ? "disabled" : "enabled");
+		welcomed = 1;
+	}
+
+	return tbl;
+}
+
+void iommu_free_table(struct device_node *dn)
+{
+	struct pci_dn *pdn = dn->data;
+	struct iommu_table *tbl = pdn->iommu_table;
+	unsigned long bitmap_sz, i;
+	unsigned int order;
+
+	if (!tbl || !tbl->it_map) {
+		printk(KERN_ERR "%s: expected TCE map for %s\n", __FUNCTION__,
+				dn->full_name);
+		return;
+	}
+
+	/* verify that table contains no entries */
+	/* it_size is in entries, and we're examining 64 at a time */
+	for (i = 0; i < (tbl->it_size/64); i++) {
+		if (tbl->it_map[i] != 0) {
+			printk(KERN_WARNING "%s: Unexpected TCEs for %s\n",
+				__FUNCTION__, dn->full_name);
+			break;
+		}
+	}
+
+	/* calculate bitmap size in bytes */
+	bitmap_sz = (tbl->it_size + 7) / 8;
+
+	/* free bitmap */
+	order = get_order(bitmap_sz);
+	free_pages((unsigned long) tbl->it_map, order);
+
+	/* free table */
+	kfree(tbl);
+}
+
+/* Creates TCEs for a user provided buffer.  The user buffer must be
+ * contiguous real kernel storage (not vmalloc).  The address of the buffer
+ * passed here is the kernel (virtual) address of the buffer.  The buffer
+ * need not be page aligned, the dma_addr_t returned will point to the same
+ * byte within the page as vaddr.
+ */
+dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr,
+		size_t size, enum dma_data_direction direction)
+{
+	dma_addr_t dma_handle = DMA_ERROR_CODE;
+	unsigned long uaddr;
+	unsigned int npages;
+
+	BUG_ON(direction == DMA_NONE);
+
+	uaddr = (unsigned long)vaddr;
+	npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK);
+	npages >>= PAGE_SHIFT;
+
+	if (tbl) {
+		dma_handle = iommu_alloc(tbl, vaddr, npages, direction, 0);
+		if (dma_handle == DMA_ERROR_CODE) {
+			if (printk_ratelimit())  {
+				printk(KERN_INFO "iommu_alloc failed, "
+						"tbl %p vaddr %p npages %d\n",
+						tbl, vaddr, npages);
+			}
+		} else
+			dma_handle |= (uaddr & ~PAGE_MASK);
+	}
+
+	return dma_handle;
+}
+
+void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle,
+		size_t size, enum dma_data_direction direction)
+{
+	BUG_ON(direction == DMA_NONE);
+
+	if (tbl)
+		iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) -
+					(dma_handle & PAGE_MASK)) >> PAGE_SHIFT);
+}
+
+/* Allocates a contiguous real buffer and creates mappings over it.
+ * Returns the virtual address of the buffer and sets dma_handle
+ * to the dma address (mapping) of the first page.
+ */
+void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size,
+		dma_addr_t *dma_handle, gfp_t flag)
+{
+	void *ret = NULL;
+	dma_addr_t mapping;
+	unsigned int npages, order;
+
+	size = PAGE_ALIGN(size);
+	npages = size >> PAGE_SHIFT;
+	order = get_order(size);
+
+ 	/*
+	 * Client asked for way too much space.  This is checked later
+	 * anyway.  It is easier to debug here for the drivers than in
+	 * the tce tables.
+	 */
+	if (order >= IOMAP_MAX_ORDER) {
+		printk("iommu_alloc_consistent size too large: 0x%lx\n", size);
+		return NULL;
+	}
+
+	if (!tbl)
+		return NULL;
+
+	/* Alloc enough pages (and possibly more) */
+	ret = (void *)__get_free_pages(flag, order);
+	if (!ret)
+		return NULL;
+	memset(ret, 0, size);
+
+	/* Set up tces to cover the allocated range */
+	mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL, order);
+	if (mapping == DMA_ERROR_CODE) {
+		free_pages((unsigned long)ret, order);
+		ret = NULL;
+	} else
+		*dma_handle = mapping;
+	return ret;
+}
+
+void iommu_free_coherent(struct iommu_table *tbl, size_t size,
+			 void *vaddr, dma_addr_t dma_handle)
+{
+	unsigned int npages;
+
+	if (tbl) {
+		size = PAGE_ALIGN(size);
+		npages = size >> PAGE_SHIFT;
+		iommu_free(tbl, dma_handle, npages);
+		free_pages((unsigned long)vaddr, get_order(size));
+	}
+}
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
new file mode 100644
index 000000000000..511af54e6230
--- /dev/null
+++ b/arch/powerpc/kernel/kprobes.c
@@ -0,0 +1,459 @@
+/*
+ *  Kernel Probes (KProbes)
+ *  arch/ppc64/kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *		Probes initial implementation ( includes contributions from
+ *		Rusty Russell).
+ * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *		interface to access function arguments.
+ * 2004-Nov	Ananth N Mavinakayanahalli <ananth@in.ibm.com> kprobes port
+ *		for PPC64
+ */
+
+#include <linux/config.h>
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <asm/cacheflush.h>
+#include <asm/kdebug.h>
+#include <asm/sstep.h>
+
+static DECLARE_MUTEX(kprobe_mutex);
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+	int ret = 0;
+	kprobe_opcode_t insn = *p->addr;
+
+	if ((unsigned long)p->addr & 0x03) {
+		printk("Attempt to register kprobe at an unaligned address\n");
+		ret = -EINVAL;
+	} else if (IS_MTMSRD(insn) || IS_RFID(insn)) {
+		printk("Cannot register a kprobe on rfid or mtmsrd\n");
+		ret = -EINVAL;
+	}
+
+	/* insn must be on a special executable page on ppc64 */
+	if (!ret) {
+		down(&kprobe_mutex);
+		p->ainsn.insn = get_insn_slot();
+		up(&kprobe_mutex);
+		if (!p->ainsn.insn)
+			ret = -ENOMEM;
+	}
+	return ret;
+}
+
+void __kprobes arch_copy_kprobe(struct kprobe *p)
+{
+	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	p->opcode = *p->addr;
+}
+
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+	*p->addr = BREAKPOINT_INSTRUCTION;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+}
+
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
+{
+	*p->addr = p->opcode;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+}
+
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+	down(&kprobe_mutex);
+	free_insn_slot(p->ainsn.insn);
+	up(&kprobe_mutex);
+}
+
+static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+	kprobe_opcode_t insn = *p->ainsn.insn;
+
+	regs->msr |= MSR_SE;
+
+	/* single step inline if it is a trap variant */
+	if (is_trap(insn))
+		regs->nip = (unsigned long)p->addr;
+	else
+		regs->nip = (unsigned long)p->ainsn.insn;
+}
+
+static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	kcb->prev_kprobe.kp = kprobe_running();
+	kcb->prev_kprobe.status = kcb->kprobe_status;
+	kcb->prev_kprobe.saved_msr = kcb->kprobe_saved_msr;
+}
+
+static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+	kcb->kprobe_status = kcb->prev_kprobe.status;
+	kcb->kprobe_saved_msr = kcb->prev_kprobe.saved_msr;
+}
+
+static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+				struct kprobe_ctlblk *kcb)
+{
+	__get_cpu_var(current_kprobe) = p;
+	kcb->kprobe_saved_msr = regs->msr;
+}
+
+/* Called with kretprobe_lock held */
+void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
+				      struct pt_regs *regs)
+{
+	struct kretprobe_instance *ri;
+
+	if ((ri = get_free_rp_inst(rp)) != NULL) {
+		ri->rp = rp;
+		ri->task = current;
+		ri->ret_addr = (kprobe_opcode_t *)regs->link;
+
+		/* Replace the return addr with trampoline addr */
+		regs->link = (unsigned long)kretprobe_trampoline;
+		add_rp_inst(ri);
+	} else {
+		rp->nmissed++;
+	}
+}
+
+static inline int kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *p;
+	int ret = 0;
+	unsigned int *addr = (unsigned int *)regs->nip;
+	struct kprobe_ctlblk *kcb;
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
+
+	/* Check we're not actually recursing */
+	if (kprobe_running()) {
+		p = get_kprobe(addr);
+		if (p) {
+			kprobe_opcode_t insn = *p->ainsn.insn;
+			if (kcb->kprobe_status == KPROBE_HIT_SS &&
+					is_trap(insn)) {
+				regs->msr &= ~MSR_SE;
+				regs->msr |= kcb->kprobe_saved_msr;
+				goto no_kprobe;
+			}
+			/* We have reentered the kprobe_handler(), since
+			 * another probe was hit while within the handler.
+			 * We here save the original kprobes variables and
+			 * just single step on the instruction of the new probe
+			 * without calling any user handlers.
+			 */
+			save_previous_kprobe(kcb);
+			set_current_kprobe(p, regs, kcb);
+			kcb->kprobe_saved_msr = regs->msr;
+			p->nmissed++;
+			prepare_singlestep(p, regs);
+			kcb->kprobe_status = KPROBE_REENTER;
+			return 1;
+		} else {
+			p = __get_cpu_var(current_kprobe);
+			if (p->break_handler && p->break_handler(p, regs)) {
+				goto ss_probe;
+			}
+		}
+		goto no_kprobe;
+	}
+
+	p = get_kprobe(addr);
+	if (!p) {
+		if (*addr != BREAKPOINT_INSTRUCTION) {
+			/*
+			 * PowerPC has multiple variants of the "trap"
+			 * instruction. If the current instruction is a
+			 * trap variant, it could belong to someone else
+			 */
+			kprobe_opcode_t cur_insn = *addr;
+			if (is_trap(cur_insn))
+		       		goto no_kprobe;
+			/*
+			 * The breakpoint instruction was removed right
+			 * after we hit it.  Another cpu has removed
+			 * either a probepoint or a debugger breakpoint
+			 * at this address.  In either case, no further
+			 * handling of this interrupt is appropriate.
+			 */
+			ret = 1;
+		}
+		/* Not one of ours: let kernel handle it */
+		goto no_kprobe;
+	}
+
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+	set_current_kprobe(p, regs, kcb);
+	if (p->pre_handler && p->pre_handler(p, regs))
+		/* handler has already set things up, so skip ss setup */
+		return 1;
+
+ss_probe:
+	prepare_singlestep(p, regs);
+	kcb->kprobe_status = KPROBE_HIT_SS;
+	return 1;
+
+no_kprobe:
+	preempt_enable_no_resched();
+	return ret;
+}
+
+/*
+ * Function return probe trampoline:
+ * 	- init_kprobes() establishes a probepoint here
+ * 	- When the probed function returns, this probe
+ * 		causes the handlers to fire
+ */
+void kretprobe_trampoline_holder(void)
+{
+	asm volatile(".global kretprobe_trampoline\n"
+			"kretprobe_trampoline:\n"
+			"nop\n");
+}
+
+/*
+ * Called when the probe at kretprobe trampoline is hit
+ */
+int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+{
+        struct kretprobe_instance *ri = NULL;
+        struct hlist_head *head;
+        struct hlist_node *node, *tmp;
+	unsigned long flags, orig_ret_address = 0;
+	unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
+
+	spin_lock_irqsave(&kretprobe_lock, flags);
+        head = kretprobe_inst_table_head(current);
+
+	/*
+	 * It is possible to have multiple instances associated with a given
+	 * task either because an multiple functions in the call path
+	 * have a return probe installed on them, and/or more then one return
+	 * return probe was registered for a target function.
+	 *
+	 * We can handle this because:
+	 *     - instances are always inserted at the head of the list
+	 *     - when multiple return probes are registered for the same
+         *       function, the first instance's ret_addr will point to the
+	 *       real return address, and all the rest will point to
+	 *       kretprobe_trampoline
+	 */
+	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+                if (ri->task != current)
+			/* another task is sharing our hash bucket */
+                        continue;
+
+		if (ri->rp && ri->rp->handler)
+			ri->rp->handler(ri, regs);
+
+		orig_ret_address = (unsigned long)ri->ret_addr;
+		recycle_rp_inst(ri);
+
+		if (orig_ret_address != trampoline_address)
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			break;
+	}
+
+	BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address));
+	regs->nip = orig_ret_address;
+
+	reset_current_kprobe();
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	preempt_enable_no_resched();
+
+        /*
+         * By returning a non-zero value, we are telling
+         * kprobe_handler() that we don't want the post_handler
+         * to run (and have re-enabled preemption)
+         */
+        return 1;
+}
+
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction whose first byte has been replaced by the "breakpoint"
+ * instruction.  To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction.  The address of this
+ * copy is p->ainsn.insn.
+ */
+static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
+{
+	int ret;
+	unsigned int insn = *p->ainsn.insn;
+
+	regs->nip = (unsigned long)p->addr;
+	ret = emulate_step(regs, insn);
+	if (ret == 0)
+		regs->nip = (unsigned long)p->addr + 4;
+}
+
+static inline int post_kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (!cur)
+		return 0;
+
+	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+		kcb->kprobe_status = KPROBE_HIT_SSDONE;
+		cur->post_handler(cur, regs, 0);
+	}
+
+	resume_execution(cur, regs);
+	regs->msr |= kcb->kprobe_saved_msr;
+
+	/*Restore back the original saved kprobes variables and continue. */
+	if (kcb->kprobe_status == KPROBE_REENTER) {
+		restore_previous_kprobe(kcb);
+		goto out;
+	}
+	reset_current_kprobe();
+out:
+	preempt_enable_no_resched();
+
+	/*
+	 * if somebody else is singlestepping across a probe point, msr
+	 * will have SE set, in which case, continue the remaining processing
+	 * of do_debug, as if this is not a probe hit.
+	 */
+	if (regs->msr & MSR_SE)
+		return 0;
+
+	return 1;
+}
+
+static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+		return 1;
+
+	if (kcb->kprobe_status & KPROBE_HIT_SS) {
+		resume_execution(cur, regs);
+		regs->msr &= ~MSR_SE;
+		regs->msr |= kcb->kprobe_saved_msr;
+
+		reset_current_kprobe();
+		preempt_enable_no_resched();
+	}
+	return 0;
+}
+
+/*
+ * Wrapper routine to for handling exceptions.
+ */
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+				       unsigned long val, void *data)
+{
+	struct die_args *args = (struct die_args *)data;
+	int ret = NOTIFY_DONE;
+
+	switch (val) {
+	case DIE_BPT:
+		if (kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_SSTEP:
+		if (post_kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_PAGE_FAULT:
+		/* kprobe_running() needs smp_processor_id() */
+		preempt_disable();
+		if (kprobe_running() &&
+		    kprobe_fault_handler(args->regs, args->trapnr))
+			ret = NOTIFY_STOP;
+		preempt_enable();
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct jprobe *jp = container_of(p, struct jprobe, kp);
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	memcpy(&kcb->jprobe_saved_regs, regs, sizeof(struct pt_regs));
+
+	/* setup return addr to the jprobe handler routine */
+	regs->nip = (unsigned long)(((func_descr_t *)jp->entry)->entry);
+	regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc);
+
+	return 1;
+}
+
+void __kprobes jprobe_return(void)
+{
+	asm volatile("trap" ::: "memory");
+}
+
+void __kprobes jprobe_return_end(void)
+{
+};
+
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	/*
+	 * FIXME - we should ideally be validating that we got here 'cos
+	 * of the "trap" in jprobe_return() above, before restoring the
+	 * saved regs...
+	 */
+	memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs));
+	preempt_enable_no_resched();
+	return 1;
+}
+
+static struct kprobe trampoline_p = {
+	.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
+	.pre_handler = trampoline_probe_handler
+};
+
+int __init arch_init_kprobes(void)
+{
+	return register_kprobe(&trampoline_p);
+}
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
new file mode 100644
index 000000000000..97c51e452be7
--- /dev/null
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -0,0 +1,358 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ *
+ * Copyright (C) 2004-2005, IBM Corp.
+ *
+ * Created by: Milton D Miller II
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+
+#include <linux/cpumask.h>
+#include <linux/kexec.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <linux/errno.h>
+
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/machdep.h>
+#include <asm/cacheflush.h>
+#include <asm/paca.h>
+#include <asm/mmu.h>
+#include <asm/sections.h>	/* _end */
+#include <asm/prom.h>
+#include <asm/smp.h>
+
+#define HASH_GROUP_SIZE 0x80	/* size of each hash group, asm/mmu.h */
+
+/* Have this around till we move it into crash specific file */
+note_buf_t crash_notes[NR_CPUS];
+
+/* Dummy for now. Not sure if we need to have a crash shutdown in here
+ * and if what it will achieve. Letting it be now to compile the code
+ * in generic kexec environment
+ */
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* do nothing right now */
+	/* smp_relase_cpus() if we want smp on panic kernel */
+	/* cpu_irq_down to isolate us until we are ready */
+}
+
+int machine_kexec_prepare(struct kimage *image)
+{
+	int i;
+	unsigned long begin, end;	/* limits of segment */
+	unsigned long low, high;	/* limits of blocked memory range */
+	struct device_node *node;
+	unsigned long *basep;
+	unsigned int *sizep;
+
+	if (!ppc_md.hpte_clear_all)
+		return -ENOENT;
+
+	/*
+	 * Since we use the kernel fault handlers and paging code to
+	 * handle the virtual mode, we must make sure no destination
+	 * overlaps kernel static data or bss.
+	 */
+	for (i = 0; i < image->nr_segments; i++)
+		if (image->segment[i].mem < __pa(_end))
+			return -ETXTBSY;
+
+	/*
+	 * For non-LPAR, we absolutely can not overwrite the mmu hash
+	 * table, since we are still using the bolted entries in it to
+	 * do the copy.  Check that here.
+	 *
+	 * It is safe if the end is below the start of the blocked
+	 * region (end <= low), or if the beginning is after the
+	 * end of the blocked region (begin >= high).  Use the
+	 * boolean identity !(a || b)  === (!a && !b).
+	 */
+	if (htab_address) {
+		low = __pa(htab_address);
+		high = low + (htab_hash_mask + 1) * HASH_GROUP_SIZE;
+
+		for (i = 0; i < image->nr_segments; i++) {
+			begin = image->segment[i].mem;
+			end = begin + image->segment[i].memsz;
+
+			if ((begin < high) && (end > low))
+				return -ETXTBSY;
+		}
+	}
+
+	/* We also should not overwrite the tce tables */
+	for (node = of_find_node_by_type(NULL, "pci"); node != NULL;
+			node = of_find_node_by_type(node, "pci")) {
+		basep = (unsigned long *)get_property(node, "linux,tce-base",
+							NULL);
+		sizep = (unsigned int *)get_property(node, "linux,tce-size",
+							NULL);
+		if (basep == NULL || sizep == NULL)
+			continue;
+
+		low = *basep;
+		high = low + (*sizep);
+
+		for (i = 0; i < image->nr_segments; i++) {
+			begin = image->segment[i].mem;
+			end = begin + image->segment[i].memsz;
+
+			if ((begin < high) && (end > low))
+				return -ETXTBSY;
+		}
+	}
+
+	return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+	/* we do nothing in prepare that needs to be undone */
+}
+
+#define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE)
+
+static void copy_segments(unsigned long ind)
+{
+	unsigned long entry;
+	unsigned long *ptr;
+	void *dest;
+	void *addr;
+
+	/*
+	 * We rely on kexec_load to create a lists that properly
+	 * initializes these pointers before they are used.
+	 * We will still crash if the list is wrong, but at least
+	 * the compiler will be quiet.
+	 */
+	ptr = NULL;
+	dest = NULL;
+
+	for (entry = ind; !(entry & IND_DONE); entry = *ptr++) {
+		addr = __va(entry & PAGE_MASK);
+
+		switch (entry & IND_FLAGS) {
+		case IND_DESTINATION:
+			dest = addr;
+			break;
+		case IND_INDIRECTION:
+			ptr = addr;
+			break;
+		case IND_SOURCE:
+			copy_page(dest, addr);
+			dest += PAGE_SIZE;
+		}
+	}
+}
+
+void kexec_copy_flush(struct kimage *image)
+{
+	long i, nr_segments = image->nr_segments;
+	struct  kexec_segment ranges[KEXEC_SEGMENT_MAX];
+
+	/* save the ranges on the stack to efficiently flush the icache */
+	memcpy(ranges, image->segment, sizeof(ranges));
+
+	/*
+	 * After this call we may not use anything allocated in dynamic
+	 * memory, including *image.
+	 *
+	 * Only globals and the stack are allowed.
+	 */
+	copy_segments(image->head);
+
+	/*
+	 * we need to clear the icache for all dest pages sometime,
+	 * including ones that were in place on the original copy
+	 */
+	for (i = 0; i < nr_segments; i++)
+		flush_icache_range(ranges[i].mem + KERNELBASE,
+				ranges[i].mem + KERNELBASE +
+				ranges[i].memsz);
+}
+
+#ifdef CONFIG_SMP
+
+/* FIXME: we should schedule this function to be called on all cpus based
+ * on calling the interrupts, but we would like to call it off irq level
+ * so that the interrupt controller is clean.
+ */
+void kexec_smp_down(void *arg)
+{
+	if (ppc_md.kexec_cpu_down)
+		ppc_md.kexec_cpu_down(0, 1);
+
+	local_irq_disable();
+	kexec_smp_wait();
+	/* NOTREACHED */
+}
+
+static void kexec_prepare_cpus(void)
+{
+	int my_cpu, i, notified=-1;
+
+	smp_call_function(kexec_smp_down, NULL, 0, /* wait */0);
+	my_cpu = get_cpu();
+
+	/* check the others cpus are now down (via paca hw cpu id == -1) */
+	for (i=0; i < NR_CPUS; i++) {
+		if (i == my_cpu)
+			continue;
+
+		while (paca[i].hw_cpu_id != -1) {
+			barrier();
+			if (!cpu_possible(i)) {
+				printk("kexec: cpu %d hw_cpu_id %d is not"
+						" possible, ignoring\n",
+						i, paca[i].hw_cpu_id);
+				break;
+			}
+			if (!cpu_online(i)) {
+				/* Fixme: this can be spinning in
+				 * pSeries_secondary_wait with a paca
+				 * waiting for it to go online.
+				 */
+				printk("kexec: cpu %d hw_cpu_id %d is not"
+						" online, ignoring\n",
+						i, paca[i].hw_cpu_id);
+				break;
+			}
+			if (i != notified) {
+				printk( "kexec: waiting for cpu %d (physical"
+						" %d) to go down\n",
+						i, paca[i].hw_cpu_id);
+				notified = i;
+			}
+		}
+	}
+
+	/* after we tell the others to go down */
+	if (ppc_md.kexec_cpu_down)
+		ppc_md.kexec_cpu_down(0, 0);
+
+	put_cpu();
+
+	local_irq_disable();
+}
+
+#else /* ! SMP */
+
+static void kexec_prepare_cpus(void)
+{
+	/*
+	 * move the secondarys to us so that we can copy
+	 * the new kernel 0-0x100 safely
+	 *
+	 * do this if kexec in setup.c ?
+	 *
+	 * We need to release the cpus if we are ever going from an
+	 * UP to an SMP kernel.
+	 */
+	smp_release_cpus();
+	if (ppc_md.kexec_cpu_down)
+		ppc_md.kexec_cpu_down(0, 0);
+	local_irq_disable();
+}
+
+#endif /* SMP */
+
+/*
+ * kexec thread structure and stack.
+ *
+ * We need to make sure that this is 16384-byte aligned due to the
+ * way process stacks are handled.  It also must be statically allocated
+ * or allocated as part of the kimage, because everything else may be
+ * overwritten when we copy the kexec image.  We piggyback on the
+ * "init_task" linker section here to statically allocate a stack.
+ *
+ * We could use a smaller stack if we don't care about anything using
+ * current, but that audit has not been performed.
+ */
+union thread_union kexec_stack
+	__attribute__((__section__(".data.init_task"))) = { };
+
+/* Our assembly helper, in kexec_stub.S */
+extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start,
+					void *image, void *control,
+					void (*clear_all)(void)) ATTRIB_NORET;
+
+/* too late to fail here */
+void machine_kexec(struct kimage *image)
+{
+
+	/* prepare control code if any */
+
+	/* shutdown other cpus into our wait loop and quiesce interrupts */
+	kexec_prepare_cpus();
+
+	/* switch to a staticly allocated stack.  Based on irq stack code.
+	 * XXX: the task struct will likely be invalid once we do the copy!
+	 */
+	kexec_stack.thread_info.task = current_thread_info()->task;
+	kexec_stack.thread_info.flags = 0;
+
+	/* Some things are best done in assembly.  Finding globals with
+	 * a toc is easier in C, so pass in what we can.
+	 */
+	kexec_sequence(&kexec_stack, image->start, image,
+			page_address(image->control_code_page),
+			ppc_md.hpte_clear_all);
+	/* NOTREACHED */
+}
+
+/* Values we need to export to the second kernel via the device tree. */
+static unsigned long htab_base, htab_size, kernel_end;
+
+static struct property htab_base_prop = {
+	.name = "linux,htab-base",
+	.length = sizeof(unsigned long),
+	.value = (unsigned char *)&htab_base,
+};
+
+static struct property htab_size_prop = {
+	.name = "linux,htab-size",
+	.length = sizeof(unsigned long),
+	.value = (unsigned char *)&htab_size,
+};
+
+static struct property kernel_end_prop = {
+	.name = "linux,kernel-end",
+	.length = sizeof(unsigned long),
+	.value = (unsigned char *)&kernel_end,
+};
+
+static void __init export_htab_values(void)
+{
+	struct device_node *node;
+
+	node = of_find_node_by_path("/chosen");
+	if (!node)
+		return;
+
+	kernel_end = __pa(_end);
+	prom_add_property(node, &kernel_end_prop);
+
+	/* On machines with no htab htab_address is NULL */
+	if (NULL == htab_address)
+		goto out;
+
+	htab_base = __pa(htab_address);
+	prom_add_property(node, &htab_base_prop);
+
+	htab_size = 1UL << ppc64_pft_size;
+	prom_add_property(node, &htab_size_prop);
+
+ out:
+	of_node_put(node);
+}
+
+void __init kexec_setup(void)
+{
+	export_htab_values();
+}
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
new file mode 100644
index 000000000000..928b8581fcb0
--- /dev/null
+++ b/arch/powerpc/kernel/module_64.c
@@ -0,0 +1,455 @@
+/*  Kernel module help for PPC64.
+    Copyright (C) 2001, 2003 Rusty Russell IBM Corporation.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/module.h>
+#include <linux/elf.h>
+#include <linux/moduleloader.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <asm/module.h>
+#include <asm/uaccess.h>
+
+/* FIXME: We don't do .init separately.  To do this, we'd need to have
+   a separate r2 value in the init and core section, and stub between
+   them, too.
+
+   Using a magic allocator which places modules within 32MB solves
+   this, and makes other things simpler.  Anton?
+   --RR.  */
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt , ...)
+#endif
+
+/* There's actually a third entry here, but it's unused */
+struct ppc64_opd_entry
+{
+	unsigned long funcaddr;
+	unsigned long r2;
+};
+
+/* Like PPC32, we need little trampolines to do > 24-bit jumps (into
+   the kernel itself).  But on PPC64, these need to be used for every
+   jump, actually, to reset r2 (TOC+0x8000). */
+struct ppc64_stub_entry
+{
+	/* 28 byte jump instruction sequence (7 instructions) */
+	unsigned char jump[28];
+	unsigned char unused[4];
+	/* Data for the above code */
+	struct ppc64_opd_entry opd;
+};
+
+/* We use a stub to fix up r2 (TOC ptr) and to jump to the (external)
+   function which may be more than 24-bits away.  We could simply
+   patch the new r2 value and function pointer into the stub, but it's
+   significantly shorter to put these values at the end of the stub
+   code, and patch the stub address (32-bits relative to the TOC ptr,
+   r2) into the stub. */
+static struct ppc64_stub_entry ppc64_stub =
+{ .jump = {
+	0x3d, 0x82, 0x00, 0x00, /* addis   r12,r2, <high> */
+	0x39, 0x8c, 0x00, 0x00, /* addi    r12,r12, <low> */
+	/* Save current r2 value in magic place on the stack. */
+	0xf8, 0x41, 0x00, 0x28, /* std     r2,40(r1) */
+	0xe9, 0x6c, 0x00, 0x20, /* ld      r11,32(r12) */
+	0xe8, 0x4c, 0x00, 0x28, /* ld      r2,40(r12) */
+	0x7d, 0x69, 0x03, 0xa6, /* mtctr   r11 */
+	0x4e, 0x80, 0x04, 0x20  /* bctr */
+} };
+
+/* Count how many different 24-bit relocations (different symbol,
+   different addend) */
+static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num)
+{
+	unsigned int i, j, ret = 0;
+
+	/* FIXME: Only count external ones --RR */
+	/* Sure, this is order(n^2), but it's usually short, and not
+           time critical */
+	for (i = 0; i < num; i++) {
+		/* Only count 24-bit relocs, others don't need stubs */
+		if (ELF64_R_TYPE(rela[i].r_info) != R_PPC_REL24)
+			continue;
+		for (j = 0; j < i; j++) {
+			/* If this addend appeared before, it's
+                           already been counted */
+			if (rela[i].r_info == rela[j].r_info
+			    && rela[i].r_addend == rela[j].r_addend)
+				break;
+		}
+		if (j == i) ret++;
+	}
+	return ret;
+}
+
+void *module_alloc(unsigned long size)
+{
+	if (size == 0)
+		return NULL;
+
+	return vmalloc_exec(size);
+}
+
+/* Free memory returned from module_alloc */
+void module_free(struct module *mod, void *module_region)
+{
+	vfree(module_region);
+	/* FIXME: If module_region == mod->init_region, trim exception
+           table entries. */
+}
+
+/* Get size of potential trampolines required. */
+static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
+				    const Elf64_Shdr *sechdrs)
+{
+	/* One extra reloc so it's always 0-funcaddr terminated */
+	unsigned long relocs = 1;
+	unsigned i;
+
+	/* Every relocated section... */
+	for (i = 1; i < hdr->e_shnum; i++) {
+		if (sechdrs[i].sh_type == SHT_RELA) {
+			DEBUGP("Found relocations in section %u\n", i);
+			DEBUGP("Ptr: %p.  Number: %lu\n",
+			       (void *)sechdrs[i].sh_addr,
+			       sechdrs[i].sh_size / sizeof(Elf64_Rela));
+			relocs += count_relocs((void *)sechdrs[i].sh_addr,
+					       sechdrs[i].sh_size
+					       / sizeof(Elf64_Rela));
+		}
+	}
+
+	DEBUGP("Looks like a total of %lu stubs, max\n", relocs);
+	return relocs * sizeof(struct ppc64_stub_entry);
+}
+
+static void dedotify_versions(struct modversion_info *vers,
+			      unsigned long size)
+{
+	struct modversion_info *end;
+
+	for (end = (void *)vers + size; vers < end; vers++)
+		if (vers->name[0] == '.')
+			memmove(vers->name, vers->name+1, strlen(vers->name));
+}
+
+/* Undefined symbols which refer to .funcname, hack to funcname */
+static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab)
+{
+	unsigned int i;
+
+	for (i = 1; i < numsyms; i++) {
+		if (syms[i].st_shndx == SHN_UNDEF) {
+			char *name = strtab + syms[i].st_name;
+			if (name[0] == '.')
+				memmove(name, name+1, strlen(name));
+		}
+	}
+}
+
+int module_frob_arch_sections(Elf64_Ehdr *hdr,
+			      Elf64_Shdr *sechdrs,
+			      char *secstrings,
+			      struct module *me)
+{
+	unsigned int i;
+
+	/* Find .toc and .stubs sections, symtab and strtab */
+	for (i = 1; i < hdr->e_shnum; i++) {
+		char *p;
+		if (strcmp(secstrings + sechdrs[i].sh_name, ".stubs") == 0)
+			me->arch.stubs_section = i;
+		else if (strcmp(secstrings + sechdrs[i].sh_name, ".toc") == 0)
+			me->arch.toc_section = i;
+		else if (strcmp(secstrings+sechdrs[i].sh_name,"__versions")==0)
+			dedotify_versions((void *)hdr + sechdrs[i].sh_offset,
+					  sechdrs[i].sh_size);
+
+		/* We don't handle .init for the moment: rename to _init */
+		while ((p = strstr(secstrings + sechdrs[i].sh_name, ".init")))
+			p[0] = '_';
+
+		if (sechdrs[i].sh_type == SHT_SYMTAB)
+			dedotify((void *)hdr + sechdrs[i].sh_offset,
+				 sechdrs[i].sh_size / sizeof(Elf64_Sym),
+				 (void *)hdr
+				 + sechdrs[sechdrs[i].sh_link].sh_offset);
+	}
+	if (!me->arch.stubs_section || !me->arch.toc_section) {
+		printk("%s: doesn't contain .toc or .stubs.\n", me->name);
+		return -ENOEXEC;
+	}
+
+	/* Override the stubs size */
+	sechdrs[me->arch.stubs_section].sh_size = get_stubs_size(hdr, sechdrs);
+	return 0;
+}
+
+int apply_relocate(Elf64_Shdr *sechdrs,
+		   const char *strtab,
+		   unsigned int symindex,
+		   unsigned int relsec,
+		   struct module *me)
+{
+	printk(KERN_ERR "%s: Non-ADD RELOCATION unsupported\n", me->name);
+	return -ENOEXEC;
+}
+
+/* r2 is the TOC pointer: it actually points 0x8000 into the TOC (this
+   gives the value maximum span in an instruction which uses a signed
+   offset) */
+static inline unsigned long my_r2(Elf64_Shdr *sechdrs, struct module *me)
+{
+	return sechdrs[me->arch.toc_section].sh_addr + 0x8000;
+}
+
+/* Both low and high 16 bits are added as SIGNED additions, so if low
+   16 bits has high bit set, high 16 bits must be adjusted.  These
+   macros do that (stolen from binutils). */
+#define PPC_LO(v) ((v) & 0xffff)
+#define PPC_HI(v) (((v) >> 16) & 0xffff)
+#define PPC_HA(v) PPC_HI ((v) + 0x8000)
+
+/* Patch stub to reference function and correct r2 value. */
+static inline int create_stub(Elf64_Shdr *sechdrs,
+			      struct ppc64_stub_entry *entry,
+			      struct ppc64_opd_entry *opd,
+			      struct module *me)
+{
+	Elf64_Half *loc1, *loc2;
+	long reladdr;
+
+	*entry = ppc64_stub;
+
+	loc1 = (Elf64_Half *)&entry->jump[2];
+	loc2 = (Elf64_Half *)&entry->jump[6];
+
+	/* Stub uses address relative to r2. */
+	reladdr = (unsigned long)entry - my_r2(sechdrs, me);
+	if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) {
+		printk("%s: Address %p of stub out of range of %p.\n",
+		       me->name, (void *)reladdr, (void *)my_r2);
+		return 0;
+	}
+	DEBUGP("Stub %p get data from reladdr %li\n", entry, reladdr);
+
+	*loc1 = PPC_HA(reladdr);
+	*loc2 = PPC_LO(reladdr);
+	entry->opd.funcaddr = opd->funcaddr;
+	entry->opd.r2 = opd->r2;
+	return 1;
+}
+
+/* Create stub to jump to function described in this OPD: we need the
+   stub to set up the TOC ptr (r2) for the function. */
+static unsigned long stub_for_addr(Elf64_Shdr *sechdrs,
+				   unsigned long opdaddr,
+				   struct module *me)
+{
+	struct ppc64_stub_entry *stubs;
+	struct ppc64_opd_entry *opd = (void *)opdaddr;
+	unsigned int i, num_stubs;
+
+	num_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stubs);
+
+	/* Find this stub, or if that fails, the next avail. entry */
+	stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr;
+	for (i = 0; stubs[i].opd.funcaddr; i++) {
+		BUG_ON(i >= num_stubs);
+
+		if (stubs[i].opd.funcaddr == opd->funcaddr)
+			return (unsigned long)&stubs[i];
+	}
+
+	if (!create_stub(sechdrs, &stubs[i], opd, me))
+		return 0;
+
+	return (unsigned long)&stubs[i];
+}
+
+/* We expect a noop next: if it is, replace it with instruction to
+   restore r2. */
+static int restore_r2(u32 *instruction, struct module *me)
+{
+	if (*instruction != 0x60000000) {
+		printk("%s: Expect noop after relocate, got %08x\n",
+		       me->name, *instruction);
+		return 0;
+	}
+	*instruction = 0xe8410028;	/* ld r2,40(r1) */
+	return 1;
+}
+
+int apply_relocate_add(Elf64_Shdr *sechdrs,
+		       const char *strtab,
+		       unsigned int symindex,
+		       unsigned int relsec,
+		       struct module *me)
+{
+	unsigned int i;
+	Elf64_Rela *rela = (void *)sechdrs[relsec].sh_addr;
+	Elf64_Sym *sym;
+	unsigned long *location;
+	unsigned long value;
+
+	DEBUGP("Applying ADD relocate section %u to %u\n", relsec,
+	       sechdrs[relsec].sh_info);
+	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) {
+		/* This is where to make the change */
+		location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+			+ rela[i].r_offset;
+		/* This is the symbol it is referring to */
+		sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
+			+ ELF64_R_SYM(rela[i].r_info);
+
+		DEBUGP("RELOC at %p: %li-type as %s (%lu) + %li\n",
+		       location, (long)ELF64_R_TYPE(rela[i].r_info),
+		       strtab + sym->st_name, (unsigned long)sym->st_value,
+		       (long)rela[i].r_addend);
+
+		/* `Everything is relative'. */
+		value = sym->st_value + rela[i].r_addend;
+
+		switch (ELF64_R_TYPE(rela[i].r_info)) {
+		case R_PPC64_ADDR32:
+			/* Simply set it */
+			*(u32 *)location = value;
+			break;
+			
+		case R_PPC64_ADDR64:
+			/* Simply set it */
+			*(unsigned long *)location = value;
+			break;
+
+		case R_PPC64_TOC:
+			*(unsigned long *)location = my_r2(sechdrs, me);
+			break;
+
+		case R_PPC64_TOC16:
+			/* Subtact TOC pointer */
+			value -= my_r2(sechdrs, me);
+			if (value + 0x8000 > 0xffff) {
+				printk("%s: bad TOC16 relocation (%lu)\n",
+				       me->name, value);
+				return -ENOEXEC;
+			}
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xffff)
+				| (value & 0xffff);
+			break;
+
+		case R_PPC64_TOC16_DS:
+			/* Subtact TOC pointer */
+			value -= my_r2(sechdrs, me);
+			if ((value & 3) != 0 || value + 0x8000 > 0xffff) {
+				printk("%s: bad TOC16_DS relocation (%lu)\n",
+				       me->name, value);
+				return -ENOEXEC;
+			}
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xfffc)
+				| (value & 0xfffc);
+			break;
+
+		case R_PPC_REL24:
+			/* FIXME: Handle weak symbols here --RR */
+			if (sym->st_shndx == SHN_UNDEF) {
+				/* External: go via stub */
+				value = stub_for_addr(sechdrs, value, me);
+				if (!value)
+					return -ENOENT;
+				if (!restore_r2((u32 *)location + 1, me))
+					return -ENOEXEC;
+			}
+
+			/* Convert value to relative */
+			value -= (unsigned long)location;
+			if (value + 0x2000000 > 0x3ffffff || (value & 3) != 0){
+				printk("%s: REL24 %li out of range!\n",
+				       me->name, (long int)value);
+				return -ENOEXEC;
+			}
+
+			/* Only replace bits 2 through 26 */
+			*(uint32_t *)location 
+				= (*(uint32_t *)location & ~0x03fffffc)
+				| (value & 0x03fffffc);
+			break;
+
+		default:
+			printk("%s: Unknown ADD relocation: %lu\n",
+			       me->name,
+			       (unsigned long)ELF64_R_TYPE(rela[i].r_info));
+			return -ENOEXEC;
+		}
+	}
+
+	return 0;
+}
+
+LIST_HEAD(module_bug_list);
+
+int module_finalize(const Elf_Ehdr *hdr,
+		const Elf_Shdr *sechdrs, struct module *me)
+{
+	char *secstrings;
+	unsigned int i;
+
+	me->arch.bug_table = NULL;
+	me->arch.num_bugs = 0;
+
+	/* Find the __bug_table section, if present */
+	secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+	for (i = 1; i < hdr->e_shnum; i++) {
+		if (strcmp(secstrings+sechdrs[i].sh_name, "__bug_table"))
+			continue;
+		me->arch.bug_table = (void *) sechdrs[i].sh_addr;
+		me->arch.num_bugs = sechdrs[i].sh_size / sizeof(struct bug_entry);
+		break;
+	}
+
+	/*
+	 * Strictly speaking this should have a spinlock to protect against
+	 * traversals, but since we only traverse on BUG()s, a spinlock
+	 * could potentially lead to deadlock and thus be counter-productive.
+	 */
+	list_add(&me->arch.bug_list, &module_bug_list);
+
+	return 0;
+}
+
+void module_arch_cleanup(struct module *mod)
+{
+	list_del(&mod->arch.bug_list);
+}
+
+struct bug_entry *module_find_bug(unsigned long bugaddr)
+{
+	struct mod_arch_specific *mod;
+	unsigned int i;
+	struct bug_entry *bug;
+
+	list_for_each_entry(mod, &module_bug_list, bug_list) {
+		bug = mod->bug_table;
+		for (i = 0; i < mod->num_bugs; ++i, ++bug)
+			if (bugaddr == bug->bug_addr)
+				return bug;
+	}
+	return NULL;
+}
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
new file mode 100644
index 000000000000..3cef1b8f57f0
--- /dev/null
+++ b/arch/powerpc/kernel/pci_64.c
@@ -0,0 +1,1319 @@
+/*
+ * Port for PPC64 David Engebretsen, IBM Corp.
+ * Contains common pci routines for ppc64 platform, pSeries and iSeries brands.
+ * 
+ * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
+ *   Rework, based on alpha PCI code.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/list.h>
+#include <linux/syscalls.h>
+
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/byteorder.h>
+#include <asm/irq.h>
+#include <asm/machdep.h>
+#include <asm/udbg.h>
+#include <asm/ppc-pci.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+unsigned long pci_probe_only = 1;
+unsigned long pci_assign_all_buses = 0;
+
+/*
+ * legal IO pages under MAX_ISA_PORT.  This is to ensure we don't touch
+ * devices we don't have access to.
+ */
+unsigned long io_page_mask;
+
+EXPORT_SYMBOL(io_page_mask);
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+static void fixup_resource(struct resource *res, struct pci_dev *dev);
+static void do_bus_setup(struct pci_bus *bus);
+#endif
+
+unsigned int pcibios_assign_all_busses(void)
+{
+	return pci_assign_all_buses;
+}
+
+/* pci_io_base -- the base address from which io bars are offsets.
+ * This is the lowest I/O base address (so bar values are always positive),
+ * and it *must* be the start of ISA space if an ISA bus exists because
+ * ISA drivers use hard coded offsets.  If no ISA bus exists a dummy
+ * page is mapped and isa_io_limit prevents access to it.
+ */
+unsigned long isa_io_base;	/* NULL if no ISA bus */
+EXPORT_SYMBOL(isa_io_base);
+unsigned long pci_io_base;
+EXPORT_SYMBOL(pci_io_base);
+
+void iSeries_pcibios_init(void);
+
+LIST_HEAD(hose_list);
+
+struct dma_mapping_ops pci_dma_ops;
+EXPORT_SYMBOL(pci_dma_ops);
+
+int global_phb_number;		/* Global phb counter */
+
+/* Cached ISA bridge dev. */
+struct pci_dev *ppc64_isabridge_dev = NULL;
+
+static void fixup_broken_pcnet32(struct pci_dev* dev)
+{
+	if ((dev->class>>8 == PCI_CLASS_NETWORK_ETHERNET)) {
+		dev->vendor = PCI_VENDOR_ID_AMD;
+		pci_write_config_word(dev, PCI_VENDOR_ID, PCI_VENDOR_ID_AMD);
+	}
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_TRIDENT, PCI_ANY_ID, fixup_broken_pcnet32);
+
+void  pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
+			      struct resource *res)
+{
+	unsigned long offset = 0;
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+
+	if (!hose)
+		return;
+
+	if (res->flags & IORESOURCE_IO)
+	        offset = (unsigned long)hose->io_base_virt - pci_io_base;
+
+	if (res->flags & IORESOURCE_MEM)
+		offset = hose->pci_mem_offset;
+
+	region->start = res->start - offset;
+	region->end = res->end - offset;
+}
+
+void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
+			      struct pci_bus_region *region)
+{
+	unsigned long offset = 0;
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+
+	if (!hose)
+		return;
+
+	if (res->flags & IORESOURCE_IO)
+	        offset = (unsigned long)hose->io_base_virt - pci_io_base;
+
+	if (res->flags & IORESOURCE_MEM)
+		offset = hose->pci_mem_offset;
+
+	res->start = region->start + offset;
+	res->end = region->end + offset;
+}
+
+#ifdef CONFIG_HOTPLUG
+EXPORT_SYMBOL(pcibios_resource_to_bus);
+EXPORT_SYMBOL(pcibios_bus_to_resource);
+#endif
+
+/*
+ * We need to avoid collisions with `mirrored' VGA ports
+ * and other strange ISA hardware, so we always want the
+ * addresses to be allocated in the 0x000-0x0ff region
+ * modulo 0x400.
+ *
+ * Why? Because some silly external IO cards only decode
+ * the low 10 bits of the IO address. The 0x00-0xff region
+ * is reserved for motherboard devices that decode all 16
+ * bits, so it's ok to allocate at, say, 0x2800-0x28ff,
+ * but we want to try to avoid allocating at 0x2900-0x2bff
+ * which might have be mirrored at 0x0100-0x03ff..
+ */
+void pcibios_align_resource(void *data, struct resource *res,
+			    unsigned long size, unsigned long align)
+{
+	struct pci_dev *dev = data;
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	unsigned long start = res->start;
+	unsigned long alignto;
+
+	if (res->flags & IORESOURCE_IO) {
+	        unsigned long offset = (unsigned long)hose->io_base_virt -
+					pci_io_base;
+		/* Make sure we start at our min on all hoses */
+		if (start - offset < PCIBIOS_MIN_IO)
+			start = PCIBIOS_MIN_IO + offset;
+
+		/*
+		 * Put everything into 0x00-0xff region modulo 0x400
+		 */
+		if (start & 0x300)
+			start = (start + 0x3ff) & ~0x3ff;
+
+	} else if (res->flags & IORESOURCE_MEM) {
+		/* Make sure we start at our min on all hoses */
+		if (start - hose->pci_mem_offset < PCIBIOS_MIN_MEM)
+			start = PCIBIOS_MIN_MEM + hose->pci_mem_offset;
+
+		/* Align to multiple of size of minimum base.  */
+		alignto = max(0x1000UL, align);
+		start = ALIGN(start, alignto);
+	}
+
+	res->start = start;
+}
+
+static DEFINE_SPINLOCK(hose_spinlock);
+
+/*
+ * pci_controller(phb) initialized common variables.
+ */
+void __devinit pci_setup_pci_controller(struct pci_controller *hose)
+{
+	memset(hose, 0, sizeof(struct pci_controller));
+
+	spin_lock(&hose_spinlock);
+	hose->global_number = global_phb_number++;
+	list_add_tail(&hose->list_node, &hose_list);
+	spin_unlock(&hose_spinlock);
+}
+
+static void __init pcibios_claim_one_bus(struct pci_bus *b)
+{
+	struct pci_dev *dev;
+	struct pci_bus *child_bus;
+
+	list_for_each_entry(dev, &b->devices, bus_list) {
+		int i;
+
+		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+			struct resource *r = &dev->resource[i];
+
+			if (r->parent || !r->start || !r->flags)
+				continue;
+			pci_claim_resource(dev, i);
+		}
+	}
+
+	list_for_each_entry(child_bus, &b->children, node)
+		pcibios_claim_one_bus(child_bus);
+}
+
+#ifndef CONFIG_PPC_ISERIES
+static void __init pcibios_claim_of_setup(void)
+{
+	struct pci_bus *b;
+
+	list_for_each_entry(b, &pci_root_buses, node)
+		pcibios_claim_one_bus(b);
+}
+#endif
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+static u32 get_int_prop(struct device_node *np, const char *name, u32 def)
+{
+	u32 *prop;
+	int len;
+
+	prop = (u32 *) get_property(np, name, &len);
+	if (prop && len >= 4)
+		return *prop;
+	return def;
+}
+
+static unsigned int pci_parse_of_flags(u32 addr0)
+{
+	unsigned int flags = 0;
+
+	if (addr0 & 0x02000000) {
+		flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY;
+		flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64;
+		flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M;
+		if (addr0 & 0x40000000)
+			flags |= IORESOURCE_PREFETCH
+				 | PCI_BASE_ADDRESS_MEM_PREFETCH;
+	} else if (addr0 & 0x01000000)
+		flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO;
+	return flags;
+}
+
+#define GET_64BIT(prop, i)	((((u64) (prop)[(i)]) << 32) | (prop)[(i)+1])
+
+static void pci_parse_of_addrs(struct device_node *node, struct pci_dev *dev)
+{
+	u64 base, size;
+	unsigned int flags;
+	struct resource *res;
+	u32 *addrs, i;
+	int proplen;
+
+	addrs = (u32 *) get_property(node, "assigned-addresses", &proplen);
+	if (!addrs)
+		return;
+	for (; proplen >= 20; proplen -= 20, addrs += 5) {
+		flags = pci_parse_of_flags(addrs[0]);
+		if (!flags)
+			continue;
+		base = GET_64BIT(addrs, 1);
+		size = GET_64BIT(addrs, 3);
+		if (!size)
+			continue;
+		i = addrs[0] & 0xff;
+		if (PCI_BASE_ADDRESS_0 <= i && i <= PCI_BASE_ADDRESS_5) {
+			res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2];
+		} else if (i == dev->rom_base_reg) {
+			res = &dev->resource[PCI_ROM_RESOURCE];
+			flags |= IORESOURCE_READONLY | IORESOURCE_CACHEABLE;
+		} else {
+			printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i);
+			continue;
+		}
+		res->start = base;
+		res->end = base + size - 1;
+		res->flags = flags;
+		res->name = pci_name(dev);
+		fixup_resource(res, dev);
+	}
+}
+
+struct pci_dev *of_create_pci_dev(struct device_node *node,
+				 struct pci_bus *bus, int devfn)
+{
+	struct pci_dev *dev;
+	const char *type;
+
+	dev = kmalloc(sizeof(struct pci_dev), GFP_KERNEL);
+	if (!dev)
+		return NULL;
+	type = get_property(node, "device_type", NULL);
+	if (type == NULL)
+		type = "";
+
+	memset(dev, 0, sizeof(struct pci_dev));
+	dev->bus = bus;
+	dev->sysdata = node;
+	dev->dev.parent = bus->bridge;
+	dev->dev.bus = &pci_bus_type;
+	dev->devfn = devfn;
+	dev->multifunction = 0;		/* maybe a lie? */
+
+	dev->vendor = get_int_prop(node, "vendor-id", 0xffff);
+	dev->device = get_int_prop(node, "device-id", 0xffff);
+	dev->subsystem_vendor = get_int_prop(node, "subsystem-vendor-id", 0);
+	dev->subsystem_device = get_int_prop(node, "subsystem-id", 0);
+
+	dev->cfg_size = 256; /*pci_cfg_space_size(dev);*/
+
+	sprintf(pci_name(dev), "%04x:%02x:%02x.%d", pci_domain_nr(bus),
+		dev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn));
+	dev->class = get_int_prop(node, "class-code", 0);
+
+	dev->current_state = 4;		/* unknown power state */
+
+	if (!strcmp(type, "pci")) {
+		/* a PCI-PCI bridge */
+		dev->hdr_type = PCI_HEADER_TYPE_BRIDGE;
+		dev->rom_base_reg = PCI_ROM_ADDRESS1;
+	} else if (!strcmp(type, "cardbus")) {
+		dev->hdr_type = PCI_HEADER_TYPE_CARDBUS;
+	} else {
+		dev->hdr_type = PCI_HEADER_TYPE_NORMAL;
+		dev->rom_base_reg = PCI_ROM_ADDRESS;
+		dev->irq = NO_IRQ;
+		if (node->n_intrs > 0) {
+			dev->irq = node->intrs[0].line;
+			pci_write_config_byte(dev, PCI_INTERRUPT_LINE,
+					      dev->irq);
+		}
+	}
+
+	pci_parse_of_addrs(node, dev);
+
+	pci_device_add(dev, bus);
+
+	/* XXX pci_scan_msi_device(dev); */
+
+	return dev;
+}
+EXPORT_SYMBOL(of_create_pci_dev);
+
+void __devinit of_scan_bus(struct device_node *node,
+				  struct pci_bus *bus)
+{
+	struct device_node *child = NULL;
+	u32 *reg;
+	int reglen, devfn;
+	struct pci_dev *dev;
+
+	while ((child = of_get_next_child(node, child)) != NULL) {
+		reg = (u32 *) get_property(child, "reg", &reglen);
+		if (reg == NULL || reglen < 20)
+			continue;
+		devfn = (reg[0] >> 8) & 0xff;
+		/* create a new pci_dev for this device */
+		dev = of_create_pci_dev(child, bus, devfn);
+		if (!dev)
+			continue;
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
+		    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
+			of_scan_pci_bridge(child, dev);
+	}
+
+	do_bus_setup(bus);
+}
+EXPORT_SYMBOL(of_scan_bus);
+
+void __devinit of_scan_pci_bridge(struct device_node *node,
+			 	struct pci_dev *dev)
+{
+	struct pci_bus *bus;
+	u32 *busrange, *ranges;
+	int len, i, mode;
+	struct resource *res;
+	unsigned int flags;
+	u64 size;
+
+	/* parse bus-range property */
+	busrange = (u32 *) get_property(node, "bus-range", &len);
+	if (busrange == NULL || len != 8) {
+		printk(KERN_ERR "Can't get bus-range for PCI-PCI bridge %s\n",
+		       node->full_name);
+		return;
+	}
+	ranges = (u32 *) get_property(node, "ranges", &len);
+	if (ranges == NULL) {
+		printk(KERN_ERR "Can't get ranges for PCI-PCI bridge %s\n",
+		       node->full_name);
+		return;
+	}
+
+	bus = pci_add_new_bus(dev->bus, dev, busrange[0]);
+	if (!bus) {
+		printk(KERN_ERR "Failed to create pci bus for %s\n",
+		       node->full_name);
+		return;
+	}
+
+	bus->primary = dev->bus->number;
+	bus->subordinate = busrange[1];
+	bus->bridge_ctl = 0;
+	bus->sysdata = node;
+
+	/* parse ranges property */
+	/* PCI #address-cells == 3 and #size-cells == 2 always */
+	res = &dev->resource[PCI_BRIDGE_RESOURCES];
+	for (i = 0; i < PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES; ++i) {
+		res->flags = 0;
+		bus->resource[i] = res;
+		++res;
+	}
+	i = 1;
+	for (; len >= 32; len -= 32, ranges += 8) {
+		flags = pci_parse_of_flags(ranges[0]);
+		size = GET_64BIT(ranges, 6);
+		if (flags == 0 || size == 0)
+			continue;
+		if (flags & IORESOURCE_IO) {
+			res = bus->resource[0];
+			if (res->flags) {
+				printk(KERN_ERR "PCI: ignoring extra I/O range"
+				       " for bridge %s\n", node->full_name);
+				continue;
+			}
+		} else {
+			if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) {
+				printk(KERN_ERR "PCI: too many memory ranges"
+				       " for bridge %s\n", node->full_name);
+				continue;
+			}
+			res = bus->resource[i];
+			++i;
+		}
+		res->start = GET_64BIT(ranges, 1);
+		res->end = res->start + size - 1;
+		res->flags = flags;
+		fixup_resource(res, dev);
+	}
+	sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus),
+		bus->number);
+
+	mode = PCI_PROBE_NORMAL;
+	if (ppc_md.pci_probe_mode)
+		mode = ppc_md.pci_probe_mode(bus);
+	if (mode == PCI_PROBE_DEVTREE)
+		of_scan_bus(node, bus);
+	else if (mode == PCI_PROBE_NORMAL)
+		pci_scan_child_bus(bus);
+}
+EXPORT_SYMBOL(of_scan_pci_bridge);
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+
+void __devinit scan_phb(struct pci_controller *hose)
+{
+	struct pci_bus *bus;
+	struct device_node *node = hose->arch_data;
+	int i, mode;
+	struct resource *res;
+
+	bus = pci_create_bus(NULL, hose->first_busno, hose->ops, node);
+	if (bus == NULL) {
+		printk(KERN_ERR "Failed to create bus for PCI domain %04x\n",
+		       hose->global_number);
+		return;
+	}
+	bus->secondary = hose->first_busno;
+	hose->bus = bus;
+
+	bus->resource[0] = res = &hose->io_resource;
+	if (res->flags && request_resource(&ioport_resource, res))
+		printk(KERN_ERR "Failed to request PCI IO region "
+		       "on PCI domain %04x\n", hose->global_number);
+
+	for (i = 0; i < 3; ++i) {
+		res = &hose->mem_resources[i];
+		bus->resource[i+1] = res;
+		if (res->flags && request_resource(&iomem_resource, res))
+			printk(KERN_ERR "Failed to request PCI memory region "
+			       "on PCI domain %04x\n", hose->global_number);
+	}
+
+	mode = PCI_PROBE_NORMAL;
+#ifdef CONFIG_PPC_MULTIPLATFORM
+	if (ppc_md.pci_probe_mode)
+		mode = ppc_md.pci_probe_mode(bus);
+	if (mode == PCI_PROBE_DEVTREE) {
+		bus->subordinate = hose->last_busno;
+		of_scan_bus(node, bus);
+	}
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+	if (mode == PCI_PROBE_NORMAL)
+		hose->last_busno = bus->subordinate = pci_scan_child_bus(bus);
+	pci_bus_add_devices(bus);
+}
+
+static int __init pcibios_init(void)
+{
+	struct pci_controller *hose, *tmp;
+
+	/* For now, override phys_mem_access_prot. If we need it,
+	 * later, we may move that initialization to each ppc_md
+	 */
+	ppc_md.phys_mem_access_prot = pci_phys_mem_access_prot;
+
+#ifdef CONFIG_PPC_ISERIES
+	iSeries_pcibios_init(); 
+#endif
+
+	printk("PCI: Probing PCI hardware\n");
+
+	/* Scan all of the recorded PCI controllers.  */
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
+		scan_phb(hose);
+
+#ifndef CONFIG_PPC_ISERIES
+	if (pci_probe_only)
+		pcibios_claim_of_setup();
+	else
+		/* FIXME: `else' will be removed when
+		   pci_assign_unassigned_resources() is able to work
+		   correctly with [partially] allocated PCI tree. */
+		pci_assign_unassigned_resources();
+#endif /* !CONFIG_PPC_ISERIES */
+
+	/* Call machine dependent final fixup */
+	if (ppc_md.pcibios_fixup)
+		ppc_md.pcibios_fixup();
+
+	/* Cache the location of the ISA bridge (if we have one) */
+	ppc64_isabridge_dev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
+	if (ppc64_isabridge_dev != NULL)
+		printk("ISA bridge at %s\n", pci_name(ppc64_isabridge_dev));
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+	/* map in PCI I/O space */
+	phbs_remap_io();
+#endif
+
+	printk("PCI: Probing PCI hardware done\n");
+
+	return 0;
+}
+
+subsys_initcall(pcibios_init);
+
+char __init *pcibios_setup(char *str)
+{
+	return str;
+}
+
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+	u16 cmd, oldcmd;
+	int i;
+
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	oldcmd = cmd;
+
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		struct resource *res = &dev->resource[i];
+
+		/* Only set up the requested stuff */
+		if (!(mask & (1<<i)))
+			continue;
+
+		if (res->flags & IORESOURCE_IO)
+			cmd |= PCI_COMMAND_IO;
+		if (res->flags & IORESOURCE_MEM)
+			cmd |= PCI_COMMAND_MEMORY;
+	}
+
+	if (cmd != oldcmd) {
+		printk(KERN_DEBUG "PCI: Enabling device: (%s), cmd %x\n",
+		       pci_name(dev), cmd);
+                /* Enable the appropriate bits in the PCI command register.  */
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
+	}
+	return 0;
+}
+
+/*
+ * Return the domain number for this bus.
+ */
+int pci_domain_nr(struct pci_bus *bus)
+{
+#ifdef CONFIG_PPC_ISERIES
+	return 0;
+#else
+	struct pci_controller *hose = pci_bus_to_host(bus);
+
+	return hose->global_number;
+#endif
+}
+
+EXPORT_SYMBOL(pci_domain_nr);
+
+/* Decide whether to display the domain number in /proc */
+int pci_proc_domain(struct pci_bus *bus)
+{
+#ifdef CONFIG_PPC_ISERIES
+	return 0;
+#else
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	return hose->buid;
+#endif
+}
+
+/*
+ * Platform support for /proc/bus/pci/X/Y mmap()s,
+ * modelled on the sparc64 implementation by Dave Miller.
+ *  -- paulus.
+ */
+
+/*
+ * Adjust vm_pgoff of VMA such that it is the physical page offset
+ * corresponding to the 32-bit pci bus offset for DEV requested by the user.
+ *
+ * Basically, the user finds the base address for his device which he wishes
+ * to mmap.  They read the 32-bit value from the config space base register,
+ * add whatever PAGE_SIZE multiple offset they wish, and feed this into the
+ * offset parameter of mmap on /proc/bus/pci/XXX for that device.
+ *
+ * Returns negative error code on failure, zero on success.
+ */
+static struct resource *__pci_mmap_make_offset(struct pci_dev *dev,
+					       unsigned long *offset,
+					       enum pci_mmap_state mmap_state)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	unsigned long io_offset = 0;
+	int i, res_bit;
+
+	if (hose == 0)
+		return NULL;		/* should never happen */
+
+	/* If memory, add on the PCI bridge address offset */
+	if (mmap_state == pci_mmap_mem) {
+		*offset += hose->pci_mem_offset;
+		res_bit = IORESOURCE_MEM;
+	} else {
+		io_offset = (unsigned long)hose->io_base_virt - pci_io_base;
+		*offset += io_offset;
+		res_bit = IORESOURCE_IO;
+	}
+
+	/*
+	 * Check that the offset requested corresponds to one of the
+	 * resources of the device.
+	 */
+	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+		struct resource *rp = &dev->resource[i];
+		int flags = rp->flags;
+
+		/* treat ROM as memory (should be already) */
+		if (i == PCI_ROM_RESOURCE)
+			flags |= IORESOURCE_MEM;
+
+		/* Active and same type? */
+		if ((flags & res_bit) == 0)
+			continue;
+
+		/* In the range of this resource? */
+		if (*offset < (rp->start & PAGE_MASK) || *offset > rp->end)
+			continue;
+
+		/* found it! construct the final physical address */
+		if (mmap_state == pci_mmap_io)
+		       	*offset += hose->io_base_phys - io_offset;
+		return rp;
+	}
+
+	return NULL;
+}
+
+/*
+ * Set vm_page_prot of VMA, as appropriate for this architecture, for a pci
+ * device mapping.
+ */
+static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp,
+				      pgprot_t protection,
+				      enum pci_mmap_state mmap_state,
+				      int write_combine)
+{
+	unsigned long prot = pgprot_val(protection);
+
+	/* Write combine is always 0 on non-memory space mappings. On
+	 * memory space, if the user didn't pass 1, we check for a
+	 * "prefetchable" resource. This is a bit hackish, but we use
+	 * this to workaround the inability of /sysfs to provide a write
+	 * combine bit
+	 */
+	if (mmap_state != pci_mmap_mem)
+		write_combine = 0;
+	else if (write_combine == 0) {
+		if (rp->flags & IORESOURCE_PREFETCH)
+			write_combine = 1;
+	}
+
+	/* XXX would be nice to have a way to ask for write-through */
+	prot |= _PAGE_NO_CACHE;
+	if (write_combine)
+		prot &= ~_PAGE_GUARDED;
+	else
+		prot |= _PAGE_GUARDED;
+
+	printk("PCI map for %s:%lx, prot: %lx\n", pci_name(dev), rp->start,
+	       prot);
+
+	return __pgprot(prot);
+}
+
+/*
+ * This one is used by /dev/mem and fbdev who have no clue about the
+ * PCI device, it tries to find the PCI device first and calls the
+ * above routine
+ */
+pgprot_t pci_phys_mem_access_prot(struct file *file,
+				  unsigned long pfn,
+				  unsigned long size,
+				  pgprot_t protection)
+{
+	struct pci_dev *pdev = NULL;
+	struct resource *found = NULL;
+	unsigned long prot = pgprot_val(protection);
+	unsigned long offset = pfn << PAGE_SHIFT;
+	int i;
+
+	if (page_is_ram(pfn))
+		return __pgprot(prot);
+
+	prot |= _PAGE_NO_CACHE | _PAGE_GUARDED;
+
+	for_each_pci_dev(pdev) {
+		for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+			struct resource *rp = &pdev->resource[i];
+			int flags = rp->flags;
+
+			/* Active and same type? */
+			if ((flags & IORESOURCE_MEM) == 0)
+				continue;
+			/* In the range of this resource? */
+			if (offset < (rp->start & PAGE_MASK) ||
+			    offset > rp->end)
+				continue;
+			found = rp;
+			break;
+		}
+		if (found)
+			break;
+	}
+	if (found) {
+		if (found->flags & IORESOURCE_PREFETCH)
+			prot &= ~_PAGE_GUARDED;
+		pci_dev_put(pdev);
+	}
+
+	DBG("non-PCI map for %lx, prot: %lx\n", offset, prot);
+
+	return __pgprot(prot);
+}
+
+
+/*
+ * Perform the actual remap of the pages for a PCI device mapping, as
+ * appropriate for this architecture.  The region in the process to map
+ * is described by vm_start and vm_end members of VMA, the base physical
+ * address is found in vm_pgoff.
+ * The pci device structure is provided so that architectures may make mapping
+ * decisions on a per-device or per-bus basis.
+ *
+ * Returns a negative error code on failure, zero on success.
+ */
+int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+			enum pci_mmap_state mmap_state,
+			int write_combine)
+{
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	struct resource *rp;
+	int ret;
+
+	rp = __pci_mmap_make_offset(dev, &offset, mmap_state);
+	if (rp == NULL)
+		return -EINVAL;
+
+	vma->vm_pgoff = offset >> PAGE_SHIFT;
+	vma->vm_flags |= VM_SHM | VM_LOCKED | VM_IO;
+	vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp,
+						  vma->vm_page_prot,
+						  mmap_state, write_combine);
+
+	ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start, vma->vm_page_prot);
+
+	return ret;
+}
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+static ssize_t pci_show_devspec(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pdev;
+	struct device_node *np;
+
+	pdev = to_pci_dev (dev);
+	np = pci_device_to_OF_node(pdev);
+	if (np == NULL || np->full_name == NULL)
+		return 0;
+	return sprintf(buf, "%s", np->full_name);
+}
+static DEVICE_ATTR(devspec, S_IRUGO, pci_show_devspec, NULL);
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+
+void pcibios_add_platform_entries(struct pci_dev *pdev)
+{
+#ifdef CONFIG_PPC_MULTIPLATFORM
+	device_create_file(&pdev->dev, &dev_attr_devspec);
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+}
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+
+#define ISA_SPACE_MASK 0x1
+#define ISA_SPACE_IO 0x1
+
+static void __devinit pci_process_ISA_OF_ranges(struct device_node *isa_node,
+				      unsigned long phb_io_base_phys,
+				      void __iomem * phb_io_base_virt)
+{
+	struct isa_range *range;
+	unsigned long pci_addr;
+	unsigned int isa_addr;
+	unsigned int size;
+	int rlen = 0;
+
+	range = (struct isa_range *) get_property(isa_node, "ranges", &rlen);
+	if (range == NULL || (rlen < sizeof(struct isa_range))) {
+		printk(KERN_ERR "no ISA ranges or unexpected isa range size,"
+		       "mapping 64k\n");
+		__ioremap_explicit(phb_io_base_phys,
+				   (unsigned long)phb_io_base_virt,
+				   0x10000, _PAGE_NO_CACHE | _PAGE_GUARDED);
+		return;	
+	}
+	
+	/* From "ISA Binding to 1275"
+	 * The ranges property is laid out as an array of elements,
+	 * each of which comprises:
+	 *   cells 0 - 1:	an ISA address
+	 *   cells 2 - 4:	a PCI address 
+	 *			(size depending on dev->n_addr_cells)
+	 *   cell 5:		the size of the range
+	 */
+	if ((range->isa_addr.a_hi && ISA_SPACE_MASK) == ISA_SPACE_IO) {
+		isa_addr = range->isa_addr.a_lo;
+		pci_addr = (unsigned long) range->pci_addr.a_mid << 32 | 
+			range->pci_addr.a_lo;
+
+		/* Assume these are both zero */
+		if ((pci_addr != 0) || (isa_addr != 0)) {
+			printk(KERN_ERR "unexpected isa to pci mapping: %s\n",
+					__FUNCTION__);
+			return;
+		}
+		
+		size = PAGE_ALIGN(range->size);
+
+		__ioremap_explicit(phb_io_base_phys, 
+				   (unsigned long) phb_io_base_virt, 
+				   size, _PAGE_NO_CACHE | _PAGE_GUARDED);
+	}
+}
+
+void __devinit pci_process_bridge_OF_ranges(struct pci_controller *hose,
+					    struct device_node *dev, int prim)
+{
+	unsigned int *ranges, pci_space;
+	unsigned long size;
+	int rlen = 0;
+	int memno = 0;
+	struct resource *res;
+	int np, na = prom_n_addr_cells(dev);
+	unsigned long pci_addr, cpu_phys_addr;
+
+	np = na + 5;
+
+	/* From "PCI Binding to 1275"
+	 * The ranges property is laid out as an array of elements,
+	 * each of which comprises:
+	 *   cells 0 - 2:	a PCI address
+	 *   cells 3 or 3+4:	a CPU physical address
+	 *			(size depending on dev->n_addr_cells)
+	 *   cells 4+5 or 5+6:	the size of the range
+	 */
+	rlen = 0;
+	hose->io_base_phys = 0;
+	ranges = (unsigned int *) get_property(dev, "ranges", &rlen);
+	while ((rlen -= np * sizeof(unsigned int)) >= 0) {
+		res = NULL;
+		pci_space = ranges[0];
+		pci_addr = ((unsigned long)ranges[1] << 32) | ranges[2];
+
+		cpu_phys_addr = ranges[3];
+		if (na >= 2)
+			cpu_phys_addr = (cpu_phys_addr << 32) | ranges[4];
+
+		size = ((unsigned long)ranges[na+3] << 32) | ranges[na+4];
+		ranges += np;
+		if (size == 0)
+			continue;
+
+		/* Now consume following elements while they are contiguous */
+		while (rlen >= np * sizeof(unsigned int)) {
+			unsigned long addr, phys;
+
+			if (ranges[0] != pci_space)
+				break;
+			addr = ((unsigned long)ranges[1] << 32) | ranges[2];
+			phys = ranges[3];
+			if (na >= 2)
+				phys = (phys << 32) | ranges[4];
+			if (addr != pci_addr + size ||
+			    phys != cpu_phys_addr + size)
+				break;
+
+			size += ((unsigned long)ranges[na+3] << 32)
+				| ranges[na+4];
+			ranges += np;
+			rlen -= np * sizeof(unsigned int);
+		}
+
+		switch ((pci_space >> 24) & 0x3) {
+		case 1:		/* I/O space */
+			hose->io_base_phys = cpu_phys_addr;
+			hose->pci_io_size = size;
+
+			res = &hose->io_resource;
+			res->flags = IORESOURCE_IO;
+			res->start = pci_addr;
+			DBG("phb%d: IO 0x%lx -> 0x%lx\n", hose->global_number,
+				    res->start, res->start + size - 1);
+			break;
+		case 2:		/* memory space */
+			memno = 0;
+			while (memno < 3 && hose->mem_resources[memno].flags)
+				++memno;
+
+			if (memno == 0)
+				hose->pci_mem_offset = cpu_phys_addr - pci_addr;
+			if (memno < 3) {
+				res = &hose->mem_resources[memno];
+				res->flags = IORESOURCE_MEM;
+				res->start = cpu_phys_addr;
+				DBG("phb%d: MEM 0x%lx -> 0x%lx\n", hose->global_number,
+					    res->start, res->start + size - 1);
+			}
+			break;
+		}
+		if (res != NULL) {
+			res->name = dev->full_name;
+			res->end = res->start + size - 1;
+			res->parent = NULL;
+			res->sibling = NULL;
+			res->child = NULL;
+		}
+	}
+}
+
+void __init pci_setup_phb_io(struct pci_controller *hose, int primary)
+{
+	unsigned long size = hose->pci_io_size;
+	unsigned long io_virt_offset;
+	struct resource *res;
+	struct device_node *isa_dn;
+
+	hose->io_base_virt = reserve_phb_iospace(size);
+	DBG("phb%d io_base_phys 0x%lx io_base_virt 0x%lx\n",
+		hose->global_number, hose->io_base_phys,
+		(unsigned long) hose->io_base_virt);
+
+	if (primary) {
+		pci_io_base = (unsigned long)hose->io_base_virt;
+		isa_dn = of_find_node_by_type(NULL, "isa");
+		if (isa_dn) {
+			isa_io_base = pci_io_base;
+			pci_process_ISA_OF_ranges(isa_dn, hose->io_base_phys,
+						hose->io_base_virt);
+			of_node_put(isa_dn);
+			/* Allow all IO */
+			io_page_mask = -1;
+		}
+	}
+
+	io_virt_offset = (unsigned long)hose->io_base_virt - pci_io_base;
+	res = &hose->io_resource;
+	res->start += io_virt_offset;
+	res->end += io_virt_offset;
+}
+
+void __devinit pci_setup_phb_io_dynamic(struct pci_controller *hose,
+					int primary)
+{
+	unsigned long size = hose->pci_io_size;
+	unsigned long io_virt_offset;
+	struct resource *res;
+
+	hose->io_base_virt = __ioremap(hose->io_base_phys, size,
+					_PAGE_NO_CACHE | _PAGE_GUARDED);
+	DBG("phb%d io_base_phys 0x%lx io_base_virt 0x%lx\n",
+		hose->global_number, hose->io_base_phys,
+		(unsigned long) hose->io_base_virt);
+
+	if (primary)
+		pci_io_base = (unsigned long)hose->io_base_virt;
+
+	io_virt_offset = (unsigned long)hose->io_base_virt - pci_io_base;
+	res = &hose->io_resource;
+	res->start += io_virt_offset;
+	res->end += io_virt_offset;
+}
+
+
+static int get_bus_io_range(struct pci_bus *bus, unsigned long *start_phys,
+				unsigned long *start_virt, unsigned long *size)
+{
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	struct pci_bus_region region;
+	struct resource *res;
+
+	if (bus->self) {
+		res = bus->resource[0];
+		pcibios_resource_to_bus(bus->self, &region, res);
+		*start_phys = hose->io_base_phys + region.start;
+		*start_virt = (unsigned long) hose->io_base_virt + 
+				region.start;
+		if (region.end > region.start) 
+			*size = region.end - region.start + 1;
+		else {
+			printk("%s(): unexpected region 0x%lx->0x%lx\n", 
+					__FUNCTION__, region.start, region.end);
+			return 1;
+		}
+		
+	} else {
+		/* Root Bus */
+		res = &hose->io_resource;
+		*start_phys = hose->io_base_phys;
+		*start_virt = (unsigned long) hose->io_base_virt;
+		if (res->end > res->start)
+			*size = res->end - res->start + 1;
+		else {
+			printk("%s(): unexpected region 0x%lx->0x%lx\n", 
+					__FUNCTION__, res->start, res->end);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+int unmap_bus_range(struct pci_bus *bus)
+{
+	unsigned long start_phys;
+	unsigned long start_virt;
+	unsigned long size;
+
+	if (!bus) {
+		printk(KERN_ERR "%s() expected bus\n", __FUNCTION__);
+		return 1;
+	}
+	
+	if (get_bus_io_range(bus, &start_phys, &start_virt, &size))
+		return 1;
+	if (iounmap_explicit((void __iomem *) start_virt, size))
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(unmap_bus_range);
+
+int remap_bus_range(struct pci_bus *bus)
+{
+	unsigned long start_phys;
+	unsigned long start_virt;
+	unsigned long size;
+
+	if (!bus) {
+		printk(KERN_ERR "%s() expected bus\n", __FUNCTION__);
+		return 1;
+	}
+	
+	
+	if (get_bus_io_range(bus, &start_phys, &start_virt, &size))
+		return 1;
+	printk("mapping IO %lx -> %lx, size: %lx\n", start_phys, start_virt, size);
+	if (__ioremap_explicit(start_phys, start_virt, size,
+			       _PAGE_NO_CACHE | _PAGE_GUARDED))
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(remap_bus_range);
+
+void phbs_remap_io(void)
+{
+	struct pci_controller *hose, *tmp;
+
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
+		remap_bus_range(hose->bus);
+}
+
+/*
+ * ppc64 can have multifunction devices that do not respond to function 0.
+ * In this case we must scan all functions.
+ * XXX this can go now, we use the OF device tree in all the
+ * cases that caused problems. -- paulus
+ */
+int pcibios_scan_all_fns(struct pci_bus *bus, int devfn)
+{
+       return 0;
+}
+
+static void __devinit fixup_resource(struct resource *res, struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	unsigned long start, end, mask, offset;
+
+	if (res->flags & IORESOURCE_IO) {
+		offset = (unsigned long)hose->io_base_virt - pci_io_base;
+
+		start = res->start += offset;
+		end = res->end += offset;
+
+		/* Need to allow IO access to pages that are in the
+		   ISA range */
+		if (start < MAX_ISA_PORT) {
+			if (end > MAX_ISA_PORT)
+				end = MAX_ISA_PORT;
+
+			start >>= PAGE_SHIFT;
+			end >>= PAGE_SHIFT;
+
+			/* get the range of pages for the map */
+			mask = ((1 << (end+1)) - 1) ^ ((1 << start) - 1);
+			io_page_mask |= mask;
+		}
+	} else if (res->flags & IORESOURCE_MEM) {
+		res->start += hose->pci_mem_offset;
+		res->end += hose->pci_mem_offset;
+	}
+}
+
+void __devinit pcibios_fixup_device_resources(struct pci_dev *dev,
+					      struct pci_bus *bus)
+{
+	/* Update device resources.  */
+	int i;
+
+	for (i = 0; i < PCI_NUM_RESOURCES; i++)
+		if (dev->resource[i].flags)
+			fixup_resource(&dev->resource[i], dev);
+}
+EXPORT_SYMBOL(pcibios_fixup_device_resources);
+
+static void __devinit do_bus_setup(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	ppc_md.iommu_bus_setup(bus);
+
+	list_for_each_entry(dev, &bus->devices, bus_list)
+		ppc_md.iommu_dev_setup(dev);
+
+	if (ppc_md.irq_bus_setup)
+		ppc_md.irq_bus_setup(bus);
+}
+
+void __devinit pcibios_fixup_bus(struct pci_bus *bus)
+{
+	struct pci_dev *dev = bus->self;
+
+	if (dev && pci_probe_only &&
+	    (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
+		/* This is a subordinate bridge */
+
+		pci_read_bridge_bases(bus);
+		pcibios_fixup_device_resources(dev, bus);
+	}
+
+	do_bus_setup(bus);
+
+	if (!pci_probe_only)
+		return;
+
+	list_for_each_entry(dev, &bus->devices, bus_list)
+		if ((dev->class >> 8) != PCI_CLASS_BRIDGE_PCI)
+			pcibios_fixup_device_resources(dev, bus);
+}
+EXPORT_SYMBOL(pcibios_fixup_bus);
+
+/*
+ * Reads the interrupt pin to determine if interrupt is use by card.
+ * If the interrupt is used, then gets the interrupt line from the 
+ * openfirmware and sets it in the pci_dev and pci_config line.
+ */
+int pci_read_irq_line(struct pci_dev *pci_dev)
+{
+	u8 intpin;
+	struct device_node *node;
+
+    	pci_read_config_byte(pci_dev, PCI_INTERRUPT_PIN, &intpin);
+	if (intpin == 0)
+		return 0;
+
+	node = pci_device_to_OF_node(pci_dev);
+	if (node == NULL)
+		return -1;
+
+	if (node->n_intrs == 0)
+		return -1;
+
+	pci_dev->irq = node->intrs[0].line;
+
+	pci_write_config_byte(pci_dev, PCI_INTERRUPT_LINE, pci_dev->irq);
+
+	return 0;
+}
+EXPORT_SYMBOL(pci_read_irq_line);
+
+void pci_resource_to_user(const struct pci_dev *dev, int bar,
+			  const struct resource *rsrc,
+			  u64 *start, u64 *end)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	unsigned long offset = 0;
+
+	if (hose == NULL)
+		return;
+
+	if (rsrc->flags & IORESOURCE_IO)
+		offset = pci_io_base - (unsigned long)hose->io_base_virt +
+			hose->io_base_phys;
+
+	*start = rsrc->start + offset;
+	*end = rsrc->end + offset;
+}
+
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+
+
+#define IOBASE_BRIDGE_NUMBER	0
+#define IOBASE_MEMORY		1
+#define IOBASE_IO		2
+#define IOBASE_ISA_IO		3
+#define IOBASE_ISA_MEM		4
+
+long sys_pciconfig_iobase(long which, unsigned long in_bus,
+			  unsigned long in_devfn)
+{
+	struct pci_controller* hose;
+	struct list_head *ln;
+	struct pci_bus *bus = NULL;
+	struct device_node *hose_node;
+
+	/* Argh ! Please forgive me for that hack, but that's the
+	 * simplest way to get existing XFree to not lockup on some
+	 * G5 machines... So when something asks for bus 0 io base
+	 * (bus 0 is HT root), we return the AGP one instead.
+	 */
+	if (machine_is_compatible("MacRISC4"))
+		if (in_bus == 0)
+			in_bus = 0xf0;
+
+	/* That syscall isn't quite compatible with PCI domains, but it's
+	 * used on pre-domains setup. We return the first match
+	 */
+
+	for (ln = pci_root_buses.next; ln != &pci_root_buses; ln = ln->next) {
+		bus = pci_bus_b(ln);
+		if (in_bus >= bus->number && in_bus < (bus->number + bus->subordinate))
+			break;
+		bus = NULL;
+	}
+	if (bus == NULL || bus->sysdata == NULL)
+		return -ENODEV;
+
+	hose_node = (struct device_node *)bus->sysdata;
+	hose = PCI_DN(hose_node)->phb;
+
+	switch (which) {
+	case IOBASE_BRIDGE_NUMBER:
+		return (long)hose->first_busno;
+	case IOBASE_MEMORY:
+		return (long)hose->pci_mem_offset;
+	case IOBASE_IO:
+		return (long)hose->io_base_phys;
+	case IOBASE_ISA_IO:
+		return (long)isa_io_base;
+	case IOBASE_ISA_MEM:
+		return -EINVAL;
+	}
+
+	return -EOPNOTSUPP;
+}
diff --git a/arch/powerpc/kernel/pci_direct_iommu.c b/arch/powerpc/kernel/pci_direct_iommu.c
new file mode 100644
index 000000000000..e1a32f802c0b
--- /dev/null
+++ b/arch/powerpc/kernel/pci_direct_iommu.c
@@ -0,0 +1,94 @@
+/*
+ * Support for DMA from PCI devices to main memory on
+ * machines without an iommu or with directly addressable
+ * RAM (typically a pmac with 2Gb of RAM or less)
+ *
+ * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/pmac_feature.h>
+#include <asm/abs_addr.h>
+#include <asm/ppc-pci.h>
+
+static void *pci_direct_alloc_coherent(struct device *hwdev, size_t size,
+				   dma_addr_t *dma_handle, gfp_t flag)
+{
+	void *ret;
+
+	ret = (void *)__get_free_pages(flag, get_order(size));
+	if (ret != NULL) {
+		memset(ret, 0, size);
+		*dma_handle = virt_to_abs(ret);
+	}
+	return ret;
+}
+
+static void pci_direct_free_coherent(struct device *hwdev, size_t size,
+				 void *vaddr, dma_addr_t dma_handle)
+{
+	free_pages((unsigned long)vaddr, get_order(size));
+}
+
+static dma_addr_t pci_direct_map_single(struct device *hwdev, void *ptr,
+		size_t size, enum dma_data_direction direction)
+{
+	return virt_to_abs(ptr);
+}
+
+static void pci_direct_unmap_single(struct device *hwdev, dma_addr_t dma_addr,
+		size_t size, enum dma_data_direction direction)
+{
+}
+
+static int pci_direct_map_sg(struct device *hwdev, struct scatterlist *sg,
+		int nents, enum dma_data_direction direction)
+{
+	int i;
+
+	for (i = 0; i < nents; i++, sg++) {
+		sg->dma_address = page_to_phys(sg->page) + sg->offset;
+		sg->dma_length = sg->length;
+	}
+
+	return nents;
+}
+
+static void pci_direct_unmap_sg(struct device *hwdev, struct scatterlist *sg,
+		int nents, enum dma_data_direction direction)
+{
+}
+
+static int pci_direct_dma_supported(struct device *dev, u64 mask)
+{
+	return mask < 0x100000000ull;
+}
+
+void __init pci_direct_iommu_init(void)
+{
+	pci_dma_ops.alloc_coherent = pci_direct_alloc_coherent;
+	pci_dma_ops.free_coherent = pci_direct_free_coherent;
+	pci_dma_ops.map_single = pci_direct_map_single;
+	pci_dma_ops.unmap_single = pci_direct_unmap_single;
+	pci_dma_ops.map_sg = pci_direct_map_sg;
+	pci_dma_ops.unmap_sg = pci_direct_unmap_sg;
+	pci_dma_ops.dma_supported = pci_direct_dma_supported;
+}
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
new file mode 100644
index 000000000000..12c4c9e9bbc7
--- /dev/null
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -0,0 +1,230 @@
+/*
+ * pci_dn.c
+ *
+ * Copyright (C) 2001 Todd Inglett, IBM Corporation
+ *
+ * PCI manipulation via device_nodes.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *    
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/bootmem.h>
+
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/ppc-pci.h>
+
+/*
+ * Traverse_func that inits the PCI fields of the device node.
+ * NOTE: this *must* be done before read/write config to the device.
+ */
+static void * __devinit update_dn_pci_info(struct device_node *dn, void *data)
+{
+	struct pci_controller *phb = data;
+	int *type = (int *)get_property(dn, "ibm,pci-config-space-type", NULL);
+	u32 *regs;
+	struct pci_dn *pdn;
+
+	if (mem_init_done)
+		pdn = kmalloc(sizeof(*pdn), GFP_KERNEL);
+	else
+		pdn = alloc_bootmem(sizeof(*pdn));
+	if (pdn == NULL)
+		return NULL;
+	memset(pdn, 0, sizeof(*pdn));
+	dn->data = pdn;
+	pdn->node = dn;
+	pdn->phb = phb;
+	regs = (u32 *)get_property(dn, "reg", NULL);
+	if (regs) {
+		/* First register entry is addr (00BBSS00)  */
+		pdn->busno = (regs[0] >> 16) & 0xff;
+		pdn->devfn = (regs[0] >> 8) & 0xff;
+	}
+
+	pdn->pci_ext_config_space = (type && *type == 1);
+	return NULL;
+}
+
+/*
+ * Traverse a device tree stopping each PCI device in the tree.
+ * This is done depth first.  As each node is processed, a "pre"
+ * function is called and the children are processed recursively.
+ *
+ * The "pre" func returns a value.  If non-zero is returned from
+ * the "pre" func, the traversal stops and this value is returned.
+ * This return value is useful when using traverse as a method of
+ * finding a device.
+ *
+ * NOTE: we do not run the func for devices that do not appear to
+ * be PCI except for the start node which we assume (this is good
+ * because the start node is often a phb which may be missing PCI
+ * properties).
+ * We use the class-code as an indicator. If we run into
+ * one of these nodes we also assume its siblings are non-pci for
+ * performance.
+ */
+void *traverse_pci_devices(struct device_node *start, traverse_func pre,
+		void *data)
+{
+	struct device_node *dn, *nextdn;
+	void *ret;
+
+	/* We started with a phb, iterate all childs */
+	for (dn = start->child; dn; dn = nextdn) {
+		u32 *classp, class;
+
+		nextdn = NULL;
+		classp = (u32 *)get_property(dn, "class-code", NULL);
+		class = classp ? *classp : 0;
+
+		if (pre && ((ret = pre(dn, data)) != NULL))
+			return ret;
+
+		/* If we are a PCI bridge, go down */
+		if (dn->child && ((class >> 8) == PCI_CLASS_BRIDGE_PCI ||
+				  (class >> 8) == PCI_CLASS_BRIDGE_CARDBUS))
+			/* Depth first...do children */
+			nextdn = dn->child;
+		else if (dn->sibling)
+			/* ok, try next sibling instead. */
+			nextdn = dn->sibling;
+		if (!nextdn) {
+			/* Walk up to next valid sibling. */
+			do {
+				dn = dn->parent;
+				if (dn == start)
+					return NULL;
+			} while (dn->sibling == NULL);
+			nextdn = dn->sibling;
+		}
+	}
+	return NULL;
+}
+
+/** 
+ * pci_devs_phb_init_dynamic - setup pci devices under this PHB
+ * phb: pci-to-host bridge (top-level bridge connecting to cpu)
+ *
+ * This routine is called both during boot, (before the memory
+ * subsystem is set up, before kmalloc is valid) and during the 
+ * dynamic lpar operation of adding a PHB to a running system.
+ */
+void __devinit pci_devs_phb_init_dynamic(struct pci_controller *phb)
+{
+	struct device_node * dn = (struct device_node *) phb->arch_data;
+	struct pci_dn *pdn;
+
+	/* PHB nodes themselves must not match */
+	update_dn_pci_info(dn, phb);
+	pdn = dn->data;
+	if (pdn) {
+		pdn->devfn = pdn->busno = -1;
+		pdn->phb = phb;
+	}
+
+	/* Update dn->phb ptrs for new phb and children devices */
+	traverse_pci_devices(dn, update_dn_pci_info, phb);
+}
+
+/*
+ * Traversal func that looks for a <busno,devfcn> value.
+ * If found, the pci_dn is returned (thus terminating the traversal).
+ */
+static void *is_devfn_node(struct device_node *dn, void *data)
+{
+	int busno = ((unsigned long)data >> 8) & 0xff;
+	int devfn = ((unsigned long)data) & 0xff;
+	struct pci_dn *pci = dn->data;
+
+	if (pci && (devfn == pci->devfn) && (busno == pci->busno))
+		return dn;
+	return NULL;
+}
+
+/*
+ * This is the "slow" path for looking up a device_node from a
+ * pci_dev.  It will hunt for the device under its parent's
+ * phb and then update sysdata for a future fastpath.
+ *
+ * It may also do fixups on the actual device since this happens
+ * on the first read/write.
+ *
+ * Note that it also must deal with devices that don't exist.
+ * In this case it may probe for real hardware ("just in case")
+ * and add a device_node to the device tree if necessary.
+ *
+ */
+struct device_node *fetch_dev_dn(struct pci_dev *dev)
+{
+	struct device_node *orig_dn = dev->sysdata;
+	struct device_node *dn;
+	unsigned long searchval = (dev->bus->number << 8) | dev->devfn;
+
+	dn = traverse_pci_devices(orig_dn, is_devfn_node, (void *)searchval);
+	if (dn)
+		dev->sysdata = dn;
+	return dn;
+}
+EXPORT_SYMBOL(fetch_dev_dn);
+
+static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
+{
+	struct device_node *np = node;
+	struct pci_dn *pci = NULL;
+	int err = NOTIFY_OK;
+
+	switch (action) {
+	case PSERIES_RECONFIG_ADD:
+		pci = np->parent->data;
+		if (pci)
+			update_dn_pci_info(np, pci->phb);
+		break;
+	default:
+		err = NOTIFY_DONE;
+		break;
+	}
+	return err;
+}
+
+static struct notifier_block pci_dn_reconfig_nb = {
+	.notifier_call = pci_dn_reconfig_notifier,
+};
+
+/** 
+ * pci_devs_phb_init - Initialize phbs and pci devs under them.
+ * 
+ * This routine walks over all phb's (pci-host bridges) on the
+ * system, and sets up assorted pci-related structures 
+ * (including pci info in the device node structs) for each
+ * pci device found underneath.  This routine runs once,
+ * early in the boot sequence.
+ */
+void __init pci_devs_phb_init(void)
+{
+	struct pci_controller *phb, *tmp;
+
+	/* This must be done first so the device nodes have valid pci info! */
+	list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
+		pci_devs_phb_init_dynamic(phb);
+
+	pSeries_reconfig_notifier_register(&pci_dn_reconfig_nb);
+}
diff --git a/arch/powerpc/kernel/pci_iommu.c b/arch/powerpc/kernel/pci_iommu.c
new file mode 100644
index 000000000000..bdf15dbbf4f0
--- /dev/null
+++ b/arch/powerpc/kernel/pci_iommu.c
@@ -0,0 +1,128 @@
+/*
+ * arch/ppc64/kernel/pci_iommu.c
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ *
+ * Rewrite, cleanup, new allocation schemes:
+ * Copyright (C) 2004 Olof Johansson, IBM Corporation
+ *
+ * Dynamic DMA mapping support, platform-independent parts.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/iommu.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+
+/*
+ * We can use ->sysdata directly and avoid the extra work in
+ * pci_device_to_OF_node since ->sysdata will have been initialised
+ * in the iommu init code for all devices.
+ */
+#define PCI_GET_DN(dev) ((struct device_node *)((dev)->sysdata))
+
+static inline struct iommu_table *devnode_table(struct device *dev)
+{
+	struct pci_dev *pdev;
+
+	if (!dev) {
+		pdev = ppc64_isabridge_dev;
+		if (!pdev)
+			return NULL;
+	} else
+		pdev = to_pci_dev(dev);
+
+	return PCI_DN(PCI_GET_DN(pdev))->iommu_table;
+}
+
+
+/* Allocates a contiguous real buffer and creates mappings over it.
+ * Returns the virtual address of the buffer and sets dma_handle
+ * to the dma address (mapping) of the first page.
+ */
+static void *pci_iommu_alloc_coherent(struct device *hwdev, size_t size,
+			   dma_addr_t *dma_handle, gfp_t flag)
+{
+	return iommu_alloc_coherent(devnode_table(hwdev), size, dma_handle,
+			flag);
+}
+
+static void pci_iommu_free_coherent(struct device *hwdev, size_t size,
+			 void *vaddr, dma_addr_t dma_handle)
+{
+	iommu_free_coherent(devnode_table(hwdev), size, vaddr, dma_handle);
+}
+
+/* Creates TCEs for a user provided buffer.  The user buffer must be 
+ * contiguous real kernel storage (not vmalloc).  The address of the buffer
+ * passed here is the kernel (virtual) address of the buffer.  The buffer
+ * need not be page aligned, the dma_addr_t returned will point to the same
+ * byte within the page as vaddr.
+ */
+static dma_addr_t pci_iommu_map_single(struct device *hwdev, void *vaddr,
+		size_t size, enum dma_data_direction direction)
+{
+	return iommu_map_single(devnode_table(hwdev), vaddr, size, direction);
+}
+
+
+static void pci_iommu_unmap_single(struct device *hwdev, dma_addr_t dma_handle,
+		size_t size, enum dma_data_direction direction)
+{
+	iommu_unmap_single(devnode_table(hwdev), dma_handle, size, direction);
+}
+
+
+static int pci_iommu_map_sg(struct device *pdev, struct scatterlist *sglist,
+		int nelems, enum dma_data_direction direction)
+{
+	return iommu_map_sg(pdev, devnode_table(pdev), sglist,
+			nelems, direction);
+}
+
+static void pci_iommu_unmap_sg(struct device *pdev, struct scatterlist *sglist,
+		int nelems, enum dma_data_direction direction)
+{
+	iommu_unmap_sg(devnode_table(pdev), sglist, nelems, direction);
+}
+
+/* We support DMA to/from any memory page via the iommu */
+static int pci_iommu_dma_supported(struct device *dev, u64 mask)
+{
+	return 1;
+}
+
+void pci_iommu_init(void)
+{
+	pci_dma_ops.alloc_coherent = pci_iommu_alloc_coherent;
+	pci_dma_ops.free_coherent = pci_iommu_free_coherent;
+	pci_dma_ops.map_single = pci_iommu_map_single;
+	pci_dma_ops.unmap_single = pci_iommu_unmap_single;
+	pci_dma_ops.map_sg = pci_iommu_map_sg;
+	pci_dma_ops.unmap_sg = pci_iommu_unmap_sg;
+	pci_dma_ops.dma_supported = pci_iommu_dma_supported;
+}