From 3eefae994d9224fb7771a3ddb683868363c23510 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:21:04 +0200
Subject: ftrace: limit trace entries

Currently there is no protection from the root user to use up all of
memory for trace buffers. If the root user allocates too many entries,
the OOM killer might start kill off all tasks.

This patch adds an algorith to check the following condition:

 pages_requested > (freeable_memory + current_trace_buffer_pages) / 4

If the above is met then the allocation fails. The above prevents more
than 1/4th of freeable memory from being used by trace buffers.

To determine the freeable_memory, I made determine_dirtyable_memory in
mm/page-writeback.c global.

Special thanks goes to Peter Zijlstra for suggesting the above calculation.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 mm/page-writeback.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 789b6adbef37..b38f700825fc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -126,8 +126,6 @@ static void background_writeout(unsigned long _min_pages);
 static struct prop_descriptor vm_completions;
 static struct prop_descriptor vm_dirties;
 
-static unsigned long determine_dirtyable_memory(void);
-
 /*
  * couple the period to the dirty_ratio:
  *
@@ -347,7 +345,13 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
 #endif
 }
 
-static unsigned long determine_dirtyable_memory(void)
+/**
+ * determine_dirtyable_memory - amount of memory that may be used
+ *
+ * Returns the numebr of pages that can currently be freed and used
+ * by the kernel for direct mappings.
+ */
+unsigned long determine_dirtyable_memory(void)
 {
 	unsigned long x;
 
-- 
cgit v1.2.3-59-g8ed1b


From e8c27ac9191ab9e6506ae5cbe70d87ac50f8e960 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 1 Jun 2008 13:15:22 -0700
Subject: x86, numa, 32-bit: print out debug info on all kvas

also fix the print out of node_remap_end_vaddr

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/discontig_32.c | 9 +++++++--
 mm/page_alloc.c            | 5 +++++
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index ebbbba338150..3150ad385672 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -168,6 +168,8 @@ static void __init allocate_pgdat(int nid)
 		reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t),
 			      "NODE_DATA");
 	}
+	printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
+		nid, (unsigned long)NODE_DATA(nid));
 }
 
 #ifdef CONFIG_DISCONTIGMEM
@@ -208,8 +210,12 @@ void __init remap_numa_kva(void)
 	int node;
 
 	for_each_online_node(node) {
+		printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
 		for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
 			vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+			printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
+				(unsigned long)vaddr,
+				node_remap_start_pfn[node] + pfn);
 			set_pmd_pfn((ulong) vaddr, 
 				node_remap_start_pfn[node] + pfn, 
 				PAGE_KERNEL_LARGE);
@@ -293,8 +299,7 @@ static void init_remap_allocator(int nid)
 
 	printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
 		(ulong) node_remap_start_vaddr[nid],
-		(ulong) pfn_to_kaddr(highstart_pfn
-		   + node_remap_offset[nid] + node_remap_size[nid]));
+		(ulong) node_remap_end_vaddr[nid]);
 }
 #else
 void *alloc_remap(int nid, unsigned long size)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63835579323a..502223c3c2c6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3486,6 +3486,11 @@ void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,
 	calculate_node_totalpages(pgdat, zones_size, zholes_size);
 
 	alloc_node_mem_map(pgdat);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
+		nid, (unsigned long)pgdat,
+		(unsigned long)pgdat->node_mem_map);
+#endif
 
 	free_area_init_core(pgdat, zones_size, zholes_size);
 }
-- 
cgit v1.2.3-59-g8ed1b


From cc1a9d86ce989083703c4bdc11b75a87e1cc404a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 8 Jun 2008 19:39:16 -0700
Subject: mm, x86: shrink_active_range() should check all

Now we are using register_e820_active_regions() instead of
add_active_range() directly. So end_pfn could be different between the
value in early_node_map to node_end_pfn.

So we need to make shrink_active_range() smarter.

shrink_active_range() is a generic MM function in mm/page_alloc.c but
it is only used on 32-bit x86. Should we move it back to some file in
arch/x86?

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/discontig_32.c |  2 +-
 include/linux/mm.h         |  3 +--
 mm/page_alloc.c            | 44 ++++++++++++++++++++++++++++++++++----------
 3 files changed, 36 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index a89ccf3d4c14..489605bab85a 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -282,7 +282,7 @@ static unsigned long calculate_numa_remap_pages(void)
 
 		node_end_pfn[nid] -= size;
 		node_remap_start_pfn[nid] = node_end_pfn[nid];
-		shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
+		shrink_active_range(nid, node_end_pfn[nid]);
 	}
 	printk("Reserving total of %ld pages for numa KVA remap\n",
 			reserve_pages);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c31a9cd2a30e..7cbd949f2516 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -997,8 +997,7 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat,
 extern void free_area_init_nodes(unsigned long *max_zone_pfn);
 extern void add_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
-extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
-						unsigned long new_end_pfn);
+extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn);
 extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
 extern void remove_all_active_ranges(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 502223c3c2c6..215408684076 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3579,25 +3579,49 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 /**
  * shrink_active_range - Shrink an existing registered range of PFNs
  * @nid: The node id the range is on that should be shrunk
- * @old_end_pfn: The old end PFN of the range
  * @new_end_pfn: The new PFN of the range
  *
  * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
- * The map is kept at the end physical page range that has already been
- * registered with add_active_range(). This function allows an arch to shrink
- * an existing registered range.
+ * The map is kept near the end physical page range that has already been
+ * registered. This function allows an arch to shrink an existing registered
+ * range.
  */
-void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
-						unsigned long new_end_pfn)
+void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn)
 {
-	int i;
+	int i, j;
+	int removed = 0;
 
 	/* Find the old active region end and shrink */
-	for_each_active_range_index_in_nid(i, nid)
-		if (early_node_map[i].end_pfn == old_end_pfn) {
+	for_each_active_range_index_in_nid(i, nid) {
+		if (early_node_map[i].start_pfn >= new_end_pfn) {
+			/* clear it */
+			early_node_map[i].end_pfn = 0;
+			removed = 1;
+			continue;
+		}
+		if (early_node_map[i].end_pfn > new_end_pfn) {
 			early_node_map[i].end_pfn = new_end_pfn;
-			break;
+			continue;
 		}
+	}
+
+	if (!removed)
+		return;
+
+	/* remove the blank ones */
+	for (i = nr_nodemap_entries - 1; i > 0; i--) {
+		if (early_node_map[i].nid != nid)
+			continue;
+		if (early_node_map[i].end_pfn)
+			continue;
+		/* we found it, get rid of it */
+		for (j = i; j < nr_nodemap_entries - 1; j++)
+			memcpy(&early_node_map[j], &early_node_map[j+1],
+				sizeof(early_node_map[j]));
+		j = nr_nodemap_entries - 1;
+		memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
+		nr_nodemap_entries--;
+	}
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 1ea0704e0da65b2b46f9142ff1391163aac24060 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 16 Jun 2008 04:30:00 -0700
Subject: mm: add a ptep_modify_prot transaction abstraction

This patch adds an API for doing read-modify-write updates to a pte's
protection bits which may race against hardware updates to the pte.
After reading the pte, the hardware may asynchonously set the accessed
or dirty bits on a pte, which would be lost when writing back the
modified pte value.

The existing technique to handle this race is to use
ptep_get_and_clear() atomically fetch the old pte value and clear it
in memory.  This has the effect of marking the pte as non-present,
which will prevent the hardware from updating its state.  When the new
value is written back, the pte will be present again, and the hardware
can resume updating the access/dirty flags.

When running in a virtualized environment, pagetable updates are
relatively expensive, since they generally involve some trap into the
hypervisor.  To mitigate the cost of these updates, we tend to batch
them.

However, because of the atomic nature of ptep_get_and_clear(), it is
inherently non-batchable.  This new interface allows batching by
giving the underlying implementation enough information to open a
transaction between the read and write phases:

ptep_modify_prot_start() returns the current pte value, and puts the
  pte entry into a state where either the hardware will not update the
  pte, or if it does, the updates will be preserved on commit.

ptep_modify_prot_commit() writes back the updated pte, makes sure that
  any hardware updates made since ptep_modify_prot_start() are
  preserved.

ptep_modify_prot_start() and _commit() must be exactly paired, and
used while holding the appropriate pte lock.  They do not protect
against other software updates of the pte in any way.

The current implementations of ptep_modify_prot_start and _commit are
functionally unchanged from before: _start() uses ptep_get_and_clear()
fetch the pte and zero the entry, preventing any hardware updates.
_commit() simply writes the new pte value back knowing that the
hardware has not updated the pte in the meantime.

The only current user of this interface is mprotect

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/pgtable.h | 57 +++++++++++++++++++++++++++++++++++++++++++
 mm/mprotect.c                 | 10 +++-----
 2 files changed, 61 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 44ef329531c3..4fce3db2cecc 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -197,6 +197,63 @@ static inline int pmd_none_or_clear_bad(pmd_t *pmd)
 }
 #endif /* CONFIG_MMU */
 
+static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm,
+					     unsigned long addr,
+					     pte_t *ptep)
+{
+	/*
+	 * Get the current pte state, but zero it out to make it
+	 * non-present, preventing the hardware from asynchronously
+	 * updating it.
+	 */
+	return ptep_get_and_clear(mm, addr, ptep);
+}
+
+static inline void __ptep_modify_prot_commit(struct mm_struct *mm,
+					     unsigned long addr,
+					     pte_t *ptep, pte_t pte)
+{
+	/*
+	 * The pte is non-present, so there's no hardware state to
+	 * preserve.
+	 */
+	set_pte_at(mm, addr, ptep, pte);
+}
+
+#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+/*
+ * Start a pte protection read-modify-write transaction, which
+ * protects against asynchronous hardware modifications to the pte.
+ * The intention is not to prevent the hardware from making pte
+ * updates, but to prevent any updates it may make from being lost.
+ *
+ * This does not protect against other software modifications of the
+ * pte; the appropriate pte lock must be held over the transation.
+ *
+ * Note that this interface is intended to be batchable, meaning that
+ * ptep_modify_prot_commit may not actually update the pte, but merely
+ * queue the update to be done at some later time.  The update must be
+ * actually committed before the pte lock is released, however.
+ */
+static inline pte_t ptep_modify_prot_start(struct mm_struct *mm,
+					   unsigned long addr,
+					   pte_t *ptep)
+{
+	return __ptep_modify_prot_start(mm, addr, ptep);
+}
+
+/*
+ * Commit an update to a pte, leaving any hardware-controlled bits in
+ * the PTE unmodified.
+ */
+static inline void ptep_modify_prot_commit(struct mm_struct *mm,
+					   unsigned long addr,
+					   pte_t *ptep, pte_t pte)
+{
+	__ptep_modify_prot_commit(mm, addr, ptep, pte);
+}
+#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
+
 /*
  * A facility to provide lazy MMU batching.  This allows PTE updates and
  * page invalidations to be delayed until a call to leave lazy MMU mode
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a5bf31c27375..acfe7c8d72fc 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -47,19 +47,17 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		if (pte_present(oldpte)) {
 			pte_t ptent;
 
-			/* Avoid an SMP race with hardware updated dirty/clean
-			 * bits by wiping the pte and then setting the new pte
-			 * into place.
-			 */
-			ptent = ptep_get_and_clear(mm, addr, pte);
+			ptent = ptep_modify_prot_start(mm, addr, pte);
 			ptent = pte_modify(ptent, newprot);
+
 			/*
 			 * Avoid taking write faults for pages we know to be
 			 * dirty.
 			 */
 			if (dirty_accountable && pte_dirty(ptent))
 				ptent = pte_mkwrite(ptent);
-			set_pte_at(mm, addr, pte, ptent);
+
+			ptep_modify_prot_commit(mm, addr, pte, ptent);
 #ifdef CONFIG_MIGRATION
 		} else if (!pte_file(oldpte)) {
 			swp_entry_t entry = pte_to_swp_entry(oldpte);
-- 
cgit v1.2.3-59-g8ed1b


From 38510754a50192a072210e24fdc4ae65592182f0 Mon Sep 17 00:00:00 2001
From: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Date: Mon, 14 Jan 2008 23:35:32 +0100
Subject: avr32: Use a quicklist for PTE allocation as well

Using a quicklist to allocate PTEs might be slightly faster than using
the page allocator directly since we might avoid zeroing the page
after each allocation.

Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
---
 include/asm-avr32/pgalloc.h | 28 +++++++++++++++-------------
 mm/Kconfig                  |  2 +-
 2 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/include/asm-avr32/pgalloc.h b/include/asm-avr32/pgalloc.h
index a291f59659cf..640821323943 100644
--- a/include/asm-avr32/pgalloc.h
+++ b/include/asm-avr32/pgalloc.h
@@ -13,6 +13,7 @@
 #include <asm/pgtable.h>
 
 #define QUICK_PGD	0	/* Preserve kernel mappings over free */
+#define QUICK_PT	1	/* Zero on free */
 
 static inline void pmd_populate_kernel(struct mm_struct *mm,
 				       pmd_t *pmd, pte_t *pte)
@@ -52,34 +53,34 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 					  unsigned long address)
 {
-	pte_t *pte;
-
-	pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_REPEAT);
-
-	return pte;
+	return quicklist_alloc(QUICK_PT, GFP_KERNEL | __GFP_REPEAT, NULL);
 }
 
-static inline struct page *pte_alloc_one(struct mm_struct *mm,
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
 					 unsigned long address)
 {
-	struct page *pte;
+	struct page *page;
+	void *pg;
 
-	pte = alloc_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);
-	if (!pte)
+	pg = quicklist_alloc(QUICK_PT, GFP_KERNEL | __GFP_REPEAT, NULL);
+	if (!pg)
 		return NULL;
-	pgtable_page_ctor(pte);
-	return pte;
+
+	page = virt_to_page(pg);
+	pgtable_page_ctor(page);
+
+	return page;
 }
 
 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
-	free_page((unsigned long)pte);
+	quicklist_free(QUICK_PT, NULL, pte);
 }
 
 static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
 {
 	pgtable_page_dtor(pte);
-	__free_page(pte);
+	quicklist_free_page(QUICK_PT, NULL, pte);
 }
 
 #define __pte_free_tlb(tlb,pte)				\
@@ -91,6 +92,7 @@ do {							\
 static inline void check_pgt_cache(void)
 {
 	quicklist_trim(QUICK_PGD, NULL, 25, 16);
+	quicklist_trim(QUICK_PT, NULL, 25, 16);
 }
 
 #endif /* __ASM_AVR32_PGALLOC_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 3aa819d628c1..60b261952783 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -199,7 +199,7 @@ config BOUNCE
 config NR_QUICK
 	int
 	depends on QUICKLIST
-	default "2" if SUPERH
+	default "2" if SUPERH || AVR32
 	default "1"
 
 config VIRT_TO_BUS
-- 
cgit v1.2.3-59-g8ed1b


From cc1050bafebfb1d7935331282e948b5016318192 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 13 Jun 2008 19:08:52 -0700
Subject: x86: replace shrink_active_range() with remove_active_range()

in case we have kva before ramdisk on a node, we still need to use
those ranges.

v2: reserve_early kva ram area, in case there are holes in highmem, to avoid
    those area could be treat as free high pages.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/discontig_32.c | 45 ++++++++++++++++++++++++---------------------
 include/linux/mm.h         |  3 ++-
 mm/page_alloc.c            | 29 +++++++++++++++++++++++------
 3 files changed, 49 insertions(+), 28 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index accc7c6c57fc..c3f119e99e0d 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -230,8 +230,8 @@ static unsigned long calculate_numa_remap_pages(void)
 	unsigned long size, reserve_pages = 0;
 
 	for_each_online_node(nid) {
-		u64 node_end_target;
-		u64 node_end_final;
+		u64 node_kva_target;
+		u64 node_kva_final;
 
 		/*
 		 * The acpi/srat node info can show hot-add memroy zones
@@ -254,42 +254,45 @@ static unsigned long calculate_numa_remap_pages(void)
 		/* now the roundup is correct, convert to PAGE_SIZE pages */
 		size = size * PTRS_PER_PTE;
 
-		node_end_target = round_down(node_end_pfn[nid] - size,
+		node_kva_target = round_down(node_end_pfn[nid] - size,
 						 PTRS_PER_PTE);
-		node_end_target <<= PAGE_SHIFT;
+		node_kva_target <<= PAGE_SHIFT;
 		do {
-			node_end_final = find_e820_area(node_end_target,
+			node_kva_final = find_e820_area(node_kva_target,
 					((u64)node_end_pfn[nid])<<PAGE_SHIFT,
 						((u64)size)<<PAGE_SHIFT,
 						LARGE_PAGE_BYTES);
-			node_end_target -= LARGE_PAGE_BYTES;
-		} while (node_end_final == -1ULL &&
-			 (node_end_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
+			node_kva_target -= LARGE_PAGE_BYTES;
+		} while (node_kva_final == -1ULL &&
+			 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
 
-		if (node_end_final == -1ULL)
+		if (node_kva_final == -1ULL)
 			panic("Can not get kva ram\n");
 
-		printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
-				size, nid);
 		node_remap_size[nid] = size;
 		node_remap_offset[nid] = reserve_pages;
 		reserve_pages += size;
-		printk("Shrinking node %d from %ld pages to %lld pages\n",
-			nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT);
+		printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n",
+				size, nid, node_kva_final>>PAGE_SHIFT);
 
 		/*
 		 *  prevent kva address below max_low_pfn want it on system
 		 *  with less memory later.
 		 *  layout will be: KVA address , KVA RAM
+		 *
+		 *  we are supposed to only record the one less then max_low_pfn
+		 *  but we could have some hole in high memory, and it will only
+		 *  check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
+		 *  to use it as free.
+		 *  So reserve_early here, hope we don't run out of that array
 		 */
-		if ((node_end_final>>PAGE_SHIFT) < max_low_pfn)
-			reserve_early(node_end_final,
-				      node_end_final+(((u64)size)<<PAGE_SHIFT),
-				      "KVA RAM");
-
-		node_end_pfn[nid] = node_end_final>>PAGE_SHIFT;
-		node_remap_start_pfn[nid] = node_end_pfn[nid];
-		shrink_active_range(nid, node_end_pfn[nid]);
+		reserve_early(node_kva_final,
+			      node_kva_final+(((u64)size)<<PAGE_SHIFT),
+			      "KVA RAM");
+
+		node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
+		remove_active_range(nid, node_remap_start_pfn[nid],
+					 node_remap_start_pfn[nid] + size);
 	}
 	printk("Reserving total of %ld pages for numa KVA remap\n",
 			reserve_pages);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ce8e397a61f6..034a3156d2f0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -998,7 +998,8 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat,
 extern void free_area_init_nodes(unsigned long *max_zone_pfn);
 extern void add_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
-extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn);
+extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
+					unsigned long end_pfn);
 extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
 extern void remove_all_active_ranges(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eee5ba7509c1..d80e1868e570 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3552,30 +3552,47 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 }
 
 /**
- * shrink_active_range - Shrink an existing registered range of PFNs
+ * remove_active_range - Shrink an existing registered range of PFNs
  * @nid: The node id the range is on that should be shrunk
- * @new_end_pfn: The new PFN of the range
+ * @start_pfn: The new PFN of the range
+ * @end_pfn: The new PFN of the range
  *
  * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
  * The map is kept near the end physical page range that has already been
  * registered. This function allows an arch to shrink an existing registered
  * range.
  */
-void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn)
+void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
+				unsigned long end_pfn)
 {
 	int i, j;
 	int removed = 0;
 
+	printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
+			  nid, start_pfn, end_pfn);
+
 	/* Find the old active region end and shrink */
 	for_each_active_range_index_in_nid(i, nid) {
-		if (early_node_map[i].start_pfn >= new_end_pfn) {
+		if (early_node_map[i].start_pfn >= start_pfn &&
+		    early_node_map[i].end_pfn <= end_pfn) {
 			/* clear it */
+			early_node_map[i].start_pfn = 0;
 			early_node_map[i].end_pfn = 0;
 			removed = 1;
 			continue;
 		}
-		if (early_node_map[i].end_pfn > new_end_pfn) {
-			early_node_map[i].end_pfn = new_end_pfn;
+		if (early_node_map[i].start_pfn < start_pfn &&
+		    early_node_map[i].end_pfn > start_pfn) {
+			unsigned long temp_end_pfn = early_node_map[i].end_pfn;
+			early_node_map[i].end_pfn = start_pfn;
+			if (temp_end_pfn > end_pfn)
+				add_active_range(nid, end_pfn, temp_end_pfn);
+			continue;
+		}
+		if (early_node_map[i].start_pfn >= start_pfn &&
+		    early_node_map[i].end_pfn > end_pfn &&
+		    early_node_map[i].start_pfn < end_pfn) {
+			early_node_map[i].start_pfn = end_pfn;
 			continue;
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From b5bc6c0e55000dab86b73f838f5ad02908b23755 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 14 Jun 2008 18:32:52 -0700
Subject: x86, mm: use add_highpages_with_active_regions() for high pages init
 v2

use early_node_map to init high pages, so we can remove page_is_ram() and
page_is_reserved_early() in the big loop with add_one_highpage

also remove page_is_reserved_early(), it is not needed anymore.

v2: fix the build of other platforms

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c     | 11 --------
 arch/x86/mm/discontig_32.c | 19 ++++++--------
 arch/x86/mm/init_32.c      | 62 ++++++++++++++++++++++++++++++++++++++--------
 include/asm-x86/e820.h     |  1 -
 include/asm-x86/highmem.h  |  3 +++
 include/linux/mm.h         |  2 ++
 mm/page_alloc.c            |  8 ++++++
 7 files changed, 71 insertions(+), 35 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 5051ce744b4e..ed46b7a6bc13 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -612,17 +612,6 @@ void __init free_early(u64 start, u64 end)
 	early_res[j - 1].end = 0;
 }
 
-int __init page_is_reserved_early(unsigned long pagenr)
-{
-	u64 start = (u64)pagenr << PAGE_SHIFT;
-	int i;
-	struct early_res *r;
-
-	i = find_overlapped_early(start, start + PAGE_SIZE);
-	r = &early_res[i];
-	return (i < MAX_EARLY_RES && r->end);
-}
-
 void __init early_res_to_bootmem(u64 start, u64 end)
 {
 	int i;
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index c3f119e99e0d..7c4d0255f8d8 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -100,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
 #endif
 
 extern unsigned long find_max_low_pfn(void);
-extern void add_one_highpage_init(struct page *, int, int);
 extern unsigned long highend_pfn, highstart_pfn;
 
 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -432,10 +431,10 @@ void __init set_highmem_pages_init(int bad_ppro)
 {
 #ifdef CONFIG_HIGHMEM
 	struct zone *zone;
-	struct page *page;
+	int nid;
 
 	for_each_zone(zone) {
-		unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
+		unsigned long zone_start_pfn, zone_end_pfn;
 
 		if (!is_highmem(zone))
 			continue;
@@ -443,16 +442,12 @@ void __init set_highmem_pages_init(int bad_ppro)
 		zone_start_pfn = zone->zone_start_pfn;
 		zone_end_pfn = zone_start_pfn + zone->spanned_pages;
 
+		nid = zone_to_nid(zone);
 		printk("Initializing %s for node %d (%08lx:%08lx)\n",
-				zone->name, zone_to_nid(zone),
-				zone_start_pfn, zone_end_pfn);
-
-		for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
-			if (!pfn_valid(node_pfn))
-				continue;
-			page = pfn_to_page(node_pfn);
-			add_one_highpage_init(page, node_pfn, bad_ppro);
-		}
+				zone->name, nid, zone_start_pfn, zone_end_pfn);
+
+		add_highpages_with_active_regions(nid, zone_start_pfn,
+				 zone_end_pfn, bad_ppro);
 	}
 	totalram_pages += totalhigh_pages;
 #endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index abadb1da70df..ba07a489230e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -287,10 +287,10 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 	pkmap_page_table = pte;
 }
 
-void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+static void __init
+add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
 {
-	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) &&
-	    !page_is_reserved_early(pfn)) {
+	if (!(bad_ppro && page_kills_ppro(pfn))) {
 		ClearPageReserved(page);
 		init_page_count(page);
 		__free_page(page);
@@ -299,18 +299,58 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
 		SetPageReserved(page);
 }
 
+struct add_highpages_data {
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+	int bad_ppro;
+};
+
+static void __init add_highpages_work_fn(unsigned long start_pfn,
+					 unsigned long end_pfn, void *datax)
+{
+	int node_pfn;
+	struct page *page;
+	unsigned long final_start_pfn, final_end_pfn;
+	struct add_highpages_data *data;
+	int bad_ppro;
+
+	data = (struct add_highpages_data *)datax;
+	bad_ppro = data->bad_ppro;
+
+	final_start_pfn = max(start_pfn, data->start_pfn);
+	final_end_pfn = min(end_pfn, data->end_pfn);
+	if (final_start_pfn >= final_end_pfn)
+		return;
+
+	for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
+	     node_pfn++) {
+		if (!pfn_valid(node_pfn))
+			continue;
+		page = pfn_to_page(node_pfn);
+		add_one_highpage_init(page, node_pfn, bad_ppro);
+	}
+
+}
+
+void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+					      unsigned long end_pfn,
+					      int bad_ppro)
+{
+	struct add_highpages_data data;
+
+	data.start_pfn = start_pfn;
+	data.end_pfn = end_pfn;
+	data.bad_ppro = bad_ppro;
+
+	work_with_active_regions(nid, add_highpages_work_fn, &data);
+}
+
 #ifndef CONFIG_NUMA
 static void __init set_highmem_pages_init(int bad_ppro)
 {
-	int pfn;
+	add_highpages_with_active_regions(0, highstart_pfn, highend_pfn,
+						bad_ppro);
 
-	for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
-		/*
-		 * Holes under sparsemem might not have no mem_map[]:
-		 */
-		if (pfn_valid(pfn))
-			add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
-	}
 	totalram_pages += totalhigh_pages;
 }
 #endif /* !CONFIG_NUMA */
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 6b0ce745a60c..55d310596907 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -86,7 +86,6 @@ extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
 extern void reserve_early(u64 start, u64 end, char *name);
 extern void free_early(u64 start, u64 end);
 extern void early_res_to_bootmem(u64 start, u64 end);
-extern int page_is_reserved_early(unsigned long pagenr);
 extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
 
 extern unsigned long e820_end_of_ram(void);
diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h
index e153f3b44774..85c4fea41ff6 100644
--- a/include/asm-x86/highmem.h
+++ b/include/asm-x86/highmem.h
@@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
 
+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+					unsigned long end_pfn, int bad_ppro);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 034a3156d2f0..e4de460907c1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1011,6 +1011,8 @@ extern unsigned long find_min_pfn_with_active_regions(void);
 extern unsigned long find_max_pfn_with_active_regions(void);
 extern void free_bootmem_with_active_regions(int nid,
 						unsigned long max_low_pfn);
+typedef void (*work_fn_t)(unsigned long, unsigned long, void *);
+extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
 extern void sparse_memory_present_with_active_regions(int nid);
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 extern int early_pfn_to_nid(unsigned long pfn);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d80e1868e570..41c6e3aa059f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2929,6 +2929,14 @@ void __init free_bootmem_with_active_regions(int nid,
 	}
 }
 
+void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
+{
+	int i;
+
+	for_each_active_range_index_in_nid(i, nid)
+		work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn,
+			data);
+}
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
-- 
cgit v1.2.3-59-g8ed1b


From d52d53b8a5b258bfaab9223a5e7284fcfdd48577 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 16 Jun 2008 20:10:55 -0700
Subject: RFC x86: try to remove arch_get_ram_range

want to remove arch_get_ram_range, and use early_node_map instead.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c     |  6 ++++--
 drivers/pci/intel-iommu.c | 51 ++++++++++++++++++++++++++++++++++-------------
 include/linux/mm.h        |  2 +-
 mm/page_alloc.c           | 10 +++++++---
 4 files changed, 49 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 65d55056b6e7..a0484adbf59d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -298,7 +298,7 @@ struct add_highpages_data {
 	unsigned long end_pfn;
 };
 
-static void __init add_highpages_work_fn(unsigned long start_pfn,
+static int __init add_highpages_work_fn(unsigned long start_pfn,
 					 unsigned long end_pfn, void *datax)
 {
 	int node_pfn;
@@ -311,7 +311,7 @@ static void __init add_highpages_work_fn(unsigned long start_pfn,
 	final_start_pfn = max(start_pfn, data->start_pfn);
 	final_end_pfn = min(end_pfn, data->end_pfn);
 	if (final_start_pfn >= final_end_pfn)
-		return;
+		return 0;
 
 	for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
 	     node_pfn++) {
@@ -321,6 +321,8 @@ static void __init add_highpages_work_fn(unsigned long start_pfn,
 		add_one_highpage_init(page, node_pfn);
 	}
 
+	return 0;
+
 }
 
 void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 66c0fd21894b..bb0642318a95 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1637,12 +1637,43 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
 }
 
 #ifdef CONFIG_DMAR_GFX_WA
-extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
+struct iommu_prepare_data {
+	struct pci_dev *pdev;
+	int ret;
+};
+
+static int __init iommu_prepare_work_fn(unsigned long start_pfn,
+					 unsigned long end_pfn, void *datax)
+{
+	struct iommu_prepare_data *data;
+
+	data = (struct iommu_prepare_data *)datax;
+
+	data->ret = iommu_prepare_identity_map(data->pdev,
+				start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+	return data->ret;
+
+}
+
+static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
+{
+	int nid;
+	struct iommu_prepare_data data;
+
+	data.pdev = pdev;
+	data.ret = 0;
+
+	for_each_online_node(nid) {
+		work_with_active_regions(nid, iommu_prepare_work_fn, &data);
+		if (data.ret)
+			return data.ret;
+	}
+	return data.ret;
+}
+
 static void __init iommu_prepare_gfx_mapping(void)
 {
 	struct pci_dev *pdev = NULL;
-	u64 base, size;
-	int slot;
 	int ret;
 
 	for_each_pci_dev(pdev) {
@@ -1651,17 +1682,9 @@ static void __init iommu_prepare_gfx_mapping(void)
 			continue;
 		printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
 			pci_name(pdev));
-		slot = arch_get_ram_range(0, &base, &size);
-		while (slot >= 0) {
-			ret = iommu_prepare_identity_map(pdev,
-					base, base + size);
-			if (ret)
-				goto error;
-			slot = arch_get_ram_range(slot, &base, &size);
-		}
-		continue;
-error:
-		printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
+		ret = iommu_prepare_with_active_regions(pdev);
+		if (ret)
+			printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
 	}
 }
 #endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3d647b24041f..cf1cd3a2ed78 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1011,7 +1011,7 @@ extern unsigned long find_min_pfn_with_active_regions(void);
 extern unsigned long find_max_pfn_with_active_regions(void);
 extern void free_bootmem_with_active_regions(int nid,
 						unsigned long max_low_pfn);
-typedef void (*work_fn_t)(unsigned long, unsigned long, void *);
+typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
 extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
 extern void sparse_memory_present_with_active_regions(int nid);
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 41c6e3aa059f..e25b6b24f844 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2932,10 +2932,14 @@ void __init free_bootmem_with_active_regions(int nid,
 void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
 {
 	int i;
+	int ret;
 
-	for_each_active_range_index_in_nid(i, nid)
-		work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn,
-			data);
+	for_each_active_range_index_in_nid(i, nid) {
+		ret = work_fn(early_node_map[i].start_pfn,
+			      early_node_map[i].end_pfn, data);
+		if (ret)
+			break;
+	}
 }
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
-- 
cgit v1.2.3-59-g8ed1b


From e2fc252e0ce695b4c4abe27bb073c35bd0d73252 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 22 Jun 2008 07:22:12 -0700
Subject: x86 boot: show pfn addresses in hex not decimal in some kernel info
 printks

Page frame numbers (the portion of physical addresses above the low
order page offsets) are displayed in several kernel debug and info
prints in decimal, not hex.  Decimal addresse are unreadable.  Use hex.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: "Yinghai Lu" <yhlu.kernel@gmail.com>
Cc: "Jack Steiner" <steiner@sgi.com>
Cc: "Mike Travis" <travis@sgi.com>
Cc: "Huang
Cc: Ying" <ying.huang@intel.com>
Cc: "Andi Kleen" <andi@firstfloor.org>
Cc: "Andrew Morton" <akpm@linux-foundation.org>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 2 +-
 mm/page_alloc.c        | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 600c9de237a0..512f779fc6af 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -990,7 +990,7 @@ unsigned long __init e820_end_of_ram(void)
 	if (last_pfn > end_user_pfn)
 		last_pfn = end_user_pfn;
 
-	printk(KERN_INFO "last_pfn = %lu max_arch_pfn = %lu\n",
+	printk(KERN_INFO "last_pfn = 0x%lx max_arch_pfn = 0x%lx\n",
 			 last_pfn, max_arch_pfn);
 	return last_pfn;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e25b6b24f844..c09c2c8d2c6a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3520,7 +3520,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 {
 	int i;
 
-	printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
+	printk(KERN_DEBUG "Entering add_active_range(%d, 0x%lx, 0x%lx) "
 			  "%d entries of %d used\n",
 			  nid, start_pfn, end_pfn,
 			  nr_nodemap_entries, MAX_ACTIVE_REGIONS);
@@ -3936,7 +3936,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
-		printk("  %-8s %8lu -> %8lu\n",
+		printk("  %-8s 0x%8lx -> 0x%8lx\n",
 				zone_names[i],
 				arch_zone_lowest_possible_pfn[i],
 				arch_zone_highest_possible_pfn[i]);
@@ -3952,7 +3952,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 	/* Print out the early_node_map[] */
 	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
 	for (i = 0; i < nr_nodemap_entries; i++)
-		printk("  %3d: %8lu -> %8lu\n", early_node_map[i].nid,
+		printk("  %3d: 0x%8lx -> 0x%8lx\n", early_node_map[i].nid,
 						early_node_map[i].start_pfn,
 						early_node_map[i].end_pfn);
 
-- 
cgit v1.2.3-59-g8ed1b


From 2bc0d2615a15a93d344abbe8cb1b9056122bce9d Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 22 Jun 2008 07:22:17 -0700
Subject: x86 boot: more consistently use type int for node ids

Everywhere I look, node id's are of type 'int', except in this one
case, which has 'unsigned long'.  Change this one to 'int' as well.
There is nothing special about the way this variable 'nid' is used in
this routine to justify using an unusual type here.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: "Yinghai Lu" <yhlu.kernel@gmail.com>
Cc: "Jack Steiner" <steiner@sgi.com>
Cc: "Mike Travis" <travis@sgi.com>
Cc: "Huang
Cc: Ying" <ying.huang@intel.com>
Cc: "Andi Kleen" <andi@firstfloor.org>
Cc: "Andrew Morton" <akpm@linux-foundation.org>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c09c2c8d2c6a..b604c64a0337 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3669,7 +3669,7 @@ static void __init sort_node_map(void)
 }
 
 /* Find the lowest pfn for a node */
-unsigned long __init find_min_pfn_for_node(unsigned long nid)
+unsigned long __init find_min_pfn_for_node(int nid)
 {
 	int i;
 	unsigned long min_pfn = ULONG_MAX;
@@ -3680,7 +3680,7 @@ unsigned long __init find_min_pfn_for_node(unsigned long nid)
 
 	if (min_pfn == ULONG_MAX) {
 		printk(KERN_WARNING
-			"Could not find start_pfn for node %lu\n", nid);
+			"Could not find start_pfn for node %d\n", nid);
 		return 0;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 5dab8ec139be215fbaba216fb4aea914d0f4dac5 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 25 Jun 2008 05:44:40 -0700
Subject: mm, generic, x86 boot: more tweaks to hex prints of some pfn
 addresses

Fix some problems with (and applies on top of) a previous patch:
  x86 boot: show pfn addresses in hex not decimal in some kernel info printks

Primarily change "0x%8lx" format, which displays with a right aligned
space filled hex number (spaces between the "0x" prefix and the number),
into "%0#10lx" format, which zero fills instead of space fills, and
which uses the printf flag '#' to request the "0x" prefix instead of
hard coding it.

Also replace some other "0x%lx" formats with "%#lx", making use of the
'#' printf flag again.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: "Yinghai Lu" <yhlu.kernel@gmail.com>
Cc: "Jack Steiner" <steiner@sgi.com>
Cc: "Mike Travis" <travis@sgi.com>
Cc: "Huang
Cc: Ying" <ying.huang@intel.com>
Cc: "Andi Kleen" <andi@firstfloor.org>
Cc: "Andrew Morton" <akpm@linux-foundation.org>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 2 +-
 mm/page_alloc.c        | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 22cfd665224c..1dcb66533dfc 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1014,7 +1014,7 @@ unsigned long __init e820_end_of_ram(void)
 	if (last_pfn > end_user_pfn)
 		last_pfn = end_user_pfn;
 
-	printk(KERN_INFO "last_pfn = 0x%lx max_arch_pfn = 0x%lx\n",
+	printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
 			 last_pfn, max_arch_pfn);
 	return last_pfn;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b604c64a0337..f024b9b3a2a6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3520,7 +3520,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 {
 	int i;
 
-	printk(KERN_DEBUG "Entering add_active_range(%d, 0x%lx, 0x%lx) "
+	printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) "
 			  "%d entries of %d used\n",
 			  nid, start_pfn, end_pfn,
 			  nr_nodemap_entries, MAX_ACTIVE_REGIONS);
@@ -3936,7 +3936,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
-		printk("  %-8s 0x%8lx -> 0x%8lx\n",
+		printk("  %-8s %0#10lx -> %0#10lx\n",
 				zone_names[i],
 				arch_zone_lowest_possible_pfn[i],
 				arch_zone_highest_possible_pfn[i]);
@@ -3952,7 +3952,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 	/* Print out the early_node_map[] */
 	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
 	for (i = 0; i < nr_nodemap_entries; i++)
-		printk("  %3d: 0x%8lx -> 0x%8lx\n", early_node_map[i].nid,
+		printk("  %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
 						early_node_map[i].start_pfn,
 						early_node_map[i].end_pfn);
 
-- 
cgit v1.2.3-59-g8ed1b


From 421c175c4d609864350df495b34d3e99f9fb1bdd Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 14 Jul 2008 09:59:18 +0200
Subject: [S390] Add support for memory hot-add.

Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig            |  4 ++++
 arch/s390/mm/init.c          | 19 +++++++++++++++++++
 include/asm-s390/sparsemem.h |  4 ++--
 mm/Kconfig                   |  2 +-
 4 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 6d0d31651f05..5dc8f8028d52 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -313,6 +313,10 @@ config ARCH_SPARSEMEM_DEFAULT
 config ARCH_SELECT_MEMORY_MODEL
        def_bool y
 
+config ARCH_ENABLE_MEMORY_HOTPLUG
+	def_bool y
+	depends on SPARSEMEM
+
 source "mm/Kconfig"
 
 comment "I/O subsystem configuration"
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 05598649b326..388cc7420055 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -202,3 +202,22 @@ void free_initrd_mem(unsigned long start, unsigned long end)
         }
 }
 #endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+	struct pglist_data *pgdat;
+	struct zone *zone;
+	int rc;
+
+	pgdat = NODE_DATA(nid);
+	zone = pgdat->node_zones + ZONE_NORMAL;
+	rc = vmem_add_mapping(start, size);
+	if (rc)
+		return rc;
+	rc = __add_pages(zone, PFN_DOWN(start), PFN_DOWN(size));
+	if (rc)
+		vmem_remove_mapping(start, size);
+	return rc;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/include/asm-s390/sparsemem.h b/include/asm-s390/sparsemem.h
index 06dfdab6c0e8..545d219e6a2d 100644
--- a/include/asm-s390/sparsemem.h
+++ b/include/asm-s390/sparsemem.h
@@ -1,15 +1,15 @@
 #ifndef _ASM_S390_SPARSEMEM_H
 #define _ASM_S390_SPARSEMEM_H
 
-#define SECTION_SIZE_BITS	25
-
 #ifdef CONFIG_64BIT
 
+#define SECTION_SIZE_BITS	28
 #define MAX_PHYSADDR_BITS	42
 #define MAX_PHYSMEM_BITS	42
 
 #else
 
+#define SECTION_SIZE_BITS	25
 #define MAX_PHYSADDR_BITS	31
 #define MAX_PHYSMEM_BITS	31
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 3aa819d628c1..4242743b981b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -129,7 +129,7 @@ config MEMORY_HOTPLUG
 	bool "Allow for memory hot-add"
 	depends on SPARSEMEM || X86_64_ACPI_NUMA
 	depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG
-	depends on (IA64 || X86 || PPC64 || SUPERH)
+	depends on (IA64 || X86 || PPC64 || SUPERH || S390)
 
 comment "Memory hotplug is currently incompatible with Software Suspend"
 	depends on SPARSEMEM && HOTPLUG && HIBERNATION
-- 
cgit v1.2.3-59-g8ed1b


From 7daf705f362e349983e92037a198b8821db198af Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 14 Jul 2008 12:12:53 -0700
Subject: Start using the new '%pS' infrastructure to print symbols

This simplifies the code significantly, and was the whole point of the
exercise.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/traps_64.c | 25 +------------------------
 mm/slub.c                  |  5 ++---
 2 files changed, 3 insertions(+), 27 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index adff76ea97c4..f1a95d105953 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -104,30 +104,7 @@ int kstack_depth_to_print = 12;
 
 void printk_address(unsigned long address, int reliable)
 {
-#ifdef CONFIG_KALLSYMS
-	unsigned long offset = 0, symsize;
-	const char *symname;
-	char *modname;
-	char *delim = ":";
-	char namebuf[KSYM_NAME_LEN];
-	char reliab[4] = "";
-
-	symname = kallsyms_lookup(address, &symsize, &offset,
-					&modname, namebuf);
-	if (!symname) {
-		printk(" [<%016lx>]\n", address);
-		return;
-	}
-	if (!reliable)
-		strcpy(reliab, "? ");
-
-	if (!modname)
-		modname = delim = "";
-	printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
-		address, reliab, delim, modname, delim, symname, offset, symsize);
-#else
-	printk(" [<%016lx>]\n", address);
-#endif
+	printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
 }
 
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
diff --git a/mm/slub.c b/mm/slub.c
index 315c392253c7..5f6e2c4a2ba7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -431,9 +431,8 @@ static void print_track(const char *s, struct track *t)
 	if (!t->addr)
 		return;
 
-	printk(KERN_ERR "INFO: %s in ", s);
-	__print_symbol("%s", (unsigned long)t->addr);
-	printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
+	printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+		s, t->addr, jiffies - t->when, t->cpu, t->pid);
 }
 
 static void print_tracking(struct kmem_cache *s, void *object)
-- 
cgit v1.2.3-59-g8ed1b