aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/init.c469
-rw-r--r--arch/x86/mm/init_32.c118
-rw-r--r--arch/x86/mm/init_64.c652
-rw-r--r--arch/x86/mm/mm_internal.h19
-rw-r--r--arch/x86/mm/numa.c43
-rw-r--r--arch/x86/mm/numa_32.c161
-rw-r--r--arch/x86/mm/numa_64.c13
-rw-r--r--arch/x86/mm/numa_internal.h6
-rw-r--r--arch/x86/mm/pageattr.c113
-rw-r--r--arch/x86/mm/pat.c4
-rw-r--r--arch/x86/mm/pgtable.c7
-rw-r--r--arch/x86/mm/physaddr.c60
-rw-r--r--arch/x86/mm/srat.c125
13 files changed, 1206 insertions, 584 deletions
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d7aea41563b3..4903a03ae876 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -16,87 +16,134 @@
#include <asm/tlb.h>
#include <asm/proto.h>
#include <asm/dma.h> /* for MAX_DMA_PFN */
+#include <asm/microcode.h>
-unsigned long __initdata pgt_buf_start;
-unsigned long __meminitdata pgt_buf_end;
-unsigned long __meminitdata pgt_buf_top;
+#include "mm_internal.h"
-int after_bootmem;
+static unsigned long __initdata pgt_buf_start;
+static unsigned long __initdata pgt_buf_end;
+static unsigned long __initdata pgt_buf_top;
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
- = 1
-#endif
-;
+static unsigned long min_pfn_mapped;
-struct map_range {
- unsigned long start;
- unsigned long end;
- unsigned page_size_mask;
-};
+static bool __initdata can_use_brk_pgt = true;
/*
- * First calculate space needed for kernel direct mapping page tables to cover
- * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB
- * pages. Then find enough contiguous space for those page tables.
+ * Pages returned are already directly mapped.
+ *
+ * Changing that is likely to break Xen, see commit:
+ *
+ * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
+ *
+ * for detailed information.
*/
-static void __init find_early_table_space(struct map_range *mr, int nr_range)
+__ref void *alloc_low_pages(unsigned int num)
{
+ unsigned long pfn;
int i;
- unsigned long puds = 0, pmds = 0, ptes = 0, tables;
- unsigned long start = 0, good_end;
- phys_addr_t base;
- for (i = 0; i < nr_range; i++) {
- unsigned long range, extra;
+ if (after_bootmem) {
+ unsigned int order;
- range = mr[i].end - mr[i].start;
- puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
+ order = get_order((unsigned long)num << PAGE_SHIFT);
+ return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
+ __GFP_ZERO, order);
+ }
- if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
- extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
- pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
- } else {
- pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
- }
+ if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
+ unsigned long ret;
+ if (min_pfn_mapped >= max_pfn_mapped)
+ panic("alloc_low_page: ran out of memory");
+ ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+ max_pfn_mapped << PAGE_SHIFT,
+ PAGE_SIZE * num , PAGE_SIZE);
+ if (!ret)
+ panic("alloc_low_page: can not alloc memory");
+ memblock_reserve(ret, PAGE_SIZE * num);
+ pfn = ret >> PAGE_SHIFT;
+ } else {
+ pfn = pgt_buf_end;
+ pgt_buf_end += num;
+ printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
+ pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
+ }
- if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
- extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
-#ifdef CONFIG_X86_32
- extra += PMD_SIZE;
-#endif
- ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else {
- ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
- }
+ for (i = 0; i < num; i++) {
+ void *adr;
+
+ adr = __va((pfn + i) << PAGE_SHIFT);
+ clear_page(adr);
}
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
- tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+ return __va(pfn << PAGE_SHIFT);
+}
-#ifdef CONFIG_X86_32
- /* for fixmap */
- tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
- good_end = max_pfn_mapped << PAGE_SHIFT;
+/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
+#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE)
+RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
+void __init early_alloc_pgt_buf(void)
+{
+ unsigned long tables = INIT_PGT_BUF_SIZE;
+ phys_addr_t base;
- base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
- if (!base)
- panic("Cannot find space for the kernel page tables");
+ base = __pa(extend_brk(tables, PAGE_SIZE));
pgt_buf_start = base >> PAGE_SHIFT;
pgt_buf_end = pgt_buf_start;
pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+}
- printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n",
- mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT,
- (pgt_buf_top << PAGE_SHIFT) - 1);
+int after_bootmem;
+
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+ = 1
+#endif
+;
+
+static void __init init_gbpages(void)
+{
+#ifdef CONFIG_X86_64
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ direct_gbpages = 0;
+#endif
}
-void __init native_pagetable_reserve(u64 start, u64 end)
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
+
+static int page_size_mask;
+
+static void __init probe_page_size_mask(void)
{
- memblock_reserve(start, end - start);
+ init_gbpages();
+
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ if (direct_gbpages)
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ if (cpu_has_pse)
+ page_size_mask |= 1 << PG_LEVEL_2M;
+#endif
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
}
#ifdef CONFIG_X86_32
@@ -122,58 +169,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
}
/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
+ * adjust the page_size_mask for small range to go with
+ * big page size instead small one if nearby are ram too.
*/
-unsigned long __init_refok init_memory_mapping(unsigned long start,
- unsigned long end)
+static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
+ int nr_range)
{
- unsigned long page_size_mask = 0;
- unsigned long start_pfn, end_pfn;
- unsigned long ret = 0;
- unsigned long pos;
-
- struct map_range mr[NR_RANGE_MR];
- int nr_range, i;
- int use_pse, use_gbpages;
+ int i;
- printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n",
- start, end - 1);
+ for (i = 0; i < nr_range; i++) {
+ if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
+ unsigned long start = round_down(mr[i].start, PMD_SIZE);
+ unsigned long end = round_up(mr[i].end, PMD_SIZE);
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
- /*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
- * This will simplify cpa(), which otherwise needs to support splitting
- * large pages into small in interrupt context, etc.
- */
- use_pse = use_gbpages = 0;
-#else
- use_pse = cpu_has_pse;
- use_gbpages = direct_gbpages;
+#ifdef CONFIG_X86_32
+ if ((end >> PAGE_SHIFT) > max_low_pfn)
+ continue;
#endif
- /* Enable PSE if available */
- if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
+ }
+ if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
+ unsigned long start = round_down(mr[i].start, PUD_SIZE);
+ unsigned long end = round_up(mr[i].end, PUD_SIZE);
- /* Enable PGE if available */
- if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
- __supported_pte_mask |= _PAGE_GLOBAL;
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
+ }
}
+}
- if (use_gbpages)
- page_size_mask |= 1 << PG_LEVEL_1G;
- if (use_pse)
- page_size_mask |= 1 << PG_LEVEL_2M;
+static int __meminit split_mem_range(struct map_range *mr, int nr_range,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long start_pfn, end_pfn, limit_pfn;
+ unsigned long pfn;
+ int i;
- memset(mr, 0, sizeof(mr));
- nr_range = 0;
+ limit_pfn = PFN_DOWN(end);
/* head if not big page alignment ? */
- start_pfn = start >> PAGE_SHIFT;
- pos = start_pfn << PAGE_SHIFT;
+ pfn = start_pfn = PFN_DOWN(start);
#ifdef CONFIG_X86_32
/*
* Don't use a large page for the first 2/4MB of memory
@@ -181,66 +221,60 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
* and overlapping MTRRs into large pages can cause
* slowdowns.
*/
- if (pos == 0)
- end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+ if (pfn == 0)
+ end_pfn = PFN_DOWN(PMD_SIZE);
else
- end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
- end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#endif
- if (end_pfn > (end >> PAGE_SHIFT))
- end_pfn = end >> PAGE_SHIFT;
+ if (end_pfn > limit_pfn)
+ end_pfn = limit_pfn;
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
/* big page (2M) range */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#ifdef CONFIG_X86_32
- end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
- end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
- end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+ end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#endif
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
#ifdef CONFIG_X86_64
/* big page (1G) range */
- start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask &
((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
/* tail is not big page (1G) alignment */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
#endif
/* tail is not big page (2M) alignment */
- start_pfn = pos>>PAGE_SHIFT;
- end_pfn = end>>PAGE_SHIFT;
+ start_pfn = pfn;
+ end_pfn = limit_pfn;
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
/* try to merge same page size and continuous */
@@ -257,59 +291,169 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
nr_range--;
}
+ if (!after_bootmem)
+ adjust_range_page_size_mask(mr, nr_range);
+
for (i = 0; i < nr_range; i++)
printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
mr[i].start, mr[i].end - 1,
(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
(mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
- /*
- * Find space for the kernel direct mapping tables.
- *
- * Later we should allocate these tables in the local node of the
- * memory mapped. Unfortunately this is done currently before the
- * nodes are discovered.
- */
- if (!after_bootmem)
- find_early_table_space(mr, nr_range);
+ return nr_range;
+}
+
+struct range pfn_mapped[E820_X_MAX];
+int nr_pfn_mapped;
+
+static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
+ nr_pfn_mapped, start_pfn, end_pfn);
+ nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
+
+ max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+
+ if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
+ max_low_pfn_mapped = max(max_low_pfn_mapped,
+ min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ int i;
+
+ for (i = 0; i < nr_pfn_mapped; i++)
+ if ((start_pfn >= pfn_mapped[i].start) &&
+ (end_pfn <= pfn_mapped[i].end))
+ return true;
+
+ return false;
+}
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
+{
+ struct map_range mr[NR_RANGE_MR];
+ unsigned long ret = 0;
+ int nr_range, i;
+
+ pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
+ start, end - 1);
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = split_mem_range(mr, 0, start, end);
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
mr[i].page_size_mask);
-#ifdef CONFIG_X86_32
- early_ioremap_page_table_range_init();
+ add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
- load_cr3(swapper_pg_dir);
-#endif
+ return ret >> PAGE_SHIFT;
+}
- __flush_tlb_all();
+/*
+ * would have hole in the middle or ends, and only ram parts will be mapped.
+ */
+static unsigned long __init init_range_memory_mapping(
+ unsigned long r_start,
+ unsigned long r_end)
+{
+ unsigned long start_pfn, end_pfn;
+ unsigned long mapped_ram_size = 0;
+ int i;
- /*
- * Reserve the kernel pagetable pages we used (pgt_buf_start -
- * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
- * so that they can be reused for other purposes.
- *
- * On native it just means calling memblock_reserve, on Xen it also
- * means marking RW the pagetable pages that we allocated before
- * but that haven't been used.
- *
- * In fact on xen we mark RO the whole range pgt_buf_start -
- * pgt_buf_top, because we have to make sure that when
- * init_memory_mapping reaches the pagetable pages area, it maps
- * RO all the pagetable pages, including the ones that are beyond
- * pgt_buf_end at that time.
- */
- if (!after_bootmem && pgt_buf_end > pgt_buf_start)
- x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
- PFN_PHYS(pgt_buf_end));
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
+ u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
+ if (start >= end)
+ continue;
- if (!after_bootmem)
- early_memtest(start, end);
+ /*
+ * if it is overlapping with brk pgt, we need to
+ * alloc pgt buf from memblock instead.
+ */
+ can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
+ min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
+ init_memory_mapping(start, end);
+ mapped_ram_size += end - start;
+ can_use_brk_pgt = true;
+ }
- return ret >> PAGE_SHIFT;
+ return mapped_ram_size;
}
+/* (PUD_SHIFT-PMD_SHIFT)/2 */
+#define STEP_SIZE_SHIFT 5
+void __init init_mem_mapping(void)
+{
+ unsigned long end, real_end, start, last_start;
+ unsigned long step_size;
+ unsigned long addr;
+ unsigned long mapped_ram_size = 0;
+ unsigned long new_mapped_ram_size;
+
+ probe_page_size_mask();
+
+#ifdef CONFIG_X86_64
+ end = max_pfn << PAGE_SHIFT;
+#else
+ end = max_low_pfn << PAGE_SHIFT;
+#endif
+
+ /* the ISA range is always mapped regardless of memory holes */
+ init_memory_mapping(0, ISA_END_ADDRESS);
+
+ /* xen has big range in reserved near end of ram, skip it at first */
+ addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE,
+ PAGE_SIZE);
+ real_end = addr + PMD_SIZE;
+
+ /* step_size need to be small so pgt_buf from BRK could cover it */
+ step_size = PMD_SIZE;
+ max_pfn_mapped = 0; /* will get exact value next */
+ min_pfn_mapped = real_end >> PAGE_SHIFT;
+ last_start = start = real_end;
+ while (last_start > ISA_END_ADDRESS) {
+ if (last_start > step_size) {
+ start = round_down(last_start - 1, step_size);
+ if (start < ISA_END_ADDRESS)
+ start = ISA_END_ADDRESS;
+ } else
+ start = ISA_END_ADDRESS;
+ new_mapped_ram_size = init_range_memory_mapping(start,
+ last_start);
+ last_start = start;
+ min_pfn_mapped = last_start >> PAGE_SHIFT;
+ /* only increase step_size after big range get mapped */
+ if (new_mapped_ram_size > mapped_ram_size)
+ step_size <<= STEP_SIZE_SHIFT;
+ mapped_ram_size += new_mapped_ram_size;
+ }
+
+ if (real_end < end)
+ init_range_memory_mapping(real_end, end);
+
+#ifdef CONFIG_X86_64
+ if (max_pfn > max_low_pfn) {
+ /* can we preseve max_low_pfn ?*/
+ max_low_pfn = max_pfn;
+ }
+#else
+ early_ioremap_page_table_range_init();
+#endif
+
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
+
+ early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
+}
/*
* devmem_is_allowed() checks to see if /dev/mem access to a certain address
@@ -391,6 +535,15 @@ void free_initmem(void)
#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
+#ifdef CONFIG_MICROCODE_EARLY
+ /*
+ * Remember, initrd memory may contain microcode or other useful things.
+ * Before we lose initrd mem, we need to find a place to hold them
+ * now that normal virtual memory is enabled.
+ */
+ save_microcode_in_initrd();
+#endif
+
/*
* end could be not aligned, and We can not align that,
* decompresser could be confused by aligned initrd_end
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 745d66b843c8..2d19001151d5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -53,25 +53,14 @@
#include <asm/page_types.h>
#include <asm/init.h>
+#include "mm_internal.h"
+
unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
bool __read_mostly __vmalloc_start_set = false;
-static __init void *alloc_low_page(void)
-{
- unsigned long pfn = pgt_buf_end++;
- void *adr;
-
- if (pfn >= pgt_buf_top)
- panic("alloc_low_page: ran out of memory");
-
- adr = __va(pfn * PAGE_SIZE);
- clear_page(adr);
- return adr;
-}
-
/*
* Creates a middle page table and puts a pointer to it in the
* given global directory entry. This only returns the gd entry
@@ -84,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
- if (after_bootmem)
- pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
- else
- pmd_table = (pmd_t *)alloc_low_page();
+ pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
@@ -109,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
static pte_t * __init one_page_table_init(pmd_t *pmd)
{
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
- pte_t *page_table = NULL;
-
- if (after_bootmem) {
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
- page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
-#endif
- if (!page_table)
- page_table =
- (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
- } else
- page_table = (pte_t *)alloc_low_page();
+ pte_t *page_table = (pte_t *)alloc_low_page();
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -146,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr)
return one_page_table_init(pmd) + pte_idx;
}
+static unsigned long __init
+page_table_range_init_count(unsigned long start, unsigned long end)
+{
+ unsigned long count = 0;
+#ifdef CONFIG_HIGHMEM
+ int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+ int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+ int pgd_idx, pmd_idx;
+ unsigned long vaddr;
+
+ if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
+ return 0;
+
+ vaddr = start;
+ pgd_idx = pgd_index(vaddr);
+
+ for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+ pmd_idx++) {
+ if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
+ (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
+ count++;
+ vaddr += PMD_SIZE;
+ }
+ pmd_idx = 0;
+ }
+#endif
+ return count;
+}
+
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
- unsigned long vaddr, pte_t *lastpte)
+ unsigned long vaddr, pte_t *lastpte,
+ void **adr)
{
#ifdef CONFIG_HIGHMEM
/*
@@ -161,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
if (pmd_idx_kmap_begin != pmd_idx_kmap_end
&& (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
- && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
- && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
- || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
+ && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
pte_t *newpte;
int i;
BUG_ON(after_bootmem);
- newpte = alloc_low_page();
+ newpte = *adr;
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(newpte + i, pte[i]);
+ *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
@@ -204,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte = NULL;
+ unsigned long count = page_table_range_init_count(start, end);
+ void *adr = NULL;
+
+ if (count)
+ adr = alloc_low_pages(count);
vaddr = start;
pgd_idx = pgd_index(vaddr);
@@ -216,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
pmd++, pmd_idx++) {
pte = page_table_kmap_check(one_page_table_init(pmd),
- pmd, vaddr, pte);
+ pmd, vaddr, pte, &adr);
vaddr += PMD_SIZE;
}
@@ -310,6 +321,7 @@ repeat:
__pgprot(PTE_IDENT_ATTR |
_PAGE_PSE);
+ pfn &= PMD_MASK >> PAGE_SHIFT;
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
@@ -455,9 +467,14 @@ void __init native_pagetable_init(void)
/*
* Remove any mappings which extend past the end of physical
- * memory from the boot time page table:
+ * memory from the boot time page table.
+ * In virtual address space, we should have at least two pages
+ * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
+ * definition. And max_low_pfn is set to VMALLOC_END physical
+ * address. If initial memory mapping is doing right job, we
+ * should have pte used near max_low_pfn or one pmd is not present.
*/
- for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+ for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
pgd = base + pgd_index(va);
if (!pgd_present(*pgd))
@@ -468,10 +485,19 @@ void __init native_pagetable_init(void)
if (!pmd_present(*pmd))
break;
+ /* should not be large page here */
+ if (pmd_large(*pmd)) {
+ pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
+ pfn, pmd, __pa(pmd));
+ BUG_ON(1);
+ }
+
pte = pte_offset_kernel(pmd, va);
if (!pte_present(*pte))
break;
+ printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
+ pfn, pmd, __pa(pmd), pte, __pa(pte));
pte_clear(NULL, va, pte);
}
paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
@@ -550,7 +576,7 @@ early_param("highmem", parse_highmem);
* artificially via the highmem=x boot parameter then create
* it:
*/
-void __init lowmem_pfn_init(void)
+static void __init lowmem_pfn_init(void)
{
/* max_low_pfn is 0, we already have early_res support */
max_low_pfn = max_pfn;
@@ -586,7 +612,7 @@ void __init lowmem_pfn_init(void)
* We have more RAM than fits into lowmem - we try to put it into
* highmem, also taking the highmem=x boot parameter into account:
*/
-void __init highmem_pfn_init(void)
+static void __init highmem_pfn_init(void)
{
max_low_pfn = MAXMEM_PFN;
@@ -669,8 +695,6 @@ void __init setup_bootmem_allocator(void)
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
-
- after_bootmem = 1;
}
/*
@@ -753,6 +777,8 @@ void __init mem_init(void)
if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
reservedpages++;
+ after_bootmem = 1;
+
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
@@ -836,6 +862,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
return __add_pages(nid, zone, start_pfn, nr_pages);
}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ return __remove_pages(zone, start_pfn, nr_pages);
+}
+#endif
#endif
/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d6eeead43758..474e28f10815 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -54,6 +54,82 @@
#include <asm/uv/uv.h>
#include <asm/setup.h>
+#include "mm_internal.h"
+
+static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
+ unsigned long addr, unsigned long end)
+{
+ addr &= PMD_MASK;
+ for (; addr < end; addr += PMD_SIZE) {
+ pmd_t *pmd = pmd_page + pmd_index(addr);
+
+ if (!pmd_present(*pmd))
+ set_pmd(pmd, __pmd(addr | pmd_flag));
+ }
+}
+static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next) {
+ pud_t *pud = pud_page + pud_index(addr);
+ pmd_t *pmd;
+
+ next = (addr & PUD_MASK) + PUD_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pud_present(*pud)) {
+ pmd = pmd_offset(pud, 0);
+ ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ continue;
+ }
+ pmd = (pmd_t *)info->alloc_pgt_page(info->context);
+ if (!pmd)
+ return -ENOMEM;
+ ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
+
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ int result;
+ int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
+
+ for (; addr < end; addr = next) {
+ pgd_t *pgd = pgd_page + pgd_index(addr) + off;
+ pud_t *pud;
+
+ next = (addr & PGDIR_MASK) + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, 0);
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+ continue;
+ }
+
+ pud = (pud_t *)info->alloc_pgt_page(info->context);
+ if (!pud)
+ return -ENOMEM;
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
+
static int __init parse_direct_gbpages_off(char *arg)
{
direct_gbpages = 0;
@@ -302,10 +378,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
void __init cleanup_highmap(void)
{
unsigned long vaddr = __START_KERNEL_map;
- unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+ unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
pmd_t *pmd = level2_kernel_pgt;
+ /*
+ * Native path, max_pfn_mapped is not set yet.
+ * Xen has valid max_pfn_mapped set in
+ * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
+ */
+ if (max_pfn_mapped)
+ vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+
for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
if (pmd_none(*pmd))
continue;
@@ -314,69 +398,24 @@ void __init cleanup_highmap(void)
}
}
-static __ref void *alloc_low_page(unsigned long *phys)
-{
- unsigned long pfn = pgt_buf_end++;
- void *adr;
-
- if (after_bootmem) {
- adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
- *phys = __pa(adr);
-
- return adr;
- }
-
- if (pfn >= pgt_buf_top)
- panic("alloc_low_page: ran out of memory");
-
- adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
- clear_page(adr);
- *phys = pfn * PAGE_SIZE;
- return adr;
-}
-
-static __ref void *map_low_page(void *virt)
-{
- void *adr;
- unsigned long phys, left;
-
- if (after_bootmem)
- return virt;
-
- phys = __pa(virt);
- left = phys & (PAGE_SIZE - 1);
- adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
- adr = (void *)(((unsigned long)adr) | left);
-
- return adr;
-}
-
-static __ref void unmap_low_page(void *adr)
-{
- if (after_bootmem)
- return;
-
- early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
-}
-
static unsigned long __meminit
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
pgprot_t prot)
{
- unsigned pages = 0;
+ unsigned long pages = 0, next;
unsigned long last_map_addr = end;
int i;
pte_t *pte = pte_page + pte_index(addr);
- for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
-
+ for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
+ next = (addr & PAGE_MASK) + PAGE_SIZE;
if (addr >= end) {
- if (!after_bootmem) {
- for(; i < PTRS_PER_PTE; i++, pte++)
- set_pte(pte, __pte(0));
- }
- break;
+ if (!after_bootmem &&
+ !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
+ !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
+ set_pte(pte, __pte(0));
+ continue;
}
/*
@@ -414,28 +453,25 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
int i = pmd_index(address);
for (; i < PTRS_PER_PMD; i++, address = next) {
- unsigned long pte_phys;
pmd_t *pmd = pmd_page + pmd_index(address);
pte_t *pte;
pgprot_t new_prot = prot;
+ next = (address & PMD_MASK) + PMD_SIZE;
if (address >= end) {
- if (!after_bootmem) {
- for (; i < PTRS_PER_PMD; i++, pmd++)
- set_pmd(pmd, __pmd(0));
- }
- break;
+ if (!after_bootmem &&
+ !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
+ !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
+ set_pmd(pmd, __pmd(0));
+ continue;
}
- next = (address & PMD_MASK) + PMD_SIZE;
-
if (pmd_val(*pmd)) {
if (!pmd_large(*pmd)) {
spin_lock(&init_mm.page_table_lock);
- pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+ pte = (pte_t *)pmd_page_vaddr(*pmd);
last_map_addr = phys_pte_init(pte, address,
end, prot);
- unmap_low_page(pte);
spin_unlock(&init_mm.page_table_lock);
continue;
}
@@ -464,19 +500,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pmd,
- pfn_pte(address >> PAGE_SHIFT,
+ pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
__pgprot(pgprot_val(prot) | _PAGE_PSE)));
spin_unlock(&init_mm.page_table_lock);
last_map_addr = next;
continue;
}
- pte = alloc_low_page(&pte_phys);
+ pte = alloc_low_page();
last_map_addr = phys_pte_init(pte, address, end, new_prot);
- unmap_low_page(pte);
spin_lock(&init_mm.page_table_lock);
- pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+ pmd_populate_kernel(&init_mm, pmd, pte);
spin_unlock(&init_mm.page_table_lock);
}
update_page_count(PG_LEVEL_2M, pages);
@@ -492,27 +527,24 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
int i = pud_index(addr);
for (; i < PTRS_PER_PUD; i++, addr = next) {
- unsigned long pmd_phys;
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
pgprot_t prot = PAGE_KERNEL;
- if (addr >= end)
- break;
-
next = (addr & PUD_MASK) + PUD_SIZE;
-
- if (!after_bootmem && !e820_any_mapped(addr, next, 0)) {
- set_pud(pud, __pud(0));
+ if (addr >= end) {
+ if (!after_bootmem &&
+ !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
+ !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
+ set_pud(pud, __pud(0));
continue;
}
if (pud_val(*pud)) {
if (!pud_large(*pud)) {
- pmd = map_low_page(pmd_offset(pud, 0));
+ pmd = pmd_offset(pud, 0);
last_map_addr = phys_pmd_init(pmd, addr, end,
page_size_mask, prot);
- unmap_low_page(pmd);
__flush_tlb_all();
continue;
}
@@ -541,19 +573,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pud,
- pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE));
spin_unlock(&init_mm.page_table_lock);
last_map_addr = next;
continue;
}
- pmd = alloc_low_page(&pmd_phys);
+ pmd = alloc_low_page();
last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
prot);
- unmap_low_page(pmd);
spin_lock(&init_mm.page_table_lock);
- pud_populate(&init_mm, pud, __va(pmd_phys));
+ pud_populate(&init_mm, pud, pmd);
spin_unlock(&init_mm.page_table_lock);
}
__flush_tlb_all();
@@ -578,28 +610,23 @@ kernel_physical_mapping_init(unsigned long start,
for (; start < end; start = next) {
pgd_t *pgd = pgd_offset_k(start);
- unsigned long pud_phys;
pud_t *pud;
- next = (start + PGDIR_SIZE) & PGDIR_MASK;
- if (next > end)
- next = end;
+ next = (start & PGDIR_MASK) + PGDIR_SIZE;
if (pgd_val(*pgd)) {
- pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
last_map_addr = phys_pud_init(pud, __pa(start),
__pa(end), page_size_mask);
- unmap_low_page(pud);
continue;
}
- pud = alloc_low_page(&pud_phys);
- last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
+ pud = alloc_low_page();
+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
page_size_mask);
- unmap_low_page(pud);
spin_lock(&init_mm.page_table_lock);
- pgd_populate(&init_mm, pgd, __va(pud_phys));
+ pgd_populate(&init_mm, pgd, pud);
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
@@ -664,13 +691,11 @@ int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(nid);
struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
- unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- last_mapped_pfn = init_memory_mapping(start, start + size);
- if (last_mapped_pfn > max_pfn_mapped)
- max_pfn_mapped = last_mapped_pfn;
+ init_memory_mapping(start, start + size);
ret = __add_pages(nid, zone, start_pfn, nr_pages);
WARN_ON_ONCE(ret);
@@ -682,10 +707,357 @@ int arch_add_memory(int nid, u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(arch_add_memory);
+#define PAGE_INUSE 0xFD
+
+static void __meminit free_pagetable(struct page *page, int order)
+{
+ struct zone *zone;
+ bool bootmem = false;
+ unsigned long magic;
+ unsigned int nr_pages = 1 << order;
+
+ /* bootmem page has reserved flag */
+ if (PageReserved(page)) {
+ __ClearPageReserved(page);
+ bootmem = true;
+
+ magic = (unsigned long)page->lru.next;
+ if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+ while (nr_pages--)
+ put_page_bootmem(page++);
+ } else
+ __free_pages_bootmem(page, order);
+ } else
+ free_pages((unsigned long)page_address(page), order);
+
+ /*
+ * SECTION_INFO pages and MIX_SECTION_INFO pages
+ * are all allocated by bootmem.
+ */
+ if (bootmem) {
+ zone = page_zone(page);
+ zone_span_writelock(zone);
+ zone->present_pages += nr_pages;
+ zone_span_writeunlock(zone);
+ totalram_pages += nr_pages;
+ }
+}
+
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (pte_val(*pte))
+ return;
+ }
+
+ /* free a pte talbe */
+ free_pagetable(pmd_page(*pmd), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (pmd_val(*pmd))
+ return;
+ }
+
+ /* free a pmd talbe */
+ free_pagetable(pud_page(*pud), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
+/* Return true if pgd is changed, otherwise return false. */
+static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (pud_val(*pud))
+ return false;
+ }
+
+ /* free a pud table */
+ free_pagetable(pgd_page(*pgd), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pgd_clear(pgd);
+ spin_unlock(&init_mm.page_table_lock);
+
+ return true;
+}
+
+static void __meminit
+remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte;
+ void *page_addr;
+ phys_addr_t phys_addr;
+
+ pte = pte_start + pte_index(addr);
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
+
+ /*
+ * We mapped [0,1G) memory as identity mapping when
+ * initializing, in arch/x86/kernel/head_64.S. These
+ * pagetables cannot be removed.
+ */
+ phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
+ if (phys_addr < (phys_addr_t)0x40000000)
+ return;
+
+ if (IS_ALIGNED(addr, PAGE_SIZE) &&
+ IS_ALIGNED(next, PAGE_SIZE)) {
+ /*
+ * Do not free direct mapping pages since they were
+ * freed when offlining, or simplely not in use.
+ */
+ if (!direct)
+ free_pagetable(pte_page(*pte), 0);
+
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
+
+ /* For non-direct mapping, pages means nothing. */
+ pages++;
+ } else {
+ /*
+ * If we are here, we are freeing vmemmap pages since
+ * direct mapped memory ranges to be freed are aligned.
+ *
+ * If we are not removing the whole page, it means
+ * other page structs in this page are being used and
+ * we canot remove them. So fill the unused page_structs
+ * with 0xFD, and remove the page when it is wholly
+ * filled with 0xFD.
+ */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pte_page(*pte));
+ if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
+ free_pagetable(pte_page(*pte), 0);
+
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+ }
+
+ /* Call free_pte_table() in remove_pmd_table(). */
+ flush_tlb_all();
+ if (direct)
+ update_page_count(PG_LEVEL_4K, -pages);
+}
+
+static void __meminit
+remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte_base;
+ pmd_t *pmd;
+ void *page_addr;
+
+ pmd = pmd_start + pmd_index(addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(*pmd))
+ continue;
+
+ if (pmd_large(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ free_pagetable(pmd_page(*pmd),
+ get_order(PMD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
+ } else {
+ /* If here, we are freeing vmemmap pages. */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pmd_page(*pmd));
+ if (!memchr_inv(page_addr, PAGE_INUSE,
+ PMD_SIZE)) {
+ free_pagetable(pmd_page(*pmd),
+ get_order(PMD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+
+ continue;
+ }
+
+ pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+ remove_pte_table(pte_base, addr, next, direct);
+ free_pte_table(pte_base, pmd);
+ }
+
+ /* Call free_pmd_table() in remove_pud_table(). */
+ if (direct)
+ update_page_count(PG_LEVEL_2M, -pages);
+}
+
+static void __meminit
+remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pmd_t *pmd_base;
+ pud_t *pud;
+ void *page_addr;
+
+ pud = pud_start + pud_index(addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(*pud))
+ continue;
+
+ if (pud_large(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ if (!direct)
+ free_pagetable(pud_page(*pud),
+ get_order(PUD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
+ } else {
+ /* If here, we are freeing vmemmap pages. */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pud_page(*pud));
+ if (!memchr_inv(page_addr, PAGE_INUSE,
+ PUD_SIZE)) {
+ free_pagetable(pud_page(*pud),
+ get_order(PUD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+
+ continue;
+ }
+
+ pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+ remove_pmd_table(pmd_base, addr, next, direct);
+ free_pmd_table(pmd_base, pud);
+ }
+
+ if (direct)
+ update_page_count(PG_LEVEL_1G, -pages);
+}
+
+/* start and end are both virtual address. */
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+ unsigned long next;
+ pgd_t *pgd;
+ pud_t *pud;
+ bool pgd_changed = false;
+
+ for (; start < end; start = next) {
+ next = pgd_addr_end(start, end);
+
+ pgd = pgd_offset_k(start);
+ if (!pgd_present(*pgd))
+ continue;
+
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
+ remove_pud_table(pud, start, next, direct);
+ if (free_pud_table(pud, pgd))
+ pgd_changed = true;
+ }
+
+ if (pgd_changed)
+ sync_global_pgds(start, end - 1);
+
+ flush_tlb_all();
+}
+
+void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+ unsigned long start = (unsigned long)memmap;
+ unsigned long end = (unsigned long)(memmap + nr_pages);
+
+ remove_pagetable(start, end, false);
+}
+
+static void __meminit
+kernel_physical_mapping_remove(unsigned long start, unsigned long end)
+{
+ start = (unsigned long)__va(start);
+ end = (unsigned long)__va(end);
+
+ remove_pagetable(start, end, true);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int __ref arch_remove_memory(u64 start, u64 size)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
+ int ret;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ kernel_physical_mapping_remove(start, start + size);
+ ret = __remove_pages(zone, start_pfn, nr_pages);
+ WARN_ON_ONCE(ret);
+
+ return ret;
+}
+#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
static struct kcore_list kcore_vsyscall;
+static void __init register_page_bootmem_info(void)
+{
+#ifdef CONFIG_NUMA
+ int i;
+
+ for_each_online_node(i)
+ register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
+
void __init mem_init(void)
{
long codesize, reservedpages, datasize, initsize;
@@ -698,11 +1070,8 @@ void __init mem_init(void)
reservedpages = 0;
/* this will put all low memory onto the freelists */
-#ifdef CONFIG_NUMA
- totalram_pages = numa_free_all_bootmem();
-#else
+ register_page_bootmem_info();
totalram_pages = free_all_bootmem();
-#endif
absent_pages = absent_pages_in_range(0, max_pfn);
reservedpages = max_pfn - totalram_pages - absent_pages;
@@ -772,12 +1141,11 @@ void set_kernel_text_ro(void)
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
- unsigned long rodata_start =
- ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+ unsigned long rodata_start = PFN_ALIGN(__start_rodata);
unsigned long end = (unsigned long) &__end_rodata_hpage_align;
- unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
- unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
- unsigned long data_start = (unsigned long) &_sdata;
+ unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
+ unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
+ unsigned long all_end = PFN_ALIGN(&_end);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
@@ -786,10 +1154,10 @@ void mark_rodata_ro(void)
kernel_set_to_readonly = 1;
/*
- * The rodata section (but not the kernel text!) should also be
- * not-executable.
+ * The rodata/data/bss/brk section (but not the kernel text!)
+ * should also be not-executable.
*/
- set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
+ set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
rodata_test();
@@ -802,12 +1170,12 @@ void mark_rodata_ro(void)
#endif
free_init_pages("unused kernel memory",
- (unsigned long) page_address(virt_to_page(text_end)),
- (unsigned long)
- page_address(virt_to_page(rodata_start)));
+ (unsigned long) __va(__pa_symbol(text_end)),
+ (unsigned long) __va(__pa_symbol(rodata_start)));
+
free_init_pages("unused kernel memory",
- (unsigned long) page_address(virt_to_page(rodata_end)),
- (unsigned long) page_address(virt_to_page(data_start)));
+ (unsigned long) __va(__pa_symbol(rodata_end)),
+ (unsigned long) __va(__pa_symbol(_sdata)));
}
#endif
@@ -988,6 +1356,66 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
return 0;
}
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+ unsigned long addr = (unsigned long)start_page;
+ unsigned long end = (unsigned long)(start_page + size);
+ unsigned long next;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ unsigned int nr_pages;
+ struct page *page;
+
+ for (; addr < end; addr = next) {
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
+
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
+
+ if (!cpu_has_pse) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+ get_page_bootmem(section_nr, pmd_page(*pmd),
+ MIX_SECTION_INFO);
+
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte))
+ continue;
+ get_page_bootmem(section_nr, pte_page(*pte),
+ SECTION_INFO);
+ } else {
+ next = pmd_addr_end(addr, end);
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+
+ nr_pages = 1 << (get_order(PMD_SIZE));
+ page = pmd_page(*pmd);
+ while (nr_pages--)
+ get_page_bootmem(section_nr, page++,
+ SECTION_INFO);
+ }
+ }
+}
+#endif
+
void __meminit vmemmap_populate_print_last(void)
{
if (p_start) {
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644
index 000000000000..6b563a118891
--- /dev/null
+++ b/arch/x86/mm/mm_internal.h
@@ -0,0 +1,19 @@
+#ifndef __X86_MM_INTERNAL_H
+#define __X86_MM_INTERNAL_H
+
+void *alloc_low_pages(unsigned int num);
+static inline void *alloc_low_page(void)
+{
+ return alloc_low_pages(1);
+}
+
+void early_ioremap_page_table_range_init(void);
+
+unsigned long kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask);
+void zone_sizes_init(void);
+
+extern int after_bootmem;
+
+#endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 21d02f0d7a2c..ff3633c794c6 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -56,7 +56,7 @@ early_param("numa", numa_setup);
/*
* apicid, cpu, node mappings
*/
-s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+s16 __apicid_to_node[MAX_LOCAL_APIC] = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map);
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-void __cpuinit numa_set_node(int cpu, int node)
+void numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
@@ -100,7 +100,7 @@ void __cpuinit numa_set_node(int cpu, int node)
set_cpu_numa_node(cpu, node);
}
-void __cpuinit numa_clear_node(int cpu)
+void numa_clear_node(int cpu)
{
numa_set_node(cpu, NUMA_NO_NODE);
}
@@ -192,7 +192,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
static void __init setup_node_data(int nid, u64 start, u64 end)
{
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- bool remapped = false;
u64 nd_pa;
void *nd;
int tnid;
@@ -204,37 +203,27 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
if (end && (end - start) < NODE_MIN_SIZE)
return;
- /* initialize remap allocator before aligning to ZONE_ALIGN */
- init_alloc_remap(nid, start, end);
-
start = roundup(start, ZONE_ALIGN);
printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
nid, start, end - 1);
/*
- * Allocate node data. Try remap allocator first, node-local
- * memory and then any node. Never allocate in DMA zone.
+ * Allocate node data. Try node-local memory and then any node.
+ * Never allocate in DMA zone.
*/
- nd = alloc_remap(nid, nd_size);
- if (nd) {
- nd_pa = __pa(nd);
- remapped = true;
- } else {
- nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
- if (!nd_pa) {
- pr_err("Cannot find %zu bytes in node %d\n",
- nd_size, nid);
- return;
- }
- nd = __va(nd_pa);
+ nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
+ if (!nd_pa) {
+ pr_err("Cannot find %zu bytes in any node\n", nd_size);
+ return;
}
+ nd = __va(nd_pa);
/* report and initialize */
- printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]%s\n",
- nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
+ printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n",
+ nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
- if (!remapped && tnid != nid)
+ if (tnid != nid)
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
node_data[nid] = nd;
@@ -570,10 +559,12 @@ static int __init numa_init(int (*init_func)(void))
for (i = 0; i < MAX_LOCAL_APIC; i++)
set_apicid_to_node(i, NUMA_NO_NODE);
- nodes_clear(numa_nodes_parsed);
+ /*
+ * Do not clear numa_nodes_parsed or zero numa_meminfo here, because
+ * SRAT was parsed earlier in early_parse_srat().
+ */
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
- memset(&numa_meminfo, 0, sizeof(numa_meminfo));
WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
numa_reset_distance();
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 534255a36b6b..73a6d7395bd3 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
extern unsigned long highend_pfn, highstart_pfn;
-#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-
-static void *node_remap_start_vaddr[MAX_NUMNODES];
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-/*
- * Remap memory allocator
- */
-static unsigned long node_remap_start_pfn[MAX_NUMNODES];
-static void *node_remap_end_vaddr[MAX_NUMNODES];
-static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-
-/**
- * alloc_remap - Allocate remapped memory
- * @nid: NUMA node to allocate memory from
- * @size: The size of allocation
- *
- * Allocate @size bytes from the remap area of NUMA node @nid. The
- * size of the remap area is predetermined by init_alloc_remap() and
- * only the callers considered there should call this function. For
- * more info, please read the comment on top of init_alloc_remap().
- *
- * The caller must be ready to handle allocation failure from this
- * function and fall back to regular memory allocator in such cases.
- *
- * CONTEXT:
- * Single CPU early boot context.
- *
- * RETURNS:
- * Pointer to the allocated memory on success, %NULL on failure.
- */
-void *alloc_remap(int nid, unsigned long size)
-{
- void *allocation = node_remap_alloc_vaddr[nid];
-
- size = ALIGN(size, L1_CACHE_BYTES);
-
- if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
- return NULL;
-
- node_remap_alloc_vaddr[nid] += size;
- memset(allocation, 0, size);
-
- return allocation;
-}
-
-#ifdef CONFIG_HIBERNATION
-/**
- * resume_map_numa_kva - add KVA mapping to the temporary page tables created
- * during resume from hibernation
- * @pgd_base - temporary resume page directory
- */
-void resume_map_numa_kva(pgd_t *pgd_base)
-{
- int node;
-
- for_each_online_node(node) {
- unsigned long start_va, start_pfn, nr_pages, pfn;
-
- start_va = (unsigned long)node_remap_start_vaddr[node];
- start_pfn = node_remap_start_pfn[node];
- nr_pages = (node_remap_end_vaddr[node] -
- node_remap_start_vaddr[node]) >> PAGE_SHIFT;
-
- printk(KERN_DEBUG "%s: node %d\n", __func__, node);
-
- for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
- unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
- pgd_t *pgd = pgd_base + pgd_index(vaddr);
- pud_t *pud = pud_offset(pgd, vaddr);
- pmd_t *pmd = pmd_offset(pud, vaddr);
-
- set_pmd(pmd, pfn_pmd(start_pfn + pfn,
- PAGE_KERNEL_LARGE_EXEC));
-
- printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
- __func__, vaddr, start_pfn + pfn);
- }
- }
-}
-#endif
-
-/**
- * init_alloc_remap - Initialize remap allocator for a NUMA node
- * @nid: NUMA node to initizlie remap allocator for
- *
- * NUMA nodes may end up without any lowmem. As allocating pgdat and
- * memmap on a different node with lowmem is inefficient, a special
- * remap allocator is implemented which can be used by alloc_remap().
- *
- * For each node, the amount of memory which will be necessary for
- * pgdat and memmap is calculated and two memory areas of the size are
- * allocated - one in the node and the other in lowmem; then, the area
- * in the node is remapped to the lowmem area.
- *
- * As pgdat and memmap must be allocated in lowmem anyway, this
- * doesn't waste lowmem address space; however, the actual lowmem
- * which gets remapped over is wasted. The amount shouldn't be
- * problematic on machines this feature will be used.
- *
- * Initialization failure isn't fatal. alloc_remap() is used
- * opportunistically and the callers will fall back to other memory
- * allocation mechanisms on failure.
- */
-void __init init_alloc_remap(int nid, u64 start, u64 end)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long end_pfn = end >> PAGE_SHIFT;
- unsigned long size, pfn;
- u64 node_pa, remap_pa;
- void *remap_va;
-
- /*
- * The acpi/srat node info can show hot-add memroy zones where
- * memory could be added but not currently present.
- */
- printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
- nid, start_pfn, end_pfn);
-
- /* calculate the necessary space aligned to large page size */
- size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
- size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
- size = ALIGN(size, LARGE_PAGE_BYTES);
-
- /* allocate node memory and the lowmem remap area */
- node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
- if (!node_pa) {
- pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
- size, nid);
- return;
- }
- memblock_reserve(node_pa, size);
-
- remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
- max_low_pfn << PAGE_SHIFT,
- size, LARGE_PAGE_BYTES);
- if (!remap_pa) {
- pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
- size, nid);
- memblock_free(node_pa, size);
- return;
- }
- memblock_reserve(remap_pa, size);
- remap_va = phys_to_virt(remap_pa);
-
- /* perform actual remap */
- for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
- set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
- (node_pa >> PAGE_SHIFT) + pfn,
- PAGE_KERNEL_LARGE);
-
- /* initialize remap allocator parameters */
- node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
- node_remap_start_vaddr[nid] = remap_va;
- node_remap_end_vaddr[nid] = remap_va + size;
- node_remap_alloc_vaddr[nid] = remap_va;
-
- printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
- nid, node_pa, node_pa + size, remap_va, remap_va + size);
-}
-
void __init initmem_init(void)
{
x86_numa_init();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 92e27119ee1a..9405ffc91502 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -10,16 +10,3 @@ void __init initmem_init(void)
{
x86_numa_init();
}
-
-unsigned long __init numa_free_all_bootmem(void)
-{
- unsigned long pages = 0;
- int i;
-
- for_each_online_node(i)
- pages += free_all_bootmem_node(NODE_DATA(i));
-
- pages += free_low_memory_core_early(MAX_NUMNODES);
-
- return pages;
-}
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index 7178c3afe05e..ad86ec91e640 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -21,12 +21,6 @@ void __init numa_reset_distance(void);
void __init x86_numa_init(void);
-#ifdef CONFIG_X86_64
-static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
-#else
-void __init init_alloc_remap(int nid, u64 start, u64 end);
-#endif
-
#ifdef CONFIG_NUMA_EMU
void __init numa_emulation(struct numa_meminfo *numa_meminfo,
int numa_dist_cnt);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 2713be4bca41..091934e1d0d9 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -94,12 +94,12 @@ static inline void split_page_count(int level) { }
static inline unsigned long highmap_start_pfn(void)
{
- return __pa(_text) >> PAGE_SHIFT;
+ return __pa_symbol(_text) >> PAGE_SHIFT;
}
static inline unsigned long highmap_end_pfn(void)
{
- return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
+ return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
}
#endif
@@ -276,8 +276,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
* The .rodata section needs to be read-only. Using the pfn
* catches all aliases.
*/
- if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
- __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
+ if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
+ __pa_symbol(__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
@@ -364,6 +364,37 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
EXPORT_SYMBOL_GPL(lookup_address);
/*
+ * This is necessary because __pa() does not work on some
+ * kinds of memory, like vmalloc() or the alloc_remap()
+ * areas on 32-bit NUMA systems. The percpu areas can
+ * end up in this kind of memory, for instance.
+ *
+ * This could be optimized, but it is only intended to be
+ * used at inititalization time, and keeping it
+ * unoptimized should increase the testing coverage for
+ * the more obscure platforms.
+ */
+phys_addr_t slow_virt_to_phys(void *__virt_addr)
+{
+ unsigned long virt_addr = (unsigned long)__virt_addr;
+ phys_addr_t phys_addr;
+ unsigned long offset;
+ enum pg_level level;
+ unsigned long psize;
+ unsigned long pmask;
+ pte_t *pte;
+
+ pte = lookup_address(virt_addr, &level);
+ BUG_ON(!pte);
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+ offset = virt_addr & ~pmask;
+ phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
+ return (phys_addr | offset);
+}
+EXPORT_SYMBOL_GPL(slow_virt_to_phys);
+
+/*
* Set the new pmd in all the pgds we know about:
*/
static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
@@ -396,7 +427,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
pte_t new_pte, old_pte, *tmp;
pgprot_t old_prot, new_prot, req_prot;
int i, do_split = 1;
- unsigned int level;
+ enum pg_level level;
if (cpa->force_split)
return 1;
@@ -412,15 +443,12 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
switch (level) {
case PG_LEVEL_2M:
- psize = PMD_PAGE_SIZE;
- pmask = PMD_PAGE_MASK;
- break;
#ifdef CONFIG_X86_64
case PG_LEVEL_1G:
- psize = PUD_PAGE_SIZE;
- pmask = PUD_PAGE_MASK;
- break;
#endif
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+ break;
default:
do_split = -EINVAL;
goto out_unlock;
@@ -514,21 +542,13 @@ out_unlock:
return do_split;
}
-static int split_large_page(pte_t *kpte, unsigned long address)
+int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
{
unsigned long pfn, pfninc = 1;
unsigned int i, level;
- pte_t *pbase, *tmp;
+ pte_t *tmp;
pgprot_t ref_prot;
- struct page *base;
-
- if (!debug_pagealloc)
- spin_unlock(&cpa_lock);
- base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
- if (!debug_pagealloc)
- spin_lock(&cpa_lock);
- if (!base)
- return -ENOMEM;
+ struct page *base = virt_to_page(pbase);
spin_lock(&pgd_lock);
/*
@@ -536,10 +556,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
* up for us already:
*/
tmp = lookup_address(address, &level);
- if (tmp != kpte)
- goto out_unlock;
+ if (tmp != kpte) {
+ spin_unlock(&pgd_lock);
+ return 1;
+ }
- pbase = (pte_t *)page_address(base);
paravirt_alloc_pte(&init_mm, page_to_pfn(base));
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
/*
@@ -583,16 +604,10 @@ static int split_large_page(pte_t *kpte, unsigned long address)
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
- if (address >= (unsigned long)__va(0) &&
- address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+ if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
+ PFN_DOWN(__pa(address)) + 1))
split_page_count(level);
-#ifdef CONFIG_X86_64
- if (address >= (unsigned long)__va(1UL<<32) &&
- address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
- split_page_count(level);
-#endif
-
/*
* Install the new, split up pagetable.
*
@@ -611,17 +626,27 @@ static int split_large_page(pte_t *kpte, unsigned long address)
* going on.
*/
__flush_tlb_all();
+ spin_unlock(&pgd_lock);
+
+ return 0;
+}
- base = NULL;
+static int split_large_page(pte_t *kpte, unsigned long address)
+{
+ pte_t *pbase;
+ struct page *base;
-out_unlock:
- /*
- * If we dropped out via the lookup_address check under
- * pgd_lock then stick the page back into the pool:
- */
- if (base)
+ if (!debug_pagealloc)
+ spin_unlock(&cpa_lock);
+ base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+ if (!debug_pagealloc)
+ spin_lock(&cpa_lock);
+ if (!base)
+ return -ENOMEM;
+
+ pbase = (pte_t *)page_address(base);
+ if (__split_large_page(kpte, address, pbase))
__free_page(base);
- spin_unlock(&pgd_lock);
return 0;
}
@@ -773,13 +798,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
unsigned long vaddr;
int ret;
- if (cpa->pfn >= max_pfn_mapped)
+ if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
return 0;
-#ifdef CONFIG_X86_64
- if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
- return 0;
-#endif
/*
* No need to redo, when the primary call touched the direct
* mapping already:
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 0eb572eda406..2610bd93c896 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -560,10 +560,10 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
{
unsigned long id_sz;
- if (base >= __pa(high_memory))
+ if (base > __pa(high_memory-1))
return 0;
- id_sz = (__pa(high_memory) < base + size) ?
+ id_sz = (__pa(high_memory-1) <= base + size) ?
__pa(high_memory) - base :
size;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e27fbf887f3b..193350b51f90 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -334,7 +334,12 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
if (changed && dirty) {
*pmdp = entry;
pmd_update_defer(vma->vm_mm, address, pmdp);
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ /*
+ * We had a write-protection fault here and changed the pmd
+ * to to more permissive. No need to flush the TLB for that,
+ * #PF is architecturally guaranteed to do that and in the
+ * worst-case we'll generate a spurious fault.
+ */
}
return changed;
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index d2e2735327b4..e666cbbb9261 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -1,3 +1,4 @@
+#include <linux/bootmem.h>
#include <linux/mmdebug.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -8,33 +9,54 @@
#ifdef CONFIG_X86_64
+#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
- x += phys_base;
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
} else {
- VIRTUAL_BUG_ON(x < PAGE_OFFSET);
- x -= PAGE_OFFSET;
- VIRTUAL_BUG_ON(!phys_addr_valid(x));
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
}
+
return x;
}
EXPORT_SYMBOL(__phys_addr);
+unsigned long __phys_addr_symbol(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* only check upper bounds since lower bounds will trigger carry */
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+
+ return y + phys_base;
+}
+EXPORT_SYMBOL(__phys_addr_symbol);
+#endif
+
bool __virt_addr_valid(unsigned long x)
{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- if (x >= KERNEL_IMAGE_SIZE)
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ if (y >= KERNEL_IMAGE_SIZE)
return false;
- x += phys_base;
} else {
- if (x < PAGE_OFFSET)
- return false;
- x -= PAGE_OFFSET;
- if (!phys_addr_valid(x))
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ if ((x > y) || !phys_addr_valid(x))
return false;
}
@@ -47,10 +69,16 @@ EXPORT_SYMBOL(__virt_addr_valid);
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
+ unsigned long phys_addr = x - PAGE_OFFSET;
/* VMALLOC_* aren't constants */
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
- return x - PAGE_OFFSET;
+ /* max_low_pfn is set early, but not _that_ early */
+ if (max_low_pfn) {
+ VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
+ BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
+ }
+ return phys_addr;
}
EXPORT_SYMBOL(__phys_addr);
#endif
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index cdd0da9dd530..79836d01f789 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -141,11 +141,126 @@ static inline int save_add_info(void) {return 1;}
static inline int save_add_info(void) {return 0;}
#endif
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static void __init
+handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
+{
+ int overlap, i;
+ unsigned long start_pfn, end_pfn;
+
+ start_pfn = PFN_DOWN(start);
+ end_pfn = PFN_UP(end);
+
+ /*
+ * For movablemem_map=acpi:
+ *
+ * SRAT: |_____| |_____| |_________| |_________| ......
+ * node id: 0 1 1 2
+ * hotpluggable: n y y n
+ * movablemem_map: |_____| |_________|
+ *
+ * Using movablemem_map, we can prevent memblock from allocating memory
+ * on ZONE_MOVABLE at boot time.
+ *
+ * Before parsing SRAT, memblock has already reserve some memory ranges
+ * for other purposes, such as for kernel image. We cannot prevent
+ * kernel from using these memory, so we need to exclude these memory
+ * even if it is hotpluggable.
+ * Furthermore, to ensure the kernel has enough memory to boot, we make
+ * all the memory on the node which the kernel resides in
+ * un-hotpluggable.
+ */
+ if (hotpluggable && movablemem_map.acpi) {
+ /* Exclude ranges reserved by memblock. */
+ struct memblock_type *rgn = &memblock.reserved;
+
+ for (i = 0; i < rgn->cnt; i++) {
+ if (end <= rgn->regions[i].base ||
+ start >= rgn->regions[i].base +
+ rgn->regions[i].size)
+ continue;
+
+ /*
+ * If the memory range overlaps the memory reserved by
+ * memblock, then the kernel resides in this node.
+ */
+ node_set(node, movablemem_map.numa_nodes_kernel);
+
+ goto out;
+ }
+
+ /*
+ * If the kernel resides in this node, then the whole node
+ * should not be hotpluggable.
+ */
+ if (node_isset(node, movablemem_map.numa_nodes_kernel))
+ goto out;
+
+ insert_movablemem_map(start_pfn, end_pfn);
+
+ /*
+ * numa_nodes_hotplug nodemask represents which nodes are put
+ * into movablemem_map.map[].
+ */
+ node_set(node, movablemem_map.numa_nodes_hotplug);
+ goto out;
+ }
+
+ /*
+ * For movablemem_map=nn[KMG]@ss[KMG]:
+ *
+ * SRAT: |_____| |_____| |_________| |_________| ......
+ * node id: 0 1 1 2
+ * user specified: |__| |___|
+ * movablemem_map: |___| |_________| |______| ......
+ *
+ * Using movablemem_map, we can prevent memblock from allocating memory
+ * on ZONE_MOVABLE at boot time.
+ *
+ * NOTE: In this case, SRAT info will be ingored.
+ */
+ overlap = movablemem_map_overlap(start_pfn, end_pfn);
+ if (overlap >= 0) {
+ /*
+ * If part of this range is in movablemem_map, we need to
+ * add the range after it to extend the range to the end
+ * of the node, because from the min address specified to
+ * the end of the node will be ZONE_MOVABLE.
+ */
+ start_pfn = max(start_pfn,
+ movablemem_map.map[overlap].start_pfn);
+ insert_movablemem_map(start_pfn, end_pfn);
+
+ /*
+ * Set the nodemask, so that if the address range on one node
+ * is not continuse, we can add the subsequent ranges on the
+ * same node into movablemem_map.
+ */
+ node_set(node, movablemem_map.numa_nodes_hotplug);
+ } else {
+ if (node_isset(node, movablemem_map.numa_nodes_hotplug))
+ /*
+ * Insert the range if we already have movable ranges
+ * on the same node.
+ */
+ insert_movablemem_map(start_pfn, end_pfn);
+ }
+out:
+ return;
+}
+#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+static inline void
+handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
+{
+}
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
int __init
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
{
u64 start, end;
+ u32 hotpluggable;
int node, pxm;
if (srat_disabled())
@@ -154,7 +269,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
goto out_err_bad_srat;
if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
goto out_err;
- if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
+ hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
+ if (hotpluggable && !save_add_info())
goto out_err;
start = ma->base_address;
@@ -174,9 +290,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
node_set(node, numa_nodes_parsed);
- printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
+ printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n",
node, pxm,
- (unsigned long long) start, (unsigned long long) end - 1);
+ (unsigned long long) start, (unsigned long long) end - 1,
+ hotpluggable ? "Hot Pluggable": "");
+
+ handle_movablemem(node, start, end, hotpluggable);
return 0;
out_err_bad_srat: