aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/amdtopology_64.c (renamed from arch/x86/mm/k8topology_64.c)97
-rw-r--r--arch/x86/mm/gup.c28
-rw-r--r--arch/x86/mm/init.c3
-rw-r--r--arch/x86/mm/init_32.c22
-rw-r--r--arch/x86/mm/kmemcheck/error.c2
-rw-r--r--arch/x86/mm/numa.c22
-rw-r--r--arch/x86/mm/numa_64.c199
-rw-r--r--arch/x86/mm/pageattr.c33
-rw-r--r--arch/x86/mm/pgtable.c66
-rw-r--r--arch/x86/mm/setup_nx.c2
-rw-r--r--arch/x86/mm/srat_32.c2
-rw-r--r--arch/x86/mm/srat_64.c36
-rw-r--r--arch/x86/mm/tlb.c5
14 files changed, 410 insertions, 109 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 55543397a8a7..09df2f9a3d69 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,7 +23,7 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
-obj-$(CONFIG_K8_NUMA) += k8topology_64.o
+obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology_64.c
index 804a3b6c6e14..f21962c435ed 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -1,8 +1,8 @@
/*
- * AMD K8 NUMA support.
+ * AMD NUMA support.
* Discover the memory map and associated nodes.
*
- * This version reads it directly from the K8 northbridge.
+ * This version reads it directly from the AMD northbridge.
*
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/
@@ -27,6 +27,7 @@
#include <asm/amd_nb.h>
static struct bootnode __initdata nodes[8];
+static unsigned char __initdata nodeids[8];
static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
static __init int find_northbridge(void)
@@ -57,7 +58,7 @@ static __init void early_get_boot_cpu_id(void)
{
/*
* need to get the APIC ID of the BSP so can use that to
- * create apicid_to_node in k8_scan_nodes()
+ * create apicid_to_node in amd_scan_nodes()
*/
#ifdef CONFIG_X86_MPPARSE
/*
@@ -66,23 +67,9 @@ static __init void early_get_boot_cpu_id(void)
if (smp_found_config)
early_get_smp_config();
#endif
- early_init_lapic_mapping();
}
-int __init k8_get_nodes(struct bootnode *physnodes)
-{
- int i;
- int ret = 0;
-
- for_each_node_mask(i, nodes_parsed) {
- physnodes[ret].start = nodes[i].start;
- physnodes[ret].end = nodes[i].end;
- ret++;
- }
- return ret;
-}
-
-int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long start = PFN_PHYS(start_pfn);
unsigned long end = PFN_PHYS(end_pfn);
@@ -114,7 +101,7 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
base = read_pci_config(0, nb, 1, 0x40 + i*8);
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
- nodeid = limit & 7;
+ nodeids[i] = nodeid = limit & 7;
if ((base & 3) == 0) {
if (i < numnodes)
pr_info("Skipping disabled node %d\n", i);
@@ -194,7 +181,77 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
return 0;
}
-int __init k8_scan_nodes(void)
+#ifdef CONFIG_NUMA_EMU
+static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+
+void __init amd_get_nodes(struct bootnode *physnodes)
+{
+ int i;
+
+ for_each_node_mask(i, nodes_parsed) {
+ physnodes[i].start = nodes[i].start;
+ physnodes[i].end = nodes[i].end;
+ }
+}
+
+static int __init find_node_by_addr(unsigned long addr)
+{
+ int ret = NUMA_NO_NODE;
+ int i;
+
+ for (i = 0; i < 8; i++)
+ if (addr >= nodes[i].start && addr < nodes[i].end) {
+ ret = i;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
+ * setup to represent the physical topology but reflect the emulated
+ * environment. For each emulated node, the real node which it appears on is
+ * found and a fake pxm to nid mapping is created which mirrors the actual
+ * locality. node_distance() then represents the correct distances between
+ * emulated nodes by using the fake acpi mappings to pxms.
+ */
+void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
+{
+ unsigned int bits;
+ unsigned int cores;
+ unsigned int apicid_base = 0;
+ int i;
+
+ bits = boot_cpu_data.x86_coreid_bits;
+ cores = 1 << bits;
+ early_get_boot_cpu_id();
+ if (boot_cpu_physical_apicid > 0)
+ apicid_base = boot_cpu_physical_apicid;
+
+ for (i = 0; i < nr_nodes; i++) {
+ int index;
+ int nid;
+ int j;
+
+ nid = find_node_by_addr(nodes[i].start);
+ if (nid == NUMA_NO_NODE)
+ continue;
+
+ index = nodeids[nid] << bits;
+ if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
+ for (j = apicid_base; j < cores + apicid_base; j++)
+ fake_apicid_to_node[index + j] = i;
+#ifdef CONFIG_ACPI_NUMA
+ __acpi_map_pxm_to_node(nid, i);
+#endif
+ }
+ memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
+}
+#endif /* CONFIG_NUMA_EMU */
+
+int __init amd_scan_nodes(void)
{
unsigned int bits;
unsigned int cores;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/highmem.h>
+#include <linux/swap.h>
#include <asm/pgtable.h>
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
get_page(page);
+ SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
VM_BUG_ON(page != compound_head(page));
VM_BUG_ON(page_count(page) == 0);
atomic_add(nr, &page->_count);
+ SetPageReferenced(page);
+}
+
+static inline void get_huge_page_tail(struct page *page)
+{
+ /*
+ * __split_huge_page_refcount() cannot run
+ * from under us.
+ */
+ VM_BUG_ON(atomic_read(&page->_count) < 0);
+ atomic_inc(&page->_count);
}
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
+ if (PageTail(page))
+ get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
+ /*
+ * The pmd_trans_splitting() check below explains why
+ * pmdp_splitting_flush has to flush the tlb, to stop
+ * this gup-fast code from running while we set the
+ * splitting bit in the pmd. Returning zero will take
+ * the slow path that will call wait_split_huge_page()
+ * if the pmd is still in splitting state. gup-fast
+ * can't because it has irq disabled and
+ * wait_split_huge_page() would never return as the
+ * tlb flush IPI wouldn't run.
+ */
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c0e28a13de7d..947f42abe820 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
/*
* We just marked the kernel text read only above, now that
* we are going to free part of that, we need to make that
- * writeable first.
+ * writeable and non-executable first.
*/
+ set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e969f9f401b..c821074b7f0b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -45,6 +45,7 @@
#include <asm/bugs.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
+#include <asm/olpc_ofw.h>
#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/paravirt.h>
@@ -226,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
static inline int is_kernel_text(unsigned long addr)
{
- if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+ if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
return 1;
return 0;
}
@@ -715,6 +716,7 @@ void __init paging_init(void)
/*
* NOTE: at this point the bootmem allocator is fully available.
*/
+ olpc_dt_build_devicetree();
sparse_init();
zone_sizes_init();
}
@@ -912,6 +914,23 @@ void set_kernel_text_ro(void)
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
}
+static void mark_nxdata_nx(void)
+{
+ /*
+ * When this called, init has already been executed and released,
+ * so everything past _etext sould be NX.
+ */
+ unsigned long start = PFN_ALIGN(_etext);
+ /*
+ * This comes from is_kernel_text upper limit. Also HPAGE where used:
+ */
+ unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+ if (__supported_pte_mask & _PAGE_NX)
+ printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+ set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
@@ -946,6 +965,7 @@ void mark_rodata_ro(void)
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
+ mark_nxdata_nx();
}
#endif
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index af3b6c8a436f..704a37cedddb 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
e->trace.entries = e->trace_entries;
e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
e->trace.skip = 0;
- save_stack_trace_bp(&e->trace, regs->bp);
+ save_stack_trace_regs(&e->trace, regs);
/* Round address down to nearest 16 bytes */
shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 787c52ca49c3..ebf6d7887a38 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -2,6 +2,28 @@
#include <linux/topology.h>
#include <linux/module.h>
#include <linux/bootmem.h>
+#include <asm/numa.h>
+#include <asm/acpi.h>
+
+int __initdata numa_off;
+
+static __init int numa_setup(char *opt)
+{
+ if (!opt)
+ return -EINVAL;
+ if (!strncmp(opt, "off", 3))
+ numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+ if (!strncmp(opt, "fake=", 5))
+ numa_emu_cmdline(opt + 5);
+#endif
+#ifdef CONFIG_ACPI_NUMA
+ if (!strncmp(opt, "noacpi", 6))
+ acpi_numa = -1;
+#endif
+ return 0;
+}
+early_param("numa", numa_setup);
/*
* Which logical CPUs are on which nodes
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7ffc9b727efd..95ea1551eebc 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -30,7 +30,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
-int numa_off __initdata;
static unsigned long __initdata nodemap_addr;
static unsigned long __initdata nodemap_size;
@@ -260,30 +259,35 @@ void __init numa_init_array(void)
#ifdef CONFIG_NUMA_EMU
/* Numa emulation */
static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode physnodes[MAX_NUMNODES] __initdata;
+static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
static char *cmdline __initdata;
+void __init numa_emu_cmdline(char *str)
+{
+ cmdline = str;
+}
+
static int __init setup_physnodes(unsigned long start, unsigned long end,
- int acpi, int k8)
+ int acpi, int amd)
{
- int nr_nodes = 0;
int ret = 0;
int i;
+ memset(physnodes, 0, sizeof(physnodes));
#ifdef CONFIG_ACPI_NUMA
if (acpi)
- nr_nodes = acpi_get_nodes(physnodes);
+ acpi_get_nodes(physnodes, start, end);
#endif
-#ifdef CONFIG_K8_NUMA
- if (k8)
- nr_nodes = k8_get_nodes(physnodes);
+#ifdef CONFIG_AMD_NUMA
+ if (amd)
+ amd_get_nodes(physnodes);
#endif
/*
* Basic sanity checking on the physical node map: there may be errors
- * if the SRAT or K8 incorrectly reported the topology or the mem=
+ * if the SRAT or AMD code incorrectly reported the topology or the mem=
* kernel parameter is used.
*/
- for (i = 0; i < nr_nodes; i++) {
+ for (i = 0; i < MAX_NUMNODES; i++) {
if (physnodes[i].start == physnodes[i].end)
continue;
if (physnodes[i].start > end) {
@@ -298,17 +302,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
physnodes[i].start = start;
if (physnodes[i].end > end)
physnodes[i].end = end;
- }
-
- /*
- * Remove all nodes that have no memory or were truncated because of the
- * limited address range.
- */
- for (i = 0; i < nr_nodes; i++) {
- if (physnodes[i].start == physnodes[i].end)
- continue;
- physnodes[ret].start = physnodes[i].start;
- physnodes[ret].end = physnodes[i].end;
ret++;
}
@@ -324,6 +317,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
return ret;
}
+static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
+{
+ int i;
+
+ BUG_ON(acpi && amd);
+#ifdef CONFIG_ACPI_NUMA
+ if (acpi)
+ acpi_fake_nodes(nodes, nr_nodes);
+#endif
+#ifdef CONFIG_AMD_NUMA
+ if (amd)
+ amd_fake_nodes(nodes, nr_nodes);
+#endif
+ if (!acpi && !amd)
+ for (i = 0; i < nr_cpu_ids; i++)
+ numa_set_node(i, 0);
+}
+
/*
* Setups up nid to range from addr to addr + size. If the end
* boundary is greater than max_addr, then max_addr is used instead.
@@ -352,8 +363,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
* to max_addr. The return value is the number of nodes allocated.
*/
-static int __init split_nodes_interleave(u64 addr, u64 max_addr,
- int nr_phys_nodes, int nr_nodes)
+static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
{
nodemask_t physnode_mask = NODE_MASK_NONE;
u64 size;
@@ -384,7 +394,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
return -1;
}
- for (i = 0; i < nr_phys_nodes; i++)
+ for (i = 0; i < MAX_NUMNODES; i++)
if (physnodes[i].start != physnodes[i].end)
node_set(i, physnode_mask);
@@ -549,15 +559,13 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
* numa=fake command-line option.
*/
static int __init numa_emulation(unsigned long start_pfn,
- unsigned long last_pfn, int acpi, int k8)
+ unsigned long last_pfn, int acpi, int amd)
{
u64 addr = start_pfn << PAGE_SHIFT;
u64 max_addr = last_pfn << PAGE_SHIFT;
- int num_phys_nodes;
int num_nodes;
int i;
- num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
/*
* If the numa=fake command-line contains a 'M' or 'G', it represents
* the fixed node size. Otherwise, if it is just a single number N,
@@ -572,7 +580,7 @@ static int __init numa_emulation(unsigned long start_pfn,
unsigned long n;
n = simple_strtoul(cmdline, NULL, 0);
- num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
+ num_nodes = split_nodes_interleave(addr, max_addr, n);
}
if (num_nodes < 0)
@@ -595,14 +603,15 @@ static int __init numa_emulation(unsigned long start_pfn,
nodes[i].end >> PAGE_SHIFT);
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
- acpi_fake_nodes(nodes, num_nodes);
+ setup_physnodes(addr, max_addr, acpi, amd);
+ fake_physnodes(acpi, amd, num_nodes);
numa_init_array();
return 0;
}
#endif /* CONFIG_NUMA_EMU */
void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
- int acpi, int k8)
+ int acpi, int amd)
{
int i;
@@ -610,8 +619,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
nodes_clear(node_online_map);
#ifdef CONFIG_NUMA_EMU
- if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
+ setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
+ acpi, amd);
+ if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
return;
+ setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
+ acpi, amd);
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#endif
@@ -624,8 +637,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
nodes_clear(node_online_map);
#endif
-#ifdef CONFIG_K8_NUMA
- if (!numa_off && k8 && !k8_scan_nodes())
+#ifdef CONFIG_AMD_NUMA
+ if (!numa_off && amd && !amd_scan_nodes())
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
@@ -661,24 +674,6 @@ unsigned long __init numa_free_all_bootmem(void)
return pages;
}
-static __init int numa_setup(char *opt)
-{
- if (!opt)
- return -EINVAL;
- if (!strncmp(opt, "off", 3))
- numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
- if (!strncmp(opt, "fake=", 5))
- cmdline = opt + 5;
-#endif
-#ifdef CONFIG_ACPI_NUMA
- if (!strncmp(opt, "noacpi", 6))
- acpi_numa = -1;
-#endif
- return 0;
-}
-early_param("numa", numa_setup);
-
#ifdef CONFIG_NUMA
static __init int find_near_online_node(int node)
@@ -767,6 +762,7 @@ void __cpuinit numa_clear_node(int cpu)
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+#ifndef CONFIG_NUMA_EMU
void __cpuinit numa_add_cpu(int cpu)
{
cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
@@ -776,34 +772,115 @@ void __cpuinit numa_remove_cpu(int cpu)
{
cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
+#else
+void __cpuinit numa_add_cpu(int cpu)
+{
+ unsigned long addr;
+ u16 apicid;
+ int physnid;
+ int nid = NUMA_NO_NODE;
+
+ apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+ if (apicid != BAD_APICID)
+ nid = apicid_to_node[apicid];
+ if (nid == NUMA_NO_NODE)
+ nid = early_cpu_to_node(cpu);
+ BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+
+ /*
+ * Use the starting address of the emulated node to find which physical
+ * node it is allocated on.
+ */
+ addr = node_start_pfn(nid) << PAGE_SHIFT;
+ for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
+ if (addr >= physnodes[physnid].start &&
+ addr < physnodes[physnid].end)
+ break;
+
+ /*
+ * Map the cpu to each emulated node that is allocated on the physical
+ * node of the cpu's apic id.
+ */
+ for_each_online_node(nid) {
+ addr = node_start_pfn(nid) << PAGE_SHIFT;
+ if (addr >= physnodes[physnid].start &&
+ addr < physnodes[physnid].end)
+ cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+ }
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+ int i;
+
+ for_each_online_node(i)
+ cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#endif /* !CONFIG_NUMA_EMU */
#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
+{
+ int node = early_cpu_to_node(cpu);
+ struct cpumask *mask;
+ char buf[64];
+
+ mask = node_to_cpumask_map[node];
+ if (!mask) {
+ pr_err("node_to_cpumask_map[%i] NULL\n", node);
+ dump_stack();
+ return NULL;
+ }
+
+ cpulist_scnprintf(buf, sizeof(buf), mask);
+ printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+ enable ? "numa_add_cpu" : "numa_remove_cpu",
+ cpu, node, buf);
+ return mask;
+}
/*
* --------- debug versions of the numa functions ---------
*/
+#ifndef CONFIG_NUMA_EMU
static void __cpuinit numa_set_cpumask(int cpu, int enable)
{
- int node = early_cpu_to_node(cpu);
struct cpumask *mask;
- char buf[64];
- mask = node_to_cpumask_map[node];
- if (mask == NULL) {
- printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
- dump_stack();
+ mask = debug_cpumask_set_cpu(cpu, enable);
+ if (!mask)
return;
- }
if (enable)
cpumask_set_cpu(cpu, mask);
else
cpumask_clear_cpu(cpu, mask);
+}
+#else
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+ int node = early_cpu_to_node(cpu);
+ struct cpumask *mask;
+ int i;
- cpulist_scnprintf(buf, sizeof(buf), mask);
- printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
- enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
+ for_each_online_node(i) {
+ unsigned long addr;
+
+ addr = node_start_pfn(i) << PAGE_SHIFT;
+ if (addr < physnodes[node].start ||
+ addr >= physnodes[node].end)
+ continue;
+ mask = debug_cpumask_set_cpu(cpu, enable);
+ if (!mask)
+ return;
+
+ if (enable)
+ cpumask_set_cpu(cpu, mask);
+ else
+ cpumask_clear_cpu(cpu, mask);
+ }
}
+#endif /* CONFIG_NUMA_EMU */
void __cpuinit numa_add_cpu(int cpu)
{
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e7933d606..8b830ca14ac4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
#include <linux/pfn.h>
#include <linux/percpu.h>
#include <linux/gfp.h>
+#include <linux/pci.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -255,13 +256,16 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
unsigned long pfn)
{
pgprot_t forbidden = __pgprot(0);
+ pgprot_t required = __pgprot(0);
/*
* The BIOS area between 640k and 1Mb needs to be executable for
* PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
*/
- if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+#ifdef CONFIG_PCI_BIOS
+ if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_NX;
+#endif
/*
* The kernel text needs to be executable for obvious reasons
@@ -278,6 +282,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
__pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
+ /*
+ * .data and .bss should always be writable.
+ */
+ if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
+ within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
+ pgprot_val(required) |= _PAGE_RW;
#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
/*
@@ -317,6 +327,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
#endif
prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+ prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
return prot;
}
@@ -393,7 +404,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
{
unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
pte_t new_pte, old_pte, *tmp;
- pgprot_t old_prot, new_prot;
+ pgprot_t old_prot, new_prot, req_prot;
int i, do_split = 1;
unsigned int level;
@@ -438,10 +449,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* We are safe now. Check whether the new pgprot is the same:
*/
old_pte = *kpte;
- old_prot = new_prot = pte_pgprot(old_pte);
+ old_prot = new_prot = req_prot = pte_pgprot(old_pte);
- pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
- pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+ pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
/*
* old_pte points to the large page base address. So we need
@@ -450,17 +461,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
cpa->pfn = pfn;
- new_prot = static_protections(new_prot, address, pfn);
+ new_prot = static_protections(req_prot, address, pfn);
/*
* We need to check the full range, whether
* static_protection() requires a different pgprot for one of
* the pages in the range we try to preserve:
*/
- addr = address + PAGE_SIZE;
- pfn++;
- for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
- pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
+ addr = address & pmask;
+ pfn = pte_pfn(old_pte);
+ for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
+ pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
if (pgprot_val(chk_prot) != pgprot_val(new_prot))
goto out_unlock;
@@ -483,7 +494,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* that we limited the number of possible pages already to
* the number of pages in the large page.
*/
- if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+ if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
/*
* The address is aligned and the number of pages
* covers the full page.
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8be8c7d7bc89..500242d3c96d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
return changed;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+ int changed = !pmd_same(*pmdp, entry);
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ if (changed && dirty) {
+ *pmdp = entry;
+ pmd_update_defer(vma->vm_mm, address, pmdp);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+
+ return changed;
+}
+#endif
+
int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
return ret;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ int ret = 0;
+
+ if (pmd_young(*pmdp))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ (unsigned long *)pmdp);
+
+ if (ret)
+ pmd_update(vma->vm_mm, addr, pmdp);
+
+ return ret;
+}
+#endif
+
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
return young;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int young;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ young = pmdp_test_and_clear_young(vma, address, pmdp);
+ if (young)
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+ return young;
+}
+
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int set;
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+ (unsigned long *)pmdp);
+ if (set) {
+ pmd_update(vma->vm_mm, address, pmdp);
+ /* need tlb flush only to serialize against gup-fast */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+}
+#endif
+
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa34086..410531d3c292 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
{
if (!cpu_has_nx) {
printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
- "missing in CPU or disabled in BIOS!\n");
+ "missing in CPU!\n");
} else {
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
if (disable_nx) {
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index a17dffd136c1..ae96e7b8051d 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -59,7 +59,6 @@ static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
static int __initdata num_memory_chunks; /* total number of memory chunks */
static u8 __initdata apicid_to_pxm[MAX_APICID];
-int numa_off __initdata;
int acpi_numa __initdata;
static __init void bad_srat(void)
@@ -92,6 +91,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
/* mark this node as "seen" in node bitmap */
BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
+ /* don't need to check apic_id here, because it is always 8 bits */
apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index a35cb9d8b060..603d285d1daa 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
}
apic_id = pa->apic_id;
+ if (apic_id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+ return;
+ }
apicid_to_node[apic_id] = node;
node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
@@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
else
apic_id = pa->apic_id;
+
+ if (apic_id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+ return;
+ }
+
apicid_to_node[apic_id] = node;
node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
@@ -339,18 +349,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
void __init acpi_numa_arch_fixup(void) {}
-int __init acpi_get_nodes(struct bootnode *physnodes)
+#ifdef CONFIG_NUMA_EMU
+void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
+ unsigned long end)
{
int i;
- int ret = 0;
for_each_node_mask(i, nodes_parsed) {
- physnodes[ret].start = nodes[i].start;
- physnodes[ret].end = nodes[i].end;
- ret++;
+ cutoff_node(i, start, end);
+ physnodes[i].start = nodes[i].start;
+ physnodes[i].end = nodes[i].end;
}
- return ret;
}
+#endif /* CONFIG_NUMA_EMU */
/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
@@ -495,8 +506,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
{
int i, j;
- printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
- "topology.\n");
for (i = 0; i < num_nodes; i++) {
int nid, pxm;
@@ -516,6 +525,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
fake_apicid_to_node[j] == NUMA_NO_NODE)
fake_apicid_to_node[j] = i;
}
+
+ /*
+ * If there are apicid-to-node mappings for physical nodes that do not
+ * have a corresponding emulated node, it should default to a guaranteed
+ * value.
+ */
+ for (i = 0; i < MAX_LOCAL_APIC; i++)
+ if (apicid_to_node[i] != NUMA_NO_NODE &&
+ fake_apicid_to_node[i] == NUMA_NO_NODE)
+ fake_apicid_to_node[i] = 0;
+
for (i = 0; i < num_nodes; i++)
__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 12cdbb17ad18..6acc724d5d8f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -223,7 +223,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
static void __cpuinit calculate_tlb_offset(void)
{
- int cpu, node, nr_node_vecs;
+ int cpu, node, nr_node_vecs, idx = 0;
/*
* we are changing tlb_vector_offset for each CPU in runtime, but this
* will not cause inconsistency, as the write is atomic under X86. we
@@ -239,7 +239,7 @@ static void __cpuinit calculate_tlb_offset(void)
nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
for_each_online_node(node) {
- int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
+ int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
nr_node_vecs;
int cpu_offset = 0;
for_each_cpu(cpu, cpumask_of_node(node)) {
@@ -248,6 +248,7 @@ static void __cpuinit calculate_tlb_offset(void)
cpu_offset++;
cpu_offset = cpu_offset % nr_node_vecs;
}
+ idx++;
}
}