From 99bd0c0fc4b04da54cb311953ef9489931c19c63 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 19 Jun 2009 10:59:09 +0200 Subject: x86: Set cpu_llc_id on AMD CPUs This counts when building sched domains in case NUMA information is not available. ( See cpu_coregroup_mask() which uses llc_shared_map which in turn is created based on cpu_llc_id. ) Currently Linux builds domains as follows: (example from a dual socket quad-core system) CPU0 attaching sched-domain: domain 0: span 0-7 level CPU groups: 0 1 2 3 4 5 6 7 ... CPU7 attaching sched-domain: domain 0: span 0-7 level CPU groups: 7 0 1 2 3 4 5 6 Ever since that is borked for multi-core AMD CPU systems. This patch fixes that and now we get a proper: CPU0 attaching sched-domain: domain 0: span 0-3 level MC groups: 0 1 2 3 domain 1: span 0-7 level CPU groups: 0-3 4-7 ... CPU7 attaching sched-domain: domain 0: span 4-7 level MC groups: 7 4 5 6 domain 1: span 0-7 level CPU groups: 4-7 0-3 This allows scheduler to assign tasks to cores on different sockets (i.e. that don't share last level cache) for performance reasons. Signed-off-by: Andreas Herrmann LKML-Reference: <20090619085909.GJ5218@alberich.amd.com> Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e5b27d8f1b47..28e5f5956042 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -258,13 +258,15 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits; + int cpu = smp_processor_id(); bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; #endif } -- cgit v1.2.3-59-g8ed1b From 85ae87c1ad8e18a421e7448a99a42ecda183f29f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:23 +0900 Subject: percpu: fix too lazy vunmap cache flushing In pcpu_unmap(), flushing virtual cache on vunmap can't be delayed as the page is going to be returned to the page allocator. Only TLB flushing can be put off such that vmalloc code can handle it lazily. Fix it. [ Impact: fix subtle virtual cache flush bug ] Signed-off-by: Tejun Heo Cc: Nick Piggin Cc: Ingo Molnar --- mm/percpu.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index c0b2c1a76e81..d06f4748271e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -549,14 +549,14 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) * @chunk: chunk of interest * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 - * @flush: whether to flush cache and tlb or not + * @flush_tlb: whether to flush tlb or not * * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. * If @flush is true, vcache is flushed before unmapping and tlb * after. */ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, - bool flush) + bool flush_tlb) { unsigned int last = num_possible_cpus() - 1; unsigned int cpu; @@ -569,9 +569,8 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, * the whole region at once rather than doing it for each cpu. * This could be an overkill but is more scalable. */ - if (flush) - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); for_each_possible_cpu(cpu) unmap_kernel_range_noflush( @@ -579,7 +578,7 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, (page_end - page_start) << PAGE_SHIFT); /* ditto as flush_cache_vunmap() */ - if (flush) + if (flush_tlb) flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), pcpu_chunk_addr(chunk, last, page_end)); } -- cgit v1.2.3-59-g8ed1b From c5806df9232d2a7f554b4839b57cac2e664fc256 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: fix duplicate free in setup_pcpu_remap() failure path In the failure path, setup_pcpu_remap() tries to free the area which has already been freed to make holes in the large page. Fix it. [ Impact: fix duplicate free in failure path ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9c3f0823e6aa..dfbc7e6c64d4 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -228,7 +228,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) enomem: for_each_possible_cpu(cpu) if (pcpur_ptrs[cpu]) - free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); + free_bootmem(__pa(pcpur_ptrs[cpu]), pcpur_size); ret = -ENOMEM; out_free_ar: free_bootmem(__pa(pcpur_ptrs), ptrs_size); -- cgit v1.2.3-59-g8ed1b From 97c9bf0618cd40b05b4859c1f8a90d8ad97fefb2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: rename remap percpu first chunk allocator to lpage The "remap" allocator remaps large pages to build the first chunk; however, the name isn't very good because 4k allocator remaps too and the whole point of the remap allocator is using large page mapping. The allocator will be generalized and exported outside of x86, rename it to lpage before that happens. percpu_alloc kernel parameter is updated to accept both "remap" and "lpage" for lpage allocator. [ Impact: code cleanup, kernel parameter argument updated ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 50 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index dfbc7e6c64d4..8794c0c94d2c 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -124,7 +124,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } /* - * Remap allocator + * Large page remap allocator * * This allocator uses PMD page as unit. A PMD page is allocated for * each cpu and each is remapped into vmalloc area using PMD mapping. @@ -137,20 +137,20 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, * better than only using 4k mappings while still being NUMA friendly. */ #ifdef CONFIG_NEED_MULTIPLE_NODES -static size_t pcpur_size __initdata; -static void **pcpur_ptrs __initdata; +static size_t pcpul_size __initdata; +static void **pcpul_ptrs __initdata; -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) { size_t off = (size_t)pageno << PAGE_SHIFT; - if (off >= pcpur_size) + if (off >= pcpul_size) return NULL; - return virt_to_page(pcpur_ptrs[cpu] + off); + return virt_to_page(pcpul_ptrs[cpu] + off); } -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size) { static struct vm_struct vm; size_t ptrs_size, dyn_size; @@ -170,36 +170,36 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. */ - pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE); - if (pcpur_size > PMD_SIZE) { + if (pcpul_size > PMD_SIZE) { pr_warning("PERCPU: static data is larger than large page, " "can't use large page\n"); return -EINVAL; } - dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; + dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); - pcpur_ptrs = alloc_bootmem(ptrs_size); + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_ptrs[0])); + pcpul_ptrs = alloc_bootmem(ptrs_size); for_each_possible_cpu(cpu) { - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpur_ptrs[cpu]) + pcpul_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); + if (!pcpul_ptrs[cpu]) goto enomem; /* - * Only use pcpur_size bytes and give back the rest. + * Only use pcpul_size bytes and give back the rest. * * Ingo: The 2MB up-rounding bootmem is needed to make * sure the partial 2MB page is still fully RAM - it's * not well-specified to have a PAT-incompatible area * (unmapped RAM, device memory, etc.) in that hole. */ - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), - PMD_SIZE - pcpur_size); + free_bootmem(__pa(pcpul_ptrs[cpu] + pcpul_size), + PMD_SIZE - pcpul_size); - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + memcpy(pcpul_ptrs[cpu], __per_cpu_load, static_size); } /* allocate address and map */ @@ -212,7 +212,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) pmd = populate_extra_pmd((unsigned long)vm.addr + cpu * PMD_SIZE); - set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), + set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpul_ptrs[cpu])), PAGE_KERNEL_LARGE)); } @@ -220,22 +220,22 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, + ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, PMD_SIZE, vm.addr, NULL); goto out_free_ar; enomem: for_each_possible_cpu(cpu) - if (pcpur_ptrs[cpu]) - free_bootmem(__pa(pcpur_ptrs[cpu]), pcpur_size); + if (pcpul_ptrs[cpu]) + free_bootmem(__pa(pcpul_ptrs[cpu]), pcpul_size); ret = -ENOMEM; out_free_ar: - free_bootmem(__pa(pcpur_ptrs), ptrs_size); + free_bootmem(__pa(pcpul_ptrs), ptrs_size); return ret; } #else -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size) { return -EINVAL; } @@ -367,7 +367,7 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = setup_pcpu_remap(static_size); + ret = setup_pcpu_lpage(static_size); if (ret < 0) ret = setup_pcpu_embed(static_size); if (ret < 0) -- cgit v1.2.3-59-g8ed1b From 0ff2587fd54bd6f66bc6914ada4eb77a7e819a5b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: prepare setup_pcpu_lpage() for pageattr fix Make the following changes in preparation of coming pageattr updates. * Define and use array of struct pcpul_ent instead of array of pointers. The only difference is ->cpu field which is set but unused yet. * Rename variables according to the above change. * Rename local variable vm to pcpul_vm and move it out of the function. [ Impact: no functional difference ] Signed-off-by: Tejun Heo Cc: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 58 ++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8794c0c94d2c..7d38941e2b8c 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -137,8 +137,14 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, * better than only using 4k mappings while still being NUMA friendly. */ #ifdef CONFIG_NEED_MULTIPLE_NODES +struct pcpul_ent { + unsigned int cpu; + void *ptr; +}; + static size_t pcpul_size __initdata; -static void **pcpul_ptrs __initdata; +static struct pcpul_ent *pcpul_map __initdata; +static struct vm_struct pcpul_vm; static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) { @@ -147,13 +153,12 @@ static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) if (off >= pcpul_size) return NULL; - return virt_to_page(pcpul_ptrs[cpu] + off); + return virt_to_page(pcpul_map[cpu].ptr + off); } static ssize_t __init setup_pcpu_lpage(size_t static_size) { - static struct vm_struct vm; - size_t ptrs_size, dyn_size; + size_t map_size, dyn_size; unsigned int cpu; ssize_t ret; @@ -180,12 +185,14 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_ptrs[0])); - pcpul_ptrs = alloc_bootmem(ptrs_size); + map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + pcpul_map = alloc_bootmem(map_size); for_each_possible_cpu(cpu) { - pcpul_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpul_ptrs[cpu]) + pcpul_map[cpu].cpu = cpu; + pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, + PMD_SIZE); + if (!pcpul_map[cpu].ptr) goto enomem; /* @@ -196,42 +203,43 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) * not well-specified to have a PAT-incompatible area * (unmapped RAM, device memory, etc.) in that hole. */ - free_bootmem(__pa(pcpul_ptrs[cpu] + pcpul_size), + free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), PMD_SIZE - pcpul_size); - memcpy(pcpul_ptrs[cpu], __per_cpu_load, static_size); + memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); } /* allocate address and map */ - vm.flags = VM_ALLOC; - vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&vm, PMD_SIZE); + pcpul_vm.flags = VM_ALLOC; + pcpul_vm.size = num_possible_cpus() * PMD_SIZE; + vm_area_register_early(&pcpul_vm, PMD_SIZE); for_each_possible_cpu(cpu) { - pmd_t *pmd; + pmd_t *pmd, pmd_v; - pmd = populate_extra_pmd((unsigned long)vm.addr - + cpu * PMD_SIZE); - set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpul_ptrs[cpu])), - PAGE_KERNEL_LARGE)); + pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + + cpu * PMD_SIZE); + pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), + PAGE_KERNEL_LARGE); + set_pmd(pmd, pmd_v); } /* we're ready, commit */ pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", vm.addr, static_size); + "%zu bytes\n", pcpul_vm.addr, static_size); ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, vm.addr, NULL); - goto out_free_ar; + PMD_SIZE, pcpul_vm.addr, NULL); + goto out_free_map; enomem: for_each_possible_cpu(cpu) - if (pcpul_ptrs[cpu]) - free_bootmem(__pa(pcpul_ptrs[cpu]), pcpul_size); + if (pcpul_map[cpu].ptr) + free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpul_ptrs), ptrs_size); +out_free_map: + free_bootmem(__pa(pcpul_map), map_size); return ret; } #else -- cgit v1.2.3-59-g8ed1b From 992f4c1c2c1583cef3296ec4bf5205843a9a5f3d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: reorganize cpa_process_alias() Reorganize cpa_process_alias() so that new alias condition can be added easily. Jan Beulich spotted problem in the original cleanup thread which incorrectly assumed the two existing conditions were mutially exclusive. [ Impact: code reorganization ] Signed-off-by: Tejun Heo Cc: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/mm/pageattr.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3cfe9ced8a4c..2ab058b0947d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -681,8 +681,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; - int ret = 0; - unsigned long temp_cpa_vaddr, vaddr; + unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); + unsigned long vaddr; + int ret; if (cpa->pfn >= max_pfn_mapped) return 0; @@ -706,42 +707,37 @@ static int cpa_process_alias(struct cpa_data *cpa) PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; - temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); - alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.vaddr = &laddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; } #ifdef CONFIG_X86_64 - if (ret) - return ret; /* - * No need to redo, when the primary call touched the high - * mapping already: - */ - if (within(vaddr, (unsigned long) _text, _brk_end)) - return 0; - - /* - * If the physical address is inside the kernel map, we need + * If the primary call didn't touch the high mapping already + * and the physical address is inside the kernel map, we need * to touch the high mapped kernel as well: */ - if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) - return 0; - - alias_cpa = *cpa; - temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; - alias_cpa.vaddr = &temp_cpa_vaddr; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + if (!within(vaddr, (unsigned long)_text, _brk_end) && + within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { + unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + + __START_KERNEL_map - phys_base; + alias_cpa = *cpa; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - /* - * The high mapping range is imprecise, so ignore the return value. - */ - __change_page_attr_set_clr(&alias_cpa, 0); + /* + * The high mapping range is imprecise, so ignore the + * return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); + } #endif - return ret; + + return 0; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) -- cgit v1.2.3-59-g8ed1b From e59a1bb2fdfb745c685f5b40ffbed126331d3223 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: fix pageattr handling for lpage percpu allocator and re-enable it lpage allocator aliases a PMD page for each cpu and returns whatever is unused to the page allocator. When the pageattr of the recycled pages are changed, this makes the two aliases point to the overlapping regions with different attributes which isn't allowed and known to cause subtle data corruption in certain cases. This can be handled in simliar manner to the x86_64 highmap alias. pageattr code should detect if the target pages have PMD alias and split the PMD alias and synchronize the attributes. pcpur allocator is updated to keep the allocated PMD pages map sorted in ascending address order and provide pcpu_lpage_remapped() function which binary searches the array to determine whether the given address is aliased and if so to which address. pageattr is updated to use pcpu_lpage_remapped() to detect the PMD alias and split it up as necessary from cpa_process_alias(). Jan Beulich spotted the original problem and incorrect usage of vaddr instead of laddr for lookup. With this, lpage percpu allocator should work correctly. Re-enable it. [ Impact: fix subtle lpage pageattr bug and re-enable lpage ] Signed-off-by: Tejun Heo Reported-by: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/include/asm/percpu.h | 10 ++++++ arch/x86/kernel/setup_percpu.c | 72 ++++++++++++++++++++++++++++++++++++------ arch/x86/mm/pageattr.c | 21 +++++++++++- 3 files changed, 93 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 02ecb30982a3..103f1ddb0d85 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -42,6 +42,7 @@ #else /* ...!ASSEMBLY */ +#include #include #ifdef CONFIG_SMP @@ -155,6 +156,15 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); +#ifdef CONFIG_NEED_MULTIPLE_NODES +void *pcpu_lpage_remapped(void *kaddr); +#else +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} +#endif + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7d38941e2b8c..bad2fd223114 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -142,8 +142,8 @@ struct pcpul_ent { void *ptr; }; -static size_t pcpul_size __initdata; -static struct pcpul_ent *pcpul_map __initdata; +static size_t pcpul_size; +static struct pcpul_ent *pcpul_map; static struct vm_struct pcpul_vm; static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) @@ -160,15 +160,14 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) { size_t map_size, dyn_size; unsigned int cpu; + int i, j; ssize_t ret; /* * If large page isn't supported, there's no benefit in doing * this. Also, on non-NUMA, embedding is better. - * - * NOTE: disabled for now. */ - if (true || !cpu_has_pse || !pcpu_need_numa()) + if (!cpu_has_pse || !pcpu_need_numa()) return -EINVAL; /* @@ -231,16 +230,71 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, PMD_SIZE, pcpul_vm.addr, NULL); - goto out_free_map; + + /* sort pcpul_map array for pcpu_lpage_remapped() */ + for (i = 0; i < num_possible_cpus() - 1; i++) + for (j = i + 1; j < num_possible_cpus(); j++) + if (pcpul_map[i].ptr > pcpul_map[j].ptr) { + struct pcpul_ent tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + return ret; enomem: for_each_possible_cpu(cpu) if (pcpul_map[cpu].ptr) free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - ret = -ENOMEM; -out_free_map: free_bootmem(__pa(pcpul_map), map_size); - return ret; + return -ENOMEM; +} + +/** + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area + * @kaddr: the kernel address in question + * + * Determine whether @kaddr falls in the pcpul recycled area. This is + * used by pageattr to detect VM aliases and break up the pcpu PMD + * mapping such that the same physical page is not mapped under + * different attributes. + * + * The recycled area is always at the tail of a partially used PMD + * page. + * + * RETURNS: + * Address of corresponding remapped pcpu address if match is found; + * otherwise, NULL. + */ +void *pcpu_lpage_remapped(void *kaddr) +{ + void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); + unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; + int left = 0, right = num_possible_cpus() - 1; + int pos; + + /* pcpul in use at all? */ + if (!pcpul_map) + return NULL; + + /* okay, perform binary search */ + while (left <= right) { + pos = (left + right) / 2; + + if (pcpul_map[pos].ptr < pmd_addr) + left = pos + 1; + else if (pcpul_map[pos].ptr > pmd_addr) + right = pos - 1; + else { + /* it shouldn't be in the area for the first chunk */ + WARN_ON(offset < pcpul_size); + + return pcpul_vm.addr + + pcpul_map[pos].cpu * PMD_SIZE + offset; + } + } + + return NULL; } #else static ssize_t __init setup_pcpu_lpage(size_t static_size) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2ab058b0947d..1b734d7a8966 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -682,7 +683,7 @@ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); - unsigned long vaddr; + unsigned long vaddr, remapped; int ret; if (cpa->pfn >= max_pfn_mapped) @@ -737,6 +738,24 @@ static int cpa_process_alias(struct cpa_data *cpa) } #endif + /* + * If the PMD page was partially used for per-cpu remapping, + * the recycled area needs to be split and modified. Because + * the area is always proper subset of a PMD page + * cpa->numpages is guaranteed to be 1 for these areas, so + * there's no need to loop over and check for further remaps. + */ + remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); + if (remapped) { + WARN_ON(cpa->numpages > 1); + alias_cpa = *cpa; + alias_cpa.vaddr = &remapped; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; + } + return 0; } -- cgit v1.2.3-59-g8ed1b From fa8a7094ba1679b4b9b443e0ac9f5e046c79ee8d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: implement percpu_alloc kernel parameter According to Andi, it isn't clear whether lpage allocator is worth the trouble as there are many processors where PMD TLB is far scarcer than PTE TLB. The advantage or disadvantage probably depends on the actual size of percpu area and specific processor. As performance degradation due to TLB pressure tends to be highly workload specific and subtle, it is difficult to decide which way to go without more data. This patch implements percpu_alloc kernel parameter to allow selecting which first chunk allocator to use to ease debugging and testing. While at it, make sure all the failure paths report why something failed to help determining why certain allocator isn't working. Also, kill the "Great future plan" comment which had already been realized quite some time ago. [ Impact: allow explicit percpu first chunk allocator selection ] Signed-off-by: Tejun Heo Reported-by: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- Documentation/kernel-parameters.txt | 6 ++++ arch/x86/kernel/setup_percpu.c | 69 +++++++++++++++++++++++++++---------- mm/percpu.c | 13 ++++--- 3 files changed, 65 insertions(+), 23 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 08def8deb5f5..ecad946920d1 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1882,6 +1882,12 @@ and is between 256 and 4096 characters. It is defined in the file Format: { 0 | 1 } See arch/parisc/kernel/pdc_chassis.c + percpu_alloc= [X86] Select which percpu first chunk allocator to use. + Allowed values are one of "lpage", "embed" and "4k". + See comments in arch/x86/kernel/setup_percpu.c for + details on each allocator. This parameter is primarily + for debugging and performance comparison. + pf. [PARIDE] See Documentation/blockdev/paride.txt. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index bad2fd223114..165ebd5ba83b 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -156,20 +156,23 @@ static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) return virt_to_page(pcpul_map[cpu].ptr + off); } -static ssize_t __init setup_pcpu_lpage(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { size_t map_size, dyn_size; unsigned int cpu; int i, j; ssize_t ret; - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, on non-NUMA, embedding is better. - */ - if (!cpu_has_pse || !pcpu_need_numa()) + /* on non-NUMA, embedding is better */ + if (!chosen && !pcpu_need_numa()) return -EINVAL; + /* need PSE */ + if (!cpu_has_pse) { + pr_warning("PERCPU: lpage allocator requires PSE\n"); + return -EINVAL; + } + /* * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. @@ -191,8 +194,11 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) pcpul_map[cpu].cpu = cpu; pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpul_map[cpu].ptr) + if (!pcpul_map[cpu].ptr) { + pr_warning("PERCPU: failed to allocate large page " + "for cpu%u\n", cpu); goto enomem; + } /* * Only use pcpul_size bytes and give back the rest. @@ -297,7 +303,7 @@ void *pcpu_lpage_remapped(void *kaddr) return NULL; } #else -static ssize_t __init setup_pcpu_lpage(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { return -EINVAL; } @@ -311,7 +317,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(size_t static_size) +static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -320,7 +326,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) * this. Also, embedding allocation doesn't play well with * NUMA. */ - if (!cpu_has_pse || pcpu_need_numa()) + if (!chosen && (!cpu_has_pse || pcpu_need_numa())) return -EINVAL; return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, @@ -370,8 +376,11 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) void *ptr; ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) + if (!ptr) { + pr_warning("PERCPU: failed to allocate " + "4k page for cpu%u\n", cpu); goto enomem; + } memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); pcpu4k_pages[j++] = virt_to_page(ptr); @@ -395,6 +404,16 @@ out_free_ar: return ret; } +/* for explicit first chunk allocator selection */ +static char pcpu_chosen_alloc[16] __initdata; + +static int __init percpu_alloc_setup(char *str) +{ + strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -408,11 +427,6 @@ static inline void setup_percpu_segment(int cpu) #endif } -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ void __init setup_per_cpu_areas(void) { size_t static_size = __per_cpu_end - __per_cpu_start; @@ -429,9 +443,26 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = setup_pcpu_lpage(static_size); - if (ret < 0) - ret = setup_pcpu_embed(static_size); + ret = -EINVAL; + if (strlen(pcpu_chosen_alloc)) { + if (strcmp(pcpu_chosen_alloc, "4k")) { + if (!strcmp(pcpu_chosen_alloc, "lpage")) + ret = setup_pcpu_lpage(static_size, true); + else if (!strcmp(pcpu_chosen_alloc, "embed")) + ret = setup_pcpu_embed(static_size, true); + else + pr_warning("PERCPU: unknown allocator %s " + "specified\n", pcpu_chosen_alloc); + if (ret < 0) + pr_warning("PERCPU: %s allocator failed (%zd), " + "falling back to 4k\n", + pcpu_chosen_alloc, ret); + } + } else { + ret = setup_pcpu_lpage(static_size, false); + if (ret < 0) + ret = setup_pcpu_embed(static_size, false); + } if (ret < 0) ret = setup_pcpu_4k(static_size); if (ret < 0) diff --git a/mm/percpu.c b/mm/percpu.c index d06f4748271e..b70f2acd8853 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1233,6 +1233,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size, ssize_t unit_size) { + size_t chunk_size; unsigned int cpu; /* determine parameters and allocate */ @@ -1247,11 +1248,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } else pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - pcpue_ptr = __alloc_bootmem_nopanic( - num_possible_cpus() * pcpue_unit_size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); - if (!pcpue_ptr) + chunk_size = pcpue_unit_size * num_possible_cpus(); + + pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS)); + if (!pcpue_ptr) { + pr_warning("PERCPU: failed to allocate %zu bytes for " + "embedding\n", chunk_size); return -ENOMEM; + } /* return the leftover and copy */ for_each_possible_cpu(cpu) { -- cgit v1.2.3-59-g8ed1b From 0017c869ddcb73069905d09f9e98e68627466237 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: ensure percpu lpage doesn't consume too much vmalloc space On extreme configuration (e.g. 32bit 32-way NUMA machine), lpage percpu first chunk allocator can consume too much of vmalloc space. Make it fall back to 4k allocator if the consumption goes over 20%. [ Impact: add sanity check for lpage percpu first chunk allocator ] Signed-off-by: Tejun Heo Reported-by: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 165ebd5ba83b..29a3eef7cf4a 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -163,9 +163,21 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) int i, j; ssize_t ret; - /* on non-NUMA, embedding is better */ - if (!chosen && !pcpu_need_numa()) - return -EINVAL; + if (!chosen) { + size_t vm_size = VMALLOC_END - VMALLOC_START; + size_t tot_size = num_possible_cpus() * PMD_SIZE; + + /* on non-NUMA, embedding is better */ + if (!pcpu_need_numa()) + return -EINVAL; + + /* don't consume more than 20% of vmalloc area */ + if (tot_size > vm_size / 5) { + pr_info("PERCPU: too large chunk size %zuMB for " + "large page remap\n", tot_size >> 20); + return -EINVAL; + } + } /* need PSE */ if (!cpu_has_pse) { -- cgit v1.2.3-59-g8ed1b From 854c879f5abf309ebd378bea1ee41acf4ddf7194 Mon Sep 17 00:00:00 2001 From: Pekka J Enberg Date: Mon, 22 Jun 2009 17:39:41 +0300 Subject: x86: Move init_gbpages() to setup_arch() The init_gbpages() function is conditionally called from init_memory_mapping() function. There are two call-sites where this 'after_bootmem' condition can be true: setup_arch() and mem_init() via pci_iommu_alloc(). Therefore, it's safe to move the call to init_gbpages() to setup_arch() as it's always called before mem_init(). This removes an after_bootmem use - paving the way to remove all uses of that state variable. Signed-off-by: Pekka Enberg Acked-by: Yinghai Lu LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 16 ++++++++++++++++ arch/x86/mm/init.c | 17 ----------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index be5ae80f897f..de2cab132844 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -289,6 +289,20 @@ void * __init extend_brk(size_t size, size_t align) return ret; } +#ifdef CONFIG_X86_64 +static void __init init_gbpages(void) +{ + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +} +#else +static inline void init_gbpages(void) +{ +} +#endif + static void __init reserve_brk(void) { if (_brk_end > _brk_start) @@ -871,6 +885,8 @@ void __init setup_arch(char **cmdline_p) reserve_brk(); + init_gbpages(); + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< Date: Tue, 23 Jun 2009 12:40:54 +0900 Subject: x86, mce: Fix mce resume on 32bit Calling mcheck_init() on resume is required only with CONFIG_X86_OLD_MCE=y. Signed-off-by: Hidetoshi Seto Acked-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/power/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index d277ef1eea51..b3d20b9cac63 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -244,7 +244,7 @@ static void __restore_processor_state(struct saved_context *ctxt) do_fpu_end(); mtrr_ap_init(); -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_OLD_MCE mcheck_init(&boot_cpu_data); #endif } -- cgit v1.2.3-59-g8ed1b From 9c26f52b900f7207135bafc8789e1a4f5d43e096 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 24 Jun 2009 09:41:59 -0500 Subject: x86: Fix uv bau sending buffer initialization The initialization of the UV Broadcast Assist Unit's sending buffers was making an invalid assumption about the initialization of an MMR that defines its address. The BIOS will not be providing that MMR. So uv_activation_descriptor_init() should unconditionally set it. Tested on UV simulator. Signed-off-by: Cliff Wickman Cc: # for v2.6.30.x LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 124d40c575df..8ccabb8a2f6a 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -711,7 +711,6 @@ uv_activation_descriptor_init(int node, int pnode) unsigned long pa; unsigned long m; unsigned long n; - unsigned long mmr_image; struct bau_desc *adp; struct bau_desc *ad2; @@ -727,12 +726,8 @@ uv_activation_descriptor_init(int node, int pnode) n = pa >> uv_nshift; m = pa & uv_mmask; - mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE); - if (mmr_image) { - uv_write_global_mmr64(pnode, (unsigned long) - UVH_LB_BAU_SB_DESCRIPTOR_BASE, - (n << UV_DESC_BASE_PNODE_SHIFT | m)); - } + uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, + (n << UV_DESC_BASE_PNODE_SHIFT | m)); /* * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each -- cgit v1.2.3-59-g8ed1b From 5211a242d0cbdded372aee59da18f80552b0a80a Mon Sep 17 00:00:00 2001 From: Kurt Garloff Date: Wed, 24 Jun 2009 14:32:11 -0700 Subject: x86: Add sysctl to allow panic on IOCK NMI error This patch introduces a new sysctl: /proc/sys/kernel/panic_on_io_nmi which defaults to 0 (off). When enabled, the kernel panics when the kernel receives an NMI caused by an IO error. The IO error triggered NMI indicates a serious system condition, which could result in IO data corruption. Rather than contiuing, panicing and dumping might be a better choice, so one can figure out what's causing the IO error. This could be especially important to companies running IO intensive applications where corruption must be avoided, e.g. a bank's databases. [ SuSE has been shipping it for a while, it was done at the request of a large database vendor, for their users. ] Signed-off-by: Kurt Garloff Signed-off-by: Roberto Angelino Signed-off-by: Greg Kroah-Hartman Cc: "Eric W. Biederman" LKML-Reference: <20090624213211.GA11291@kroah.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 1 + arch/x86/kernel/traps.c | 3 +++ include/linux/kernel.h | 1 + kernel/sysctl.c | 8 ++++++++ 4 files changed, 13 insertions(+) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 95ea5fa7d444..c8405718a4c3 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,6 +22,7 @@ #include "dumpstack.h" int panic_on_unrecovered_nmi; +int panic_on_io_nmi; unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a0f48f5671c0..5204332f475d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -346,6 +346,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); + if (panic_on_io_nmi) + panic("NMI IOCK error: Not continuing"); + /* Re-enable the IOCK line, wait for a few seconds */ reason = (reason & 0xf) | 8; outb(reason, 0x61); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index fac104e7186a..d6320a3e8def 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -303,6 +303,7 @@ extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in extern int panic_timeout; extern int panic_on_oops; extern int panic_on_unrecovered_nmi; +extern int panic_on_io_nmi; extern const char *print_tainted(void); extern void add_taint(unsigned flag); extern int test_taint(unsigned flag); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 62e4ff9968b5..fba42eda8de2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -743,6 +743,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = KERN_BOOTLOADER_TYPE, .procname = "bootloader_type", -- cgit v1.2.3-59-g8ed1b From 5be6066a7f8d917db347d94f1b359b9b70dcb572 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 24 Jun 2009 09:21:10 +0900 Subject: x86, mce: percpu mcheck_timer should be pinned If CONFIG_NO_HZ + CONFIG_SMP, timer added via add_timer() might be migrated on other cpu. Use add_timer_on() instead. Avoids the following failure: Maciej Rutecki wrote: > > After normal boot I try: > > > > echo 1 > /sys/devices/system/machinecheck/machinecheck0/check_interval > > > > I found this in dmesg: > > > > [ 141.704025] ------------[ cut here ]------------ > > [ 141.704039] WARNING: at arch/x86/kernel/cpu/mcheck/mce.c:1102 > > mcheck_timer+0xf5/0x100() Reported-by: Maciej Rutecki Signed-off-by: Hidetoshi Seto Tested-by: Maciej Rutecki Acked-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968bc..af425b83202b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1117,7 +1117,7 @@ static void mcheck_timer(unsigned long data) *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); t->expires = jiffies + *n; - add_timer(t); + add_timer_on(t, smp_processor_id()); } static void mce_do_trigger(struct work_struct *work) @@ -1321,7 +1321,7 @@ static void mce_init_timer(void) return; setup_timer(t, mcheck_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); - add_timer(t); + add_timer_on(t, smp_processor_id()); } /* -- cgit v1.2.3-59-g8ed1b From 22f4319d6bc0155e6c0ae560729baa6c09dc09e7 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Thu, 25 Jun 2009 16:20:48 -0400 Subject: x86, setup: Fix typo "CONFIG_x86_64" in CONFIG_X86_64 was misspelled (wrong case), which caused the x86-64 kernel to advertise itself as more relocatable than it really is. This could in theory cause boot failures once bootloaders start support the new relocation fields. Signed-off-by: Robert P. J. Day Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/boot.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 418e632d4a80..6f02b9a53848 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -16,7 +16,7 @@ & ~(CONFIG_PHYSICAL_ALIGN - 1)) /* Minimum kernel alignment, as a power of two */ -#ifdef CONFIG_x86_64 +#ifdef CONFIG_X86_64 #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else #define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1) -- cgit v1.2.3-59-g8ed1b From 658dbfeb5e7ab35d440c665d643a6285e43fddcd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 25 Jun 2009 15:16:06 -0700 Subject: x86, setup: correct include file in needs , not in order to resolve PMD_SHIFT. Also, correct a +1 which really should be + THREAD_ORDER. This is a build error which was masked by a typoed #ifdef. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/boot.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 6f02b9a53848..7a1065958ba9 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -8,7 +8,7 @@ #ifdef __KERNEL__ -#include +#include /* Physical address where kernel should be loaded. */ #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ @@ -19,7 +19,7 @@ #ifdef CONFIG_X86_64 #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else -#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1) +#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER) #endif #define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) -- cgit v1.2.3-59-g8ed1b From e888d7facd1f1460a638151036d15b6cfb3ccc74 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Thu, 25 Jun 2009 16:44:31 -0700 Subject: x86, delay: tsc based udelay should have rdtsc_barrier delay_tsc needs rdtsc_barrier to provide proper delay. Output from a test driver using hpet to cross check delay provided by udelay(). Before: [ 86.794363] Expected delay 5us actual 4679ns [ 87.154362] Expected delay 5us actual 698ns [ 87.514162] Expected delay 5us actual 4539ns [ 88.653716] Expected delay 5us actual 4539ns [ 94.664106] Expected delay 10us actual 9638ns [ 95.049351] Expected delay 10us actual 10126ns [ 95.416110] Expected delay 10us actual 9568ns [ 95.799216] Expected delay 10us actual 9638ns [ 103.624104] Expected delay 10us actual 9707ns [ 104.020619] Expected delay 10us actual 768ns [ 104.419951] Expected delay 10us actual 9707ns After: [ 50.983320] Expected delay 5us actual 5587ns [ 51.261807] Expected delay 5us actual 5587ns [ 51.565715] Expected delay 5us actual 5657ns [ 51.861171] Expected delay 5us actual 5587ns [ 52.164704] Expected delay 5us actual 5726ns [ 52.487457] Expected delay 5us actual 5657ns [ 52.789338] Expected delay 5us actual 5726ns [ 57.119680] Expected delay 10us actual 10755ns [ 57.893997] Expected delay 10us actual 10615ns [ 58.261287] Expected delay 10us actual 10755ns [ 58.620505] Expected delay 10us actual 10825ns [ 58.941035] Expected delay 10us actual 10755ns [ 59.320903] Expected delay 10us actual 10615ns [ 61.306311] Expected delay 10us actual 10755ns [ 61.520542] Expected delay 10us actual 10615ns Signed-off-by: Venkatesh Pallipadi Signed-off-by: H. Peter Anvin --- arch/x86/lib/delay.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index f4568605d7d5..ff485d361182 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -55,8 +55,10 @@ static void delay_tsc(unsigned long loops) preempt_disable(); cpu = smp_processor_id(); + rdtsc_barrier(); rdtscl(bclock); for (;;) { + rdtsc_barrier(); rdtscl(now); if ((now - bclock) >= loops) break; @@ -78,6 +80,7 @@ static void delay_tsc(unsigned long loops) if (unlikely(cpu != smp_processor_id())) { loops -= (now - bclock); cpu = smp_processor_id(); + rdtsc_barrier(); rdtscl(bclock); } } -- cgit v1.2.3-59-g8ed1b