From d7731c0ff69dc3f18ea020257e627dae4d214fdb Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] i386: i386 rename X86_FEATURE_DTES to X86_FEATURE_DS Here is a patch (used by perfmon2) that renames X86_FEATURE_DTES to X86_FEATURE_DS to match Intel's documentation for the Debug Store save area on i386. The patch also adds cpu_has_ds. - rename X86_FEATURE_DTES to X86_FEATURE_DS to match documentation - adds cpu_has_ds to test for X86_FEATURE_DS Signed-off-by: stephane eranian Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- include/asm-i386/cpufeature.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/asm-i386') diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h index d314ebb3d59e..69ce35049a07 100644 --- a/include/asm-i386/cpufeature.h +++ b/include/asm-i386/cpufeature.h @@ -31,7 +31,7 @@ #define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */ #define X86_FEATURE_PN (0*32+18) /* Processor serial number */ #define X86_FEATURE_CLFLSH (0*32+19) /* Supports the CLFLUSH instruction */ -#define X86_FEATURE_DTES (0*32+21) /* Debug Trace Store */ +#define X86_FEATURE_DS (0*32+21) /* Debug Store */ #define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */ #define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ #define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions (fast save and restore */ @@ -134,6 +134,7 @@ #define cpu_has_phe_enabled boot_cpu_has(X86_FEATURE_PHE_EN) #define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM) #define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN) +#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS) #endif /* __ASM_I386_CPUFEATURE_H */ -- cgit v1.2.3-59-g8ed1b From 42ed458aa51337357d7632c64aed4528f923e829 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] i386: i386 add X86_FEATURE_PEBS and detection Here is a patch (used by perfmon2) to detect the presence of the Precise Event Based Sampling (PEBS) feature for i386. The patch also adds the cpu_has_pebs macro. - adds X86_FEATURE_PEBS - adds cpu_has_pebs to test for X86_FEATURE_PEBS Signed-off-by: stephane eranian Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/intel.c | 8 +++++++- include/asm-i386/cpufeature.h | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 94a95aa5227e..798c2f617e87 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -195,8 +195,14 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); -} + if (cpu_has_ds) { + unsigned int l1; + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<12))) + set_bit(X86_FEATURE_PEBS, c->x86_capability); + } +} static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) { diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h index 69ce35049a07..231672558c1f 100644 --- a/include/asm-i386/cpufeature.h +++ b/include/asm-i386/cpufeature.h @@ -73,6 +73,7 @@ #define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */ #define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ @@ -135,6 +136,7 @@ #define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM) #define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN) #define cpu_has_ds boot_cpu_has(X86_FEATURE_DS) +#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS) #endif /* __ASM_I386_CPUFEATURE_H */ -- cgit v1.2.3-59-g8ed1b From e5e3a0428968dcc1f9318ce1c941a918e99f8b84 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] i386: remove default_ldt, and simplify ldt-setting. This patch removes the default_ldt[] array, as it has been unused since iBCS stopped being supported. This means it is now possible to actually set an empty LDT segment. In order to deal with this, the set_ldt_desc/load_LDT pair has been replaced with a single set_ldt() operation which is responsible for both setting up the LDT descriptor in the GDT, and reloading the LDT register. If there are no LDT entries, the LDT register is loaded with a NULL descriptor. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Andi Kleen Acked-by: Zachary Amsden Signed-off-by: Andrew Morton --- arch/i386/kernel/ldt.c | 4 +--- arch/i386/kernel/traps.c | 3 --- include/asm-i386/desc.h | 50 ++++++++++++++++-------------------------- include/asm-i386/mmu_context.h | 4 ++-- 4 files changed, 22 insertions(+), 39 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c index 445211eb2d57..b410e5fb034f 100644 --- a/arch/i386/kernel/ldt.c +++ b/arch/i386/kernel/ldt.c @@ -160,16 +160,14 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount) { int err; unsigned long size; - void *address; err = 0; - address = &default_ldt[0]; size = 5*sizeof(struct desc_struct); if (size > bytecount) size = bytecount; err = size; - if (copy_to_user(ptr, address, size)) + if (clear_user(ptr, size)) err = -EFAULT; return err; diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 48ebfab661b7..56655ea8d98f 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -61,9 +61,6 @@ int panic_on_unrecovered_nmi; asmlinkage int system_call(void); -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 } }; - /* Do we ignore FPU interrupts ? */ char ignore_fpu_irq = 0; diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index 5874ef119ffd..a0398f780ca1 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -33,11 +33,6 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address; } -/* - * This is the ldt that every process will get unless we need - * something other than this. - */ -extern struct desc_struct default_ldt[]; extern struct desc_struct idt_table[]; extern void set_intr_gate(unsigned int irq, void * addr); @@ -65,7 +60,6 @@ static inline void pack_gate(__u32 *a, __u32 *b, #define DESCTYPE_S 0x10 /* !system */ #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) -#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)) #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr)) @@ -115,13 +109,20 @@ static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const vo write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b); } -static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries) +static inline void set_ldt(void *addr, unsigned int entries) { - __u32 a, b; - pack_descriptor(&a, &b, (unsigned long)addr, - entries * sizeof(struct desc_struct) - 1, - DESCTYPE_LDT, 0); - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b); + if (likely(entries == 0)) + __asm__ __volatile__("lldt %w0"::"q" (0)); + else { + unsigned cpu = smp_processor_id(); + __u32 a, b; + + pack_descriptor(&a, &b, (unsigned long)addr, + entries * sizeof(struct desc_struct) - 1, + DESCTYPE_LDT, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b); + __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); + } } #define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) @@ -153,35 +154,22 @@ static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entri static inline void clear_LDT(void) { - int cpu = get_cpu(); - - set_ldt_desc(cpu, &default_ldt[0], 5); - load_LDT_desc(); - put_cpu(); + set_ldt(NULL, 0); } /* * load one particular LDT into the current CPU */ -static inline void load_LDT_nolock(mm_context_t *pc, int cpu) +static inline void load_LDT_nolock(mm_context_t *pc) { - void *segments = pc->ldt; - int count = pc->size; - - if (likely(!count)) { - segments = &default_ldt[0]; - count = 5; - } - - set_ldt_desc(cpu, segments, count); - load_LDT_desc(); + set_ldt(pc->ldt, pc->size); } static inline void load_LDT(mm_context_t *pc) { - int cpu = get_cpu(); - load_LDT_nolock(pc, cpu); - put_cpu(); + preempt_disable(); + load_LDT_nolock(pc); + preempt_enable(); } static inline unsigned long get_desc_base(unsigned long *desc) diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h index 62b7bf184094..1b1495372c4d 100644 --- a/include/asm-i386/mmu_context.h +++ b/include/asm-i386/mmu_context.h @@ -44,7 +44,7 @@ static inline void switch_mm(struct mm_struct *prev, * load the LDT, if the LDT is different: */ if (unlikely(prev->context.ldt != next->context.ldt)) - load_LDT_nolock(&next->context, cpu); + load_LDT_nolock(&next->context); } #ifdef CONFIG_SMP else { @@ -56,7 +56,7 @@ static inline void switch_mm(struct mm_struct *prev, * tlb flush IPI delivery. We must reload %cr3. */ load_cr3(next->pgd); - load_LDT_nolock(&next->context, cpu); + load_LDT_nolock(&next->context); } } #endif -- cgit v1.2.3-59-g8ed1b From bb81a09e55eaf7e5f798468ab971469b6f66a259 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] x86: all cpu backtrace When a spinlock lockup occurs, arrange for the NMI code to emit an all-cpu backtrace, so we get to see which CPU is holding the lock, and where. Cc: Andi Kleen Cc: Ingo Molnar Cc: Badari Pulavarty Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 26 ++++++++++++++++++++++++++ arch/x86_64/kernel/nmi.c | 29 ++++++++++++++++++++++++++++- include/asm-i386/nmi.h | 8 ++++++++ include/asm-x86_64/nmi.h | 3 +++ include/linux/nmi.h | 5 +++++ lib/spinlock_debug.c | 4 ++++ 6 files changed, 74 insertions(+), 1 deletion(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index eaafe233a5da..171194ccb7bc 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,8 @@ int nmi_watchdog_enabled; static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner); static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); +static cpumask_t backtrace_mask = CPU_MASK_NONE; + /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) */ @@ -907,6 +910,16 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) touched = 1; } + if (cpu_isset(cpu, backtrace_mask)) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk("NMI backtrace for cpu %d\n", cpu); + dump_stack(); + spin_unlock(&lock); + cpu_clear(cpu, backtrace_mask); + } + sum = per_cpu(irq_stat, cpu).apic_timer_irqs; /* if the apic timer isn't firing, this cpu isn't doing much */ @@ -1033,6 +1046,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, #endif +void __trigger_all_cpu_backtrace(void) +{ + int i; + + backtrace_mask = cpu_online_map; + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpus_empty(backtrace_mask)) + break; + mdelay(1); + } +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 7af9cb3e2d99..27e95e7922c1 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -12,14 +12,15 @@ * Mikael Pettersson : PM converted to driver model. Disable/enable API. */ +#include #include #include #include #include #include -#include #include #include +#include #include #include @@ -41,6 +42,8 @@ int panic_on_unrecovered_nmi; static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner); static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]); +static cpumask_t backtrace_mask = CPU_MASK_NONE; + /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) */ @@ -782,6 +785,7 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; + int cpu = smp_processor_id(); struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 dummy; int rc=0; @@ -799,6 +803,16 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) touched = 1; } + if (cpu_isset(cpu, backtrace_mask)) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk("NMI backtrace for cpu %d\n", cpu); + dump_stack(); + spin_unlock(&lock); + cpu_clear(cpu, backtrace_mask); + } + #ifdef CONFIG_X86_MCE /* Could check oops_in_progress here too, but it's safer not too */ @@ -931,6 +945,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, #endif +void __trigger_all_cpu_backtrace(void) +{ + int i; + + backtrace_mask = cpu_online_map; + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpus_empty(backtrace_mask)) + break; + mdelay(1); + } +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index 269d315719ca..b04333ea6f31 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -5,6 +5,9 @@ #define ASM_NMI_H #include +#include + +#ifdef ARCH_HAS_NMI_WATCHDOG /** * do_nmi_callback @@ -42,4 +45,9 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; +void __trigger_all_cpu_backtrace(void); +#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() + +#endif + #endif /* ASM_NMI_H */ diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index f367d4014b42..72375e7d32a8 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -77,4 +77,7 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, extern int unknown_nmi_panic; +void __trigger_all_cpu_backtrace(void); +#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() + #endif /* ASM_NMI_H */ diff --git a/include/linux/nmi.h b/include/linux/nmi.h index e16904e28c3a..acb4ed130247 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -15,9 +15,14 @@ * disables interrupts for a long time. This call is stateless. */ #ifdef ARCH_HAS_NMI_WATCHDOG +#include extern void touch_nmi_watchdog(void); #else # define touch_nmi_watchdog() touch_softlockup_watchdog() #endif +#ifndef trigger_all_cpu_backtrace +#define trigger_all_cpu_backtrace() do { } while (0) +#endif + #endif diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c index b6c4f898197c..479fd462eaa9 100644 --- a/lib/spinlock_debug.c +++ b/lib/spinlock_debug.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -117,6 +118,9 @@ static void __spin_lock_debug(spinlock_t *lock) raw_smp_processor_id(), current->comm, current->pid, lock); dump_stack(); +#ifdef CONFIG_SMP + trigger_all_cpu_backtrace(); +#endif } } } -- cgit v1.2.3-59-g8ed1b From be44d2aabce2d62f72d5751d1871b6212bf7a1c7 Mon Sep 17 00:00:00 2001 From: Stas Sergeev Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] i386: espfix cleanup Clean up the espfix code: - Introduced PER_CPU() macro to be used from asm - Introduced GET_DESC_BASE() macro to be used from asm - Rewrote the fixup code in asm, as calling a C code with the altered %ss appeared to be unsafe - No longer altering the stack from a .fixup section - 16bit per-cpu stack is no longer used, instead the stack segment base is patched the way so that the high word of the kernel and user %esp are the same. - Added the limit-patching for the espfix segment. (Chuck Ebbert) [jeremy@goop.org: use the x86 scaling addressing mode rather than shifting] Signed-off-by: Stas Sergeev Signed-off-by: Andi Kleen Acked-by: Zachary Amsden Acked-by: Chuck Ebbert <76306.1226@compuserve.com> Acked-by: Jan Beulich Cc: Andi Kleen Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 5 +++ arch/i386/kernel/cpu/common.c | 11 ------- arch/i386/kernel/entry.S | 73 +++++++++++++++++++----------------------- arch/i386/kernel/head.S | 2 +- arch/i386/kernel/traps.c | 57 +++++++++------------------------ include/asm-i386/desc.h | 27 +++++++++++++--- include/asm-i386/percpu.h | 25 +++++++++++++++ 7 files changed, 103 insertions(+), 97 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index c80271f8f084..e94d910a28bd 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -58,6 +58,11 @@ void foo(void) OFFSET(TI_sysenter_return, thread_info, sysenter_return); BLANK(); + OFFSET(GDS_size, Xgt_desc_struct, size); + OFFSET(GDS_address, Xgt_desc_struct, address); + OFFSET(GDS_pad, Xgt_desc_struct, pad); + BLANK(); + OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); BLANK(); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index d9f3e3c31f05..5532fc4e1bf0 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -24,9 +24,6 @@ DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); -DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); -EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); - static int cachesize_override __cpuinitdata = -1; static int disable_x86_fxsr __cpuinitdata; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -603,7 +600,6 @@ void __cpuinit cpu_init(void) struct tss_struct * t = &per_cpu(init_tss, cpu); struct thread_struct *thread = ¤t->thread; struct desc_struct *gdt; - __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu); struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); if (cpu_test_and_set(cpu, cpu_initialized)) { @@ -651,13 +647,6 @@ old_gdt: * and set up the GDT descriptor: */ memcpy(gdt, cpu_gdt_table, GDT_SIZE); - - /* Set up GDT entry for 16bit stack */ - *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= - ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | - (CPU_16BIT_STACK_SIZE - 1); - cpu_gdt_descr->size = GDT_SIZE - 1; cpu_gdt_descr->address = (unsigned long)gdt; diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 5a63d6fdb70e..c38d801ba0bb 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -48,6 +48,7 @@ #include #include #include +#include #include #include "irq_vectors.h" @@ -418,23 +419,18 @@ ldt_ss: * This is an "official" bug of all the x86-compatible * CPUs, which we can try to work around to make * dosemu and wine happy. */ - subl $8, %esp # reserve space for switch16 pointer - CFI_ADJUST_CFA_OFFSET 8 + movl OLDESP(%esp), %eax + movl %esp, %edx + call patch_espfix_desc + pushl $__ESPFIX_SS + CFI_ADJUST_CFA_OFFSET 4 + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 DISABLE_INTERRUPTS TRACE_IRQS_OFF - movl %esp, %eax - /* Set up the 16bit stack frame with switch32 pointer on top, - * and a switch16 pointer on top of the current frame. */ - call setup_x86_bogus_stack - CFI_ADJUST_CFA_OFFSET -8 # frame has moved - TRACE_IRQS_IRET - RESTORE_REGS - lss 20+4(%esp), %esp # switch to 16bit stack -1: INTERRUPT_RETURN -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous + lss (%esp), %esp + CFI_ADJUST_CFA_OFFSET -8 + jmp restore_nocheck CFI_ENDPROC # perform work that needs to be done immediately before resumption @@ -524,30 +520,30 @@ syscall_badsys: CFI_ENDPROC #define FIXUP_ESPFIX_STACK \ - movl %esp, %eax; \ - /* switch to 32bit stack using the pointer on top of 16bit stack */ \ - lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ - /* copy data from 16bit stack to 32bit stack */ \ - call fixup_x86_bogus_stack; \ - /* put ESP to the proper location */ \ - movl %eax, %esp; -#define UNWIND_ESPFIX_STACK \ + /* since we are on a wrong stack, we cant make it a C code :( */ \ + GET_THREAD_INFO(%ebp); \ + movl TI_cpu(%ebp), %ebx; \ + PER_CPU(cpu_gdt_descr, %ebx); \ + movl GDS_address(%ebx), %ebx; \ + GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ + addl %esp, %eax; \ + pushl $__KERNEL_DS; \ + CFI_ADJUST_CFA_OFFSET 4; \ pushl %eax; \ CFI_ADJUST_CFA_OFFSET 4; \ + lss (%esp), %esp; \ + CFI_ADJUST_CFA_OFFSET -8; +#define UNWIND_ESPFIX_STACK \ movl %ss, %eax; \ - /* see if on 16bit stack */ \ + /* see if on espfix stack */ \ cmpw $__ESPFIX_SS, %ax; \ - je 28f; \ -27: popl %eax; \ - CFI_ADJUST_CFA_OFFSET -4; \ -.section .fixup,"ax"; \ -28: movl $__KERNEL_DS, %eax; \ + jne 27f; \ + movl $__KERNEL_DS, %eax; \ movl %eax, %ds; \ movl %eax, %es; \ - /* switch to 32bit stack */ \ + /* switch to normal stack */ \ FIXUP_ESPFIX_STACK; \ - jmp 27b; \ -.previous +27:; /* * Build the entry stubs and pointer table with @@ -614,7 +610,6 @@ error_code: pushl %eax CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eax, 0 - xorl %eax, %eax pushl %ebp CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebp, 0 @@ -627,7 +622,6 @@ error_code: pushl %edx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET edx, 0 - decl %eax # eax = -1 pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx, 0 @@ -644,7 +638,7 @@ error_code: /*CFI_REGISTER es, ecx*/ movl ES(%esp), %edi # get the function address movl ORIG_EAX(%esp), %edx # get the error code - movl %eax, ORIG_EAX(%esp) + movl $-1, ORIG_EAX(%esp) movl %ecx, ES(%esp) /*CFI_REL_OFFSET es, ES*/ movl $(__USER_DS), %ecx @@ -754,7 +748,7 @@ KPROBE_ENTRY(nmi) cmpw $__ESPFIX_SS, %ax popl %eax CFI_ADJUST_CFA_OFFSET -4 - je nmi_16bit_stack + je nmi_espfix_stack cmpl $sysenter_entry,(%esp) je nmi_stack_fixup pushl %eax @@ -797,7 +791,7 @@ nmi_debug_stack_check: FIX_STACK(24,nmi_stack_correct, 1) jmp nmi_stack_correct -nmi_16bit_stack: +nmi_espfix_stack: /* We have a RING0_INT_FRAME here. * * create the pointer to lss back @@ -806,7 +800,6 @@ nmi_16bit_stack: CFI_ADJUST_CFA_OFFSET 4 pushl %esp CFI_ADJUST_CFA_OFFSET 4 - movzwl %sp, %esp addw $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 @@ -817,11 +810,11 @@ nmi_16bit_stack: CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL FIXUP_ESPFIX_STACK # %eax == %esp - CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved xorl %edx,%edx # zero error code call do_nmi RESTORE_REGS - lss 12+4(%esp), %esp # back to 16bit stack + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 1: INTERRUPT_RETURN CFI_ENDPROC .section __ex_table,"a" diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index ca31f18d277c..b1f1df11fcc6 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -584,7 +584,7 @@ ENTRY(cpu_gdt_table) .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */ .quad 0x004092000000ffff /* 0xc8 APM DS data */ - .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */ + .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */ .quad 0x0000000000000000 /* 0xd8 - unused */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 56655ea8d98f..f9bb1f89d687 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -1088,49 +1088,24 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, #endif } -fastcall void setup_x86_bogus_stack(unsigned char * stk) +fastcall unsigned long patch_espfix_desc(unsigned long uesp, + unsigned long kesp) { - unsigned long *switch16_ptr, *switch32_ptr; - struct pt_regs *regs; - unsigned long stack_top, stack_bot; - unsigned short iret_frame16_off; int cpu = smp_processor_id(); - /* reserve the space on 32bit stack for the magic switch16 pointer */ - memmove(stk, stk + 8, sizeof(struct pt_regs)); - switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs)); - regs = (struct pt_regs *)stk; - /* now the switch32 on 16bit stack */ - stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); - stack_top = stack_bot + CPU_16BIT_STACK_SIZE; - switch32_ptr = (unsigned long *)(stack_top - 8); - iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20; - /* copy iret frame on 16bit stack */ - memcpy((void *)(stack_bot + iret_frame16_off), ®s->eip, 20); - /* fill in the switch pointers */ - switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; - switch16_ptr[1] = __ESPFIX_SS; - switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) + - 8 - CPU_16BIT_STACK_SIZE; - switch32_ptr[1] = __KERNEL_DS; -} - -fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp) -{ - unsigned long *switch32_ptr; - unsigned char *stack16, *stack32; - unsigned long stack_top, stack_bot; - int len; - int cpu = smp_processor_id(); - stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); - stack_top = stack_bot + CPU_16BIT_STACK_SIZE; - switch32_ptr = (unsigned long *)(stack_top - 8); - /* copy the data from 16bit stack to 32bit stack */ - len = CPU_16BIT_STACK_SIZE - 8 - sp; - stack16 = (unsigned char *)(stack_bot + sp); - stack32 = (unsigned char *) - (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); - memcpy(stack32, stack16, len); - return stack32; + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address; + unsigned long base = (kesp - uesp) & -THREAD_SIZE; + unsigned long new_kesp = kesp - base; + unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; + __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; + /* Set up base for espfix segment */ + desc &= 0x00f0ff0000000000ULL; + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)base) << 32) & 0xff00000000000000ULL) | + ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | + (lim_pages & 0xffff); + *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; + return new_kesp; } /* diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index a0398f780ca1..6cf2ac2bfde7 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -4,8 +4,6 @@ #include #include -#define CPU_16BIT_STACK_SIZE 1024 - #ifndef __ASSEMBLY__ #include @@ -16,8 +14,6 @@ extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; -DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); - struct Xgt_desc_struct { unsigned short size; unsigned long address __attribute__((packed)); @@ -181,6 +177,29 @@ static inline unsigned long get_desc_base(unsigned long *desc) return base; } +#else /* __ASSEMBLY__ */ + +/* + * GET_DESC_BASE reads the descriptor base of the specified segment. + * + * Args: + * idx - descriptor index + * gdt - GDT pointer + * base - 32bit register to which the base will be written + * lo_w - lo word of the "base" register + * lo_b - lo byte of the "base" register + * hi_b - hi byte of the low word of the "base" register + * + * Example: + * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) + * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax. + */ +#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \ + movb idx*8+4(gdt), lo_b; \ + movb idx*8+7(gdt), hi_b; \ + shll $16, base; \ + movw idx*8+2(gdt), lo_w; + #endif /* !__ASSEMBLY__ */ #endif diff --git a/include/asm-i386/percpu.h b/include/asm-i386/percpu.h index 5764afa4b6a4..510ae1d3486c 100644 --- a/include/asm-i386/percpu.h +++ b/include/asm-i386/percpu.h @@ -1,6 +1,31 @@ #ifndef __ARCH_I386_PERCPU__ #define __ARCH_I386_PERCPU__ +#ifndef __ASSEMBLY__ #include +#else + +/* + * PER_CPU finds an address of a per-cpu variable. + * + * Args: + * var - variable name + * cpu - 32bit register containing the current CPU number + * + * The resulting address is stored in the "cpu" argument. + * + * Example: + * PER_CPU(cpu_gdt_descr, %ebx) + */ +#ifdef CONFIG_SMP +#define PER_CPU(var, cpu) \ + movl __per_cpu_offset(,cpu,4), cpu; \ + addl $per_cpu__/**/var, cpu; +#else /* ! SMP */ +#define PER_CPU(var, cpu) \ + movl $per_cpu__/**/var, cpu; +#endif /* SMP */ + +#endif /* !__ASSEMBLY__ */ #endif /* __ARCH_I386_PERCPU__ */ -- cgit v1.2.3-59-g8ed1b From acc207616a91a413a50fdd8847a747c4a7324167 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] i386: add sleazy FPU optimization i386 port of the sLeAZY-fpu feature. Chuck reports that this gives him a +/- 0.4% improvement on his simple benchmark x86_64 description follows: Right now the kernel on x86-64 has a 100% lazy fpu behavior: after *every* context switch a trap is taken for the first FPU use to restore the FPU context lazily. This is of course great for applications that have very sporadic or no FPU use (since then you avoid doing the expensive save/restore all the time). However for very frequent FPU users... you take an extra trap every context switch. The patch below adds a simple heuristic to this code: After 5 consecutive context switches of FPU use, the lazy behavior is disabled and the context gets restored every context switch. If the app indeed uses the FPU, the trap is avoided. (the chance of the 6th time slice using FPU after the previous 5 having done so are quite high obviously). After 256 switches, this is reset and lazy behavior is returned (until there are 5 consecutive ones again). The reason for this is to give apps that do longer bursts of FPU use still the lazy behavior back after some time. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/process.c | 12 ++++++++++++ arch/i386/kernel/traps.c | 3 ++- include/asm-i386/i387.h | 5 ++++- 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index dd53c58f64f1..ae924c416b68 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -648,6 +648,11 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas __unlazy_fpu(prev_p); + + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter > 5) + prefetch(&next->i387.fxsave); + /* * Reload esp0. */ @@ -697,6 +702,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas disable_tsc(prev_p, next_p); + /* If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + if (next_p->fpu_counter > 5) + math_state_restore(); + return prev_p; } diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index f9bb1f89d687..4a6fa2837df2 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -1118,7 +1118,7 @@ fastcall unsigned long patch_espfix_desc(unsigned long uesp, * Must be called with kernel preemption disabled (in this case, * local interrupts are disabled at the call-site in entry.S). */ -asmlinkage void math_state_restore(struct pt_regs regs) +asmlinkage void math_state_restore(void) { struct thread_info *thread = current_thread_info(); struct task_struct *tsk = thread->task; @@ -1128,6 +1128,7 @@ asmlinkage void math_state_restore(struct pt_regs regs) init_fpu(tsk); restore_fpu(tsk); thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; } #ifndef CONFIG_MATH_EMULATION diff --git a/include/asm-i386/i387.h b/include/asm-i386/i387.h index bc1d6edae1ed..434936c732d6 100644 --- a/include/asm-i386/i387.h +++ b/include/asm-i386/i387.h @@ -76,7 +76,9 @@ static inline void __save_init_fpu( struct task_struct *tsk ) #define __unlazy_fpu( tsk ) do { \ if (task_thread_info(tsk)->status & TS_USEDFPU) \ - save_init_fpu( tsk ); \ + save_init_fpu( tsk ); \ + else \ + tsk->fpu_counter = 0; \ } while (0) #define __clear_fpu( tsk ) \ @@ -118,6 +120,7 @@ static inline void save_init_fpu( struct task_struct *tsk ) extern unsigned short get_fpu_cwd( struct task_struct *tsk ); extern unsigned short get_fpu_swd( struct task_struct *tsk ); extern unsigned short get_fpu_mxcsr( struct task_struct *tsk ); +extern asmlinkage void math_state_restore(void); /* * Signal frame handlers... -- cgit v1.2.3-59-g8ed1b From bb0d977ed42c79ed709c79dbab4ff2159941eb2a Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: add Intel Core related PMU MSRs - add Intel Precise-Event Based sampling (PEBS) related MSR - add Intel Data Save (DS) Area related MSR - add Intel Core microarchitecure performance counter MSRs Signed-off-by: stephane eranian Signed-off-by: Andi Kleen --- include/asm-i386/msr.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/asm-i386') diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h index 62b76cd96957..1820d9d73af3 100644 --- a/include/asm-i386/msr.h +++ b/include/asm-i386/msr.h @@ -141,6 +141,10 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val) #define MSR_IA32_MC0_ADDR 0x402 #define MSR_IA32_MC0_MISC 0x403 +#define MSR_IA32_PEBS_ENABLE 0x3f1 +#define MSR_IA32_DS_AREA 0x600 +#define MSR_IA32_PERF_CAPABILITIES 0x345 + /* Pentium IV performance counter MSRs */ #define MSR_P4_BPU_PERFCTR0 0x300 #define MSR_P4_BPU_PERFCTR1 0x301 @@ -284,4 +288,13 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val) #define MSR_TMTA_LRTI_READOUT 0x80868018 #define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a +/* Intel Core-based CPU performance counters */ +#define MSR_CORE_PERF_FIXED_CTR0 0x309 +#define MSR_CORE_PERF_FIXED_CTR1 0x30a +#define MSR_CORE_PERF_FIXED_CTR2 0x30b +#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x38d +#define MSR_CORE_PERF_GLOBAL_STATUS 0x38e +#define MSR_CORE_PERF_GLOBAL_CTRL 0x38f +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x390 + #endif /* __ASM_MSR_H */ -- cgit v1.2.3-59-g8ed1b From 9ca36101a8d74704d78f10910f89d62de96f9dc8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Basic definitions for i386-pda This patch has the basic definitions of struct i386_pda, and the segment selector in the GDT. asm-i386/pda.h is more or less a direct copy of asm-x86_64/pda.h. The most interesting difference is the use of _proxy_pda, which is used to give gcc a model for the actual memory operations on the real pda structure. No actual reference is ever made to _proxy_pda, so it is never defined. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/head.S | 2 +- include/asm-i386/pda.h | 95 ++++++++++++++++++++++++++++++++++++++++++++++ include/asm-i386/segment.h | 5 ++- 3 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 include/asm-i386/pda.h (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index b1f1df11fcc6..4a83384c5a61 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -585,7 +585,7 @@ ENTRY(cpu_gdt_table) .quad 0x004092000000ffff /* 0xc8 APM DS data */ .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */ - .quad 0x0000000000000000 /* 0xd8 - unused */ + .quad 0x0000000000000000 /* 0xd8 - PDA */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ .quad 0x0000000000000000 /* 0xf0 - unused */ diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h new file mode 100644 index 000000000000..4c39ccb1305c --- /dev/null +++ b/include/asm-i386/pda.h @@ -0,0 +1,95 @@ +/* + Per-processor Data Areas + Jeremy Fitzhardinge 2006 + Based on asm-x86_64/pda.h by Andi Kleen. + */ +#ifndef _I386_PDA_H +#define _I386_PDA_H + +#include + +struct i386_pda +{ + struct i386_pda *_pda; /* pointer to self */ +}; + +extern struct i386_pda *_cpu_pda[]; + +#define cpu_pda(i) (_cpu_pda[i]) + +#define pda_offset(field) offsetof(struct i386_pda, field) + +extern void __bad_pda_field(void); + +/* This variable is never instantiated. It is only used as a stand-in + for the real per-cpu PDA memory, so that gcc can understand what + memory operations the inline asms() below are performing. This + eliminates the need to make the asms volatile or have memory + clobbers, so gcc can readily analyse them. */ +extern struct i386_pda _proxy_pda; + +#define pda_to_op(op,field,val) \ + do { \ + typedef typeof(_proxy_pda.field) T__; \ + if (0) { T__ tmp__; tmp__ = (val); } \ + switch (sizeof(_proxy_pda.field)) { \ + case 1: \ + asm(op "b %1,%%gs:%c2" \ + : "+m" (_proxy_pda.field) \ + :"ri" ((T__)val), \ + "i"(pda_offset(field))); \ + break; \ + case 2: \ + asm(op "w %1,%%gs:%c2" \ + : "+m" (_proxy_pda.field) \ + :"ri" ((T__)val), \ + "i"(pda_offset(field))); \ + break; \ + case 4: \ + asm(op "l %1,%%gs:%c2" \ + : "+m" (_proxy_pda.field) \ + :"ri" ((T__)val), \ + "i"(pda_offset(field))); \ + break; \ + default: __bad_pda_field(); \ + } \ + } while (0) + +#define pda_from_op(op,field) \ + ({ \ + typeof(_proxy_pda.field) ret__; \ + switch (sizeof(_proxy_pda.field)) { \ + case 1: \ + asm(op "b %%gs:%c1,%0" \ + : "=r" (ret__) \ + : "i" (pda_offset(field)), \ + "m" (_proxy_pda.field)); \ + break; \ + case 2: \ + asm(op "w %%gs:%c1,%0" \ + : "=r" (ret__) \ + : "i" (pda_offset(field)), \ + "m" (_proxy_pda.field)); \ + break; \ + case 4: \ + asm(op "l %%gs:%c1,%0" \ + : "=r" (ret__) \ + : "i" (pda_offset(field)), \ + "m" (_proxy_pda.field)); \ + break; \ + default: __bad_pda_field(); \ + } \ + ret__; }) + +/* Return a pointer to a pda field */ +#define pda_addr(field) \ + ((typeof(_proxy_pda.field) *)((unsigned char *)read_pda(_pda) + \ + pda_offset(field))) + +#define read_pda(field) pda_from_op("mov",field) +#define write_pda(field,val) pda_to_op("mov",field,val) +#define add_pda(field,val) pda_to_op("add",field,val) +#define sub_pda(field,val) pda_to_op("sub",field,val) +#define or_pda(field,val) pda_to_op("or",field,val) + +#endif /* _I386_PDA_H */ diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h index b7ab59685ba7..5bdda79b6b53 100644 --- a/include/asm-i386/segment.h +++ b/include/asm-i386/segment.h @@ -39,7 +39,7 @@ * 25 - APM BIOS support * * 26 - ESPFIX small SS - * 27 - unused + * 27 - PDA [ per-cpu private data area ] * 28 - unused * 29 - unused * 30 - unused @@ -74,6 +74,9 @@ #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) +#define GDT_ENTRY_PDA (GDT_ENTRY_KERNEL_BASE + 15) +#define __KERNEL_PDA (GDT_ENTRY_PDA * 8) + #define GDT_ENTRY_DOUBLEFAULT_TSS 31 /* -- cgit v1.2.3-59-g8ed1b From 62111195800d80c66cdc69063ea3145878c99fbf Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Initialize the per-CPU data area When a CPU is brought up, a PDA and GDT are allocated for it. The GDT's __KERNEL_PDA entry is pointed to the allocated PDA memory, so that all references using this segment descriptor will refer to the PDA. This patch rearranges CPU initialization a bit, so that the GDT/PDA are set up as early as possible in cpu_init(). Also for secondary CPUs, GDT+PDA are preallocated and initialized so all the secondary CPU needs to do is set up the ldt and load %gs. This will be important once smp_processor_id() and current use the PDA. In all cases, the PDA is set up in head.S, before a CPU starts running C code, so the PDA is always available. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Cc: James Bottomley Cc: Matt Tolentino Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/common.c | 177 ++++++++++++++++++++++++++--------- arch/i386/kernel/smpboot.c | 28 ++++-- arch/i386/mach-voyager/voyager_smp.c | 14 ++- include/asm-i386/processor.h | 3 + 4 files changed, 172 insertions(+), 50 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 5532fc4e1bf0..2534e25ed745 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -18,12 +18,16 @@ #include #include #endif +#include #include "cpu.h" DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); +struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(_cpu_pda); + static int cachesize_override __cpuinitdata = -1; static int disable_x86_fxsr __cpuinitdata; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -588,41 +592,16 @@ void __init early_cpu_init(void) disable_pse = 1; #endif } -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - */ -void __cpuinit cpu_init(void) + +__cpuinit int alloc_gdt(int cpu) { - int cpu = smp_processor_id(); - struct tss_struct * t = &per_cpu(init_tss, cpu); - struct thread_struct *thread = ¤t->thread; - struct desc_struct *gdt; struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt; + struct i386_pda *pda; - if (cpu_test_and_set(cpu, cpu_initialized)) { - printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); - for (;;) local_irq_enable(); - } - printk(KERN_INFO "Initializing CPU#%d\n", cpu); + gdt = (struct desc_struct *)cpu_gdt_descr->address; + pda = cpu_pda(cpu); - if (cpu_has_vme || cpu_has_tsc || cpu_has_de) - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - if (tsc_disable && cpu_has_tsc) { - printk(KERN_NOTICE "Disabling TSC...\n"); - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); - set_in_cr4(X86_CR4_TSD); - } - - /* The CPU hotplug case */ - if (cpu_gdt_descr->address) { - gdt = (struct desc_struct *)cpu_gdt_descr->address; - memset(gdt, 0, PAGE_SIZE); - goto old_gdt; - } /* * This is a horrible hack to allocate the GDT. The problem * is that cpu_init() is called really early for the boot CPU @@ -630,36 +609,117 @@ void __cpuinit cpu_init(void) * CPUs, when bootmem will have gone away */ if (NODE_DATA(0)->bdata->node_bootmem_map) { - gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); - /* alloc_bootmem_pages panics on failure, so no check */ + BUG_ON(gdt != NULL || pda != NULL); + + gdt = alloc_bootmem_pages(PAGE_SIZE); + pda = alloc_bootmem(sizeof(*pda)); + /* alloc_bootmem(_pages) panics on failure, so no check */ + memset(gdt, 0, PAGE_SIZE); + memset(pda, 0, sizeof(*pda)); } else { - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); - if (unlikely(!gdt)) { - printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); - for (;;) - local_irq_enable(); + /* GDT and PDA might already have been allocated if + this is a CPU hotplug re-insertion. */ + if (gdt == NULL) + gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); + + if (pda == NULL) + pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu)); + + if (unlikely(!gdt || !pda)) { + free_pages((unsigned long)gdt, 0); + kfree(pda); + return 0; } } -old_gdt: + + cpu_gdt_descr->address = (unsigned long)gdt; + cpu_pda(cpu) = pda; + + return 1; +} + +/* Initial PDA used by boot CPU */ +struct i386_pda boot_pda = { + ._pda = &boot_pda, +}; + +/* Initialize the CPU's GDT and PDA. The boot CPU does this for + itself, but secondaries find this done for them. */ +__cpuinit int init_gdt(int cpu, struct task_struct *idle) +{ + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt; + struct i386_pda *pda; + + /* For non-boot CPUs, the GDT and PDA should already have been + allocated. */ + if (!alloc_gdt(cpu)) { + printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu); + return 0; + } + + gdt = (struct desc_struct *)cpu_gdt_descr->address; + pda = cpu_pda(cpu); + + BUG_ON(gdt == NULL || pda == NULL); + /* * Initialize the per-CPU GDT with the boot GDT, * and set up the GDT descriptor: */ memcpy(gdt, cpu_gdt_table, GDT_SIZE); cpu_gdt_descr->size = GDT_SIZE - 1; - cpu_gdt_descr->address = (unsigned long)gdt; + pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, + (u32 *)&gdt[GDT_ENTRY_PDA].b, + (unsigned long)pda, sizeof(*pda) - 1, + 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ + + memset(pda, 0, sizeof(*pda)); + pda->_pda = pda; + + return 1; +} + +/* Common CPU init for both boot and secondary CPUs */ +static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) +{ + struct tss_struct * t = &per_cpu(init_tss, cpu); + struct thread_struct *thread = &curr->thread; + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + + /* Reinit these anyway, even if they've already been done (on + the boot CPU, this will transition from the boot gdt+pda to + the real ones). */ load_gdt(cpu_gdt_descr); + + if (cpu_test_and_set(cpu, cpu_initialized)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); + for (;;) local_irq_enable(); + } + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + if (cpu_has_vme || cpu_has_tsc || cpu_has_de) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + if (tsc_disable && cpu_has_tsc) { + printk(KERN_NOTICE "Disabling TSC...\n"); + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + set_in_cr4(X86_CR4_TSD); + } + load_idt(&idt_descr); /* * Set up and load the per-CPU TSS and LDT */ atomic_inc(&init_mm.mm_count); - current->active_mm = &init_mm; - BUG_ON(current->mm); - enter_lazy_tlb(&init_mm, current); + curr->active_mm = &init_mm; + if (curr->mm) + BUG(); + enter_lazy_tlb(&init_mm, curr); load_esp0(t, thread); set_tss_desc(cpu,t); @@ -690,6 +750,37 @@ old_gdt: mxcsr_feature_mask_init(); } +/* Entrypoint to initialize secondary CPU */ +void __cpuinit secondary_cpu_init(void) +{ + int cpu = smp_processor_id(); + struct task_struct *curr = current; + + _cpu_init(cpu, curr); +} + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void __cpuinit cpu_init(void) +{ + int cpu = smp_processor_id(); + struct task_struct *curr = current; + + /* Set up the real GDT and PDA, so we can transition from the + boot versions. */ + if (!init_gdt(cpu, curr)) { + /* failed to allocate something; not much we can do... */ + for (;;) + local_irq_enable(); + } + + _cpu_init(cpu, curr); +} + #ifdef CONFIG_HOTPLUG_CPU void __cpuinit cpu_uninit(void) { diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 4bb8b77cd65b..095636620fa2 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -536,11 +537,11 @@ set_cpu_sibling_map(int cpu) static void __devinit start_secondary(void *unused) { /* - * Dont put anything before smp_callin(), SMP + * Don't put *anything* before secondary_cpu_init(), SMP * booting is too fragile that we want to limit the * things done here to the most necessary things. */ - cpu_init(); + secondary_cpu_init(); preempt_disable(); smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) @@ -599,13 +600,16 @@ void __devinit initialize_secondary(void) "movl %0,%%esp\n\t" "jmp *%1" : - :"r" (current->thread.esp),"r" (current->thread.eip)); + :"m" (current->thread.esp),"m" (current->thread.eip)); } +/* Static state in head.S used to set up a CPU */ extern struct { void * esp; unsigned short ss; } stack_start; +extern struct i386_pda *start_pda; +extern struct Xgt_desc_struct cpu_gdt_descr; #ifdef CONFIG_NUMA @@ -936,9 +940,6 @@ static int __devinit do_boot_cpu(int apicid, int cpu) unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; - ++cpucount; - alternatives_smp_switch(1); - /* * We can't use kernel_thread since we must avoid to * reschedule the child. @@ -946,15 +947,30 @@ static int __devinit do_boot_cpu(int apicid, int cpu) idle = alloc_idle_task(cpu); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); + + /* Pre-allocate and initialize the CPU's GDT and PDA so it + doesn't have to do any memory allocation during the + delicate CPU-bringup phase. */ + if (!init_gdt(cpu, idle)) { + printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu); + return -1; /* ? */ + } + idle->thread.eip = (unsigned long) start_secondary; /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); + ++cpucount; + alternatives_smp_switch(1); + /* So we see what's up */ printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); /* Stack for startup_32 can be just as for start_secondary onwards */ stack_start.esp = (void *) idle->thread.esp; + start_pda = cpu_pda(cpu); + cpu_gdt_descr = per_cpu(cpu_gdt_descr, cpu); + irq_ctx_init(cpu); x86_cpu_to_apicid[cpu] = apicid; diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c index f3fea2ad50fe..55428e656a3f 100644 --- a/arch/i386/mach-voyager/voyager_smp.c +++ b/arch/i386/mach-voyager/voyager_smp.c @@ -28,6 +28,7 @@ #include #include #include +#include /* TLB state -- visible externally, indexed physically */ DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 }; @@ -422,6 +423,7 @@ find_smp_config(void) VOYAGER_SUS_IN_CONTROL_PORT); current_thread_info()->cpu = boot_cpu_id; + write_pda(cpu_number, boot_cpu_id); } /* @@ -458,7 +460,7 @@ start_secondary(void *unused) /* external functions not defined in the headers */ extern void calibrate_delay(void); - cpu_init(); + secondary_cpu_init(); /* OK, we're in the routine */ ack_CPI(VIC_CPU_BOOT_CPI); @@ -578,6 +580,15 @@ do_boot_cpu(__u8 cpu) /* init_tasks (in sched.c) is indexed logically */ stack_start.esp = (void *) idle->thread.esp; + /* Pre-allocate and initialize the CPU's GDT and PDA so it + doesn't have to do any memory allocation during the + delicate CPU-bringup phase. */ + if (!init_gdt(cpu, idle)) { + printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu); + cpucount--; + return; + } + irq_ctx_init(cpu); /* Note: Don't modify initial ss override */ @@ -1963,4 +1974,5 @@ void __init smp_setup_processor_id(void) { current_thread_info()->cpu = hard_smp_processor_id(); + write_pda(cpu_number, hard_smp_processor_id()); } diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index e0ddca94d50c..a9f2041c7c87 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -727,4 +727,7 @@ extern unsigned long boot_option_idle_override; extern void enable_sep_cpu(void); extern int sysenter_setup(void); +extern int init_gdt(int cpu, struct task_struct *idle); +extern void secondary_cpu_init(void); + #endif /* __ASM_I386_PROCESSOR_H */ -- cgit v1.2.3-59-g8ed1b From f95d47caae5302a63d92be9a0292abc90e2a14e1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Use %gs as the PDA base-segment in the kernel This patch is the meat of the PDA change. This patch makes several related changes: 1: Most significantly, %gs is now used in the kernel. This means that on entry, the old value of %gs is saved away, and it is reloaded with __KERNEL_PDA. 2: entry.S constructs the stack in the shape of struct pt_regs, and this is passed around the kernel so that the process's saved register state can be accessed. Unfortunately struct pt_regs doesn't currently have space for %gs (or %fs). This patch extends pt_regs to add space for gs (no space is allocated for %fs, since it won't be used, and it would just complicate the code in entry.S to work around the space). 3: Because %gs is now saved on the stack like %ds, %es and the integer registers, there are a number of places where it no longer needs to be handled specially; namely context switch, and saving/restoring the register state in a signal context. 4: And since kernel threads run in kernel space and call normal kernel code, they need to be created with their %gs == __KERNEL_PDA. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 1 + arch/i386/kernel/cpu/common.c | 21 +++++++++++-- arch/i386/kernel/entry.S | 70 +++++++++++++++++++++++++++++------------- arch/i386/kernel/head.S | 31 ++++++++++++++++--- arch/i386/kernel/process.c | 26 ++++++++-------- arch/i386/kernel/signal.c | 6 ++-- include/asm-i386/mmu_context.h | 4 +-- include/asm-i386/processor.h | 4 ++- include/asm-i386/ptrace.h | 2 ++ kernel/fork.c | 2 +- 10 files changed, 117 insertions(+), 50 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 70b19807acf9..9620872d3534 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -72,6 +72,7 @@ void foo(void) OFFSET(PT_EAX, pt_regs, eax); OFFSET(PT_DS, pt_regs, xds); OFFSET(PT_ES, pt_regs, xes); + OFFSET(PT_GS, pt_regs, xgs); OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); OFFSET(PT_EIP, pt_regs, eip); OFFSET(PT_CS, pt_regs, xcs); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 2534e25ed745..4e63d8ce602b 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -593,6 +593,14 @@ void __init early_cpu_init(void) #endif } +/* Make sure %gs is initialized properly in idle threads */ +struct pt_regs * __devinit idle_regs(struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + regs->xgs = __KERNEL_PDA; + return regs; +} + __cpuinit int alloc_gdt(int cpu) { struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); @@ -644,6 +652,14 @@ struct i386_pda boot_pda = { ._pda = &boot_pda, }; +static inline void set_kernel_gs(void) +{ + /* Set %gs for this CPU's PDA. Memory clobber is to create a + barrier with respect to any PDA operations, so the compiler + doesn't move any before here. */ + asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); +} + /* Initialize the CPU's GDT and PDA. The boot CPU does this for itself, but secondaries find this done for them. */ __cpuinit int init_gdt(int cpu, struct task_struct *idle) @@ -693,6 +709,7 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) the boot CPU, this will transition from the boot gdt+pda to the real ones). */ load_gdt(cpu_gdt_descr); + set_kernel_gs(); if (cpu_test_and_set(cpu, cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); @@ -731,8 +748,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); #endif - /* Clear %fs and %gs. */ - asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); + /* Clear %fs. */ + asm volatile ("mov %0, %%fs" : : "r" (0)); /* Clear all 6 debug registers: */ set_debugreg(0, 0); diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 0069bf01603e..b99d4a160078 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -30,12 +30,13 @@ * 18(%esp) - %eax * 1C(%esp) - %ds * 20(%esp) - %es - * 24(%esp) - orig_eax - * 28(%esp) - %eip - * 2C(%esp) - %cs - * 30(%esp) - %eflags - * 34(%esp) - %oldesp - * 38(%esp) - %oldss + * 24(%esp) - %gs + * 28(%esp) - orig_eax + * 2C(%esp) - %eip + * 30(%esp) - %cs + * 34(%esp) - %eflags + * 38(%esp) - %oldesp + * 3C(%esp) - %oldss * * "current" is in register %ebx during any slow entries. */ @@ -92,6 +93,9 @@ VM_MASK = 0x00020000 #define SAVE_ALL \ cld; \ + pushl %gs; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET gs, 0;*/\ pushl %es; \ CFI_ADJUST_CFA_OFFSET 4;\ /*CFI_REL_OFFSET es, 0;*/\ @@ -121,7 +125,9 @@ VM_MASK = 0x00020000 CFI_REL_OFFSET ebx, 0;\ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ - movl %edx, %es; + movl %edx, %es; \ + movl $(__KERNEL_PDA), %edx; \ + movl %edx, %gs #define RESTORE_INT_REGS \ popl %ebx; \ @@ -154,17 +160,22 @@ VM_MASK = 0x00020000 2: popl %es; \ CFI_ADJUST_CFA_OFFSET -4;\ /*CFI_RESTORE es;*/\ -.section .fixup,"ax"; \ -3: movl $0,(%esp); \ - jmp 1b; \ +3: popl %gs; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE gs;*/\ +.pushsection .fixup,"ax"; \ 4: movl $0,(%esp); \ + jmp 1b; \ +5: movl $0,(%esp); \ jmp 2b; \ -.previous; \ +6: movl $0,(%esp); \ + jmp 3b; \ .section __ex_table,"a";\ .align 4; \ - .long 1b,3b; \ - .long 2b,4b; \ -.previous + .long 1b,4b; \ + .long 2b,5b; \ + .long 3b,6b; \ +.popsection #define RING0_INT_FRAME \ CFI_STARTPROC simple;\ @@ -231,6 +242,7 @@ check_userspace: andl $(VM_MASK | SEGMENT_RPL_MASK), %eax cmpl $USER_RPL, %eax jb resume_kernel # not returning to v8086 or userspace + ENTRY(resume_userspace) DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending @@ -327,9 +339,16 @@ sysenter_past_esp: movl PT_OLDESP(%esp), %ecx xorl %ebp,%ebp TRACE_IRQS_ON +1: mov PT_GS(%esp), %gs ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC - +.pushsection .fixup,"ax" +2: movl $0,PT_GS(%esp) + jmp 1b +.section __ex_table,"a" + .align 4 + .long 1b,2b +.popsection # system call handler stub ENTRY(system_call) @@ -375,7 +394,7 @@ restore_nocheck: TRACE_IRQS_IRET restore_nocheck_notrace: RESTORE_REGS - addl $4, %esp + addl $4, %esp # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 1: INTERRUPT_RETURN .section .fixup,"ax" @@ -588,6 +607,10 @@ KPROBE_ENTRY(page_fault) CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: + /* the function address is in %gs's slot on the stack */ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ pushl %ds CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ds, 0*/ @@ -613,18 +636,20 @@ error_code: CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebx, 0 cld - pushl %es + pushl %gs CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET es, 0*/ + /*CFI_REL_OFFSET gs, 0*/ + movl $(__KERNEL_PDA), %ecx + movl %ecx, %gs UNWIND_ESPFIX_STACK popl %ecx CFI_ADJUST_CFA_OFFSET -4 /*CFI_REGISTER es, ecx*/ - movl PT_ES(%esp), %edi # get the function address + movl PT_GS(%esp), %edi # get the function address movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) - movl %ecx, PT_ES(%esp) - /*CFI_REL_OFFSET es, ES*/ + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + mov %ecx, PT_GS(%esp) + /*CFI_REL_OFFSET gs, ES*/ movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es @@ -936,6 +961,7 @@ ENTRY(arch_unwind_init_running) movl %ebx, PT_EAX(%edx) movl $__USER_DS, PT_DS(%edx) movl $__USER_DS, PT_ES(%edx) + movl $0, PT_GS(%edx) movl %ebx, PT_ORIG_EAX(%edx) movl %ecx, PT_EIP(%edx) movl 12(%esp), %ecx diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 4a83384c5a61..5b14e95ac8b9 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -302,6 +302,7 @@ is386: movl $2,%ecx # set MP movl %eax,%cr0 call check_x87 + call setup_pda lgdt cpu_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f @@ -312,10 +313,13 @@ is386: movl $2,%ecx # set MP movl %eax,%ds movl %eax,%es - xorl %eax,%eax # Clear FS/GS and LDT + xorl %eax,%eax # Clear FS and LDT movl %eax,%fs - movl %eax,%gs lldt %ax + + movl $(__KERNEL_PDA),%eax + mov %eax,%gs + cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder #ifdef CONFIG_SMP @@ -345,6 +349,23 @@ check_x87: .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ ret +/* + * Point the GDT at this CPU's PDA. On boot this will be + * cpu_gdt_table and boot_pda; for secondary CPUs, these will be + * that CPU's GDT and PDA. + */ +setup_pda: + /* get the PDA pointer */ + movl start_pda, %eax + + /* slot the PDA address into the GDT */ + mov cpu_gdt_descr+2, %ecx + mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ + shr $16, %eax + mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ + mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */ + ret + /* * setup_idt * @@ -484,6 +505,8 @@ ENTRY(empty_zero_page) * This starts the data section. */ .data +ENTRY(start_pda) + .long boot_pda ENTRY(stack_start) .long init_thread_union+THREAD_SIZE @@ -525,7 +548,7 @@ idt_descr: # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address -cpu_gdt_descr: +ENTRY(cpu_gdt_descr) .word GDT_ENTRIES*8-1 .long cpu_gdt_table @@ -585,7 +608,7 @@ ENTRY(cpu_gdt_table) .quad 0x004092000000ffff /* 0xc8 APM DS data */ .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */ - .quad 0x0000000000000000 /* 0xd8 - PDA */ + .quad 0x00cf92000000ffff /* 0xd8 - PDA */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ .quad 0x0000000000000000 /* 0xf0 - unused */ diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index ae924c416b68..905364d42847 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -56,6 +56,7 @@ #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -346,6 +347,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.xds = __USER_DS; regs.xes = __USER_DS; + regs.xgs = __KERNEL_PDA; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; regs.xcs = __KERNEL_CS | get_kernel_rpl(); @@ -431,7 +433,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, p->thread.eip = (unsigned long) ret_from_fork; savesegment(fs,p->thread.fs); - savesegment(gs,p->thread.gs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -659,16 +660,16 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas load_esp0(tss, next); /* - * Save away %fs and %gs. No need to save %es and %ds, as - * those are always kernel segments while inside the kernel. - * Doing this before setting the new TLS descriptors avoids - * the situation where we temporarily have non-reloadable - * segments in %fs and %gs. This could be an issue if the - * NMI handler ever used %fs or %gs (it does not today), or - * if the kernel is running inside of a hypervisor layer. + * Save away %fs. No need to save %gs, as it was saved on the + * stack on entry. No need to save %es and %ds, as those are + * always kernel segments while inside the kernel. Doing this + * before setting the new TLS descriptors avoids the situation + * where we temporarily have non-reloadable segments in %fs + * and %gs. This could be an issue if the NMI handler ever + * used %fs or %gs (it does not today), or if the kernel is + * running inside of a hypervisor layer. */ savesegment(fs, prev->fs); - savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -676,16 +677,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas load_TLS(next, cpu); /* - * Restore %fs and %gs if needed. + * Restore %fs if needed. * - * Glibc normally makes %fs be zero, and %gs is one of - * the TLS segments. + * Glibc normally makes %fs be zero. */ if (unlikely(prev->fs | next->fs)) loadsegment(fs, next->fs); - if (prev->gs | next->gs) - loadsegment(gs, next->gs); /* * Restore IOPL if needed. diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 43002cfb40c4..65d7620eaa09 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -128,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) - GET_SEG(gs); + COPY_SEG(gs); GET_SEG(fs); COPY_SEG(es); COPY_SEG(ds); @@ -244,9 +244,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, { int tmp, err = 0; - tmp = 0; - savesegment(gs, tmp); - err |= __put_user(tmp, (unsigned int __user *)&sc->gs); + err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs); savesegment(fs, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->fs); diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h index 1b1495372c4d..68ff102d6f5e 100644 --- a/include/asm-i386/mmu_context.h +++ b/include/asm-i386/mmu_context.h @@ -62,8 +62,8 @@ static inline void switch_mm(struct mm_struct *prev, #endif } -#define deactivate_mm(tsk, mm) \ - asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) +#define deactivate_mm(tsk, mm) \ + asm("movl %0,%%fs": :"r" (0)); #define activate_mm(prev, next) \ switch_mm((prev),(next),NULL) diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index a9f2041c7c87..f73cf836e649 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -473,6 +473,7 @@ struct thread_struct { .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ + .gs = __KERNEL_PDA, \ } /* @@ -500,7 +501,8 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa } #define start_thread(regs, new_eip, new_esp) do { \ - __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ + __asm__("movl %0,%%fs": :"r" (0)); \ + regs->xgs = 0; \ set_fs(USER_DS); \ regs->xds = __USER_DS; \ regs->xes = __USER_DS; \ diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h index d505f501077a..bdbc894339b4 100644 --- a/include/asm-i386/ptrace.h +++ b/include/asm-i386/ptrace.h @@ -16,6 +16,8 @@ struct pt_regs { long eax; int xds; int xes; + /* int xfs; */ + int xgs; long orig_eax; long eip; int xcs; diff --git a/kernel/fork.c b/kernel/fork.c index 8cdd3e72ba55..fd22245e3881 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1303,7 +1303,7 @@ fork_out: return ERR_PTR(retval); } -struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) +noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); return regs; -- cgit v1.2.3-59-g8ed1b From 66e10a44d724f1464b5e8b5a3eae1e2cbbc2cca6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Fix places where using %gs changes the usermode ABI There are a few places where the change in struct pt_regs and the use of %gs affect the userspace ABI. These are primarily debugging interfaces where thread state can be inspected or extracted. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/process.c | 6 +++--- arch/i386/kernel/ptrace.c | 18 ++++++------------ include/asm-i386/elf.h | 2 +- include/asm-i386/unwind.h | 1 + 4 files changed, 11 insertions(+), 16 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 905364d42847..dc4272545179 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -315,8 +315,8 @@ void show_regs(struct pt_regs * regs) regs->eax,regs->ebx,regs->ecx,regs->edx); printk("ESI: %08lx EDI: %08lx EBP: %08lx", regs->esi, regs->edi, regs->ebp); - printk(" DS: %04x ES: %04x\n", - 0xffff & regs->xds,0xffff & regs->xes); + printk(" DS: %04x ES: %04x GS: %04x\n", + 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs); cr0 = read_cr0(); cr2 = read_cr2(); @@ -509,7 +509,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump) dump->regs.ds = regs->xds; dump->regs.es = regs->xes; savesegment(fs,dump->regs.fs); - savesegment(gs,dump->regs.gs); + dump->regs.gs = regs->xgs; dump->regs.orig_eax = regs->orig_eax; dump->regs.eip = regs->eip; dump->regs.cs = regs->xcs; diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index 775f50e9395b..f3f94ac5736a 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -94,13 +94,9 @@ static int putreg(struct task_struct *child, return -EIO; child->thread.fs = value; return 0; - case GS: - if (value && (value & 3) != 3) - return -EIO; - child->thread.gs = value; - return 0; case DS: case ES: + case GS: if (value && (value & 3) != 3) return -EIO; value &= 0xffff; @@ -116,8 +112,8 @@ static int putreg(struct task_struct *child, value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; break; } - if (regno > GS*4) - regno -= 2*4; + if (regno > ES*4) + regno -= 1*4; put_stack_long(child, regno - sizeof(struct pt_regs), value); return 0; } @@ -131,18 +127,16 @@ static unsigned long getreg(struct task_struct *child, case FS: retval = child->thread.fs; break; - case GS: - retval = child->thread.gs; - break; case DS: case ES: + case GS: case SS: case CS: retval = 0xffff; /* fall through */ default: - if (regno > GS*4) - regno -= 2*4; + if (regno > ES*4) + regno -= 1*4; regno = regno - sizeof(struct pt_regs); retval &= get_stack_long(child, regno); } diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h index 3a05436f31c0..45d21a0c95bf 100644 --- a/include/asm-i386/elf.h +++ b/include/asm-i386/elf.h @@ -91,7 +91,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t; pr_reg[7] = regs->xds; \ pr_reg[8] = regs->xes; \ savesegment(fs,pr_reg[9]); \ - savesegment(gs,pr_reg[10]); \ + pr_reg[10] = regs->xgs; \ pr_reg[11] = regs->orig_eax; \ pr_reg[12] = regs->eip; \ pr_reg[13] = regs->xcs; \ diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h index 5031d693b89d..601fc67bd775 100644 --- a/include/asm-i386/unwind.h +++ b/include/asm-i386/unwind.h @@ -71,6 +71,7 @@ static inline void arch_unw_init_blocked(struct unwind_frame_info *info) info->regs.xss = __KERNEL_DS; info->regs.xds = __USER_DS; info->regs.xes = __USER_DS; + info->regs.xgs = __KERNEL_PDA; } extern asmlinkage int arch_unwind_init_running(struct unwind_frame_info *, -- cgit v1.2.3-59-g8ed1b From 49d26b6eaa8e970c8cf6e299e6ccba2474191bf5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Update sys_vm86 to cope with changed pt_regs and %gs usage sys_vm86 uses a struct kernel_vm86_regs, which is identical to pt_regs, but adds an extra space for all the segment registers. Previously this structure was completely independent, so changes in pt_regs had to be reflected in kernel_vm86_regs. This changes just embeds pt_regs in kernel_vm86_regs, and makes the appropriate changes to vm86.c to deal with the new naming. Also, since %gs is dealt with differently in the kernel, this change adjusts vm86.c to reflect this. While making these changes, I also cleaned up some frankly bizarre code which was added when auditing was added to sys_vm86. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Cc: Al Viro Cc: Jason Baron Cc: Chris Wright Signed-off-by: Andrew Morton --- arch/i386/kernel/vm86.c | 121 +++++++++++++++++++++++++++++------------------- include/asm-i386/vm86.h | 17 +------ 2 files changed, 76 insertions(+), 62 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c index cbcd61d6120b..be2f96e67f78 100644 --- a/arch/i386/kernel/vm86.c +++ b/arch/i386/kernel/vm86.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -72,10 +73,10 @@ /* * 8- and 16-bit register defines.. */ -#define AL(regs) (((unsigned char *)&((regs)->eax))[0]) -#define AH(regs) (((unsigned char *)&((regs)->eax))[1]) -#define IP(regs) (*(unsigned short *)&((regs)->eip)) -#define SP(regs) (*(unsigned short *)&((regs)->esp)) +#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0]) +#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1]) +#define IP(regs) (*(unsigned short *)&((regs)->pt.eip)) +#define SP(regs) (*(unsigned short *)&((regs)->pt.esp)) /* * virtual flags (16 and 32-bit versions) @@ -89,10 +90,37 @@ #define SAFE_MASK (0xDD5) #define RETURN_MASK (0xDFF) -#define VM86_REGS_PART2 orig_eax -#define VM86_REGS_SIZE1 \ - ( (unsigned)( & (((struct kernel_vm86_regs *)0)->VM86_REGS_PART2) ) ) -#define VM86_REGS_SIZE2 (sizeof(struct kernel_vm86_regs) - VM86_REGS_SIZE1) +/* convert kernel_vm86_regs to vm86_regs */ +static int copy_vm86_regs_to_user(struct vm86_regs __user *user, + const struct kernel_vm86_regs *regs) +{ + int ret = 0; + + /* kernel_vm86_regs is missing xfs, so copy everything up to + (but not including) xgs, and then rest after xgs. */ + ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs)); + ret += copy_to_user(&user->__null_gs, ®s->pt.xgs, + sizeof(struct kernel_vm86_regs) - + offsetof(struct kernel_vm86_regs, pt.xgs)); + + return ret; +} + +/* convert vm86_regs to kernel_vm86_regs */ +static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, + const struct vm86_regs __user *user, + unsigned extra) +{ + int ret = 0; + + ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs)); + ret += copy_from_user(®s->pt.xgs, &user->__null_gs, + sizeof(struct kernel_vm86_regs) - + offsetof(struct kernel_vm86_regs, pt.xgs) + + extra); + + return ret; +} struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) @@ -112,10 +140,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } - set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); - tmp = copy_to_user(¤t->thread.vm86_info->regs,regs, VM86_REGS_SIZE1); - tmp += copy_to_user(¤t->thread.vm86_info->regs.VM86_REGS_PART2, - ®s->VM86_REGS_PART2, VM86_REGS_SIZE2); + set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); + tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs,regs); tmp += put_user(current->thread.screen_bitmap,¤t->thread.vm86_info->screen_bitmap); if (tmp) { printk("vm86: could not access userspace vm86_info\n"); @@ -129,9 +155,11 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) current->thread.saved_esp0 = 0; put_cpu(); - loadsegment(fs, current->thread.saved_fs); - loadsegment(gs, current->thread.saved_gs); ret = KVM86->regs32; + + loadsegment(fs, current->thread.saved_fs); + ret->xgs = current->thread.saved_gs; + return ret; } @@ -183,9 +211,9 @@ asmlinkage int sys_vm86old(struct pt_regs regs) tsk = current; if (tsk->thread.saved_esp0) goto out; - tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); - tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, - (long)&info.vm86plus - (long)&info.regs.VM86_REGS_PART2); + tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, + offsetof(struct kernel_vm86_struct, vm86plus) - + sizeof(info.regs)); ret = -EFAULT; if (tmp) goto out; @@ -233,9 +261,9 @@ asmlinkage int sys_vm86(struct pt_regs regs) if (tsk->thread.saved_esp0) goto out; v86 = (struct vm86plus_struct __user *)regs.ecx; - tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); - tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, - (long)&info.regs32 - (long)&info.regs.VM86_REGS_PART2); + tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, + offsetof(struct kernel_vm86_struct, regs32) - + sizeof(info.regs)); ret = -EFAULT; if (tmp) goto out; @@ -252,15 +280,15 @@ out: static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) { struct tss_struct *tss; - long eax; /* * make sure the vm86() system call doesn't try to do anything silly */ - info->regs.__null_ds = 0; - info->regs.__null_es = 0; + info->regs.pt.xds = 0; + info->regs.pt.xes = 0; + info->regs.pt.xgs = 0; -/* we are clearing fs,gs later just before "jmp resume_userspace", - * because starting with Linux 2.1.x they aren't no longer saved/restored +/* we are clearing fs later just before "jmp resume_userspace", + * because it is not saved/restored. */ /* @@ -268,10 +296,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk * has set it up safely, so this makes sure interrupt etc flags are * inherited from protected mode. */ - VEFLAGS = info->regs.eflags; - info->regs.eflags &= SAFE_MASK; - info->regs.eflags |= info->regs32->eflags & ~SAFE_MASK; - info->regs.eflags |= VM_MASK; + VEFLAGS = info->regs.pt.eflags; + info->regs.pt.eflags &= SAFE_MASK; + info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK; + info->regs.pt.eflags |= VM_MASK; switch (info->cpu_type) { case CPU_286: @@ -294,7 +322,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk info->regs32->eax = 0; tsk->thread.saved_esp0 = tsk->thread.esp0; savesegment(fs, tsk->thread.saved_fs); - savesegment(gs, tsk->thread.saved_gs); + tsk->thread.saved_gs = info->regs32->xgs; tss = &per_cpu(init_tss, get_cpu()); tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; @@ -306,19 +334,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk tsk->thread.screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - __asm__ __volatile__("xorl %eax,%eax; movl %eax,%fs; movl %eax,%gs\n\t"); - __asm__ __volatile__("movl %%eax, %0\n" :"=r"(eax)); /*call audit_syscall_exit since we do not exit via the normal paths */ if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(eax), eax); + audit_syscall_exit(AUDITSC_RESULT(0), 0); __asm__ __volatile__( "movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" + "mov %2, %%fs\n\t" "jmp resume_userspace" : /* no outputs */ - :"r" (&info->regs), "r" (task_thread_info(tsk))); + :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); /* we never return here */ } @@ -348,12 +375,12 @@ static inline void clear_IF(struct kernel_vm86_regs * regs) static inline void clear_TF(struct kernel_vm86_regs * regs) { - regs->eflags &= ~TF_MASK; + regs->pt.eflags &= ~TF_MASK; } static inline void clear_AC(struct kernel_vm86_regs * regs) { - regs->eflags &= ~AC_MASK; + regs->pt.eflags &= ~AC_MASK; } /* It is correct to call set_IF(regs) from the set_vflags_* @@ -370,7 +397,7 @@ static inline void clear_AC(struct kernel_vm86_regs * regs) static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) { set_flags(VEFLAGS, eflags, current->thread.v86mask); - set_flags(regs->eflags, eflags, SAFE_MASK); + set_flags(regs->pt.eflags, eflags, SAFE_MASK); if (eflags & IF_MASK) set_IF(regs); else @@ -380,7 +407,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) { set_flags(VFLAGS, flags, current->thread.v86mask); - set_flags(regs->eflags, flags, SAFE_MASK); + set_flags(regs->pt.eflags, flags, SAFE_MASK); if (flags & IF_MASK) set_IF(regs); else @@ -389,7 +416,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) { - unsigned long flags = regs->eflags & RETURN_MASK; + unsigned long flags = regs->pt.eflags & RETURN_MASK; if (VEFLAGS & VIF_MASK) flags |= IF_MASK; @@ -493,7 +520,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i, unsigned long __user *intr_ptr; unsigned long segoffs; - if (regs->cs == BIOSSEG) + if (regs->pt.xcs == BIOSSEG) goto cannot_handle; if (is_revectored(i, &KVM86->int_revectored)) goto cannot_handle; @@ -505,9 +532,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i, if ((segoffs >> 16) == BIOSSEG) goto cannot_handle; pushw(ssp, sp, get_vflags(regs), cannot_handle); - pushw(ssp, sp, regs->cs, cannot_handle); + pushw(ssp, sp, regs->pt.xcs, cannot_handle); pushw(ssp, sp, IP(regs), cannot_handle); - regs->cs = segoffs >> 16; + regs->pt.xcs = segoffs >> 16; SP(regs) -= 6; IP(regs) = segoffs & 0xffff; clear_TF(regs); @@ -524,7 +551,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno if (VMPI.is_vm86pus) { if ( (trapno==3) || (trapno==1) ) return_to_32bit(regs, VM86_TRAP + (trapno << 8)); - do_int(regs, trapno, (unsigned char __user *) (regs->ss << 4), SP(regs)); + do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs)); return 0; } if (trapno !=1) @@ -560,10 +587,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) handle_vm86_trap(regs, 0, 1); \ return; } while (0) - orig_flags = *(unsigned short *)®s->eflags; + orig_flags = *(unsigned short *)®s->pt.eflags; - csp = (unsigned char __user *) (regs->cs << 4); - ssp = (unsigned char __user *) (regs->ss << 4); + csp = (unsigned char __user *) (regs->pt.xcs << 4); + ssp = (unsigned char __user *) (regs->pt.xss << 4); sp = SP(regs); ip = IP(regs); @@ -650,7 +677,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) SP(regs) += 6; } IP(regs) = newip; - regs->cs = newcs; + regs->pt.xcs = newcs; CHECK_IF_IN_TRAP; if (data32) { set_vflags_long(newflags, regs); diff --git a/include/asm-i386/vm86.h b/include/asm-i386/vm86.h index 952fd6957380..a5edf517b992 100644 --- a/include/asm-i386/vm86.h +++ b/include/asm-i386/vm86.h @@ -145,26 +145,13 @@ struct vm86plus_struct { * at the end of the structure. Look at ptrace.h to see the "normal" * setup. For user space layout see 'struct vm86_regs' above. */ +#include struct kernel_vm86_regs { /* * normal regs, with special meaning for the segment descriptors.. */ - long ebx; - long ecx; - long edx; - long esi; - long edi; - long ebp; - long eax; - long __null_ds; - long __null_es; - long orig_eax; - long eip; - unsigned short cs, __csh; - long eflags; - long esp; - unsigned short ss, __ssh; + struct pt_regs pt; /* * these are specific to v86 mode: */ -- cgit v1.2.3-59-g8ed1b From b2938f880890ebfcccad356275e0000193153623 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Implement smp_processor_id() with the PDA Use the cpu_number in the PDA to implement raw_smp_processor_id. This is a little simpler than using thread_info, though the cpu field in thread_info cannot be removed since it is used for things other than getting the current CPU in common code. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 4 +++- arch/i386/kernel/cpu/common.c | 2 ++ arch/i386/kernel/entry.S | 3 +-- include/asm-i386/pda.h | 2 ++ include/asm-i386/smp.h | 3 ++- 5 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 9620872d3534..85f1b038e9c3 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -51,7 +51,6 @@ void foo(void) OFFSET(TI_exec_domain, thread_info, exec_domain); OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); - OFFSET(TI_cpu, thread_info, cpu); OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); @@ -97,4 +96,7 @@ void foo(void) DEFINE(VDSO_PRELINK, VDSO_PRELINK); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); + + BLANK(); + OFFSET(PDA_cpu, i386_pda, cpu_number); } diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 4e63d8ce602b..e476202b887f 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -650,6 +650,7 @@ __cpuinit int alloc_gdt(int cpu) /* Initial PDA used by boot CPU */ struct i386_pda boot_pda = { ._pda = &boot_pda, + .cpu_number = 0, }; static inline void set_kernel_gs(void) @@ -694,6 +695,7 @@ __cpuinit int init_gdt(int cpu, struct task_struct *idle) memset(pda, 0, sizeof(*pda)); pda->_pda = pda; + pda->cpu_number = cpu; return 1; } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index b99d4a160078..d7423efaeea4 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -524,8 +524,7 @@ syscall_badsys: #define FIXUP_ESPFIX_STACK \ /* since we are on a wrong stack, we cant make it a C code :( */ \ - GET_THREAD_INFO(%ebp); \ - movl TI_cpu(%ebp), %ebx; \ + movl %gs:PDA_cpu, %ebx; \ PER_CPU(cpu_gdt_descr, %ebx); \ movl GDS_address(%ebx), %ebx; \ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h index 4c39ccb1305c..f90fde22566d 100644 --- a/include/asm-i386/pda.h +++ b/include/asm-i386/pda.h @@ -11,6 +11,8 @@ struct i386_pda { struct i386_pda *_pda; /* pointer to self */ + + int cpu_number; }; extern struct i386_pda *_cpu_pda[]; diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h index bd59c1508e71..64fe624c02ca 100644 --- a/include/asm-i386/smp.h +++ b/include/asm-i386/smp.h @@ -8,6 +8,7 @@ #include #include #include +#include #endif #ifdef CONFIG_X86_LOCAL_APIC @@ -56,7 +57,7 @@ extern void cpu_uninit(void); * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define raw_smp_processor_id() (current_thread_info()->cpu) +#define raw_smp_processor_id() (read_pda(cpu_number)) extern cpumask_t cpu_callout_map; extern cpumask_t cpu_callin_map; -- cgit v1.2.3-59-g8ed1b From ec7fcaabbfb3c5bd5189f857b6ac7bb9745ef291 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Implement "current" with the PDA Use the pcurrent field in the PDA to implement the "current" macro. This ends up compiling down to a single instruction to get the current task. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 2 ++ arch/i386/kernel/cpu/common.c | 2 ++ arch/i386/kernel/process.c | 1 + include/asm-i386/current.h | 7 ++++--- include/asm-i386/pda.h | 2 ++ 5 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 85f1b038e9c3..0666eb0ed7bc 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -15,6 +15,7 @@ #include #include #include +#include #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -99,4 +100,5 @@ void foo(void) BLANK(); OFFSET(PDA_cpu, i386_pda, cpu_number); + OFFSET(PDA_pcurrent, i386_pda, pcurrent); } diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index e476202b887f..6958ae5e2fa5 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -651,6 +651,7 @@ __cpuinit int alloc_gdt(int cpu) struct i386_pda boot_pda = { ._pda = &boot_pda, .cpu_number = 0, + .pcurrent = &init_task, }; static inline void set_kernel_gs(void) @@ -696,6 +697,7 @@ __cpuinit int init_gdt(int cpu, struct task_struct *idle) memset(pda, 0, sizeof(*pda)); pda->_pda = pda; pda->cpu_number = cpu; + pda->pcurrent = idle; return 1; } diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index dc4272545179..8749b10d3805 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -684,6 +684,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas if (unlikely(prev->fs | next->fs)) loadsegment(fs, next->fs); + write_pda(pcurrent, next_p); /* * Restore IOPL if needed. diff --git a/include/asm-i386/current.h b/include/asm-i386/current.h index 3cbbecd79016..5252ee0f6d7a 100644 --- a/include/asm-i386/current.h +++ b/include/asm-i386/current.h @@ -1,13 +1,14 @@ #ifndef _I386_CURRENT_H #define _I386_CURRENT_H -#include +#include +#include struct task_struct; -static __always_inline struct task_struct * get_current(void) +static __always_inline struct task_struct *get_current(void) { - return current_thread_info()->task; + return read_pda(pcurrent); } #define current get_current() diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h index f90fde22566d..08a35c478af2 100644 --- a/include/asm-i386/pda.h +++ b/include/asm-i386/pda.h @@ -7,12 +7,14 @@ #define _I386_PDA_H #include +#include struct i386_pda { struct i386_pda *_pda; /* pointer to self */ int cpu_number; + struct task_struct *pcurrent; /* current process */ }; extern struct i386_pda *_cpu_pda[]; -- cgit v1.2.3-59-g8ed1b From 70463daca852db396ce17f179d2404b257ba0f66 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Store the interrupt regs pointer in the PDA Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- include/asm-i386/irq_regs.h | 28 +++++++++++++++++++++++++++- include/asm-i386/pda.h | 1 + 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include/asm-i386') diff --git a/include/asm-i386/irq_regs.h b/include/asm-i386/irq_regs.h index 3dd9c0b70270..a1b3f7f594a2 100644 --- a/include/asm-i386/irq_regs.h +++ b/include/asm-i386/irq_regs.h @@ -1 +1,27 @@ -#include +/* + * Per-cpu current frame pointer - the location of the last exception frame on + * the stack, stored in the PDA. + * + * Jeremy Fitzhardinge + */ +#ifndef _ASM_I386_IRQ_REGS_H +#define _ASM_I386_IRQ_REGS_H + +#include + +static inline struct pt_regs *get_irq_regs(void) +{ + return read_pda(irq_regs); +} + +static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) +{ + struct pt_regs *old_regs; + + old_regs = read_pda(irq_regs); + write_pda(irq_regs, new_regs); + + return old_regs; +} + +#endif /* _ASM_I386_IRQ_REGS_H */ diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h index 08a35c478af2..2ba2736aa109 100644 --- a/include/asm-i386/pda.h +++ b/include/asm-i386/pda.h @@ -15,6 +15,7 @@ struct i386_pda int cpu_number; struct task_struct *pcurrent; /* current process */ + struct pt_regs *irq_regs; }; extern struct i386_pda *_cpu_pda[]; -- cgit v1.2.3-59-g8ed1b From 63cb683c6ed56a420ba60df6a5b206a44e3f85fe Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: PDA: Fix math emulator for new pt_regs This patch fixes the math emulator, which had not been adjusted to match the changed struct pt_regs. AK: extracted from larger patch by Jeremy. Signed-off-by: Andi Kleen --- include/asm-i386/math_emu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/asm-i386') diff --git a/include/asm-i386/math_emu.h b/include/asm-i386/math_emu.h index 697673b555ce..a4b0aa3320e6 100644 --- a/include/asm-i386/math_emu.h +++ b/include/asm-i386/math_emu.h @@ -21,6 +21,7 @@ struct info { long ___eax; long ___ds; long ___es; + long ___fs; long ___orig_eax; long ___eip; long ___cs; -- cgit v1.2.3-59-g8ed1b From 9f45accf17efc050ba26bf77cc4f166c950b284e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: define __pa_symbol() On x86_64 we have to be careful with calculating the physical address of kernel symbols. Both because of compiler odditities and because the symbols live in a different range of the virtual address space. Having a defintition of __pa_symbol that works on both x86_64 and i386 simplifies writing code that works for both x86_64 and i386 that has these kinds of dependencies. So this patch adds the trivial i386 __pa_symbol definition. Added assembly magic similar to RELOC_HIDE as suggested by Andi Kleen. Just picked it up from x86_64. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- include/asm-i386/page.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/asm-i386') diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index f5bf544c729a..5a70501291d6 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -124,6 +124,9 @@ extern int page_is_ram(unsigned long pagenr); #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) #define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) +/* __pa_symbol should be used for C visible symbols. + This seems to be the official gcc blessed way to do such arithmetic. */ +#define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x),0)) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #ifdef CONFIG_FLATMEM -- cgit v1.2.3-59-g8ed1b From 2a43f3ede48ea3d5790b863b719a1e21c90a3697 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 7 Dec 2006 02:14:04 +0100 Subject: [PATCH] i386: CONFIG_PHYSICAL_START cleanup Defining __PHYSICAL_START and __KERNEL_START in asm-i386/page.h works but it triggers a full kernel rebuild for the silliest of reasons. This modifies the users to directly use CONFIG_PHYSICAL_START and linux/config.h which prevents the full rebuild problem, which makes the code much more maintainer and hopefully user friendly. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/boot/compressed/head.S | 7 +++---- arch/i386/boot/compressed/misc.c | 7 +++---- arch/i386/kernel/vmlinux.lds.S | 2 +- include/asm-i386/page.h | 3 --- 4 files changed, 7 insertions(+), 12 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S index b5893e4ecd37..40a8de8270a9 100644 --- a/arch/i386/boot/compressed/head.S +++ b/arch/i386/boot/compressed/head.S @@ -25,7 +25,6 @@ #include #include -#include .globl startup_32 @@ -75,7 +74,7 @@ startup_32: popl %esi # discard address popl %esi # real mode pointer xorl %ebx,%ebx - ljmp $(__BOOT_CS), $__PHYSICAL_START + ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START /* * We come here, if we were loaded high. @@ -100,7 +99,7 @@ startup_32: popl %ecx # lcount popl %edx # high_buffer_start popl %eax # hcount - movl $__PHYSICAL_START,%edi + movl $CONFIG_PHYSICAL_START,%edi cli # make sure we don't get interrupted ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine @@ -125,5 +124,5 @@ move_routine_start: movsl movl %ebx,%esi # Restore setup pointer xorl %ebx,%ebx - ljmp $(__BOOT_CS), $__PHYSICAL_START + ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START move_routine_end: diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index b2ccd543410d..20970ff44119 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -13,7 +13,6 @@ #include #include #include -#include /* * gzip declarations @@ -303,7 +302,7 @@ static void setup_normal_output_buffer(void) #else if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); #endif - output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */ + output_data = (unsigned char *)CONFIG_PHYSICAL_START; /* Normally Points to 1M */ free_mem_end_ptr = (long)real_mode; } @@ -326,8 +325,8 @@ static void setup_output_buffer_if_we_run_high(struct moveparams *mv) low_buffer_size = low_buffer_end - LOW_BUFFER_START; high_loaded = 1; free_mem_end_ptr = (long)high_buffer_start; - if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { - high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size); + if ( (CONFIG_PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { + high_buffer_start = (uch *)(CONFIG_PHYSICAL_START + low_buffer_size); mv->hcount = 0; /* say: we need not to move high_buffer */ } else mv->hcount = -1; diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index c217e18f1081..f8d61ec4c8cb 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -27,7 +27,7 @@ PHDRS { } SECTIONS { - . = __KERNEL_START; + . = LOAD_OFFSET + CONFIG_PHYSICAL_START; phys_startup_32 = startup_32 - LOAD_OFFSET; /* read-only */ .text : AT(ADDR(.text) - LOAD_OFFSET) { diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index 5a70501291d6..2b69686107ae 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -112,12 +112,9 @@ extern int page_is_ram(unsigned long pagenr); #ifdef __ASSEMBLY__ #define __PAGE_OFFSET CONFIG_PAGE_OFFSET -#define __PHYSICAL_START CONFIG_PHYSICAL_START #else #define __PAGE_OFFSET ((unsigned long)CONFIG_PAGE_OFFSET) -#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START) #endif -#define __KERNEL_START (__PAGE_OFFSET + __PHYSICAL_START) #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) -- cgit v1.2.3-59-g8ed1b From e69f202d0a1419219198566e1c22218a5c71a9a6 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 7 Dec 2006 02:14:04 +0100 Subject: [PATCH] i386: Implement CONFIG_PHYSICAL_ALIGN o Now CONFIG_PHYSICAL_START is being replaced with CONFIG_PHYSICAL_ALIGN. Hardcoding the kernel physical start value creates a problem in relocatable kernel context due to boot loader limitations. For ex, if somebody compiles a relocatable kernel to be run from address 4MB, but this kernel will run from location 1MB as grub loads the kernel at physical address 1MB. Kernel thinks that I am a relocatable kernel and I should run from the address I have been loaded at. So somebody wanting to run kernel from 4MB alignment location (for improved performance regions) can't do that. o Hence, Eric proposed that probably CONFIG_PHYSICAL_ALIGN will make more sense in relocatable kernel context. At run time kernel will move itself to a physical addr location which meets user specified alignment restrictions. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/Kconfig | 33 ++++++++++++++++++--------------- arch/i386/boot/compressed/head.S | 26 ++++++++++++++------------ arch/i386/boot/compressed/misc.c | 7 ++++--- arch/i386/kernel/vmlinux.lds.S | 3 ++- include/asm-i386/boot.h | 6 +++++- 5 files changed, 43 insertions(+), 32 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index d588ca874bb4..fd2fa7a7ec52 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -785,23 +785,26 @@ config RELOCATABLE must live at a different physical address than the primary kernel. -config PHYSICAL_START - hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) - - default "0x1000000" if CRASH_DUMP +config PHYSICAL_ALIGN + hex "Alignment value to which kernel should be aligned" default "0x100000" + range 0x2000 0x400000 help - This gives the physical address where the kernel is loaded. Normally - for regular kernels this value is 0x100000 (1MB). But in the case - of kexec on panic the fail safe kernel needs to run at a different - address than the panic-ed kernel. This option is used to set the load - address for kernels used to capture crash dump on being kexec'ed - after panic. The default value for crash dump kernels is - 0x1000000 (16MB). This can also be set based on the "X" value as - specified in the "crashkernel=YM@XM" command line boot parameter - passed to the panic-ed kernel. Typically this parameter is set as - crashkernel=64M@16M. Please take a look at - Documentation/kdump/kdump.txt for more details about crash dumps. + This value puts the alignment restrictions on physical address + where kernel is loaded and run from. Kernel is compiled for an + address which meets above alignment restriction. + + If bootloader loads the kernel at a non-aligned address and + CONFIG_RELOCATABLE is set, kernel will move itself to nearest + address aligned to above value and run from there. + + If bootloader loads the kernel at a non-aligned address and + CONFIG_RELOCATABLE is not set, kernel will ignore the run time + load address and decompress itself to the address it has been + compiled for and run from there. The address for which kernel is + compiled already meets above alignment restrictions. Hence the + end result is that kernel runs from a physical address meeting + above alignment restrictions. Don't change this unless you know what you are doing. diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S index e4dd7a6b9b0f..f395a4bb38bb 100644 --- a/arch/i386/boot/compressed/head.S +++ b/arch/i386/boot/compressed/head.S @@ -26,6 +26,7 @@ #include #include #include +#include .section ".text.head" .globl startup_32 @@ -52,17 +53,17 @@ startup_32: 1: popl %ebp subl $1b, %ebp -/* Compute the delta between where we were compiled to run at - * and where the code will actually run at. +/* %ebp contains the address we are loaded at by the boot loader and %ebx + * contains the address where we should move the kernel image temporarily + * for safe in-place decompression. */ - /* Start with the delta to where the kernel will run at. If we are - * a relocatable kernel this is the delta to our load address otherwise - * this is the delta to CONFIG_PHYSICAL start. - */ + #ifdef CONFIG_RELOCATABLE - movl %ebp, %ebx + movl %ebp, %ebx + addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx + andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx #else - movl $(CONFIG_PHYSICAL_START - startup_32), %ebx + movl $LOAD_PHYSICAL_ADDR, %ebx #endif /* Replace the compressed data size with the uncompressed size */ @@ -94,9 +95,10 @@ startup_32: /* Compute the kernel start address. */ #ifdef CONFIG_RELOCATABLE - leal startup_32(%ebp), %ebp + addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp + andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp #else - movl $CONFIG_PHYSICAL_START, %ebp + movl $LOAD_PHYSICAL_ADDR, %ebp #endif /* @@ -150,8 +152,8 @@ relocated: * and where it was actually loaded. */ movl %ebp, %ebx - subl $CONFIG_PHYSICAL_START, %ebx - + subl $LOAD_PHYSICAL_ADDR, %ebx + jz 2f /* Nothing to be done if loaded at compiled addr. */ /* * Process relocations. */ diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index 4eac24e95a10..dc1538931555 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -14,6 +14,7 @@ #include #include #include +#include /* WARNING!! * This code is compiled with -fPIC and it is relocated dynamically @@ -360,12 +361,12 @@ asmlinkage void decompress_kernel(void *rmode, unsigned long end, insize = input_len; inptr = 0; - if (((u32)output - CONFIG_PHYSICAL_START) & 0x3fffff) - error("Destination address not 4M aligned"); + if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1)) + error("Destination address not CONFIG_PHYSICAL_ALIGN aligned"); if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff)) error("Destination address too large"); #ifndef CONFIG_RELOCATABLE - if ((u32)output != CONFIG_PHYSICAL_START) + if ((u32)output != LOAD_PHYSICAL_ADDR) error("Wrong destination address"); #endif diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index f8d61ec4c8cb..6860f20aa579 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -14,6 +14,7 @@ #include #include #include +#include OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) @@ -27,7 +28,7 @@ PHDRS { } SECTIONS { - . = LOAD_OFFSET + CONFIG_PHYSICAL_START; + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; phys_startup_32 = startup_32 - LOAD_OFFSET; /* read-only */ .text : AT(ADDR(.text) - LOAD_OFFSET) { diff --git a/include/asm-i386/boot.h b/include/asm-i386/boot.h index 96b228e6e79c..8ce79a6fa891 100644 --- a/include/asm-i386/boot.h +++ b/include/asm-i386/boot.h @@ -12,4 +12,8 @@ #define EXTENDED_VGA 0xfffe /* 80x50 mode */ #define ASK_VGA 0xfffd /* ask for it at bootup */ -#endif +/* Physical address where kenrel should be loaded. */ +#define LOAD_PHYSICAL_ADDR ((0x100000 + CONFIG_PHYSICAL_ALIGN - 1) \ + & ~(CONFIG_PHYSICAL_ALIGN - 1)) + +#endif /* _LINUX_BOOT_H */ -- cgit v1.2.3-59-g8ed1b From 770d132f03ac15b12919f1bac481f4beda13e094 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:05 +0100 Subject: [PATCH] i386: Retrieve CLFLUSH size from CPUID Also report it in /proc/cpuinfo similar to x86-64. Needed for followon patch Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/common.c | 3 +++ arch/i386/kernel/cpu/proc.c | 3 ++- include/asm-i386/processor.h | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 6958ae5e2fa5..cda41aef79ad 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -309,6 +309,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c) #else c->apicid = (ebx >> 24) & 0xFF; #endif + if (c->x86_capability[0] & (1<<19)) + c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8; } else { /* Have CPUID level 0 only - unheard of */ c->x86 = 4; @@ -373,6 +375,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; + c->x86_clflush_size = 32; memset(&c->x86_capability, 0, sizeof c->x86_capability); if (!have_cpuid_p()) { diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 76aac088a323..6624d8583c42 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -152,9 +152,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, " [%d]", i); } - seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n", + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); + seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size); return 0; } diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index f73cf836e649..98fa73b71760 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -72,6 +72,7 @@ struct cpuinfo_x86 { #endif unsigned char x86_max_cores; /* cpuid returned max cores value */ unsigned char apicid; + unsigned short x86_clflush_size; #ifdef CONFIG_SMP unsigned char booted_cores; /* number of cores as seen by OS */ __u8 phys_proc_id; /* Physical processor id. */ -- cgit v1.2.3-59-g8ed1b From 3760dd6efa75c98e223643da2eb7040406433053 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:05 +0100 Subject: [PATCH] i386: Use CLFLUSH instead of WBINVD in change_page_attr CLFLUSH is a lot faster than WBINVD so try to use that. Signed-off-by: Andi Kleen --- arch/i386/mm/pageattr.c | 24 +++++++++++++++++------- include/asm-i386/cpufeature.h | 1 + 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 8564b6ae17e3..ad91528bdc14 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -67,11 +67,17 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, return base; } -static void flush_kernel_map(void *dummy) +static void flush_kernel_map(void *arg) { - /* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */ - if (boot_cpu_data.x86_model >= 4) + unsigned long adr = (unsigned long)arg; + + if (adr && cpu_has_clflush) { + int i; + for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) + asm volatile("clflush (%0)" :: "r" (adr + i)); + } else if (boot_cpu_data.x86_model >= 4) wbinvd(); + /* Flush all to work around Errata in early athlons regarding * large page flushing. */ @@ -173,9 +179,9 @@ __change_page_attr(struct page *page, pgprot_t prot) return 0; } -static inline void flush_map(void) +static inline void flush_map(void *adr) { - on_each_cpu(flush_kernel_map, NULL, 1, 1); + on_each_cpu(flush_kernel_map, adr, 1, 1); } /* @@ -217,9 +223,13 @@ void global_flush_tlb(void) spin_lock_irq(&cpa_lock); list_replace_init(&df_list, &l); spin_unlock_irq(&cpa_lock); - flush_map(); - list_for_each_entry_safe(pg, next, &l, lru) + if (!cpu_has_clflush) + flush_map(0); + list_for_each_entry_safe(pg, next, &l, lru) { + if (cpu_has_clflush) + flush_map(page_address(pg)); __free_page(pg); + } } #ifdef CONFIG_DEBUG_PAGEALLOC diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h index 231672558c1f..4c83e059228f 100644 --- a/include/asm-i386/cpufeature.h +++ b/include/asm-i386/cpufeature.h @@ -137,6 +137,7 @@ #define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN) #define cpu_has_ds boot_cpu_has(X86_FEATURE_DS) #define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS) +#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH) #endif /* __ASM_I386_CPUFEATURE_H */ -- cgit v1.2.3-59-g8ed1b From b2dff6a88cbed59d787a8ca7367c76ba385e1187 Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: Move find_max_pfn function to e820.c Move more code from setup.c into e820.c Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/e820.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 55 ------------------------------------------------ include/asm-i386/e820.h | 1 + 3 files changed, 53 insertions(+), 55 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 0db95760b073..be4934f6f85b 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -539,3 +540,54 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map) return 0; } +/* + * Callback for efi_memory_walk. + */ +static int __init +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) +{ + unsigned long *max_pfn = arg, pfn; + + if (start < end) { + pfn = PFN_UP(end -1); + if (pfn > *max_pfn) + *max_pfn = pfn; + } + return 0; +} + +static int __init +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) +{ + memory_present(0, PFN_UP(start), PFN_DOWN(end)); + return 0; +} + +/* + * Find the highest page frame number we have available + */ +void __init find_max_pfn(void) +{ + int i; + + max_pfn = 0; + if (efi_enabled) { + efi_memmap_walk(efi_find_max_pfn, &max_pfn); + efi_memmap_walk(efi_memory_present_wrapper, NULL); + return; + } + + for (i = 0; i < e820.nr_map; i++) { + unsigned long start, end; + /* RAM? */ + if (e820.map[i].type != E820_RAM) + continue; + start = PFN_UP(e820.map[i].addr); + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + if (start >= end) + continue; + if (end > max_pfn) + max_pfn = end; + memory_present(0, start, end); + } +} diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index b7509aec0eb1..3d808054fdf7 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -63,9 +63,6 @@ #include #include -/* Forward Declaration. */ -void __init find_max_pfn(void); - /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* address, and must not be in the .bss segment! */ @@ -387,29 +384,6 @@ static int __init parse_reservetop(char *arg) } early_param("reservetop", parse_reservetop); -/* - * Callback for efi_memory_walk. - */ -static int __init -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) -{ - unsigned long *max_pfn = arg, pfn; - - if (start < end) { - pfn = PFN_UP(end -1); - if (pfn > *max_pfn) - *max_pfn = pfn; - } - return 0; -} - -static int __init -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) -{ - memory_present(0, PFN_UP(start), PFN_DOWN(end)); - return 0; -} - /* * This function checks if the entire range is mapped with type. * @@ -442,35 +416,6 @@ e820_all_mapped(unsigned long s, unsigned long e, unsigned type) return 0; } -/* - * Find the highest page frame number we have available - */ -void __init find_max_pfn(void) -{ - int i; - - max_pfn = 0; - if (efi_enabled) { - efi_memmap_walk(efi_find_max_pfn, &max_pfn); - efi_memmap_walk(efi_memory_present_wrapper, NULL); - return; - } - - for (i = 0; i < e820.nr_map; i++) { - unsigned long start, end; - /* RAM? */ - if (e820.map[i].type != E820_RAM) - continue; - start = PFN_UP(e820.map[i].addr); - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - if (start >= end) - continue; - if (end > max_pfn) - max_pfn = end; - memory_present(0, start, end); - } -} - /* * Determine low and high memory ranges: */ diff --git a/include/asm-i386/e820.h b/include/asm-i386/e820.h index f7514fb6e8e4..147569425152 100644 --- a/include/asm-i386/e820.h +++ b/include/asm-i386/e820.h @@ -38,6 +38,7 @@ extern struct e820map e820; extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); +extern void find_max_pfn(void); #endif/*!__ASSEMBLY__*/ -- cgit v1.2.3-59-g8ed1b From b5b2405706005cc7765f6ecd00965d29e93f090a Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: Move e820/efi memmap walking code to e820.c This patch moves e820/efi memmap table walking function from setup.c to e820.c, also this patch adds extern declaration in header file. Signed-off-by: bibo,mao Signed-off-by: Andi Kleen arch/i386/kernel/e820.c | 115 +++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 118 ----------------------------------- include/asm-i386/e820.h | 2 arch/i386/kernel/e820.c | 115 +++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 118 ----------------------------------------------- include/asm-i386/e820.h | 2 3 files changed, 117 insertions(+), 118 deletions(-) --- arch/i386/kernel/e820.c | 115 +++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 118 ----------------------------------------------- include/asm-i386/e820.h | 2 + 3 files changed, 117 insertions(+), 118 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index be4934f6f85b..47c495bf0cbc 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -28,6 +28,11 @@ static struct change_member change_point_list[2*E820MAX] __initdata; static struct change_member *change_point[2*E820MAX] __initdata; static struct e820entry *overlap_list[E820MAX] __initdata; static struct e820entry new_bios[E820MAX] __initdata; +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0x10000000; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif struct resource data_resource = { .name = "Kernel data", .start = 0, @@ -591,3 +596,113 @@ void __init find_max_pfn(void) memory_present(0, start, end); } } + +/* + * Free all available memory for boot time allocation. Used + * as a callback function by efi_memory_walk() + */ + +static int __init +free_available_memory(unsigned long start, unsigned long end, void *arg) +{ + /* check max_low_pfn */ + if (start >= (max_low_pfn << PAGE_SHIFT)) + return 0; + if (end >= (max_low_pfn << PAGE_SHIFT)) + end = max_low_pfn << PAGE_SHIFT; + if (start < end) + free_bootmem(start, end - start); + + return 0; +} +/* + * Register fully available low RAM pages with the bootmem allocator. + */ +void __init register_bootmem_low_pages(unsigned long max_low_pfn) +{ + int i; + + if (efi_enabled) { + efi_memmap_walk(free_available_memory, NULL); + return; + } + for (i = 0; i < e820.nr_map; i++) { + unsigned long curr_pfn, last_pfn, size; + /* + * Reserve usable low memory + */ + if (e820.map[i].type != E820_RAM) + continue; + /* + * We are rounding up the start address of usable memory: + */ + curr_pfn = PFN_UP(e820.map[i].addr); + if (curr_pfn >= max_low_pfn) + continue; + /* + * ... and at the end of the usable range downwards: + */ + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + + if (last_pfn > max_low_pfn) + last_pfn = max_low_pfn; + + /* + * .. finally, did all the rounding and playing + * around just make the area go away? + */ + if (last_pfn <= curr_pfn) + continue; + + size = last_pfn - curr_pfn; + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); + } +} + +void __init register_memory(void) +{ + unsigned long gapstart, gapsize, round; + unsigned long long last; + int i; + + /* + * Search for the bigest gap in the low 32 bits of the e820 + * memory space. + */ + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = e820.nr_map; + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + } + } + if (start < last) + last = start; + } + + /* + * See how much we want to round up: start off with + * rounding to the next 1MB area. + */ + round = 0x100000; + while ((gapsize >> 4) > round) + round += round; + /* Fun with two's complement */ + pci_mem_start = (gapstart + round) & -round; + + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", + pci_mem_start, gapstart, gapsize); +} diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 3d808054fdf7..51ed015a1f35 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -94,12 +94,6 @@ unsigned int machine_submodel_id; unsigned int BIOS_revision; unsigned int mca_pentium_flag; -/* For PCI or other memory-mapped resources */ -unsigned long pci_mem_start = 0x10000000; -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_mem_start); -#endif - /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -475,68 +469,6 @@ unsigned long __init find_max_low_pfn(void) return max_low_pfn; } -/* - * Free all available memory for boot time allocation. Used - * as a callback function by efi_memory_walk() - */ - -static int __init -free_available_memory(unsigned long start, unsigned long end, void *arg) -{ - /* check max_low_pfn */ - if (start >= (max_low_pfn << PAGE_SHIFT)) - return 0; - if (end >= (max_low_pfn << PAGE_SHIFT)) - end = max_low_pfn << PAGE_SHIFT; - if (start < end) - free_bootmem(start, end - start); - - return 0; -} -/* - * Register fully available low RAM pages with the bootmem allocator. - */ -static void __init register_bootmem_low_pages(unsigned long max_low_pfn) -{ - int i; - - if (efi_enabled) { - efi_memmap_walk(free_available_memory, NULL); - return; - } - for (i = 0; i < e820.nr_map; i++) { - unsigned long curr_pfn, last_pfn, size; - /* - * Reserve usable low memory - */ - if (e820.map[i].type != E820_RAM) - continue; - /* - * We are rounding up the start address of usable memory: - */ - curr_pfn = PFN_UP(e820.map[i].addr); - if (curr_pfn >= max_low_pfn) - continue; - /* - * ... and at the end of the usable range downwards: - */ - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - - if (last_pfn > max_low_pfn) - last_pfn = max_low_pfn; - - /* - * .. finally, did all the rounding and playing - * around just make the area go away? - */ - if (last_pfn <= curr_pfn) - continue; - - size = last_pfn - curr_pfn; - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); - } -} - /* * workaround for Dell systems that neglect to reserve EBDA */ @@ -705,56 +637,6 @@ void __init remapped_pgdat_init(void) } } - - -static void __init register_memory(void) -{ - unsigned long gapstart, gapsize, round; - unsigned long long last; - int i; - - /* - * Search for the bigest gap in the low 32 bits of the e820 - * memory space. - */ - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = e820.nr_map; - while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; - - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true - */ - if (last > end) { - unsigned long gap = last - end; - - if (gap > gapsize) { - gapsize = gap; - gapstart = end; - } - } - if (start < last) - last = start; - } - - /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. - */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; - - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", - pci_mem_start, gapstart, gapsize); -} - #ifdef CONFIG_MCA static void set_mca_bus(int x) { diff --git a/include/asm-i386/e820.h b/include/asm-i386/e820.h index 147569425152..8da4175a5533 100644 --- a/include/asm-i386/e820.h +++ b/include/asm-i386/e820.h @@ -39,6 +39,8 @@ extern struct e820map e820; extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); extern void find_max_pfn(void); +extern void register_bootmem_low_pages(unsigned long max_low_pfn); +extern void register_memory(void); #endif/*!__ASSEMBLY__*/ -- cgit v1.2.3-59-g8ed1b From cef518e88b8ed94ea483c436ef5e5b151a3fabc6 Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: Move memory map printing and other code to e820.c This patch moves e820 memory map print and memmap boot param parsing function from setup.c to e820.c, also adds limit_regions and print_memory_map declaration in header file. Signed-off-by: bibo,mao Signed-off-by: Andi Kleen arch/i386/kernel/e820.c | 152 +++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 158 --------------------------------- include/asm-i386/e820.h | 2 arch/i386/kernel/e820.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 153 ----------------------------------------------- include/asm-i386/e820.h | 2 3 files changed, 155 insertions(+), 152 deletions(-) --- arch/i386/kernel/e820.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 153 +---------------------------------------------- include/asm-i386/e820.h | 2 + 3 files changed, 155 insertions(+), 152 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 47c495bf0cbc..b755255f2721 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -33,6 +33,7 @@ unsigned long pci_mem_start = 0x10000000; #ifdef CONFIG_PCI EXPORT_SYMBOL(pci_mem_start); #endif +extern int user_defined_memmap; struct resource data_resource = { .name = "Kernel data", .start = 0, @@ -706,3 +707,154 @@ void __init register_memory(void) printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", pci_mem_start, gapstart, gapsize); } + +void __init print_memory_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(" %s: %016Lx - %016Lx ", who, + e820.map[i].addr, + e820.map[i].addr + e820.map[i].size); + switch (e820.map[i].type) { + case E820_RAM: printk("(usable)\n"); + break; + case E820_RESERVED: + printk("(reserved)\n"); + break; + case E820_ACPI: + printk("(ACPI data)\n"); + break; + case E820_NVS: + printk("(ACPI NVS)\n"); + break; + default: printk("type %lu\n", e820.map[i].type); + break; + } + } +} + +void __init limit_regions(unsigned long long size) +{ + unsigned long long current_addr = 0; + int i; + + print_memory_map("limit_regions start"); + if (efi_enabled) { + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map, i = 0; p < memmap.map_end; + p += memmap.desc_size, i++) { + md = p; + current_addr = md->phys_addr + (md->num_pages << 12); + if (md->type == EFI_CONVENTIONAL_MEMORY) { + if (current_addr >= size) { + md->num_pages -= + (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); + memmap.nr_map = i + 1; + return; + } + } + } + } + for (i = 0; i < e820.nr_map; i++) { + current_addr = e820.map[i].addr + e820.map[i].size; + if (current_addr < size) + continue; + + if (e820.map[i].type != E820_RAM) + continue; + + if (e820.map[i].addr >= size) { + /* + * This region starts past the end of the + * requested size, skip it completely. + */ + e820.nr_map = i; + } else { + e820.nr_map = i + 1; + e820.map[i].size -= current_addr - size; + } + print_memory_map("limit_regions endfor"); + return; + } + print_memory_map("limit_regions endfunc"); +} + + /* + * This function checks if the entire range is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init +e820_all_mapped(unsigned long s, unsigned long e, unsigned type) +{ + u64 start = s; + u64 end = e; + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + /* if the region is at the beginning of we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* if start is now at or beyond end, we're done, full + * coverage */ + if (start >= end) + return 1; /* we're done */ + } + return 0; +} + +static int __init parse_memmap(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp(arg, "exactmap") == 0) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + find_max_pfn(); + saved_max_pfn = max_pfn; +#endif + e820.nr_map = 0; + user_defined_memmap = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long start_at, mem_size; + + mem_size = memparse(arg, &arg); + if (*arg == '@') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*arg == '#') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*arg == '$') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + limit_regions(mem_size); + user_defined_memmap = 1; + } + } + return 0; +} +early_param("memmap", parse_memmap); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 51ed015a1f35..e5bb87aa5a45 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -73,7 +73,6 @@ int disable_pse __devinitdata = 0; /* * Machine setup.. */ -extern struct e820map e820; extern struct resource code_resource; extern struct resource data_resource; @@ -137,79 +136,6 @@ static char command_line[COMMAND_LINE_SIZE]; unsigned char __initdata boot_params[PARAM_SIZE]; -static void __init limit_regions(unsigned long long size) -{ - unsigned long long current_addr = 0; - int i; - - if (efi_enabled) { - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map, i = 0; p < memmap.map_end; - p += memmap.desc_size, i++) { - md = p; - current_addr = md->phys_addr + (md->num_pages << 12); - if (md->type == EFI_CONVENTIONAL_MEMORY) { - if (current_addr >= size) { - md->num_pages -= - (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); - memmap.nr_map = i + 1; - return; - } - } - } - } - for (i = 0; i < e820.nr_map; i++) { - current_addr = e820.map[i].addr + e820.map[i].size; - if (current_addr < size) - continue; - - if (e820.map[i].type != E820_RAM) - continue; - - if (e820.map[i].addr >= size) { - /* - * This region starts past the end of the - * requested size, skip it completely. - */ - e820.nr_map = i; - } else { - e820.nr_map = i + 1; - e820.map[i].size -= current_addr - size; - } - return; - } -} - -#define E820_DEBUG 1 - -static void __init print_memory_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(" %s: %016Lx - %016Lx ", who, - e820.map[i].addr, - e820.map[i].addr + e820.map[i].size); - switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; - case E820_RESERVED: - printk("(reserved)\n"); - break; - case E820_ACPI: - printk("(ACPI data)\n"); - break; - case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %lu\n", e820.map[i].type); - break; - } - } -} - #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; #ifdef CONFIG_EDD_MODULE @@ -233,7 +159,7 @@ static inline void copy_edd(void) } #endif -static int __initdata user_defined_memmap = 0; +int __initdata user_defined_memmap = 0; /* * "mem=nopentium" disables the 4MB page tables. @@ -270,51 +196,6 @@ static int __init parse_mem(char *arg) } early_param("mem", parse_mem); -static int __init parse_memmap(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp(arg, "exactmap") == 0) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - find_max_pfn(); - saved_max_pfn = max_pfn; -#endif - e820.nr_map = 0; - user_defined_memmap = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long start_at, mem_size; - - mem_size = memparse(arg, &arg); - if (*arg == '@') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_RAM); - } else if (*arg == '#') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_ACPI); - } else if (*arg == '$') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_RESERVED); - } else { - limit_regions(mem_size); - user_defined_memmap = 1; - } - } - return 0; -} -early_param("memmap", parse_memmap); - #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. @@ -378,38 +259,6 @@ static int __init parse_reservetop(char *arg) } early_param("reservetop", parse_reservetop); - /* - * This function checks if the entire range is mapped with type. - * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case - */ -int __init -e820_all_mapped(unsigned long s, unsigned long e, unsigned type) -{ - u64 start = s; - u64 end = e; - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - /* if the region is at the beginning of we move - * start to the end of the region since it's ok until there - */ - if (ei->addr <= start) - start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full - * coverage */ - if (start >= end) - return 1; /* we're done */ - } - return 0; -} - /* * Determine low and high memory ranges: */ diff --git a/include/asm-i386/e820.h b/include/asm-i386/e820.h index 8da4175a5533..395077aba583 100644 --- a/include/asm-i386/e820.h +++ b/include/asm-i386/e820.h @@ -41,6 +41,8 @@ extern int e820_all_mapped(unsigned long start, unsigned long end, extern void find_max_pfn(void); extern void register_bootmem_low_pages(unsigned long max_low_pfn); extern void register_memory(void); +extern void limit_regions(unsigned long long size); +extern void print_memory_map(char *who); #endif/*!__ASSEMBLY__*/ -- cgit v1.2.3-59-g8ed1b From e6536c1262c56d302e749ab1b44fdb0b9786327d Mon Sep 17 00:00:00 2001 From: Paolo 'Blaisorblade' Giarrusso Date: Thu, 7 Dec 2006 02:14:07 +0100 Subject: [PATCH] x86: comment magic constants in delay.h For both i386 and x86_64, copy from arch/$ARCH/lib/delay.c comments about the used magic constants, plus a few other niceties. Signed-off-by: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andi Kleen include/asm-i386/delay.h | 5 ++++- include/asm-x86_64/delay.h | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) --- include/asm-i386/delay.h | 5 ++++- include/asm-x86_64/delay.h | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/asm-i386') diff --git a/include/asm-i386/delay.h b/include/asm-i386/delay.h index b1c7650dc7b9..9ae5e3782ed8 100644 --- a/include/asm-i386/delay.h +++ b/include/asm-i386/delay.h @@ -7,6 +7,7 @@ * Delay routines calling functions in arch/i386/lib/delay.c */ +/* Undefined functions to get compile-time errors */ extern void __bad_udelay(void); extern void __bad_ndelay(void); @@ -15,10 +16,12 @@ extern void __ndelay(unsigned long nsecs); extern void __const_udelay(unsigned long usecs); extern void __delay(unsigned long loops); +/* 0x10c7 is 2**32 / 1000000 (rounded up) */ #define udelay(n) (__builtin_constant_p(n) ? \ ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \ __udelay(n)) - + +/* 0x5 is 2**32 / 1000000000 (rounded up) */ #define ndelay(n) (__builtin_constant_p(n) ? \ ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ __ndelay(n)) diff --git a/include/asm-x86_64/delay.h b/include/asm-x86_64/delay.h index 40146f611ccb..c2669f1f5529 100644 --- a/include/asm-x86_64/delay.h +++ b/include/asm-x86_64/delay.h @@ -7,18 +7,21 @@ * Delay routines calling functions in arch/x86_64/lib/delay.c */ +/* Undefined functions to get compile-time errors */ extern void __bad_udelay(void); extern void __bad_ndelay(void); extern void __udelay(unsigned long usecs); -extern void __ndelay(unsigned long usecs); +extern void __ndelay(unsigned long nsecs); extern void __const_udelay(unsigned long usecs); extern void __delay(unsigned long loops); +/* 0x10c7 is 2**32 / 1000000 (rounded up) */ #define udelay(n) (__builtin_constant_p(n) ? \ ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \ __udelay(n)) +/* 0x5 is 2**32 / 1000000000 (rounded up) */ #define ndelay(n) (__builtin_constant_p(n) ? \ ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ __ndelay(n)) -- cgit v1.2.3-59-g8ed1b From d3561b7fa0fb0fc583bab0eeda32bec9e4c4056d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:07 +0100 Subject: [PATCH] paravirt: header and stubs for paravirtualisation Create a paravirt.h header for all the critical operations which need to be replaced with hypervisor calls, and include that instead of defining native operations, when CONFIG_PARAVIRT. This patch does the dumbest possible replacement of paravirtualized instructions: calls through a "paravirt_ops" structure. Currently these are function implementations of native hardware: hypervisors will override the ops structure with their own variants. All the pv-ops functions are declared "fastcall" so that a specific register-based ABI is used, to make inlining assember easier. And: +From: Andy Whitcroft The paravirt ops introduce a 'weak' attribute onto memory_setup(). Code ordering leads to the following warnings on x86: arch/i386/kernel/setup.c:651: warning: weak declaration of `memory_setup' after first use results in unspecified behavior Move memory_setup() to avoid this. Signed-off-by: Rusty Russell Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Zachary Amsden Signed-off-by: Andrew Morton Signed-off-by: Andy Whitcroft --- arch/i386/Kconfig | 11 + arch/i386/boot/compressed/misc.c | 1 + arch/i386/kernel/Makefile | 1 + arch/i386/kernel/asm-offsets.c | 10 + arch/i386/kernel/entry.S | 34 ++- arch/i386/kernel/i8259.c | 5 +- arch/i386/kernel/paravirt.c | 404 +++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 8 +- arch/i386/kernel/smpboot.c | 5 + arch/i386/kernel/time.c | 15 +- arch/i386/power/cpu.c | 8 +- drivers/net/de600.c | 1 - include/asm-i386/delay.h | 8 + include/asm-i386/desc.h | 9 +- include/asm-i386/io.h | 8 +- include/asm-i386/irq.h | 3 + include/asm-i386/irqflags.h | 42 +-- include/asm-i386/mach-default/setup_arch.h | 2 + include/asm-i386/msr.h | 5 + include/asm-i386/paravirt.h | 281 ++++++++++++++++++++ include/asm-i386/processor.h | 15 +- include/asm-i386/segment.h | 2 + include/asm-i386/setup.h | 1 + include/asm-i386/spinlock.h | 4 + include/asm-i386/suspend.h | 8 +- include/asm-i386/system.h | 16 +- include/asm-i386/time.h | 41 +++ 27 files changed, 890 insertions(+), 58 deletions(-) create mode 100644 arch/i386/kernel/paravirt.c create mode 100644 include/asm-i386/paravirt.h create mode 100644 include/asm-i386/time.h (limited to 'include/asm-i386') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 1f0f7b60995e..bb1fa061c6cf 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -182,6 +182,17 @@ config X86_ES7000 endchoice +config PARAVIRT + bool "Paravirtualization support (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + Paravirtualization is a way of running multiple instances of + Linux on the same machine, under a hypervisor. This option + changes the kernel so it can modify itself when it is run + under a hypervisor, improving performance significantly. + However, when run without a hypervisor the kernel is + theoretically slower. If in doubt, say N. + config ACPI_SRAT bool default y diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index dc1538931555..c6798c75c67d 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -9,6 +9,7 @@ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ +#undef CONFIG_PARAVIRT #include #include #include diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index f614854bd71f..406612136049 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_VM86) += vm86.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_PARAVIRT) += paravirt.o EXTRA_AFLAGS := -traditional diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 0666eb0ed7bc..1b2f3cd33270 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -101,4 +101,14 @@ void foo(void) BLANK(); OFFSET(PDA_cpu, i386_pda, cpu_number); OFFSET(PDA_pcurrent, i386_pda, pcurrent); + +#ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled); + OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); + OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); + OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); + OFFSET(PARAVIRT_iret, paravirt_ops, iret); + OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); +#endif } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 0220bc8cbb43..d274612e05cd 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -62,13 +62,6 @@ DF_MASK = 0x00000400 NT_MASK = 0x00004000 VM_MASK = 0x00020000 -/* These are replaces for paravirtualization */ -#define DISABLE_INTERRUPTS cli -#define ENABLE_INTERRUPTS sti -#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit -#define INTERRUPT_RETURN iret -#define GET_CR0_INTO_EAX movl %cr0, %eax - #ifdef CONFIG_PREEMPT #define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF #else @@ -416,6 +409,20 @@ ldt_ss: jnz restore_nocheck testl $0x00400000, %eax # returning to 32bit stack? jnz restore_nocheck # allright, normal return + +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, paravirt_ops+PARAVIRT_enabled + jne restore_nocheck +#endif + /* If returning to userspace with 16bit stack, * try to fix the higher word of ESP, as the CPU * won't restore it. @@ -833,6 +840,19 @@ nmi_espfix_stack: .previous KPROBE_END(nmi) +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) +1: iret +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +#endif + KPROBE_ENTRY(int3) RING0_INT_FRAME pushl $-1 # mark this as an int diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index 62996cd17084..c8d45821c788 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -381,7 +381,10 @@ void __init init_ISA_irqs (void) } } -void __init init_IRQ(void) +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) { int i; diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c new file mode 100644 index 000000000000..478192cd4b90 --- /dev/null +++ b/arch/i386/kernel/paravirt.c @@ -0,0 +1,404 @@ +/* Paravirtualization interfaces + Copyright (C) 2006 Rusty Russell IBM Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* nop stub */ +static void native_nop(void) +{ +} + +static void __init default_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + paravirt_ops.name); +} + +char *memory_setup(void) +{ + return paravirt_ops.memory_setup(); +} + +static fastcall unsigned long native_get_debugreg(int regno) +{ + unsigned long val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + asm("movl %%db0, %0" :"=r" (val)); break; + case 1: + asm("movl %%db1, %0" :"=r" (val)); break; + case 2: + asm("movl %%db2, %0" :"=r" (val)); break; + case 3: + asm("movl %%db3, %0" :"=r" (val)); break; + case 6: + asm("movl %%db6, %0" :"=r" (val)); break; + case 7: + asm("movl %%db7, %0" :"=r" (val)); break; + default: + BUG(); + } + return val; +} + +static fastcall void native_set_debugreg(int regno, unsigned long value) +{ + switch (regno) { + case 0: + asm("movl %0,%%db0" : /* no output */ :"r" (value)); + break; + case 1: + asm("movl %0,%%db1" : /* no output */ :"r" (value)); + break; + case 2: + asm("movl %0,%%db2" : /* no output */ :"r" (value)); + break; + case 3: + asm("movl %0,%%db3" : /* no output */ :"r" (value)); + break; + case 6: + asm("movl %0,%%db6" : /* no output */ :"r" (value)); + break; + case 7: + asm("movl %0,%%db7" : /* no output */ :"r" (value)); + break; + default: + BUG(); + } +} + +void init_IRQ(void) +{ + paravirt_ops.init_IRQ(); +} + +static fastcall void native_clts(void) +{ + asm volatile ("clts"); +} + +static fastcall unsigned long native_read_cr0(void) +{ + unsigned long val; + asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void native_write_cr0(unsigned long val) +{ + asm volatile("movl %0,%%cr0": :"r" (val)); +} + +static fastcall unsigned long native_read_cr2(void) +{ + unsigned long val; + asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void native_write_cr2(unsigned long val) +{ + asm volatile("movl %0,%%cr2": :"r" (val)); +} + +static fastcall unsigned long native_read_cr3(void) +{ + unsigned long val; + asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void native_write_cr3(unsigned long val) +{ + asm volatile("movl %0,%%cr3": :"r" (val)); +} + +static fastcall unsigned long native_read_cr4(void) +{ + unsigned long val; + asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall unsigned long native_read_cr4_safe(void) +{ + unsigned long val; + /* This could fault if %cr4 does not exist */ + asm("1: movl %%cr4, %0 \n" + "2: \n" + ".section __ex_table,\"a\" \n" + ".long 1b,2b \n" + ".previous \n" + : "=r" (val): "0" (0)); + return val; +} + +static fastcall void native_write_cr4(unsigned long val) +{ + asm volatile("movl %0,%%cr4": :"r" (val)); +} + +static fastcall unsigned long native_save_fl(void) +{ + unsigned long f; + asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); + return f; +} + +static fastcall void native_restore_fl(unsigned long f) +{ + asm volatile("pushl %0 ; popfl": /* no output */ + :"g" (f) + :"memory", "cc"); +} + +static fastcall void native_irq_disable(void) +{ + asm volatile("cli": : :"memory"); +} + +static fastcall void native_irq_enable(void) +{ + asm volatile("sti": : :"memory"); +} + +static fastcall void native_safe_halt(void) +{ + asm volatile("sti; hlt": : :"memory"); +} + +static fastcall void native_halt(void) +{ + asm volatile("hlt": : :"memory"); +} + +static fastcall void native_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} + +static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) +{ + unsigned long long val; + + asm volatile("2: rdmsr ; xorl %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: movl %3,%0 ; jmp 1b\n\t" + ".previous\n\t" + ".section __ex_table,\"a\"\n" + " .align 4\n\t" + " .long 2b,3b\n\t" + ".previous" + : "=r" (*err), "=A" (val) + : "c" (msr), "i" (-EFAULT)); + + return val; +} + +static fastcall int native_write_msr(unsigned int msr, unsigned long long val) +{ + int err; + asm volatile("2: wrmsr ; xorl %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: movl %4,%0 ; jmp 1b\n\t" + ".previous\n\t" + ".section __ex_table,\"a\"\n" + " .align 4\n\t" + " .long 2b,3b\n\t" + ".previous" + : "=a" (err) + : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)), + "i" (-EFAULT)); + return err; +} + +static fastcall unsigned long long native_read_tsc(void) +{ + unsigned long long val; + asm volatile("rdtsc" : "=A" (val)); + return val; +} + +static fastcall unsigned long long native_read_pmc(void) +{ + unsigned long long val; + asm volatile("rdpmc" : "=A" (val)); + return val; +} + +static fastcall void native_load_tr_desc(void) +{ + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); +} + +static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) +{ + asm ("sgdt %0":"=m" (*dtr)); +} + +static fastcall void native_store_idt(struct Xgt_desc_struct *dtr) +{ + asm ("sidt %0":"=m" (*dtr)); +} + +static fastcall unsigned long native_store_tr(void) +{ + unsigned long tr; + asm ("str %0":"=r" (tr)); + return tr; +} + +static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ +#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] + C(0); C(1); C(2); +#undef C +} + +static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high) +{ + u32 *lp = (u32 *)((char *)dt + entry*8); + lp[0] = entry_low; + lp[1] = entry_high; +} + +static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) +{ + native_write_dt_entry(dt, entrynum, low, high); +} + +static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) +{ + native_write_dt_entry(dt, entrynum, low, high); +} + +static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) +{ + native_write_dt_entry(dt, entrynum, low, high); +} + +static fastcall void native_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + tss->esp0 = thread->esp0; + + /* This can only happen when SEP is enabled, no need to test "SEP"arately */ + if (unlikely(tss->ss1 != thread->sysenter_cs)) { + tss->ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } +} + +static fastcall void native_io_delay(void) +{ + asm volatile("outb %al,$0x80"); +} + +/* These are in entry.S */ +extern fastcall void native_iret(void); +extern fastcall void native_irq_enable_sysexit(void); + +static int __init print_banner(void) +{ + paravirt_ops.banner(); + return 0; +} +core_initcall(print_banner); + +struct paravirt_ops paravirt_ops = { + .name = "bare hardware", + .paravirt_enabled = 0, + .kernel_rpl = 0, + + .banner = default_banner, + .arch_setup = native_nop, + .memory_setup = machine_specific_memory_setup, + .get_wallclock = native_get_wallclock, + .set_wallclock = native_set_wallclock, + .time_init = time_init_hook, + .init_IRQ = native_init_IRQ, + + .cpuid = native_cpuid, + .get_debugreg = native_get_debugreg, + .set_debugreg = native_set_debugreg, + .clts = native_clts, + .read_cr0 = native_read_cr0, + .write_cr0 = native_write_cr0, + .read_cr2 = native_read_cr2, + .write_cr2 = native_write_cr2, + .read_cr3 = native_read_cr3, + .write_cr3 = native_write_cr3, + .read_cr4 = native_read_cr4, + .read_cr4_safe = native_read_cr4_safe, + .write_cr4 = native_write_cr4, + .save_fl = native_save_fl, + .restore_fl = native_restore_fl, + .irq_disable = native_irq_disable, + .irq_enable = native_irq_enable, + .safe_halt = native_safe_halt, + .halt = native_halt, + .wbinvd = native_wbinvd, + .read_msr = native_read_msr, + .write_msr = native_write_msr, + .read_tsc = native_read_tsc, + .read_pmc = native_read_pmc, + .load_tr_desc = native_load_tr_desc, + .set_ldt = native_set_ldt, + .load_gdt = native_load_gdt, + .load_idt = native_load_idt, + .store_gdt = native_store_gdt, + .store_idt = native_store_idt, + .store_tr = native_store_tr, + .load_tls = native_load_tls, + .write_ldt_entry = native_write_ldt_entry, + .write_gdt_entry = native_write_gdt_entry, + .write_idt_entry = native_write_idt_entry, + .load_esp0 = native_load_esp0, + + .set_iopl_mask = native_set_iopl_mask, + .io_delay = native_io_delay, + .const_udelay = __const_udelay, + + .irq_enable_sysexit = native_irq_enable_sysexit, + .iret = native_iret, +}; +EXPORT_SYMBOL(paravirt_ops); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index e5bb87aa5a45..695d53fd14de 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -495,6 +495,12 @@ static void set_mca_bus(int x) static void set_mca_bus(int x) { } #endif +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +char * __attribute__((weak)) memory_setup(void) +{ + return machine_specific_memory_setup(); +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -547,7 +553,7 @@ void __init setup_arch(char **cmdline_p) efi_init(); else { printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(machine_specific_memory_setup()); + print_memory_map(memory_setup()); } copy_edd(); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 095636620fa2..cd7de9c9654b 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -33,6 +33,11 @@ * Dave Jones : Report invalid combinations of Athlon CPUs. * Rusty Russell : Hacked into shape for new "hotplug" boot process. */ + +/* SMP boot always wants to use real time delay to allow sufficient time for + * the APs to come online */ +#define USE_REAL_TIME_DELAY + #include #include #include diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 78af572fd17c..c505b16c0990 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "mach_time.h" @@ -116,10 +117,7 @@ static int set_rtc_mmss(unsigned long nowtime) /* gets recalled with irq locally disabled */ /* XXX - does irqsave resolve this? -johnstul */ spin_lock_irqsave(&rtc_lock, flags); - if (efi_enabled) - retval = efi_set_rtc_mmss(nowtime); - else - retval = mach_set_rtc_mmss(nowtime); + retval = set_wallclock(nowtime); spin_unlock_irqrestore(&rtc_lock, flags); return retval; @@ -223,10 +221,7 @@ unsigned long get_cmos_time(void) spin_lock_irqsave(&rtc_lock, flags); - if (efi_enabled) - retval = efi_get_time(); - else - retval = mach_get_cmos_time(); + retval = get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); @@ -370,7 +365,7 @@ static void __init hpet_time_init(void) printk("Using HPET for base-timer\n"); } - time_init_hook(); + do_time_init(); } #endif @@ -392,5 +387,5 @@ void __init time_init(void) do_settimeofday(&ts); - time_init_hook(); + do_time_init(); } diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c index 5a1abeff033b..2c15500f8713 100644 --- a/arch/i386/power/cpu.c +++ b/arch/i386/power/cpu.c @@ -26,8 +26,8 @@ void __save_processor_state(struct saved_context *ctxt) /* * descriptor tables */ - store_gdt(&ctxt->gdt_limit); - store_idt(&ctxt->idt_limit); + store_gdt(&ctxt->gdt); + store_idt(&ctxt->idt); store_tr(ctxt->tr); /* @@ -99,8 +99,8 @@ void __restore_processor_state(struct saved_context *ctxt) * now restore the descriptor tables to their proper values * ltr is done i fix_processor_context(). */ - load_gdt(&ctxt->gdt_limit); - load_idt(&ctxt->idt_limit); + load_gdt(&ctxt->gdt); + load_idt(&ctxt->idt); /* * segment registers diff --git a/drivers/net/de600.c b/drivers/net/de600.c index 690bb40b353d..8396e411f1ce 100644 --- a/drivers/net/de600.c +++ b/drivers/net/de600.c @@ -43,7 +43,6 @@ static const char version[] = "de600.c: $Revision: 1.41-2.5 $, Bjorn Ekwall (bj * modify the following "#define": (see for more info) #define REALLY_SLOW_IO */ -#define SLOW_IO_BY_JUMPING /* Looks "better" than dummy write to port 0x80 :-) */ /* use 0 for production, 1 for verification, >2 for debug */ #ifdef DE600_DEBUG diff --git a/include/asm-i386/delay.h b/include/asm-i386/delay.h index 9ae5e3782ed8..32d6678d0bbf 100644 --- a/include/asm-i386/delay.h +++ b/include/asm-i386/delay.h @@ -16,6 +16,13 @@ extern void __ndelay(unsigned long nsecs); extern void __const_udelay(unsigned long usecs); extern void __delay(unsigned long loops); +#if defined(CONFIG_PARAVIRT) && !defined(USE_REAL_TIME_DELAY) +#define udelay(n) paravirt_ops.const_udelay((n) * 0x10c7ul) + +#define ndelay(n) paravirt_ops.const_udelay((n) * 5ul) + +#else /* !PARAVIRT || USE_REAL_TIME_DELAY */ + /* 0x10c7 is 2**32 / 1000000 (rounded up) */ #define udelay(n) (__builtin_constant_p(n) ? \ ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \ @@ -25,6 +32,7 @@ extern void __delay(unsigned long loops); #define ndelay(n) (__builtin_constant_p(n) ? \ ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ __ndelay(n)) +#endif void use_tsc_delay(void); diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index 6cf2ac2bfde7..f19820f0834f 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -55,6 +55,9 @@ static inline void pack_gate(__u32 *a, __u32 *b, #define DESCTYPE_DPL3 0x60 /* DPL-3 */ #define DESCTYPE_S 0x10 /* !system */ +#ifdef CONFIG_PARAVIRT +#include +#else #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) @@ -105,7 +108,11 @@ static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const vo write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b); } -static inline void set_ldt(void *addr, unsigned int entries) +#define set_ldt native_set_ldt +#endif /* CONFIG_PARAVIRT */ + +static inline fastcall void native_set_ldt(const void *addr, + unsigned int entries) { if (likely(entries == 0)) __asm__ __volatile__("lldt %w0"::"q" (0)); diff --git a/include/asm-i386/io.h b/include/asm-i386/io.h index 68df0dc3ab8f..86ff5e83be2f 100644 --- a/include/asm-i386/io.h +++ b/include/asm-i386/io.h @@ -256,11 +256,11 @@ static inline void flush_write_buffers(void) #endif /* __KERNEL__ */ -#ifdef SLOW_IO_BY_JUMPING -#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:" +#if defined(CONFIG_PARAVIRT) +#include #else + #define __SLOW_DOWN_IO "outb %%al,$0x80;" -#endif static inline void slow_down_io(void) { __asm__ __volatile__( @@ -271,6 +271,8 @@ static inline void slow_down_io(void) { : : ); } +#endif + #ifdef CONFIG_X86_NUMAQ extern void *xquad_portio; /* Where the IO area was mapped */ #define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h index 331726b41128..9e15ce0006eb 100644 --- a/include/asm-i386/irq.h +++ b/include/asm-i386/irq.h @@ -41,4 +41,7 @@ extern int irqbalance_disable(char *str); extern void fixup_irqs(cpumask_t map); #endif +void init_IRQ(void); +void __init native_init_IRQ(void); + #endif /* _ASM_IRQ_H */ diff --git a/include/asm-i386/irqflags.h b/include/asm-i386/irqflags.h index e1bdb97c07fa..9ce01f3fb7bc 100644 --- a/include/asm-i386/irqflags.h +++ b/include/asm-i386/irqflags.h @@ -10,6 +10,9 @@ #ifndef _ASM_IRQFLAGS_H #define _ASM_IRQFLAGS_H +#ifdef CONFIG_PARAVIRT +#include +#else #ifndef __ASSEMBLY__ static inline unsigned long __raw_local_save_flags(void) @@ -25,9 +28,6 @@ static inline unsigned long __raw_local_save_flags(void) return flags; } -#define raw_local_save_flags(flags) \ - do { (flags) = __raw_local_save_flags(); } while (0) - static inline void raw_local_irq_restore(unsigned long flags) { __asm__ __volatile__( @@ -66,18 +66,6 @@ static inline void halt(void) __asm__ __volatile__("hlt": : :"memory"); } -static inline int raw_irqs_disabled_flags(unsigned long flags) -{ - return !(flags & (1 << 9)); -} - -static inline int raw_irqs_disabled(void) -{ - unsigned long flags = __raw_local_save_flags(); - - return raw_irqs_disabled_flags(flags); -} - /* * For spinlocks, etc: */ @@ -90,9 +78,33 @@ static inline unsigned long __raw_local_irq_save(void) return flags; } +#else +#define DISABLE_INTERRUPTS cli +#define ENABLE_INTERRUPTS sti +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit +#define INTERRUPT_RETURN iret +#define GET_CR0_INTO_EAX movl %cr0, %eax +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_PARAVIRT */ + +#ifndef __ASSEMBLY__ +#define raw_local_save_flags(flags) \ + do { (flags) = __raw_local_save_flags(); } while (0) + #define raw_local_irq_save(flags) \ do { (flags) = __raw_local_irq_save(); } while (0) +static inline int raw_irqs_disabled_flags(unsigned long flags) +{ + return !(flags & (1 << 9)); +} + +static inline int raw_irqs_disabled(void) +{ + unsigned long flags = __raw_local_save_flags(); + + return raw_irqs_disabled_flags(flags); +} #endif /* __ASSEMBLY__ */ /* diff --git a/include/asm-i386/mach-default/setup_arch.h b/include/asm-i386/mach-default/setup_arch.h index fb42099e7bd4..605e3ccb991b 100644 --- a/include/asm-i386/mach-default/setup_arch.h +++ b/include/asm-i386/mach-default/setup_arch.h @@ -2,4 +2,6 @@ /* no action for generic */ +#ifndef ARCH_SETUP #define ARCH_SETUP +#endif diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h index 1820d9d73af3..5679d4993072 100644 --- a/include/asm-i386/msr.h +++ b/include/asm-i386/msr.h @@ -1,6 +1,10 @@ #ifndef __ASM_MSR_H #define __ASM_MSR_H +#ifdef CONFIG_PARAVIRT +#include +#else + /* * Access to machine-specific registers (available on 586 and better only) * Note: the rd* operations modify the parameters directly (without using @@ -77,6 +81,7 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val) __asm__ __volatile__("rdpmc" \ : "=a" (low), "=d" (high) \ : "c" (counter)) +#endif /* !CONFIG_PARAVIRT */ /* symbolic names for some interesting MSRs */ /* Intel defined MSRs. */ diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h new file mode 100644 index 000000000000..a7551a44686f --- /dev/null +++ b/include/asm-i386/paravirt.h @@ -0,0 +1,281 @@ +#ifndef __ASM_PARAVIRT_H +#define __ASM_PARAVIRT_H +/* Various instructions on x86 need to be replaced for + * para-virtualization: those hooks are defined here. */ +#include + +#ifdef CONFIG_PARAVIRT +#ifndef __ASSEMBLY__ +struct thread_struct; +struct Xgt_desc_struct; +struct tss_struct; +struct paravirt_ops +{ + unsigned int kernel_rpl; + int paravirt_enabled; + const char *name; + + void (*arch_setup)(void); + char *(*memory_setup)(void); + void (*init_IRQ)(void); + + void (*banner)(void); + + unsigned long (*get_wallclock)(void); + int (*set_wallclock)(unsigned long); + void (*time_init)(void); + + /* All the function pointers here are declared as "fastcall" + so that we get a specific register-based calling + convention. This makes it easier to implement inline + assembler replacements. */ + + void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); + + unsigned long (fastcall *get_debugreg)(int regno); + void (fastcall *set_debugreg)(int regno, unsigned long value); + + void (fastcall *clts)(void); + + unsigned long (fastcall *read_cr0)(void); + void (fastcall *write_cr0)(unsigned long); + + unsigned long (fastcall *read_cr2)(void); + void (fastcall *write_cr2)(unsigned long); + + unsigned long (fastcall *read_cr3)(void); + void (fastcall *write_cr3)(unsigned long); + + unsigned long (fastcall *read_cr4_safe)(void); + unsigned long (fastcall *read_cr4)(void); + void (fastcall *write_cr4)(unsigned long); + + unsigned long (fastcall *save_fl)(void); + void (fastcall *restore_fl)(unsigned long); + void (fastcall *irq_disable)(void); + void (fastcall *irq_enable)(void); + void (fastcall *safe_halt)(void); + void (fastcall *halt)(void); + void (fastcall *wbinvd)(void); + + /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ + u64 (fastcall *read_msr)(unsigned int msr, int *err); + int (fastcall *write_msr)(unsigned int msr, u64 val); + + u64 (fastcall *read_tsc)(void); + u64 (fastcall *read_pmc)(void); + + void (fastcall *load_tr_desc)(void); + void (fastcall *load_gdt)(const struct Xgt_desc_struct *); + void (fastcall *load_idt)(const struct Xgt_desc_struct *); + void (fastcall *store_gdt)(struct Xgt_desc_struct *); + void (fastcall *store_idt)(struct Xgt_desc_struct *); + void (fastcall *set_ldt)(const void *desc, unsigned entries); + unsigned long (fastcall *store_tr)(void); + void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu); + void (fastcall *write_ldt_entry)(void *dt, int entrynum, + u32 low, u32 high); + void (fastcall *write_gdt_entry)(void *dt, int entrynum, + u32 low, u32 high); + void (fastcall *write_idt_entry)(void *dt, int entrynum, + u32 low, u32 high); + void (fastcall *load_esp0)(struct tss_struct *tss, + struct thread_struct *thread); + + void (fastcall *set_iopl_mask)(unsigned mask); + + void (fastcall *io_delay)(void); + void (*const_udelay)(unsigned long loops); + + /* These two are jmp to, not actually called. */ + void (fastcall *irq_enable_sysexit)(void); + void (fastcall *iret)(void); +}; + +extern struct paravirt_ops paravirt_ops; + +#define paravirt_enabled() (paravirt_ops.paravirt_enabled) + +static inline void load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + paravirt_ops.load_esp0(tss, thread); +} + +#define ARCH_SETUP paravirt_ops.arch_setup(); +static inline unsigned long get_wallclock(void) +{ + return paravirt_ops.get_wallclock(); +} + +static inline int set_wallclock(unsigned long nowtime) +{ + return paravirt_ops.set_wallclock(nowtime); +} + +static inline void do_time_init(void) +{ + return paravirt_ops.time_init(); +} + +/* The paravirtualized CPUID instruction. */ +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + paravirt_ops.cpuid(eax, ebx, ecx, edx); +} + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg) +#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val) + +#define clts() paravirt_ops.clts() + +#define read_cr0() paravirt_ops.read_cr0() +#define write_cr0(x) paravirt_ops.write_cr0(x) + +#define read_cr2() paravirt_ops.read_cr2() +#define write_cr2(x) paravirt_ops.write_cr2(x) + +#define read_cr3() paravirt_ops.read_cr3() +#define write_cr3(x) paravirt_ops.write_cr3(x) + +#define read_cr4() paravirt_ops.read_cr4() +#define read_cr4_safe(x) paravirt_ops.read_cr4_safe() +#define write_cr4(x) paravirt_ops.write_cr4(x) + +static inline unsigned long __raw_local_save_flags(void) +{ + return paravirt_ops.save_fl(); +} + +static inline void raw_local_irq_restore(unsigned long flags) +{ + return paravirt_ops.restore_fl(flags); +} + +static inline void raw_local_irq_disable(void) +{ + paravirt_ops.irq_disable(); +} + +static inline void raw_local_irq_enable(void) +{ + paravirt_ops.irq_enable(); +} + +static inline unsigned long __raw_local_irq_save(void) +{ + unsigned long flags = paravirt_ops.save_fl(); + + paravirt_ops.irq_disable(); + + return flags; +} + +static inline void raw_safe_halt(void) +{ + paravirt_ops.safe_halt(); +} + +static inline void halt(void) +{ + paravirt_ops.safe_halt(); +} +#define wbinvd() paravirt_ops.wbinvd() + +#define get_kernel_rpl() (paravirt_ops.kernel_rpl) + +#define rdmsr(msr,val1,val2) do { \ + int _err; \ + u64 _l = paravirt_ops.read_msr(msr,&_err); \ + val1 = (u32)_l; \ + val2 = _l >> 32; \ +} while(0) + +#define wrmsr(msr,val1,val2) do { \ + u64 _l = ((u64)(val2) << 32) | (val1); \ + paravirt_ops.write_msr((msr), _l); \ +} while(0) + +#define rdmsrl(msr,val) do { \ + int _err; \ + val = paravirt_ops.read_msr((msr),&_err); \ +} while(0) + +#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val))) +#define wrmsr_safe(msr,a,b) ({ \ + u64 _l = ((u64)(b) << 32) | (a); \ + paravirt_ops.write_msr((msr),_l); \ +}) + +/* rdmsr with exception handling */ +#define rdmsr_safe(msr,a,b) ({ \ + int _err; \ + u64 _l = paravirt_ops.read_msr(msr,&_err); \ + (*a) = (u32)_l; \ + (*b) = _l >> 32; \ + _err; }) + +#define rdtsc(low,high) do { \ + u64 _l = paravirt_ops.read_tsc(); \ + low = (u32)_l; \ + high = _l >> 32; \ +} while(0) + +#define rdtscl(low) do { \ + u64 _l = paravirt_ops.read_tsc(); \ + low = (int)_l; \ +} while(0) + +#define rdtscll(val) (val = paravirt_ops.read_tsc()) + +#define write_tsc(val1,val2) wrmsr(0x10, val1, val2) + +#define rdpmc(counter,low,high) do { \ + u64 _l = paravirt_ops.read_pmc(); \ + low = (u32)_l; \ + high = _l >> 32; \ +} while(0) + +#define load_TR_desc() (paravirt_ops.load_tr_desc()) +#define load_gdt(dtr) (paravirt_ops.load_gdt(dtr)) +#define load_idt(dtr) (paravirt_ops.load_idt(dtr)) +#define set_ldt(addr, entries) (paravirt_ops.set_ldt((addr), (entries))) +#define store_gdt(dtr) (paravirt_ops.store_gdt(dtr)) +#define store_idt(dtr) (paravirt_ops.store_idt(dtr)) +#define store_tr(tr) ((tr) = paravirt_ops.store_tr()) +#define load_TLS(t,cpu) (paravirt_ops.load_tls((t),(cpu))) +#define write_ldt_entry(dt, entry, low, high) \ + (paravirt_ops.write_ldt_entry((dt), (entry), (low), (high))) +#define write_gdt_entry(dt, entry, low, high) \ + (paravirt_ops.write_gdt_entry((dt), (entry), (low), (high))) +#define write_idt_entry(dt, entry, low, high) \ + (paravirt_ops.write_idt_entry((dt), (entry), (low), (high))) +#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask)) + +/* The paravirtualized I/O functions */ +static inline void slow_down_io(void) { + paravirt_ops.io_delay(); +#ifdef REALLY_SLOW_IO + paravirt_ops.io_delay(); + paravirt_ops.io_delay(); + paravirt_ops.io_delay(); +#endif +} + +#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax" +#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax" +#else /* __ASSEMBLY__ */ + +#define INTERRUPT_RETURN jmp *%cs:paravirt_ops+PARAVIRT_iret +#define DISABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax +#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax +#define ENABLE_INTERRUPTS_SYSEXIT jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit +#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0 +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_PARAVIRT */ +#endif /* __ASM_PARAVIRT_H */ diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 98fa73b71760..6c2c4457be0a 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -144,8 +144,8 @@ static inline void detect_ht(struct cpuinfo_x86 *c) {} #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ -static inline void __cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static inline fastcall void native_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ __asm__("cpuid" @@ -491,6 +491,12 @@ struct thread_struct { .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \ } +#ifdef CONFIG_PARAVIRT +#include +#else +#define paravirt_enabled() 0 +#define __cpuid native_cpuid + static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread) { tss->esp0 = thread->esp0; @@ -524,10 +530,13 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa : /* no output */ \ :"r" (value)) +#define set_iopl_mask native_set_iopl_mask +#endif /* CONFIG_PARAVIRT */ + /* * Set IOPL bits in EFLAGS from given mask */ -static inline void set_iopl_mask(unsigned mask) +static fastcall inline void native_set_iopl_mask(unsigned mask) { unsigned int reg; __asm__ __volatile__ ("pushfl;" diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h index 5bdda79b6b53..3c796af33776 100644 --- a/include/asm-i386/segment.h +++ b/include/asm-i386/segment.h @@ -131,5 +131,7 @@ #define SEGMENT_LDT 0x4 #define SEGMENT_GDT 0x0 +#ifndef CONFIG_PARAVIRT #define get_kernel_rpl() 0 #endif +#endif diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h index 2734909eff84..9930c5a355fc 100644 --- a/include/asm-i386/setup.h +++ b/include/asm-i386/setup.h @@ -70,6 +70,7 @@ extern unsigned char boot_params[PARAM_SIZE]; struct e820entry; char * __init machine_specific_memory_setup(void); +char *memory_setup(void); int __init copy_e820_map(struct e820entry * biosmap, int nr_map); int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map); diff --git a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h index c18b71fae6b3..dea60709db29 100644 --- a/include/asm-i386/spinlock.h +++ b/include/asm-i386/spinlock.h @@ -7,8 +7,12 @@ #include #include +#ifdef CONFIG_PARAVIRT +#include +#else #define CLI_STRING "cli" #define STI_STRING "sti" +#endif /* CONFIG_PARAVIRT */ /* * Your basic SMP spinlocks, allowing only a single CPU anywhere diff --git a/include/asm-i386/suspend.h b/include/asm-i386/suspend.h index 08be1e5009d4..30361526d568 100644 --- a/include/asm-i386/suspend.h +++ b/include/asm-i386/suspend.h @@ -23,12 +23,8 @@ arch_prepare_suspend(void) struct saved_context { u16 es, fs, gs, ss; unsigned long cr0, cr2, cr3, cr4; - u16 gdt_pad; - u16 gdt_limit; - unsigned long gdt_base; - u16 idt_pad; - u16 idt_limit; - unsigned long idt_base; + struct Xgt_desc_struct gdt; + struct Xgt_desc_struct idt; u16 ldt; u16 tss; unsigned long tr; diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h index a6dabbcd6e6a..a6d20d9a1a30 100644 --- a/include/asm-i386/system.h +++ b/include/asm-i386/system.h @@ -88,6 +88,9 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" \ #define savesegment(seg, value) \ asm volatile("mov %%" #seg ",%0":"=rm" (value)) +#ifdef CONFIG_PARAVIRT +#include +#else #define read_cr0() ({ \ unsigned int __dummy; \ __asm__ __volatile__( \ @@ -139,17 +142,18 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" \ #define write_cr4(x) \ __asm__ __volatile__("movl %0,%%cr4": :"r" (x)) -/* - * Clear and set 'TS' bit respectively - */ +#define wbinvd() \ + __asm__ __volatile__ ("wbinvd": : :"memory") + +/* Clear the 'TS' bit */ #define clts() __asm__ __volatile__ ("clts") +#endif/* CONFIG_PARAVIRT */ + +/* Set the 'TS' bit */ #define stts() write_cr0(8 | read_cr0()) #endif /* __KERNEL__ */ -#define wbinvd() \ - __asm__ __volatile__ ("wbinvd": : :"memory") - static inline unsigned long get_limit(unsigned long segment) { unsigned long __limit; diff --git a/include/asm-i386/time.h b/include/asm-i386/time.h new file mode 100644 index 000000000000..ea8065af825a --- /dev/null +++ b/include/asm-i386/time.h @@ -0,0 +1,41 @@ +#ifndef _ASMi386_TIME_H +#define _ASMi386_TIME_H + +#include +#include "mach_time.h" + +static inline unsigned long native_get_wallclock(void) +{ + unsigned long retval; + + if (efi_enabled) + retval = efi_get_time(); + else + retval = mach_get_cmos_time(); + + return retval; +} + +static inline int native_set_wallclock(unsigned long nowtime) +{ + int retval; + + if (efi_enabled) + retval = efi_set_rtc_mmss(nowtime); + else + retval = mach_set_rtc_mmss(nowtime); + + return retval; +} + +#ifdef CONFIG_PARAVIRT +#include +#else /* !CONFIG_PARAVIRT */ + +#define get_wallclock() native_get_wallclock() +#define set_wallclock(x) native_set_wallclock(x) +#define do_time_init() time_init_hook() + +#endif /* CONFIG_PARAVIRT */ + +#endif -- cgit v1.2.3-59-g8ed1b From 139ec7c416248b9ea227d21839235344edfee1e0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Patch inline replacements for paravirt intercepts It turns out that the most called ops, by several orders of magnitude, are the interrupt manipulation ops. These are obvious candidates for patching, so mark them up and create infrastructure for it. The method used is that the ops structure has a patch function, which is called for each place which needs to be patched: this returns a number of instructions (the rest are NOP-padded). Usually we can spare a register (%eax) for the binary patched code to use, but in a couple of critical places in entry.S we can't: we make the clobbers explicit at the call site, and manually clobber the allowed registers in debug mode as an extra check. And: Don't abuse CONFIG_DEBUG_KERNEL, add CONFIG_DEBUG_PARAVIRT. And: AK: Fix warnings in x86-64 alternative.c build And: AK: Fix compilation with defconfig And: ^From: Andrew Morton Some binutlises still like to emit references to __stop_parainstructions and __start_parainstructions. And: AK: Fix warnings about unused variables when PARAVIRT is disabled. Signed-off-by: Rusty Russell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/Kconfig.debug | 10 ++ arch/i386/kernel/alternative.c | 63 ++++++++++--- arch/i386/kernel/entry.S | 39 +++++--- arch/i386/kernel/module.c | 11 ++- arch/i386/kernel/paravirt.c | 44 +++++++++ arch/i386/kernel/vmlinux.lds.S | 6 ++ include/asm-i386/alternative.h | 13 ++- include/asm-i386/desc.h | 41 ++++---- include/asm-i386/irqflags.h | 4 +- include/asm-i386/paravirt.h | 189 ++++++++++++++++++++++++++++++------- include/asm-i386/processor.h | 198 +++++++++++++++++++-------------------- include/asm-i386/spinlock.h | 15 ++- include/asm-x86_64/alternative.h | 12 +++ scripts/mod/modpost.c | 2 + 14 files changed, 459 insertions(+), 188 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug index b31c0802e1cc..f68cc6f215f8 100644 --- a/arch/i386/Kconfig.debug +++ b/arch/i386/Kconfig.debug @@ -85,4 +85,14 @@ config DOUBLEFAULT option saves about 4k and might cause you much additional grey hair. +config DEBUG_PARAVIRT + bool "Enable some paravirtualization debugging" + default y + depends on PARAVIRT && DEBUG_KERNEL + help + Currently deliberately clobbers regs which are allowed to be + clobbered in inlined paravirt hooks, even in native mode. + If turning this off solves a problem, then DISABLE_INTERRUPTS() or + ENABLE_INTERRUPTS() is lying about what registers can be clobbered. + endmenu diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 535f9794fba1..9eca21b49f6b 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -124,6 +124,20 @@ static unsigned char** find_nop_table(void) #endif /* CONFIG_X86_64 */ +static void nop_out(void *insns, unsigned int len) +{ + unsigned char **noptable = find_nop_table(); + + while (len > 0) { + unsigned int noplen = len; + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + memcpy(insns, noptable[noplen], noplen); + insns += noplen; + len -= noplen; + } +} + extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; extern u8 *__smp_locks[], *__smp_locks_end[]; @@ -138,10 +152,9 @@ extern u8 __smp_alt_begin[], __smp_alt_end[]; void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { - unsigned char **noptable = find_nop_table(); struct alt_instr *a; u8 *instr; - int diff, i, k; + int diff; DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); for (a = start; a < end; a++) { @@ -159,13 +172,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end) #endif memcpy(instr, a->replacement, a->replacementlen); diff = a->instrlen - a->replacementlen; - /* Pad the rest with nops */ - for (i = a->replacementlen; diff > 0; diff -= k, i += k) { - k = diff; - if (k > ASM_NOP_MAX) - k = ASM_NOP_MAX; - memcpy(a->instr + i, noptable[k], k); - } + nop_out(instr + a->replacementlen, diff); } } @@ -209,7 +216,6 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) { - unsigned char **noptable = find_nop_table(); u8 **ptr; for (ptr = start; ptr < end; ptr++) { @@ -217,7 +223,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end continue; if (*ptr > text_end) continue; - **ptr = noptable[1][0]; + nop_out(*ptr, 1); }; } @@ -343,6 +349,40 @@ void alternatives_smp_switch(int smp) #endif +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) +{ + struct paravirt_patch *p; + + for (p = start; p < end; p++) { + unsigned int used; + + used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr, + p->len); +#ifdef CONFIG_DEBUG_PARAVIRT + { + int i; + /* Deliberately clobber regs using "not %reg" to find bugs. */ + for (i = 0; i < 3; i++) { + if (p->len - used >= 2 && (p->clobbers & (1 << i))) { + memcpy(p->instr + used, "\xf7\xd0", 2); + p->instr[used+1] |= i; + used += 2; + } + } + } +#endif + /* Pad the rest with nops */ + nop_out(p->instr + used, p->len - used); + } + + /* Sync to be conservative, in case we patched following instructions */ + sync_core(); +} +extern struct paravirt_patch __start_parainstructions[], + __stop_parainstructions[]; +#endif /* CONFIG_PARAVIRT */ + void __init alternative_instructions(void) { unsigned long flags; @@ -390,5 +430,6 @@ void __init alternative_instructions(void) alternatives_smp_switch(0); } #endif + apply_paravirt(__start_parainstructions, __stop_parainstructions); local_irq_restore(flags); } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index d274612e05cd..de34b7fed3c1 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -53,6 +53,19 @@ #include #include "irq_vectors.h" +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + #define nr_syscalls ((syscall_table_size)/4) CF_MASK = 0x00000001 @@ -63,9 +76,9 @@ NT_MASK = 0x00004000 VM_MASK = 0x00020000 #ifdef CONFIG_PREEMPT -#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else -#define preempt_stop +#define preempt_stop(clobbers) #define resume_kernel restore_nocheck #endif @@ -226,7 +239,7 @@ ENTRY(ret_from_fork) ALIGN RING0_PTREGS_FRAME ret_from_exception: - preempt_stop + preempt_stop(CLBR_ANY) ret_from_intr: GET_THREAD_INFO(%ebp) check_userspace: @@ -237,7 +250,7 @@ check_userspace: jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -248,7 +261,7 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - DISABLE_INTERRUPTS + DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: @@ -277,7 +290,7 @@ sysenter_past_esp: * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: */ - ENABLE_INTERRUPTS + ENABLE_INTERRUPTS(CLBR_NONE) pushl $(__USER_DS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ss, 0*/ @@ -322,7 +335,7 @@ sysenter_past_esp: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) - DISABLE_INTERRUPTS + DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx @@ -364,7 +377,7 @@ syscall_call: call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) # store the return value syscall_exit: - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -393,7 +406,7 @@ restore_nocheck_notrace: .section .fixup,"ax" iret_exc: TRACE_IRQS_ON - ENABLE_INTERRUPTS + ENABLE_INTERRUPTS(CLBR_NONE) pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -436,7 +449,7 @@ ldt_ss: CFI_ADJUST_CFA_OFFSET 4 pushl %eax CFI_ADJUST_CFA_OFFSET 4 - DISABLE_INTERRUPTS + DISABLE_INTERRUPTS(CLBR_EAX) TRACE_IRQS_OFF lss (%esp), %esp CFI_ADJUST_CFA_OFFSET -8 @@ -451,7 +464,7 @@ work_pending: jz work_notifysig work_resched: call schedule - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -509,7 +522,7 @@ syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending TRACE_IRQS_ON - ENABLE_INTERRUPTS # could let do_syscall_trace() call + ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call # schedule() instead movl %esp, %eax movl $1, %edx @@ -693,7 +706,7 @@ ENTRY(device_not_available) GET_CR0_INTO_EAX testl $0x4, %eax # EM (math emulation bit) jne device_not_available_emulate - preempt_stop + preempt_stop(CLBR_ANY) call math_state_restore jmp ret_from_exception device_not_available_emulate: diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c index 470cf97e7cd3..d7d9c8b23f72 100644 --- a/arch/i386/kernel/module.c +++ b/arch/i386/kernel/module.c @@ -108,7 +108,8 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -118,6 +119,8 @@ int module_finalize(const Elf_Ehdr *hdr, alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) locks= s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; } if (alt) { @@ -132,6 +135,12 @@ int module_finalize(const Elf_Ehdr *hdr, lseg, lseg + locks->sh_size, tseg, tseg + text->sh_size); } + + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + return 0; } diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 478192cd4b90..d46460426446 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -45,6 +45,49 @@ char *memory_setup(void) return paravirt_ops.memory_setup(); } +/* Simple instruction patching code. */ +#define DEF_NATIVE(name, code) \ + extern const char start_##name[], end_##name[]; \ + asm("start_" #name ": " code "; end_" #name ":") +DEF_NATIVE(cli, "cli"); +DEF_NATIVE(sti, "sti"); +DEF_NATIVE(popf, "push %eax; popf"); +DEF_NATIVE(pushf, "pushf; pop %eax"); +DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli"); +DEF_NATIVE(iret, "iret"); +DEF_NATIVE(sti_sysexit, "sti; sysexit"); + +static const struct native_insns +{ + const char *start, *end; +} native_insns[] = { + [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli }, + [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti }, + [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf }, + [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf }, + [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli }, + [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret }, + [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit }, +}; + +static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) +{ + unsigned int insn_len; + + /* Don't touch it if we don't have a replacement */ + if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start) + return len; + + insn_len = native_insns[type].end - native_insns[type].start; + + /* Similarly if we can't fit replacement. */ + if (len < insn_len) + return len; + + memcpy(insns, native_insns[type].start, insn_len); + return insn_len; +} + static fastcall unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ @@ -349,6 +392,7 @@ struct paravirt_ops paravirt_ops = { .paravirt_enabled = 0, .kernel_rpl = 0, + .patch = native_patch, .banner = default_banner, .arch_setup = native_nop, .memory_setup = machine_specific_memory_setup, diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 6860f20aa579..5c69cf0e5944 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -165,6 +165,12 @@ SECTIONS .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { *(.altinstr_replacement) } + . = ALIGN(4); + __start_parainstructions = .; + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + *(.parainstructions) + } + __stop_parainstructions = .; /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } diff --git a/include/asm-i386/alternative.h b/include/asm-i386/alternative.h index b01a7ec409ce..b8fa9557c532 100644 --- a/include/asm-i386/alternative.h +++ b/include/asm-i386/alternative.h @@ -4,7 +4,7 @@ #ifdef __KERNEL__ #include - +#include #include struct alt_instr { @@ -118,4 +118,15 @@ static inline void alternatives_smp_switch(int smp) {} #define LOCK_PREFIX "" #endif +struct paravirt_patch; +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end); +#else +static inline void +apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) +{} +#define __start_parainstructions NULL +#define __stop_parainstructions NULL +#endif + #endif /* _I386_ALTERNATIVE_H */ diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index f19820f0834f..f398cc456448 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -81,31 +81,15 @@ static inline void load_TLS(struct thread_struct *t, unsigned int cpu) #undef C } -static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b) -{ - __u32 *lp = (__u32 *)((char *)dt + entry*8); - *lp = entry_a; - *(lp+1) = entry_b; -} - #define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) #define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) #define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) -static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg) -{ - __u32 a, b; - pack_gate(&a, &b, (unsigned long)addr, seg, type, 0); - write_idt_entry(idt_table, gate, a, b); -} - -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr) +static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b) { - __u32 a, b; - pack_descriptor(&a, &b, (unsigned long)addr, - offsetof(struct tss_struct, __cacheline_filler) - 1, - DESCTYPE_TSS, 0); - write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b); + __u32 *lp = (__u32 *)((char *)dt + entry*8); + *lp = entry_a; + *(lp+1) = entry_b; } #define set_ldt native_set_ldt @@ -128,6 +112,23 @@ static inline fastcall void native_set_ldt(const void *addr, } } +static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg) +{ + __u32 a, b; + pack_gate(&a, &b, (unsigned long)addr, seg, type, 0); + write_idt_entry(idt_table, gate, a, b); +} + +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr) +{ + __u32 a, b; + pack_descriptor(&a, &b, (unsigned long)addr, + offsetof(struct tss_struct, __cacheline_filler) - 1, + DESCTYPE_TSS, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b); +} + + #define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) #define LDT_entry_a(info) \ diff --git a/include/asm-i386/irqflags.h b/include/asm-i386/irqflags.h index 9ce01f3fb7bc..17b18cf4fe9d 100644 --- a/include/asm-i386/irqflags.h +++ b/include/asm-i386/irqflags.h @@ -79,8 +79,8 @@ static inline unsigned long __raw_local_irq_save(void) } #else -#define DISABLE_INTERRUPTS cli -#define ENABLE_INTERRUPTS sti +#define DISABLE_INTERRUPTS(clobbers) cli +#define ENABLE_INTERRUPTS(clobbers) sti #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit #define INTERRUPT_RETURN iret #define GET_CR0_INTO_EAX movl %cr0, %eax diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index a7551a44686f..081194751ade 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -3,8 +3,26 @@ /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ #include +#include #ifdef CONFIG_PARAVIRT +/* These are the most performance critical ops, so we want to be able to patch + * callers */ +#define PARAVIRT_IRQ_DISABLE 0 +#define PARAVIRT_IRQ_ENABLE 1 +#define PARAVIRT_RESTORE_FLAGS 2 +#define PARAVIRT_SAVE_FLAGS 3 +#define PARAVIRT_SAVE_FLAGS_IRQ_DISABLE 4 +#define PARAVIRT_INTERRUPT_RETURN 5 +#define PARAVIRT_STI_SYSEXIT 6 + +/* Bitmask of what can be clobbered: usually at least eax. */ +#define CLBR_NONE 0x0 +#define CLBR_EAX 0x1 +#define CLBR_ECX 0x2 +#define CLBR_EDX 0x4 +#define CLBR_ANY 0x7 + #ifndef __ASSEMBLY__ struct thread_struct; struct Xgt_desc_struct; @@ -15,6 +33,15 @@ struct paravirt_ops int paravirt_enabled; const char *name; + /* + * Patch may replace one of the defined code sequences with arbitrary + * code, subject to the same register constraints. This generally + * means the code is not free to clobber any registers other than EAX. + * The patch function should return the number of bytes of code + * generated, as we nop pad the rest in generic code. + */ + unsigned (*patch)(u8 type, u16 clobber, void *firstinsn, unsigned len); + void (*arch_setup)(void); char *(*memory_setup)(void); void (*init_IRQ)(void); @@ -147,35 +174,6 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, #define read_cr4_safe(x) paravirt_ops.read_cr4_safe() #define write_cr4(x) paravirt_ops.write_cr4(x) -static inline unsigned long __raw_local_save_flags(void) -{ - return paravirt_ops.save_fl(); -} - -static inline void raw_local_irq_restore(unsigned long flags) -{ - return paravirt_ops.restore_fl(flags); -} - -static inline void raw_local_irq_disable(void) -{ - paravirt_ops.irq_disable(); -} - -static inline void raw_local_irq_enable(void) -{ - paravirt_ops.irq_enable(); -} - -static inline unsigned long __raw_local_irq_save(void) -{ - unsigned long flags = paravirt_ops.save_fl(); - - paravirt_ops.irq_disable(); - - return flags; -} - static inline void raw_safe_halt(void) { paravirt_ops.safe_halt(); @@ -267,15 +265,134 @@ static inline void slow_down_io(void) { #endif } -#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax" -#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax" +/* These all sit in the .parainstructions section to tell us what to patch. */ +struct paravirt_patch { + u8 *instr; /* original instructions */ + u8 instrtype; /* type of this instruction */ + u8 len; /* length of original instruction */ + u16 clobbers; /* what registers you may clobber */ +}; + +#define paravirt_alt(insn_string, typenum, clobber) \ + "771:\n\t" insn_string "\n" "772:\n" \ + ".pushsection .parainstructions,\"a\"\n" \ + " .long 771b\n" \ + " .byte " __stringify(typenum) "\n" \ + " .byte 772b-771b\n" \ + " .short " __stringify(clobber) "\n" \ + ".popsection" + +static inline unsigned long __raw_local_save_flags(void) +{ + unsigned long f; + + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%1;" + "popl %%edx; popl %%ecx", + PARAVIRT_SAVE_FLAGS, CLBR_NONE) + : "=a"(f): "m"(paravirt_ops.save_fl) + : "memory", "cc"); + return f; +} + +static inline void raw_local_irq_restore(unsigned long f) +{ + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%1;" + "popl %%edx; popl %%ecx", + PARAVIRT_RESTORE_FLAGS, CLBR_EAX) + : "=a"(f) : "m" (paravirt_ops.restore_fl), "0"(f) + : "memory", "cc"); +} + +static inline void raw_local_irq_disable(void) +{ + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%0;" + "popl %%edx; popl %%ecx", + PARAVIRT_IRQ_DISABLE, CLBR_EAX) + : : "m" (paravirt_ops.irq_disable) + : "memory", "eax", "cc"); +} + +static inline void raw_local_irq_enable(void) +{ + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%0;" + "popl %%edx; popl %%ecx", + PARAVIRT_IRQ_ENABLE, CLBR_EAX) + : : "m" (paravirt_ops.irq_enable) + : "memory", "eax", "cc"); +} + +static inline unsigned long __raw_local_irq_save(void) +{ + unsigned long f; + + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%1; pushl %%eax;" + "call *%2; popl %%eax;" + "popl %%edx; popl %%ecx", + PARAVIRT_SAVE_FLAGS_IRQ_DISABLE, + CLBR_NONE) + : "=a"(f) + : "m" (paravirt_ops.save_fl), + "m" (paravirt_ops.irq_disable) + : "memory", "cc"); + return f; +} + +#define CLI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \ + "call *paravirt_ops+%c[irq_disable];" \ + "popl %%edx; popl %%ecx", \ + PARAVIRT_IRQ_DISABLE, CLBR_EAX) + +#define STI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \ + "call *paravirt_ops+%c[irq_enable];" \ + "popl %%edx; popl %%ecx", \ + PARAVIRT_IRQ_ENABLE, CLBR_EAX) +#define CLI_STI_CLOBBERS , "%eax" +#define CLI_STI_INPUT_ARGS \ + , \ + [irq_disable] "i" (offsetof(struct paravirt_ops, irq_disable)), \ + [irq_enable] "i" (offsetof(struct paravirt_ops, irq_enable)) + #else /* __ASSEMBLY__ */ -#define INTERRUPT_RETURN jmp *%cs:paravirt_ops+PARAVIRT_iret -#define DISABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax -#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax -#define ENABLE_INTERRUPTS_SYSEXIT jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit -#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0 +#define PARA_PATCH(ptype, clobbers, ops) \ +771:; \ + ops; \ +772:; \ + .pushsection .parainstructions,"a"; \ + .long 771b; \ + .byte ptype; \ + .byte 772b-771b; \ + .short clobbers; \ + .popsection + +#define INTERRUPT_RETURN \ + PARA_PATCH(PARAVIRT_INTERRUPT_RETURN, CLBR_ANY, \ + jmp *%cs:paravirt_ops+PARAVIRT_iret) + +#define DISABLE_INTERRUPTS(clobbers) \ + PARA_PATCH(PARAVIRT_IRQ_DISABLE, clobbers, \ + pushl %ecx; pushl %edx; \ + call *paravirt_ops+PARAVIRT_irq_disable; \ + popl %edx; popl %ecx) \ + +#define ENABLE_INTERRUPTS(clobbers) \ + PARA_PATCH(PARAVIRT_IRQ_ENABLE, clobbers, \ + pushl %ecx; pushl %edx; \ + call *%cs:paravirt_ops+PARAVIRT_irq_enable; \ + popl %edx; popl %ecx) + +#define ENABLE_INTERRUPTS_SYSEXIT \ + PARA_PATCH(PARAVIRT_STI_SYSEXIT, CLBR_ANY, \ + jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit) + +#define GET_CR0_INTO_EAX \ + call *paravirt_ops+PARAVIRT_read_cr0 + #endif /* __ASSEMBLY__ */ #endif /* CONFIG_PARAVIRT */ #endif /* __ASM_PARAVIRT_H */ diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 6c2c4457be0a..5f0418d0078c 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -156,59 +156,6 @@ static inline fastcall void native_cpuid(unsigned int *eax, unsigned int *ebx, : "0" (*eax), "2" (*ecx)); } -/* - * Generic CPUID function - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx - * resulting in stale register contents being returned. - */ -static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) -{ - *eax = op; - *ecx = 0; - __cpuid(eax, ebx, ecx, edx); -} - -/* Some CPUID calls want 'count' to be placed in ecx */ -static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, - int *edx) -{ - *eax = op; - *ecx = count; - __cpuid(eax, ebx, ecx, edx); -} - -/* - * CPUID functions returning a single datum - */ -static inline unsigned int cpuid_eax(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return eax; -} -static inline unsigned int cpuid_ebx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return ebx; -} -static inline unsigned int cpuid_ecx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return ecx; -} -static inline unsigned int cpuid_edx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return edx; -} - #define load_cr3(pgdir) write_cr3(__pa(pgdir)) /* @@ -491,22 +438,6 @@ struct thread_struct { .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \ } -#ifdef CONFIG_PARAVIRT -#include -#else -#define paravirt_enabled() 0 -#define __cpuid native_cpuid - -static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread) -{ - tss->esp0 = thread->esp0; - /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->ss1 != thread->sysenter_cs)) { - tss->ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } -} - #define start_thread(regs, new_eip, new_esp) do { \ __asm__("movl %0,%%fs": :"r" (0)); \ regs->xgs = 0; \ @@ -519,36 +450,6 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa regs->esp = new_esp; \ } while (0) -/* - * These special macros can be used to get or set a debugging register - */ -#define get_debugreg(var, register) \ - __asm__("movl %%db" #register ", %0" \ - :"=r" (var)) -#define set_debugreg(value, register) \ - __asm__("movl %0,%%db" #register \ - : /* no output */ \ - :"r" (value)) - -#define set_iopl_mask native_set_iopl_mask -#endif /* CONFIG_PARAVIRT */ - -/* - * Set IOPL bits in EFLAGS from given mask - */ -static fastcall inline void native_set_iopl_mask(unsigned mask) -{ - unsigned int reg; - __asm__ __volatile__ ("pushfl;" - "popl %0;" - "andl %1, %0;" - "orl %2, %0;" - "pushl %0;" - "popfl" - : "=&r" (reg) - : "i" (~X86_EFLAGS_IOPL), "r" (mask)); -} - /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; @@ -640,6 +541,105 @@ static inline void rep_nop(void) #define cpu_relax() rep_nop() +#ifdef CONFIG_PARAVIRT +#include +#else +#define paravirt_enabled() 0 +#define __cpuid native_cpuid + +static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread) +{ + tss->esp0 = thread->esp0; + /* This can only happen when SEP is enabled, no need to test "SEP"arately */ + if (unlikely(tss->ss1 != thread->sysenter_cs)) { + tss->ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } +} + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + __asm__("movl %%db" #register ", %0" \ + :"=r" (var)) +#define set_debugreg(value, register) \ + __asm__("movl %0,%%db" #register \ + : /* no output */ \ + :"r" (value)) + +#define set_iopl_mask native_set_iopl_mask +#endif /* CONFIG_PARAVIRT */ + +/* + * Set IOPL bits in EFLAGS from given mask + */ +static fastcall inline void native_set_iopl_mask(unsigned mask) +{ + unsigned int reg; + __asm__ __volatile__ ("pushfl;" + "popl %0;" + "andl %1, %0;" + "orl %2, %0;" + "pushl %0;" + "popfl" + : "=&r" (reg) + : "i" (~X86_EFLAGS_IOPL), "r" (mask)); +} + +/* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx + * resulting in stale register contents being returned. + */ +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); +} + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, + int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +/* + * CPUID functions returning a single datum + */ +static inline unsigned int cpuid_eax(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return eax; +} +static inline unsigned int cpuid_ebx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return ebx; +} +static inline unsigned int cpuid_ecx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return ecx; +} +static inline unsigned int cpuid_edx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return edx; +} + /* generic versions from gas */ #define GENERIC_NOP1 ".byte 0x90\n" #define GENERIC_NOP2 ".byte 0x89,0xf6\n" diff --git a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h index dea60709db29..d3bcebed60ca 100644 --- a/include/asm-i386/spinlock.h +++ b/include/asm-i386/spinlock.h @@ -12,6 +12,8 @@ #else #define CLI_STRING "cli" #define STI_STRING "sti" +#define CLI_STI_CLOBBERS +#define CLI_STI_INPUT_ARGS #endif /* CONFIG_PARAVIRT */ /* @@ -57,25 +59,28 @@ static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long fla { asm volatile( "\n1:\t" - LOCK_PREFIX " ; decb %0\n\t" + LOCK_PREFIX " ; decb %[slock]\n\t" "jns 5f\n" "2:\t" - "testl $0x200, %1\n\t" + "testl $0x200, %[flags]\n\t" "jz 4f\n\t" STI_STRING "\n" "3:\t" "rep;nop\n\t" - "cmpb $0, %0\n\t" + "cmpb $0, %[slock]\n\t" "jle 3b\n\t" CLI_STRING "\n\t" "jmp 1b\n" "4:\t" "rep;nop\n\t" - "cmpb $0, %0\n\t" + "cmpb $0, %[slock]\n\t" "jg 1b\n\t" "jmp 4b\n" "5:\n\t" - : "+m" (lock->slock) : "r" (flags) : "memory"); + : [slock] "+m" (lock->slock) + : [flags] "r" (flags) + CLI_STI_INPUT_ARGS + : "memory" CLI_STI_CLOBBERS); } #endif diff --git a/include/asm-x86_64/alternative.h b/include/asm-x86_64/alternative.h index a584826cc570..a6657b4f3e0e 100644 --- a/include/asm-x86_64/alternative.h +++ b/include/asm-x86_64/alternative.h @@ -4,6 +4,7 @@ #ifdef __KERNEL__ #include +#include #include struct alt_instr { @@ -133,4 +134,15 @@ static inline void alternatives_smp_switch(int smp) {} #define LOCK_PREFIX "" #endif +struct paravirt_patch; +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end); +#else +static inline void +apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) +{} +#define __start_parainstructions NULL +#define __stop_parainstructions NULL +#endif + #endif /* _X86_64_ALTERNATIVE_H */ diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 2e1141623147..ac0a58222992 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -911,6 +911,7 @@ static int init_section_ref_ok(const char *name) ".toc1", /* used by ppc64 */ ".stab", ".rodata", + ".parainstructions", ".text.lock", "__bug_table", /* used by powerpc for BUG() */ ".pci_fixup_header", @@ -931,6 +932,7 @@ static int init_section_ref_ok(const char *name) ".altinstructions", ".eh_frame", ".debug", + ".parainstructions", NULL }; /* part of section name */ -- cgit v1.2.3-59-g8ed1b From d7cd56111f30259e1b532a12e06f59f8e0a20355 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] i386: cpu_detect extraction Both lhype and Xen want to call the core of the x86 cpu detect code before calling start_kernel. (extracted from larger patch) AK: folded in start_kernel header patch Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/common.c | 37 +++++++++++++++++++++---------------- include/asm-i386/processor.h | 3 +++ include/linux/start_kernel.h | 12 ++++++++++++ init/main.c | 1 + 4 files changed, 37 insertions(+), 16 deletions(-) create mode 100644 include/linux/start_kernel.h (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index cda41aef79ad..68bcb687019a 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -236,29 +236,14 @@ static int __cpuinit have_cpuid_p(void) return flag_is_changeable_p(X86_EFLAGS_ID); } -/* Do minimum CPU detection early. - Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. - The others are not touched to avoid unwanted side effects. - - WARNING: this function is only called on the BP. Don't add code here - that is supposed to run on all CPUs. */ -static void __init early_cpu_detect(void) +void __init cpu_detect(struct cpuinfo_x86 *c) { - struct cpuinfo_x86 *c = &boot_cpu_data; - - c->x86_cache_alignment = 32; - - if (!have_cpuid_p()) - return; - /* Get vendor name */ cpuid(0x00000000, &c->cpuid_level, (int *)&c->x86_vendor_id[0], (int *)&c->x86_vendor_id[8], (int *)&c->x86_vendor_id[4]); - get_cpu_vendor(c, 1); - c->x86 = 4; if (c->cpuid_level >= 0x00000001) { u32 junk, tfms, cap0, misc; @@ -275,6 +260,26 @@ static void __init early_cpu_detect(void) } } +/* Do minimum CPU detection early. + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. + The others are not touched to avoid unwanted side effects. + + WARNING: this function is only called on the BP. Don't add code here + that is supposed to run on all CPUs. */ +static void __init early_cpu_detect(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + c->x86_cache_alignment = 32; + + if (!have_cpuid_p()) + return; + + cpu_detect(c); + + get_cpu_vendor(c, 1); +} + static void __cpuinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 5f0418d0078c..a52d65440429 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -20,6 +20,7 @@ #include #include #include +#include /* flag for disabling the tsc */ extern int tsc_disable; @@ -112,6 +113,8 @@ extern struct cpuinfo_x86 cpu_data[]; extern int cpu_llc_id[NR_CPUS]; extern char ignore_fpu_irq; +void __init cpu_detect(struct cpuinfo_x86 *c); + extern void identify_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); diff --git a/include/linux/start_kernel.h b/include/linux/start_kernel.h new file mode 100644 index 000000000000..d3e5f2756545 --- /dev/null +++ b/include/linux/start_kernel.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_START_KERNEL_H +#define _LINUX_START_KERNEL_H + +#include +#include + +/* Define the prototype for start_kernel here, rather than cluttering + up something else. */ + +extern asmlinkage void __init start_kernel(void); + +#endif /* _LINUX_START_KERNEL_H */ diff --git a/init/main.c b/init/main.c index 36f608a7cfba..985c9ed86050 100644 --- a/init/main.c +++ b/init/main.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-59-g8ed1b From c9ccf30d77f04064fe5436027ab9d2230c7cdd94 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Add startup infrastructure for paravirtualization 1) Each hypervisor writes a probe function to detect whether we are running under that hypervisor. paravirt_probe() registers this function. 2) If vmlinux is booted with ring != 0, we call all the probe functions (with registers except %esp intact) in link order: the winner will not return. Signed-off-by: Rusty Russell Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Zachary Amsden Signed-off-by: Andrew Morton --- arch/i386/kernel/Makefile | 2 ++ arch/i386/kernel/head.S | 33 +++++++++++++++++++++++++++++++++ arch/i386/kernel/paravirt.c | 4 ++++ arch/i386/kernel/vmlinux.lds.S | 6 ++++++ include/asm-i386/paravirt.h | 5 +++++ 5 files changed, 50 insertions(+) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 406612136049..1e8988e558c5 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -39,6 +39,8 @@ obj-$(CONFIG_VM86) += vm86.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o + +# Make sure this is linked after any other paravirt_ops structs: see head.S obj-$(CONFIG_PARAVIRT) += paravirt.o EXTRA_AFLAGS := -traditional diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 5b14e95ac8b9..edef5084ce17 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -55,6 +55,12 @@ */ ENTRY(startup_32) +#ifdef CONFIG_PARAVIRT + movl %cs, %eax + testl $0x3, %eax + jnz startup_paravirt +#endif + /* * Set segments to known values. */ @@ -486,6 +492,33 @@ ignore_int: #endif iret +#ifdef CONFIG_PARAVIRT +startup_paravirt: + cld + movl $(init_thread_union+THREAD_SIZE),%esp + + /* We take pains to preserve all the regs. */ + pushl %edx + pushl %ecx + pushl %eax + + /* paravirt.o is last in link, and that probe fn never returns */ + pushl $__start_paravirtprobe +1: + movl 0(%esp), %eax + pushl (%eax) + movl 8(%esp), %eax + call *(%esp) + popl %eax + + movl 4(%esp), %eax + movl 8(%esp), %ecx + movl 12(%esp), %edx + + addl $4, (%esp) + jmp 1b +#endif + /* * Real beginning of normal "text" segment */ diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index d46460426446..5a9bd3250a1a 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -387,6 +388,9 @@ static int __init print_banner(void) } core_initcall(print_banner); +/* We simply declare start_kernel to be the paravirt probe of last resort. */ +paravirt_probe(start_kernel); + struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 5c69cf0e5944..877dc5cfe3a8 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -65,6 +65,12 @@ SECTIONS CONSTRUCTORS } :data + __start_paravirtprobe = .; + .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) { + *(.paravirtprobe) + } + __stop_paravirtprobe = .; + . = ALIGN(4096); .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { __nosave_begin = .; diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 081194751ade..dd707d8c8270 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -120,6 +120,11 @@ struct paravirt_ops void (fastcall *iret)(void); }; +/* Mark a paravirt probe function. */ +#define paravirt_probe(fn) \ + static asmlinkage void (*__paravirtprobe_##fn)(void) __attribute_used__ \ + __attribute__((__section__(".paravirtprobe"))) = fn + extern struct paravirt_ops paravirt_ops; #define paravirt_enabled() (paravirt_ops.paravirt_enabled) -- cgit v1.2.3-59-g8ed1b From 4f205fd45a5c192907188d6f8f6d7e66db859248 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Allow selected bug checks to be Allow selected bug checks to be skipped by paravirt kernels. The two most important are the F00F workaround (which is either done by the hypervisor, or not required), and the 'hlt' instruction check, which can break under some hypervisors. Signed-off-by: Zachary Amsden Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/intel.c | 2 +- include/asm-i386/bugs.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 798c2f617e87..3ae795e9056d 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) * Note that the workaround only should be initialized once... */ c->f00f_bug = 0; - if ( c->x86 == 5 ) { + if (!paravirt_enabled() && c->x86 == 5) { static int f00f_workaround_enabled = 0; c->f00f_bug = 1; diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h index 592ffeeda45e..38f1aebbbdb5 100644 --- a/include/asm-i386/bugs.h +++ b/include/asm-i386/bugs.h @@ -21,6 +21,7 @@ #include #include #include +#include static int __init no_halt(char *s) { @@ -91,6 +92,9 @@ static void __init check_fpu(void) static void __init check_hlt(void) { + if (paravirt_enabled()) + return; + printk(KERN_INFO "Checking 'hlt' instruction... "); if (!boot_cpu_data.hlt_works_ok) { printk("disabled\n"); -- cgit v1.2.3-59-g8ed1b From 13623d79309dd82e1964458fa017979d16f33fa8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Add APIC accessors to paravirt-ops. Add APIC accessors to paravirt-ops. Unfortunately, we need two write functions, as some older broken hardware requires workarounds for Pentium APIC errata - this is the purpose of apic_write_atomic. AK: replaced __inline with inline Signed-off-by: Zachary Amsden Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/paravirt.c | 8 ++++++++ include/asm-i386/apic.h | 15 ++++++++++++--- include/asm-i386/paravirt.h | 27 +++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 3 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 5a9bd3250a1a..fe82eb3adf42 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include /* nop stub */ static void native_nop(void) @@ -446,6 +448,12 @@ struct paravirt_ops paravirt_ops = { .io_delay = native_io_delay, .const_udelay = __const_udelay, +#ifdef CONFIG_X86_LOCAL_APIC + .apic_write = native_apic_write, + .apic_write_atomic = native_apic_write_atomic, + .apic_read = native_apic_read, +#endif + .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, }; diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h index b9529578fc37..41a44319905f 100644 --- a/include/asm-i386/apic.h +++ b/include/asm-i386/apic.h @@ -37,18 +37,27 @@ extern void generic_apic_probe(void); /* * Basic functions accessing APICs. */ +#ifdef CONFIG_PARAVIRT +#include +#else +#define apic_write native_apic_write +#define apic_write_atomic native_apic_write_atomic +#define apic_read native_apic_read +#endif -static __inline void apic_write(unsigned long reg, unsigned long v) +static __inline fastcall void native_apic_write(unsigned long reg, + unsigned long v) { *((volatile unsigned long *)(APIC_BASE+reg)) = v; } -static __inline void apic_write_atomic(unsigned long reg, unsigned long v) +static __inline fastcall void native_apic_write_atomic(unsigned long reg, + unsigned long v) { xchg((volatile unsigned long *)(APIC_BASE+reg), v); } -static __inline unsigned long apic_read(unsigned long reg) +static __inline fastcall unsigned long native_apic_read(unsigned long reg) { return *((volatile unsigned long *)(APIC_BASE+reg)); } diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index dd707d8c8270..e2c803fadb14 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -115,6 +115,12 @@ struct paravirt_ops void (fastcall *io_delay)(void); void (*const_udelay)(unsigned long loops); +#ifdef CONFIG_X86_LOCAL_APIC + void (fastcall *apic_write)(unsigned long reg, unsigned long v); + void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v); + unsigned long (fastcall *apic_read)(unsigned long reg); +#endif + /* These two are jmp to, not actually called. */ void (fastcall *irq_enable_sysexit)(void); void (fastcall *iret)(void); @@ -270,6 +276,27 @@ static inline void slow_down_io(void) { #endif } +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Basic functions accessing APICs. + */ +static inline void apic_write(unsigned long reg, unsigned long v) +{ + paravirt_ops.apic_write(reg,v); +} + +static inline void apic_write_atomic(unsigned long reg, unsigned long v) +{ + paravirt_ops.apic_write_atomic(reg,v); +} + +static inline unsigned long apic_read(unsigned long reg) +{ + return paravirt_ops.apic_read(reg); +} +#endif + + /* These all sit in the .parainstructions section to tell us what to patch. */ struct paravirt_patch { u8 *instr; /* original instructions */ -- cgit v1.2.3-59-g8ed1b From da181a8b3916aa7f2e3c5775d2bd2fe3454cf82d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Add MMU virtualization to paravirt_ops Add the three bare TLB accessor functions to paravirt-ops. Most amusingly, flush_tlb is redefined on SMP, so I can't call the paravirt op flush_tlb. Instead, I chose to indicate the actual flush type, kernel (global) vs. user (non-global). Global in this sense means using the global bit in the page table entry, which makes TLB entries persistent across CR3 reloads, not global as in the SMP sense of invoking remote shootdowns, so the term is confusingly overloaded. AK: folded in fix from Zach for PAE compilation Signed-off-by: Zachary Amsden Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/paravirt.c | 109 ++++++++++++++++++++++++++++++++++++++ arch/i386/mm/boot_ioremap.c | 1 + include/asm-i386/paravirt.h | 75 ++++++++++++++++++++++++++ include/asm-i386/pgtable-2level.h | 5 +- include/asm-i386/pgtable-3level.h | 40 +++++++------- include/asm-i386/pgtable.h | 4 +- include/asm-i386/tlbflush.h | 18 +++++-- 7 files changed, 226 insertions(+), 26 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index fe82eb3adf42..3dceab5828f1 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -31,6 +31,7 @@ #include #include #include +#include /* nop stub */ static void native_nop(void) @@ -379,6 +380,97 @@ static fastcall void native_io_delay(void) asm volatile("outb %al,$0x80"); } +static fastcall void native_flush_tlb(void) +{ + __native_flush_tlb(); +} + +/* + * Global pages have to be flushed a bit differently. Not a real + * performance problem because this does not happen often. + */ +static fastcall void native_flush_tlb_global(void) +{ + __native_flush_tlb_global(); +} + +static fastcall void native_flush_tlb_single(u32 addr) +{ + __native_flush_tlb_single(addr); +} + +#ifndef CONFIG_X86_PAE +static fastcall void native_set_pte(pte_t *ptep, pte_t pteval) +{ + *ptep = pteval; +} + +static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) +{ + *ptep = pteval; +} + +static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + *pmdp = pmdval; +} + +#else /* CONFIG_X86_PAE */ + +static fastcall void native_set_pte(pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +{ + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval) +{ + set_64bit((unsigned long long *)ptep,pte_val(pteval)); +} + +static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + set_64bit((unsigned long long *)pmdp,pmd_val(pmdval)); +} + +static fastcall void native_set_pud(pud_t *pudp, pud_t pudval) +{ + *pudp = pudval; +} + +static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = 0; +} + +static fastcall void native_pmd_clear(pmd_t *pmd) +{ + u32 *tmp = (u32 *)pmd; + *tmp = 0; + smp_wmb(); + *(tmp + 1) = 0; +} +#endif /* CONFIG_X86_PAE */ + /* These are in entry.S */ extern fastcall void native_iret(void); extern fastcall void native_irq_enable_sysexit(void); @@ -454,6 +546,23 @@ struct paravirt_ops paravirt_ops = { .apic_read = native_apic_read, #endif + .flush_tlb_user = native_flush_tlb, + .flush_tlb_kernel = native_flush_tlb_global, + .flush_tlb_single = native_flush_tlb_single, + + .set_pte = native_set_pte, + .set_pte_at = native_set_pte_at, + .set_pmd = native_set_pmd, + .pte_update = (void *)native_nop, + .pte_update_defer = (void *)native_nop, +#ifdef CONFIG_X86_PAE + .set_pte_atomic = native_set_pte_atomic, + .set_pte_present = native_set_pte_present, + .set_pud = native_set_pud, + .pte_clear = native_pte_clear, + .pmd_clear = native_pmd_clear, +#endif + .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, }; diff --git a/arch/i386/mm/boot_ioremap.c b/arch/i386/mm/boot_ioremap.c index 4de11f508c3a..4de95a17a7d4 100644 --- a/arch/i386/mm/boot_ioremap.c +++ b/arch/i386/mm/boot_ioremap.c @@ -16,6 +16,7 @@ */ #undef CONFIG_X86_PAE +#undef CONFIG_PARAVIRT #include #include #include diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index e2c803fadb14..9f06265065f4 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -4,6 +4,7 @@ * para-virtualization: those hooks are defined here. */ #include #include +#include #ifdef CONFIG_PARAVIRT /* These are the most performance critical ops, so we want to be able to patch @@ -27,6 +28,7 @@ struct thread_struct; struct Xgt_desc_struct; struct tss_struct; +struct mm_struct; struct paravirt_ops { unsigned int kernel_rpl; @@ -121,6 +123,23 @@ struct paravirt_ops unsigned long (fastcall *apic_read)(unsigned long reg); #endif + void (fastcall *flush_tlb_user)(void); + void (fastcall *flush_tlb_kernel)(void); + void (fastcall *flush_tlb_single)(u32 addr); + + void (fastcall *set_pte)(pte_t *ptep, pte_t pteval); + void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval); + void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (fastcall *pte_update)(struct mm_struct *mm, u32 addr, pte_t *ptep); + void (fastcall *pte_update_defer)(struct mm_struct *mm, u32 addr, pte_t *ptep); +#ifdef CONFIG_X86_PAE + void (fastcall *set_pte_atomic)(pte_t *ptep, pte_t pteval); + void (fastcall *set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); + void (fastcall *set_pud)(pud_t *pudp, pud_t pudval); + void (fastcall *pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + void (fastcall *pmd_clear)(pmd_t *pmdp); +#endif + /* These two are jmp to, not actually called. */ void (fastcall *irq_enable_sysexit)(void); void (fastcall *iret)(void); @@ -297,6 +316,62 @@ static inline unsigned long apic_read(unsigned long reg) #endif +#define __flush_tlb() paravirt_ops.flush_tlb_user() +#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel() +#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr) + +static inline void set_pte(pte_t *ptep, pte_t pteval) +{ + paravirt_ops.set_pte(ptep, pteval); +} + +static inline void set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) +{ + paravirt_ops.set_pte_at(mm, addr, ptep, pteval); +} + +static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + paravirt_ops.set_pmd(pmdp, pmdval); +} + +static inline void pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep) +{ + paravirt_ops.pte_update(mm, addr, ptep); +} + +static inline void pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *ptep) +{ + paravirt_ops.pte_update_defer(mm, addr, ptep); +} + +#ifdef CONFIG_X86_PAE +static inline void set_pte_atomic(pte_t *ptep, pte_t pteval) +{ + paravirt_ops.set_pte_atomic(ptep, pteval); +} + +static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +{ + paravirt_ops.set_pte_present(mm, addr, ptep, pte); +} + +static inline void set_pud(pud_t *pudp, pud_t pudval) +{ + paravirt_ops.set_pud(pudp, pudval); +} + +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + paravirt_ops.pte_clear(mm, addr, ptep); +} + +static inline void pmd_clear(pmd_t *pmdp) +{ + paravirt_ops.pmd_clear(pmdp); +} +#endif + /* These all sit in the .parainstructions section to tell us what to patch. */ struct paravirt_patch { u8 *instr; /* original instructions */ diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h index 8d8d3b9ecdb0..04d6186abc22 100644 --- a/include/asm-i386/pgtable-2level.h +++ b/include/asm-i386/pgtable-2level.h @@ -13,11 +13,14 @@ * within a page table are directly modified. Thus, the following * hook is made available. */ +#ifndef CONFIG_PARAVIRT #define set_pte(pteptr, pteval) (*(pteptr) = pteval) #define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) +#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) +#endif + #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) #define set_pte_present(mm,addr,ptep,pteval) set_pte_at(mm,addr,ptep,pteval) -#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h index c2d701ea35be..2a6e67db8bc3 100644 --- a/include/asm-i386/pgtable-3level.h +++ b/include/asm-i386/pgtable-3level.h @@ -44,6 +44,7 @@ static inline int pte_exec_kernel(pte_t pte) return pte_x(pte); } +#ifndef CONFIG_PARAVIRT /* Rules for using set_pte: the pte being assigned *must* be * either not present or in a state where the hardware will * not attempt to update the pte. In places where this is @@ -80,25 +81,6 @@ static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte #define set_pud(pudptr,pudval) \ (*(pudptr) = (pudval)) -/* - * Pentium-II erratum A13: in PAE mode we explicitly have to flush - * the TLB via cr3 if the top-level pgd is changed... - * We do not let the generic code free and clear pgd entries due to - * this erratum. - */ -static inline void pud_clear (pud_t * pud) { } - -#define pud_page(pud) \ -((struct page *) __va(pud_val(pud) & PAGE_MASK)) - -#define pud_page_vaddr(pud) \ -((unsigned long) __va(pud_val(pud) & PAGE_MASK)) - - -/* Find an entry in the second-level page table.. */ -#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ - pmd_index(address)) - /* * For PTEs and PDEs, we must clear the P-bit first when clearing a page table * entry, so clear the bottom half first and enforce ordering with a compiler @@ -118,6 +100,26 @@ static inline void pmd_clear(pmd_t *pmd) smp_wmb(); *(tmp + 1) = 0; } +#endif + +/* + * Pentium-II erratum A13: in PAE mode we explicitly have to flush + * the TLB via cr3 if the top-level pgd is changed... + * We do not let the generic code free and clear pgd entries due to + * this erratum. + */ +static inline void pud_clear (pud_t * pud) { } + +#define pud_page(pud) \ +((struct page *) __va(pud_val(pud) & PAGE_MASK)) + +#define pud_page_vaddr(pud) \ +((unsigned long) __va(pud_val(pud) & PAGE_MASK)) + + +/* Find an entry in the second-level page table.. */ +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ + pmd_index(address)) #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 7d398f493dde..efd7d90789d0 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -15,6 +15,7 @@ #include #include #include +#include #ifndef _I386_BITOPS_H #include @@ -246,6 +247,7 @@ static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return p # include #endif +#ifndef CONFIG_PARAVIRT /* * Rules for using pte_update - it must be called after any PTE update which * has not been done using the set_pte / clear_pte interfaces. It is used by @@ -261,7 +263,7 @@ static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return p */ #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) - +#endif /* * We only update the dirty/accessed state if we set diff --git a/include/asm-i386/tlbflush.h b/include/asm-i386/tlbflush.h index 360648b0f2b3..4dd82840d53b 100644 --- a/include/asm-i386/tlbflush.h +++ b/include/asm-i386/tlbflush.h @@ -4,7 +4,15 @@ #include #include -#define __flush_tlb() \ +#ifdef CONFIG_PARAVIRT +#include +#else +#define __flush_tlb() __native_flush_tlb() +#define __flush_tlb_global() __native_flush_tlb_global() +#define __flush_tlb_single(addr) __native_flush_tlb_single(addr) +#endif + +#define __native_flush_tlb() \ do { \ unsigned int tmpreg; \ \ @@ -19,7 +27,7 @@ * Global pages have to be flushed a bit differently. Not a real * performance problem because this does not happen often. */ -#define __flush_tlb_global() \ +#define __native_flush_tlb_global() \ do { \ unsigned int tmpreg, cr4, cr4_orig; \ \ @@ -36,6 +44,9 @@ : "memory"); \ } while (0) +#define __native_flush_tlb_single(addr) \ + __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory") + # define __flush_tlb_all() \ do { \ if (cpu_has_pge) \ @@ -46,9 +57,6 @@ #define cpu_has_invlpg (boot_cpu_data.x86 > 3) -#define __flush_tlb_single(addr) \ - __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory") - #ifdef CONFIG_X86_INVLPG # define __flush_tlb_one(addr) __flush_tlb_single(addr) #else -- cgit v1.2.3-59-g8ed1b From a2952d8949bb0b37c1be92a89c4f180c74292857 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Preparatory mmu header movement Move header includes for the nopud / nopmd types to the location of the actual pte / pgd type definitions. This allows generic 4-level page type code to be written before the split 2/3 level page table headers are included. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Chris Wright Signed-off-by: Andrew Morton --- include/asm-i386/page.h | 2 ++ include/asm-i386/pgtable-2level.h | 2 -- include/asm-i386/pgtable-3level.h | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/asm-i386') diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index 2b69686107ae..fd3f64ace248 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -52,6 +52,7 @@ typedef struct { unsigned long long pgprot; } pgprot_t; #define pte_val(x) ((x).pte_low | ((unsigned long long)(x).pte_high << 32)) #define __pmd(x) ((pmd_t) { (x) } ) #define HPAGE_SHIFT 21 +#include #else typedef struct { unsigned long pte_low; } pte_t; typedef struct { unsigned long pgd; } pgd_t; @@ -59,6 +60,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define boot_pte_t pte_t /* or would you rather have a typedef */ #define pte_val(x) ((x).pte_low) #define HPAGE_SHIFT 22 +#include #endif #define PTE_MASK PAGE_MASK diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h index 04d6186abc22..c08cbc4c2d8b 100644 --- a/include/asm-i386/pgtable-2level.h +++ b/include/asm-i386/pgtable-2level.h @@ -1,8 +1,6 @@ #ifndef _I386_PGTABLE_2LEVEL_H #define _I386_PGTABLE_2LEVEL_H -#include - #define pte_ERROR(e) \ printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) #define pgd_ERROR(e) \ diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h index 2a6e67db8bc3..54287f148c74 100644 --- a/include/asm-i386/pgtable-3level.h +++ b/include/asm-i386/pgtable-3level.h @@ -1,8 +1,6 @@ #ifndef _I386_PGTABLE_3LEVEL_H #define _I386_PGTABLE_3LEVEL_H -#include - /* * Intel Physical Address Extension (PAE) Mode - three-level page * tables on PPro+ CPUs. -- cgit v1.2.3-59-g8ed1b From dfbea0ad50e08c52539bddce977b07f77a762ba4 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: fix parameter names in mmu operations Make parameter names match function argument names for the yet to be defined pte_update_defer accessor. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Chris Wright Signed-off-by: Andrew Morton --- include/asm-i386/pgtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/asm-i386') diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index efd7d90789d0..04dd39b973ad 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -277,7 +277,7 @@ static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return p do { \ if (dirty) { \ (ptep)->pte_low = (entry).pte_low; \ - pte_update_defer((vma)->vm_mm, (addr), (ptep)); \ + pte_update_defer((vma)->vm_mm, (address), (ptep)); \ flush_tlb_page(vma, address); \ } \ } while (0) @@ -307,7 +307,7 @@ do { \ __dirty = pte_dirty(*(ptep)); \ if (__dirty) { \ clear_bit(_PAGE_BIT_DIRTY, &(ptep)->pte_low); \ - pte_update_defer((vma)->vm_mm, (addr), (ptep)); \ + pte_update_defer((vma)->vm_mm, (address), (ptep)); \ flush_tlb_page(vma, address); \ } \ __dirty; \ @@ -320,7 +320,7 @@ do { \ __young = pte_young(*(ptep)); \ if (__young) { \ clear_bit(_PAGE_BIT_ACCESSED, &(ptep)->pte_low); \ - pte_update_defer((vma)->vm_mm, (addr), (ptep)); \ + pte_update_defer((vma)->vm_mm, (address), (ptep)); \ flush_tlb_page(vma, address); \ } \ __young; \ -- cgit v1.2.3-59-g8ed1b From 8ecb8950695e907ed25acffec9e98c6806e311c8 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 7 Dec 2006 02:14:09 +0100 Subject: [PATCH] paravirt: fix missing pte update The function ptep_get_and_clear uses an atomic instruction sequence to get and clear an active pte. Rather than add such an atomic operator to all virtual machine implementations in paravirt-ops, it is easier to support the raw atomic sequence and use either a trapping writable pagetable approach, or a post-update notification. For the post update notification, we require the pte_update function to be called after the access. Combine the 2-level and 3-level paging operators into one common function which does the post-update notification, and rename the actual atomic sequences to raw_ptep_xxx operators. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Chris Wright Signed-off-by: Andrew Morton --- include/asm-i386/pgtable-2level.h | 3 +-- include/asm-i386/pgtable-3level.h | 3 +-- include/asm-i386/pgtable.h | 8 ++++++++ 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include/asm-i386') diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h index c08cbc4c2d8b..38c3fcc0676d 100644 --- a/include/asm-i386/pgtable-2level.h +++ b/include/asm-i386/pgtable-2level.h @@ -23,8 +23,7 @@ #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -#define ptep_get_and_clear(mm,addr,xp) __pte(xchg(&(xp)->pte_low, 0)) +#define raw_ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0)) #define pte_page(x) pfn_to_page(pte_pfn(x)) #define pte_none(x) (!(x).pte_low) diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h index 54287f148c74..7a2318f38303 100644 --- a/include/asm-i386/pgtable-3level.h +++ b/include/asm-i386/pgtable-3level.h @@ -119,8 +119,7 @@ static inline void pud_clear (pud_t * pud) { } #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ pmd_index(address)) -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static inline pte_t raw_ptep_get_and_clear(pte_t *ptep) { pte_t res; diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 04dd39b973ad..b4a301f647ba 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -326,6 +326,14 @@ do { \ __young; \ }) +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + pte_t pte = raw_ptep_get_and_clear(ptep); + pte_update(mm, addr, ptep); + return pte; +} + #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) { -- cgit v1.2.3-59-g8ed1b From c55d92d141b9c40c67db249de91f5c224eb49859 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:09 +0100 Subject: [PATCH] i386: Add support for compilation for Core2 gcc doesn't support -mtune=core2 yet, but will be soon. Use -mtune=generic or -mtune=i686 as fallback TBD need benchmarking for INTEL_USERCOPY etc. So far I used the same defaults as MPENTIUMM Signed-off-by: Andi Kleen --- arch/i386/Kconfig.cpu | 19 +++++++++++++------ arch/i386/Makefile.cpu | 1 + include/asm-i386/module.h | 2 ++ 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu index fc4f2abccf06..821fd269ca58 100644 --- a/arch/i386/Kconfig.cpu +++ b/arch/i386/Kconfig.cpu @@ -103,8 +103,15 @@ config MPENTIUMM Select this for Intel Pentium M (not Pentium-4 M) notebook chips. +config MCORE2 + bool "Core 2/newer Xeon" + help + Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and 53xx) + CPUs. You can distingush newer from older Xeons by the CPU family + in /proc/cpuinfo. Newer ones have 6. + config MPENTIUM4 - bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/Xeon" + bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon" help Select this for Intel Pentium 4 chips. This includes the Pentium 4, P4-based Celeron and Xeon, and Pentium-4 M @@ -229,7 +236,7 @@ config X86_L1_CACHE_SHIFT default "7" if MPENTIUM4 || X86_GENERIC default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX - default "6" if MK7 || MK8 || MPENTIUMM + default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 config RWSEM_GENERIC_SPINLOCK bool @@ -287,17 +294,17 @@ config X86_ALIGNMENT_16 config X86_GOOD_APIC bool - depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON + depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 default y config X86_INTEL_USERCOPY bool - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 default y config X86_USE_PPRO_CHECKSUM bool - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX + depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 default y config X86_USE_3DNOW @@ -312,5 +319,5 @@ config X86_OOSTORE config X86_TSC bool - depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX) && !X86_NUMAQ + depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ default y diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu index a11befba26d5..a32c031c90d7 100644 --- a/arch/i386/Makefile.cpu +++ b/arch/i386/Makefile.cpu @@ -32,6 +32,7 @@ cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) +cflags-$(CONFIG_MCORE2) += -march=i686 $(call cc-option,-mtune=core2,$(call cc-option,-mtune=generic,-mtune=i686)) # AMD Elan support cflags-$(CONFIG_X86_ELAN) += -march=i486 diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h index 424661d25bd3..fe5ae42e0273 100644 --- a/include/asm-i386/module.h +++ b/include/asm-i386/module.h @@ -20,6 +20,8 @@ struct mod_arch_specific #define MODULE_PROC_FAMILY "586TSC " #elif defined CONFIG_M586MMX #define MODULE_PROC_FAMILY "586MMX " +#elif defined CONFIG_MCORE2 +#define MODULE_PROC_FAMILY "CORE2 " #elif defined CONFIG_M686 #define MODULE_PROC_FAMILY "686 " #elif defined CONFIG_MPENTIUMII -- cgit v1.2.3-59-g8ed1b From fd6d7d26897dec834d0b9fbdc59819b0332a1257 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Thu, 7 Dec 2006 02:14:10 +0100 Subject: [PATCH] i386: introduce the mechanism of disabling cpu hotplug control Add 'enable_cpu_hotplug' flag and when cleared, the hotplug control file ("online") will not be added under /sys/devices/system/cpu/cpuX/ Next patch doing PCI quirks will use this. Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: "Li, Shaohua" Signed-off-by: Andrew Morton --- arch/i386/kernel/topology.c | 6 +++++- include/asm-i386/cpu.h | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c index 07d6da36a825..844c08fdb225 100644 --- a/arch/i386/kernel/topology.c +++ b/arch/i386/kernel/topology.c @@ -40,14 +40,18 @@ int arch_register_cpu(int num) * restrictions and assumptions in kernel. This basically * doesnt add a control file, one cannot attempt to offline * BSP. + * + * Also certain PCI quirks require not to enable hotplug control + * for all CPU's. */ - if (!num) + if (!num || !enable_cpu_hotplug) cpu_devices[num].cpu.no_control = 1; return register_cpu(&cpu_devices[num].cpu, num); } #ifdef CONFIG_HOTPLUG_CPU +int enable_cpu_hotplug = 1; void arch_unregister_cpu(int num) { return unregister_cpu(&cpu_devices[num].cpu); diff --git a/include/asm-i386/cpu.h b/include/asm-i386/cpu.h index b1bc7b1b64b0..9d914e1e4aad 100644 --- a/include/asm-i386/cpu.h +++ b/include/asm-i386/cpu.h @@ -13,6 +13,9 @@ struct i386_cpu { extern int arch_register_cpu(int num); #ifdef CONFIG_HOTPLUG_CPU extern void arch_unregister_cpu(int); +extern int enable_cpu_hotplug; +#else +#define enable_cpu_hotplug 0 #endif DECLARE_PER_CPU(int, cpu_state); -- cgit v1.2.3-59-g8ed1b From b0d0a4ba45760b10ecee9035ed45b442c1a6cc84 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Thu, 7 Dec 2006 02:14:10 +0100 Subject: [PATCH] x86: fix the irqbalance quirk for E7320/E7520/E7525 Move the irqbalance quirks for E7320/E7520/E7525(Errata 23 in http://download.intel.com/design/chipsets/specupdt/30304203.pdf) to early quirks. And add a PCI quirk for these platforms to check(which happens very late during the boot) if the APIC routing is indeed set to default flat mode. This fixes the breakage(in x86_64) of this quirk due to cpu hotplug which selects physical mode instead of the logical flat(as needed for this errata workaround). Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: "Li, Shaohua" Signed-off-by: Andrew Morton --- arch/i386/kernel/acpi/earlyquirk.c | 21 +++++++++++++++++ arch/i386/kernel/quirks.c | 46 +++++++++++++++++++++++++++++--------- arch/i386/kernel/smpboot.c | 7 ++++++ arch/x86_64/kernel/early-quirks.c | 13 +++++++++++ arch/x86_64/kernel/smpboot.c | 8 +++++++ include/asm-i386/genapic.h | 2 +- include/asm-i386/irq.h | 2 ++ include/asm-x86_64/proto.h | 1 + 8 files changed, 88 insertions(+), 12 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index c9841692bb7c..4b60af7f91dd 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -10,6 +10,7 @@ #include #include #include +#include #ifdef CONFIG_ACPI @@ -49,6 +50,24 @@ static int __init check_bridge(int vendor, int device) return 0; } +static void check_intel(void) +{ + u16 vendor, device; + + vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID); + + if (vendor != PCI_VENDOR_ID_INTEL) + return; + + device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); +#ifdef CONFIG_SMP + if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || + device == PCI_DEVICE_ID_INTEL_E7520_MCH || + device == PCI_DEVICE_ID_INTEL_E7525_MCH) + quirk_intel_irqbalance(); +#endif +} + void __init check_acpi_pci(void) { int num, slot, func; @@ -60,6 +79,8 @@ void __init check_acpi_pci(void) if (!early_pci_allowed()) return; + check_intel(); + /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { for (slot = 0; slot < 32; slot++) { diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c index 9f6ab1789bb0..a01320a7b636 100644 --- a/arch/i386/kernel/quirks.c +++ b/arch/i386/kernel/quirks.c @@ -3,10 +3,23 @@ */ #include #include +#include +#include +#include #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) +static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) +{ +#ifdef CONFIG_X86_64 + if (genapic != &apic_flat) + panic("APIC mode must be flat on this system\n"); +#elif defined(CONFIG_X86_GENERICARCH) + if (genapic != &apic_default) + panic("APIC mode must be default(flat) on this system. Use apic=default\n"); +#endif +} -static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) +void __init quirk_intel_irqbalance(void) { u8 config, rev; u32 word; @@ -16,18 +29,18 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) * based platforms. * Disable SW irqbalance/affinity on those platforms. */ - pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); + rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION); if (rev > 0x9) return; printk(KERN_INFO "Intel E7520/7320/7525 detected."); - /* enable access to config space*/ - pci_read_config_byte(dev, 0xf4, &config); - pci_write_config_byte(dev, 0xf4, config|0x2); + /* enable access to config space */ + config = read_pci_config_byte(0, 0, 0, 0xf4); + write_pci_config_byte(0, 0, 0, 0xf4, config|0x2); /* read xTPR register */ - raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); + word = read_pci_config_16(0, 0, 0x40, 0x4c); if (!(word & (1 << 13))) { printk(KERN_INFO "Disabling irq balancing and affinity\n"); @@ -37,14 +50,25 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) noirqdebug_setup(""); #ifdef CONFIG_PROC_FS no_irq_affinity = 1; +#endif +#ifdef CONFIG_HOTPLUG_CPU + printk(KERN_INFO "Disabling cpu hotplug control\n"); + enable_cpu_hotplug = 0; +#endif +#ifdef CONFIG_X86_64 + /* force the genapic selection to flat mode so that + * interrupts can be redirected to more than one CPU. + */ + genapic_force = &apic_flat; #endif } - /* put back the original value for config space*/ + /* put back the original value for config space */ if (!(config & 0x2)) - pci_write_config_byte(dev, 0xf4, config); + write_pci_config_byte(0, 0, 0, 0xf4, config); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance); + #endif diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index cd7de9c9654b..346f27f4c79f 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -1482,6 +1483,12 @@ int __devinit __cpu_up(unsigned int cpu) cpu_set(cpu, smp_commenced_mask); while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); + +#ifdef CONFIG_X86_GENERICARCH + if (num_online_cpus() > 8 && genapic == &apic_default) + panic("Default flat APIC routing can't be used with > 8 cpus\n"); +#endif + return 0; } diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index fb0c6da41b7e..829698f6d049 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -71,6 +71,18 @@ static void ati_bugs(void) { } +static void intel_bugs(void) +{ + u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); + +#ifdef CONFIG_SMP + if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || + device == PCI_DEVICE_ID_INTEL_E7520_MCH || + device == PCI_DEVICE_ID_INTEL_E7525_MCH) + quirk_intel_irqbalance(); +#endif +} + struct chipset { u16 vendor; void (*f)(void); @@ -80,6 +92,7 @@ static struct chipset early_qrk[] = { { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, { PCI_VENDOR_ID_VIA, via_bugs }, { PCI_VENDOR_ID_ATI, ati_bugs }, + { PCI_VENDOR_ID_INTEL, intel_bugs}, {} }; diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 62c2e747af58..4c161c208d5b 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -60,6 +60,7 @@ #include #include #include +#include /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -1167,6 +1168,13 @@ int __cpuinit __cpu_up(unsigned int cpu) while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); + + if (num_online_cpus() > 8 && genapic == &apic_flat) { + printk(KERN_WARNING + "flat APIC routing can't be used with > 8 cpus\n"); + BUG(); + } + err = 0; return err; diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h index 8ffbb0f07457..fd2be593b06e 100644 --- a/include/asm-i386/genapic.h +++ b/include/asm-i386/genapic.h @@ -122,6 +122,6 @@ struct genapic { APICFUNC(phys_pkg_id) \ } -extern struct genapic *genapic; +extern struct genapic *genapic, apic_default; #endif diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h index 9e15ce0006eb..11761cdaae19 100644 --- a/include/asm-i386/irq.h +++ b/include/asm-i386/irq.h @@ -37,6 +37,8 @@ static __inline__ int irq_canonicalize(int irq) extern int irqbalance_disable(char *str); #endif +extern void quirk_intel_irqbalance(void); + #ifdef CONFIG_HOTPLUG_CPU extern void fixup_irqs(cpumask_t map); #endif diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 21fa5afa232e..6d324b838972 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -87,6 +87,7 @@ extern void syscall32_cpu_init(void); extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); extern void early_quirks(void); +extern void quirk_intel_irqbalance(void); extern void check_efer(void); extern int unhandled_signal(struct task_struct *tsk, int sig); -- cgit v1.2.3-59-g8ed1b From 538f188e03c821c93b355c9fc346806cdd34e286 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: i386 add Intel BTS cpufeature bit and detection (take 2) Here is a small patch for i386 which adds a cpufeature flag and detection code for Intel's Branch Trace Store (BTS) feature. This feature can be found on Intel P4 and Core 2 processors among others. It can also be used by perfmon. changelog: - add CPU_FEATURE_BTS - add Branch Trace Store detection signed-off-by: stephane eranian Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/intel.c | 2 ++ include/asm-i386/cpufeature.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 3ae795e9056d..56fe26584957 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -199,6 +199,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has_ds) { unsigned int l1; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<11))) + set_bit(X86_FEATURE_BTS, c->x86_capability); if (!(l1 & (1<<12))) set_bit(X86_FEATURE_PEBS, c->x86_capability); } diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h index 4c83e059228f..3f92b94e0d75 100644 --- a/include/asm-i386/cpufeature.h +++ b/include/asm-i386/cpufeature.h @@ -74,6 +74,7 @@ #define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ #define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ @@ -138,6 +139,7 @@ #define cpu_has_ds boot_cpu_has(X86_FEATURE_DS) #define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS) #define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH) +#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS) #endif /* __ASM_I386_CPUFEATURE_H */ -- cgit v1.2.3-59-g8ed1b From a1a70c25bed75ed36ed48bbe18b9029428d2452d Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Dec 2006 02:14:12 +0100 Subject: [PATCH] i386: always enable regparm -mregparm=3 has been enabled by default for some time on i386, and AFAIK there aren't any problems with it left. This patch removes the REGPARM config option and sets -mregparm=3 unconditionally. Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen --- Documentation/stable_api_nonsense.txt | 3 --- arch/i386/Kconfig | 14 -------------- arch/i386/Makefile | 4 +--- include/asm-i386/module.h | 8 +------- 4 files changed, 2 insertions(+), 27 deletions(-) (limited to 'include/asm-i386') diff --git a/Documentation/stable_api_nonsense.txt b/Documentation/stable_api_nonsense.txt index f39c9d714db3..a2afca3b2bab 100644 --- a/Documentation/stable_api_nonsense.txt +++ b/Documentation/stable_api_nonsense.txt @@ -62,9 +62,6 @@ consider the following facts about the Linux kernel: - different structures can contain different fields - Some functions may not be implemented at all, (i.e. some locks compile away to nothing for non-SMP builds.) - - Parameter passing of variables from function to function can be - done in different ways (the CONFIG_REGPARM option controls - this.) - Memory within the kernel can be aligned in different ways, depending on the build options. - Linux runs on a wide range of different processor architectures. diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index bb1fa061c6cf..b6b2df40ca78 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -721,20 +721,6 @@ config BOOT_IOREMAP depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) default y -config REGPARM - bool "Use register arguments" - default y - help - Compile the kernel with -mregparm=3. This instructs gcc to use - a more efficient function call ABI which passes the first three - arguments of a function call via registers, which results in denser - and faster code. - - If this option is disabled, then the default ABI of passing - arguments via the stack is used. - - If unsure, say Y. - config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS diff --git a/arch/i386/Makefile b/arch/i386/Makefile index d1aca52bf690..f7ac1aea1d8a 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -31,7 +31,7 @@ LDFLAGS_vmlinux := --emit-relocs endif CHECKFLAGS += -D__i386__ -CFLAGS += -pipe -msoft-float +CFLAGS += -pipe -msoft-float -mregparm=3 # prevent gcc from keeping the stack 16 byte aligned CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) @@ -39,8 +39,6 @@ CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) # CPU-specific tuning. Anything which can be shared with UML should go here. include $(srctree)/arch/i386/Makefile.cpu -cflags-$(CONFIG_REGPARM) += -mregparm=3 - # temporary until string.h is fixed cflags-y += -ffreestanding diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h index fe5ae42e0273..02f8f541cbe0 100644 --- a/include/asm-i386/module.h +++ b/include/asm-i386/module.h @@ -62,18 +62,12 @@ struct mod_arch_specific #error unknown processor family #endif -#ifdef CONFIG_REGPARM -#define MODULE_REGPARM "REGPARM " -#else -#define MODULE_REGPARM "" -#endif - #ifdef CONFIG_4KSTACKS #define MODULE_STACKSIZE "4KSTACKS " #else #define MODULE_STACKSIZE "" #endif -#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE +#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE #endif /* _ASM_I386_MODULE_H */ -- cgit v1.2.3-59-g8ed1b From 359ad0d4015a9ab39243f2ebc4eb07915bd618b2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:13 +0100 Subject: [PATCH] unwinder: more sanity checks in Dwarf2 unwinder Tighten the requirements on both input to and output from the Dwarf2 unwinder. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 7 +++++++ arch/x86_64/kernel/traps.c | 7 +++++++ include/asm-i386/unwind.h | 12 ++++-------- include/asm-x86_64/unwind.h | 8 ++------ kernel/unwind.c | 16 +++++++++++++++- 5 files changed, 35 insertions(+), 15 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 86d8476be4fe..c447807e2a45 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -161,12 +161,19 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data) { struct ops_and_data *oad = (struct ops_and_data *)data; int n = 0; + unsigned long sp = UNW_SP(info); + if (arch_unw_user_mode(info)) + return -1; while (unwind(info) == 0 && UNW_PC(info)) { n++; oad->ops->address(oad->data, UNW_PC(info)); if (arch_unw_user_mode(info)) break; + if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1)) + && sp > UNW_SP(info)) + break; + sp = UNW_SP(info); } return n; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 9864d195c408..4fdd162f0bef 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -225,12 +225,19 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context) { struct ops_and_data *oad = (struct ops_and_data *)context; int n = 0; + unsigned long sp = UNW_SP(info); + if (arch_unw_user_mode(info)) + return -1; while (unwind(info) == 0 && UNW_PC(info)) { n++; oad->ops->address(oad->data, UNW_PC(info)); if (arch_unw_user_mode(info)) break; + if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1)) + && sp > UNW_SP(info)) + break; + sp = UNW_SP(info); } return n; } diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h index 601fc67bd775..aa2c931e30db 100644 --- a/include/asm-i386/unwind.h +++ b/include/asm-i386/unwind.h @@ -79,17 +79,13 @@ extern asmlinkage int arch_unwind_init_running(struct unwind_frame_info *, void *arg), void *arg); -static inline int arch_unw_user_mode(const struct unwind_frame_info *info) +static inline int arch_unw_user_mode(/*const*/ struct unwind_frame_info *info) { -#if 0 /* This can only work when selector register and EFLAGS saves/restores - are properly annotated (and tracked in UNW_REGISTER_INFO). */ - return user_mode_vm(&info->regs); -#else - return info->regs.eip < PAGE_OFFSET + return user_mode_vm(&info->regs) + || info->regs.eip < PAGE_OFFSET || (info->regs.eip >= __fix_to_virt(FIX_VDSO) - && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE) + && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE) || info->regs.esp < PAGE_OFFSET; -#endif } #else diff --git a/include/asm-x86_64/unwind.h b/include/asm-x86_64/unwind.h index 2e7ff10fd775..2f6349e48717 100644 --- a/include/asm-x86_64/unwind.h +++ b/include/asm-x86_64/unwind.h @@ -87,14 +87,10 @@ extern int arch_unwind_init_running(struct unwind_frame_info *, static inline int arch_unw_user_mode(const struct unwind_frame_info *info) { -#if 0 /* This can only work when selector register saves/restores - are properly annotated (and tracked in UNW_REGISTER_INFO). */ - return user_mode(&info->regs); -#else - return (long)info->regs.rip >= 0 + return user_mode(&info->regs) + || (long)info->regs.rip >= 0 || (info->regs.rip >= VSYSCALL_START && info->regs.rip < VSYSCALL_END) || (long)info->regs.rsp >= 0; -#endif } #else diff --git a/kernel/unwind.c b/kernel/unwind.c index af48168a3afb..7e721f104105 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -95,6 +95,7 @@ static const struct { typedef unsigned long uleb128_t; typedef signed long sleb128_t; +#define sleb128abs __builtin_labs static struct unwind_table { struct { @@ -787,7 +788,7 @@ int unwind(struct unwind_frame_info *frame) #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) const u32 *fde = NULL, *cie = NULL; const u8 *ptr = NULL, *end = NULL; - unsigned long pc = UNW_PC(frame) - frame->call_frame; + unsigned long pc = UNW_PC(frame) - frame->call_frame, sp; unsigned long startLoc = 0, endLoc = 0, cfa; unsigned i; signed ptrType = -1; @@ -936,6 +937,9 @@ int unwind(struct unwind_frame_info *frame) state.dataAlign = get_sleb128(&ptr, end); if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) cie = NULL; + else if (UNW_PC(frame) % state.codeAlign + || UNW_SP(frame) % sleb128abs(state.dataAlign)) + return -EPERM; else { retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); /* skip augmentation */ @@ -968,6 +972,8 @@ int unwind(struct unwind_frame_info *frame) #ifdef CONFIG_FRAME_POINTER unsigned long top, bottom; + if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long)) + return -EPERM; top = STACK_TOP(frame->task); bottom = STACK_BOTTOM(frame->task); # if FRAME_RETADDR_OFFSET < 0 @@ -1018,6 +1024,7 @@ int unwind(struct unwind_frame_info *frame) || state.regs[retAddrReg].where == Nowhere || state.cfa.reg >= ARRAY_SIZE(reg_info) || reg_info[state.cfa.reg].width != sizeof(unsigned long) + || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long) || state.cfa.offs % sizeof(unsigned long)) return -EIO; /* update frame */ @@ -1038,6 +1045,8 @@ int unwind(struct unwind_frame_info *frame) #else # define CASES CASE(8); CASE(16); CASE(32); CASE(64) #endif + pc = UNW_PC(frame); + sp = UNW_SP(frame); for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { if (REG_INVALID(i)) { if (state.regs[i].where == Nowhere) @@ -1118,6 +1127,11 @@ int unwind(struct unwind_frame_info *frame) } } + if (UNW_PC(frame) % state.codeAlign + || UNW_SP(frame) % sleb128abs(state.dataAlign) + || (pc == UNW_PC(frame) && sp == UNW_SP(frame))) + return -EIO; + return 0; #undef CASES #undef FRAME_REG -- cgit v1.2.3-59-g8ed1b From e4b522d7ef144fb2ad6a4cb23d9cb5ec154be8bc Mon Sep 17 00:00:00 2001 From: Duncan Sands Date: Thu, 7 Dec 2006 02:14:13 +0100 Subject: [PATCH] x86-64: fix asm constraints in i386 atomic_add_return Since v->counter is both read and written, it should be an output as well as an input for the asm. The current code only gets away with this because counter is volatile. Also, according to Documents/atomic_ops.txt, atomic_add_return should provide a memory barrier, in particular a compiler barrier, so the asm should be marked as clobbering memory. Test case: #include typedef struct { int counter; } atomic_t; /* NB: no "volatile" */ #define ATOMIC_INIT(i) { (i) } #define atomic_read(v) ((v)->counter) static __inline__ int atomic_add_return(int i, atomic_t *v) { int __i = i; __asm__ __volatile__( "lock; xaddl %0, %1;" :"=r"(i) :"m"(v->counter), "0"(i)); /* __asm__ __volatile__( "lock; xaddl %0, %1" :"+r" (i), "+m" (v->counter) : : "memory"); */ return i + __i; } int main (void) { atomic_t a = ATOMIC_INIT(0); int x; x = atomic_add_return (1, &a); if ((x!=1) || (atomic_read(&a)!=1)) printf("fail: %i, %i\n", x, atomic_read(&a)); } Signed-off-by: Duncan Sands Signed-off-by: Andi Kleen Cc: Andi Kleen Acked-by: David Howells Signed-off-by: Andrew Morton --- include/asm-i386/atomic.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/asm-i386') diff --git a/include/asm-i386/atomic.h b/include/asm-i386/atomic.h index 51a166242522..6aab7a105fad 100644 --- a/include/asm-i386/atomic.h +++ b/include/asm-i386/atomic.h @@ -187,9 +187,9 @@ static __inline__ int atomic_add_return(int i, atomic_t *v) /* Modern 486+ processor */ __i = i; __asm__ __volatile__( - LOCK_PREFIX "xaddl %0, %1;" - :"=r"(i) - :"m"(v->counter), "0"(i)); + LOCK_PREFIX "xaddl %0, %1" + :"+r" (i), "+m" (v->counter) + : : "memory"); return i + __i; #ifdef CONFIG_M386 -- cgit v1.2.3-59-g8ed1b From d7fb02712818643bab79a6b3cb8270a747d0227b Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] x86-64: remove remaining pc98 code Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/io_apic.c | 23 ++--------------------- arch/i386/kernel/mpparse.c | 2 -- include/asm-i386/mpspec_def.h | 2 -- 3 files changed, 2 insertions(+), 25 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 56f571c9fc0d..7f015a71ab55 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -842,8 +842,7 @@ static int __init find_isa_irq_pin(int irq, int type) if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA || - mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -862,8 +861,7 @@ static int __init find_isa_irq_apic(int irq, int type) if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA || - mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -993,12 +991,6 @@ static int EISA_ELCR(unsigned int irq) #define default_MCA_trigger(idx) (1) #define default_MCA_polarity(idx) (0) -/* NEC98 interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_NEC98_trigger(idx) (0) -#define default_NEC98_polarity(idx) (0) - static int __init MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mpc_srcbus; @@ -1033,11 +1025,6 @@ static int __init MPBIOS_polarity(int idx) polarity = default_MCA_polarity(idx); break; } - case MP_BUS_NEC98: /* NEC 98 pin */ - { - polarity = default_NEC98_polarity(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -1107,11 +1094,6 @@ static int MPBIOS_trigger(int idx) trigger = default_MCA_trigger(idx); break; } - case MP_BUS_NEC98: /* NEC 98 pin */ - { - trigger = default_NEC98_trigger(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -1173,7 +1155,6 @@ static int pin_2_irq(int idx, int apic, int pin) case MP_BUS_ISA: /* ISA pin */ case MP_BUS_EISA: case MP_BUS_MCA: - case MP_BUS_NEC98: { irq = mp_irqs[idx].mpc_srcbusirq; break; diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 442aaf8c77eb..2ce67228dff8 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -249,8 +249,6 @@ static void __init MP_bus_info (struct mpc_config_bus *m) mp_current_pci_id++; } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; - } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98; } else { printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); } diff --git a/include/asm-i386/mpspec_def.h b/include/asm-i386/mpspec_def.h index 76feedf85a8a..13bafb16e7af 100644 --- a/include/asm-i386/mpspec_def.h +++ b/include/asm-i386/mpspec_def.h @@ -97,7 +97,6 @@ struct mpc_config_bus #define BUSTYPE_TC "TC" #define BUSTYPE_VME "VME" #define BUSTYPE_XPRESS "XPRESS" -#define BUSTYPE_NEC98 "NEC98" struct mpc_config_ioapic { @@ -182,7 +181,6 @@ enum mp_bustype { MP_BUS_EISA, MP_BUS_PCI, MP_BUS_MCA, - MP_BUS_NEC98 }; #endif -- cgit v1.2.3-59-g8ed1b From 116780fc04d9f6cd3ceeab0251681f1dfda53367 Mon Sep 17 00:00:00 2001 From: Burman Yan Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] i386: replace kmalloc+memset with kzalloc Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/intel_cacheinfo.c | 11 +++-------- arch/i386/kernel/mca.c | 13 ++++--------- arch/i386/kernel/pci-dma.c | 6 ++---- arch/i386/mach-voyager/voyager_cat.c | 6 ++---- include/asm-i386/thread_info.h | 10 +--------- 5 files changed, 12 insertions(+), 34 deletions(-) (limited to 'include/asm-i386') diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index 5c43be47587f..80b4c5d421b1 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -480,12 +480,10 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) if (num_cache_leaves == 0) return -ENOENT; - cpuid4_info[cpu] = kmalloc( + cpuid4_info[cpu] = kzalloc( sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); if (unlikely(cpuid4_info[cpu] == NULL)) return -ENOMEM; - memset(cpuid4_info[cpu], 0, - sizeof(struct _cpuid4_info) * num_cache_leaves); oldmask = current->cpus_allowed; retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); @@ -658,17 +656,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) return -ENOENT; /* Allocate all required memory */ - cache_kobject[cpu] = kmalloc(sizeof(struct kobject), GFP_KERNEL); + cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL); if (unlikely(cache_kobject[cpu] == NULL)) goto err_out; - memset(cache_kobject[cpu], 0, sizeof(struct kobject)); - index_kobject[cpu] = kmalloc( + index_kobject[cpu] = kzalloc( sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); if (unlikely(index_kobject[cpu] == NULL)) goto err_out; - memset(index_kobject[cpu], 0, - sizeof(struct _index_kobject) * num_cache_leaves); return 0; diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c index eb57a851789d..b83672b89527 100644 --- a/arch/i386/kernel/mca.c +++ b/arch/i386/kernel/mca.c @@ -283,10 +283,9 @@ static int __init mca_init(void) bus->f.mca_transform_memory = mca_dummy_transform_memory; /* get the motherboard device */ - mca_dev = kmalloc(sizeof(struct mca_device), GFP_KERNEL); + mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL); if(unlikely(!mca_dev)) goto out_nomem; - memset(mca_dev, 0, sizeof(struct mca_device)); /* * We do not expect many MCA interrupts during initialization, @@ -310,11 +309,9 @@ static int __init mca_init(void) mca_dev->slot = MCA_MOTHERBOARD; mca_register_device(MCA_PRIMARY_BUS, mca_dev); - mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); + mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); if(unlikely(!mca_dev)) goto out_unlock_nomem; - memset(mca_dev, 0, sizeof(struct mca_device)); - /* Put motherboard into video setup mode, read integrated video * POS registers, and turn motherboard setup off. @@ -349,10 +346,9 @@ static int __init mca_init(void) } if(which_scsi) { /* found a scsi card */ - mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); + mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); if(unlikely(!mca_dev)) goto out_unlock_nomem; - memset(mca_dev, 0, sizeof(struct mca_device)); for(j = 0; j < 8; j++) mca_dev->pos[j] = pos[j]; @@ -378,10 +374,9 @@ static int __init mca_init(void) if(!mca_read_and_store_pos(pos)) continue; - mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); + mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); if(unlikely(!mca_dev)) goto out_unlock_nomem; - memset(mca_dev, 0, sizeof(struct mca_device)); for(j=0; j<8; j++) mca_dev->pos[j]=pos[j]; diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c index 5c8c6ef1fc5e..41af692c1584 100644 --- a/arch/i386/kernel/pci-dma.c +++ b/arch/i386/kernel/pci-dma.c @@ -92,14 +92,12 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, if (!mem_base) goto out; - dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); if (!dev->dma_mem) goto out; - memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); - dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL); + dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); if (!dev->dma_mem->bitmap) goto free1_out; - memset(dev->dma_mem->bitmap, 0, bitmap_size); dev->dma_mem->virt_base = mem_base; dev->dma_mem->device_base = device_addr; diff --git a/arch/i386/mach-voyager/voyager_cat.c b/arch/i386/mach-voyager/voyager_cat.c index f50c6c6ad680..943a9473b138 100644 --- a/arch/i386/mach-voyager/voyager_cat.c +++ b/arch/i386/mach-voyager/voyager_cat.c @@ -776,7 +776,7 @@ voyager_cat_init(void) for(asic=0; asic < (*modpp)->num_asics; asic++) { int j; voyager_asic_t *asicp = *asicpp - = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/ + = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/ voyager_sp_table_t *sp_table; voyager_at_t *asic_table; voyager_jtt_t *jtag_table; @@ -785,7 +785,6 @@ voyager_cat_init(void) printk("**WARNING** kmalloc failure in cat_init\n"); continue; } - memset(asicp, 0, sizeof(voyager_asic_t)); asicpp = &(asicp->next); asicp->asic_location = asic; sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset); @@ -851,8 +850,7 @@ voyager_cat_init(void) #endif { - struct resource *res = kmalloc(sizeof(struct resource),GFP_KERNEL); - memset(res, 0, sizeof(struct resource)); + struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL); res->name = kmalloc(128, GFP_KERNEL); sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i)); res->start = qic_addr; diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h index 54d6d7aea938..46d32ad92082 100644 --- a/include/asm-i386/thread_info.h +++ b/include/asm-i386/thread_info.h @@ -95,15 +95,7 @@ static inline struct thread_info *current_thread_info(void) /* thread information allocation */ #ifdef CONFIG_DEBUG_STACK_USAGE -#define alloc_thread_info(tsk) \ - ({ \ - struct thread_info *ret; \ - \ - ret = kmalloc(THREAD_SIZE, GFP_KERNEL); \ - if (ret) \ - memset(ret, 0, THREAD_SIZE); \ - ret; \ - }) +#define alloc_thread_info(tsk) kzalloc(THREAD_SIZE, GFP_KERNEL) #else #define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL) #endif -- cgit v1.2.3-59-g8ed1b