aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/acpi/Makefile9
-rw-r--r--arch/x86/kernel/acpi/boot.c31
-rw-r--r--arch/x86/kernel/acpi/realmode/.gitignore3
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile59
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/copy.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-bios.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-mode.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vesa.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vga.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/wakemain.c81
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S170
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h48
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S62
-rw-r--r--arch/x86/kernel/acpi/sleep.c37
-rw-r--r--arch/x86/kernel/acpi/sleep.h6
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S4
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S4
-rw-r--r--arch/x86/kernel/acpi/wakeup_rm.S12
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic/apic.c76
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c2
-rw-r--r--arch/x86/kernel/apic/apic_noop.c1
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c8
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c1
-rw-r--r--arch/x86/kernel/apic/es7000_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c404
-rw-r--r--arch/x86/kernel/apic/numaq_32.c1
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/summit_32.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c1
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/check.c20
-rw-r--r--arch/x86/kernel/cpu/amd.c29
-rw-r--r--arch/x86/kernel/cpu/common.c19
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c12
-rw-r--r--arch/x86/kernel/cpu/match.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c110
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c65
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl25
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c18
-rw-r--r--arch/x86/kernel/cpu/perf_event.h2
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c22
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c570
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c149
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c15
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c6
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/dumpstack.c23
-rw-r--r--arch/x86/kernel/dumpstack_32.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c2
-rw-r--r--arch/x86/kernel/e820.c53
-rw-r--r--arch/x86/kernel/entry_32.S60
-rw-r--r--arch/x86/kernel/entry_64.S60
-rw-r--r--arch/x86/kernel/ftrace.c594
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/head64.c1
-rw-r--r--arch/x86/kernel/head_32.S228
-rw-r--r--arch/x86/kernel/head_64.S84
-rw-r--r--arch/x86/kernel/hpet.c66
-rw-r--r--arch/x86/kernel/i387.c3
-rw-r--r--arch/x86/kernel/init_task.c42
-rw-r--r--arch/x86/kernel/irq_32.c8
-rw-r--r--arch/x86/kernel/kgdb.c8
-rw-r--r--arch/x86/kernel/kprobes.c4
-rw-r--r--arch/x86/kernel/kvm.c9
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kernel/mca_32.c476
-rw-r--r--arch/x86/kernel/microcode_amd.c12
-rw-r--r--arch/x86/kernel/microcode_core.c19
-rw-r--r--arch/x86/kernel/microcode_intel.c14
-rw-r--r--arch/x86/kernel/mpparse.c22
-rw-r--r--arch/x86/kernel/nmi.c113
-rw-r--r--arch/x86/kernel/nmi_selftest.c17
-rw-r--r--arch/x86/kernel/paravirt.c12
-rw-r--r--arch/x86/kernel/pci-calgary_64.c8
-rw-r--r--arch/x86/kernel/pci-dma.c17
-rw-r--r--arch/x86/kernel/pci-nommu.c8
-rw-r--r--arch/x86/kernel/process.c73
-rw-r--r--arch/x86/kernel/process_32.c11
-rw-r--r--arch/x86/kernel/process_64.c20
-rw-r--r--arch/x86/kernel/ptrace.c13
-rw-r--r--arch/x86/kernel/reboot.c270
-rw-r--r--arch/x86/kernel/reboot_32.S135
-rw-r--r--arch/x86/kernel/setup.c42
-rw-r--r--arch/x86/kernel/setup_percpu.c14
-rw-r--r--arch/x86/kernel/signal.c80
-rw-r--r--arch/x86/kernel/smp.c100
-rw-r--r--arch/x86/kernel/smpboot.c227
-rw-r--r--arch/x86/kernel/tboot.c7
-rw-r--r--arch/x86/kernel/test_rodata.c10
-rw-r--r--arch/x86/kernel/time.c6
-rw-r--r--arch/x86/kernel/trampoline.c42
-rw-r--r--arch/x86/kernel/trampoline_32.S83
-rw-r--r--arch/x86/kernel/trampoline_64.S171
-rw-r--r--arch/x86/kernel/traps.c16
-rw-r--r--arch/x86/kernel/uprobes.c674
-rw-r--r--arch/x86/kernel/vmlinux.lds.S12
-rw-r--r--arch/x86/kernel/vsmp_64.c40
-rw-r--r--arch/x86/kernel/x86_init.c9
-rw-r--r--arch/x86/kernel/xsave.c2
108 files changed, 3172 insertions, 3016 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 532d2e090e6f..8215e5652d97 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,7 +2,7 @@
# Makefile for the linux kernel.
#
-extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
+extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
@@ -35,7 +35,6 @@ obj-y += tsc.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
obj-y += resource.o
-obj-y += trampoline.o trampoline_$(BITS).o
obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
@@ -48,8 +47,6 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
obj-y += acpi/
obj-y += reboot.o
-obj-$(CONFIG_X86_32) += reboot_32.o
-obj-$(CONFIG_MCA) += mca_32.o
obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_PCI) += early-quirks.o
@@ -101,6 +98,7 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
obj-$(CONFIG_OF) += devicetree.o
+obj-$(CONFIG_UPROBES) += uprobes.o
###
# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 6f35260bb3ef..163b22581472 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,14 +1,7 @@
-subdir- := realmode
-
obj-$(CONFIG_ACPI) += boot.o
-obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
endif
-$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
-
-$(obj)/realmode/wakeup.bin: FORCE
- $(Q)$(MAKE) $(build)=$(obj)/realmode
-
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a415b1f44365..b2297e58c6ed 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -422,12 +422,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
return 0;
}
- if (intsrc->source_irq == 0 && intsrc->global_irq == 2) {
+ if (intsrc->source_irq == 0) {
if (acpi_skip_timer_override) {
- printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
+ printk(PREFIX "BIOS IRQ0 override ignored.\n");
return 0;
}
- if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
+
+ if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
+ && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
}
@@ -593,7 +595,7 @@ void __init acpi_set_irq_model_ioapic(void)
#ifdef CONFIG_ACPI_HOTPLUG_CPU
#include <acpi/processor.h>
-static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
{
#ifdef CONFIG_ACPI_NUMA
int nid;
@@ -990,7 +992,7 @@ void __init mp_config_acpi_legacy_irqs(void)
int i;
struct mpc_intsrc mp_irq;
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+#ifdef CONFIG_EISA
/*
* Fabricate the legacy ISA bus (bus #31).
*/
@@ -1334,17 +1336,12 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
}
/*
- * Force ignoring BIOS IRQ0 pin2 override
+ * Force ignoring BIOS IRQ0 override
*/
static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
{
- /*
- * The ati_ixp4x0_rev() early PCI quirk should have set
- * the acpi_skip_timer_override flag already:
- */
if (!acpi_skip_timer_override) {
- WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
- pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
+ pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
d->ident);
acpi_skip_timer_override = 1;
}
@@ -1438,7 +1435,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
* is enabled. This input is incorrectly designated the
* ISA IRQ 0 via an interrupt source override even though
* it is wired to the output of the master 8259A and INTIN0
- * is not connected at all. Force ignoring BIOS IRQ0 pin2
+ * is not connected at all. Force ignoring BIOS IRQ0
* override in that cases.
*/
{
@@ -1473,6 +1470,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
},
},
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "FUJITSU SIEMENS",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"),
+ },
+ },
{}
};
diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore
deleted file mode 100644
index 58f1f48a58f8..000000000000
--- a/arch/x86/kernel/acpi/realmode/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-wakeup.bin
-wakeup.elf
-wakeup.lds
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
deleted file mode 100644
index 6a564ac67ef5..000000000000
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
-#
-# arch/x86/kernel/acpi/realmode/Makefile
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License. See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-
-always := wakeup.bin
-targets := wakeup.elf wakeup.lds
-
-wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
-
-# The link order of the video-*.o modules can matter. In particular,
-# video-vga.o *must* be listed first, followed by video-vesa.o.
-# Hardware-specific drivers should follow in the order they should be
-# probed, and video-bios.o should typically be last.
-wakeup-y += video-vga.o
-wakeup-y += video-vesa.o
-wakeup-y += video-bios.o
-
-targets += $(wakeup-y)
-
-bootsrc := $(src)/../../../boot
-
-# ---------------------------------------------------------------------------
-
-# How to compile the 16-bit code. Note we always compile for -march=i386,
-# that way we can complain to the user if the CPU is insufficient.
-# Compile with _SETUP since this is similar to the boot-time setup code.
-KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
- -I$(srctree)/$(bootsrc) \
- $(cflags-y) \
- -Wall -Wstrict-prototypes \
- -march=i386 -mregparm=3 \
- -include $(srctree)/$(bootsrc)/code16gcc.h \
- -fno-strict-aliasing -fomit-frame-pointer \
- $(call cc-option, -ffreestanding) \
- $(call cc-option, -fno-toplevel-reorder,\
- $(call cc-option, -fno-unit-at-a-time)) \
- $(call cc-option, -fno-stack-protector) \
- $(call cc-option, -mpreferred-stack-boundary=2)
-KBUILD_CFLAGS += $(call cc-option, -m32)
-KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
-GCOV_PROFILE := n
-
-WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
-
-LDFLAGS_wakeup.elf := -T
-
-CPPFLAGS_wakeup.lds += -P -C
-
-$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE
- $(call if_changed,ld)
-
-OBJCOPYFLAGS_wakeup.bin := -O binary
-
-$(obj)/wakeup.bin: $(obj)/wakeup.elf FORCE
- $(call if_changed,objcopy)
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
deleted file mode 100644
index f51eb0bb56ce..000000000000
--- a/arch/x86/kernel/acpi/realmode/bioscall.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/copy.S b/arch/x86/kernel/acpi/realmode/copy.S
deleted file mode 100644
index dc59ebee69d8..000000000000
--- a/arch/x86/kernel/acpi/realmode/copy.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/copy.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
deleted file mode 100644
index 6206033ba202..000000000000
--- a/arch/x86/kernel/acpi/realmode/regs.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-bios.c b/arch/x86/kernel/acpi/realmode/video-bios.c
deleted file mode 100644
index 7deabc144a27..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-bios.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-bios.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-mode.c b/arch/x86/kernel/acpi/realmode/video-mode.c
deleted file mode 100644
index 328ad209f113..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-mode.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-mode.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vesa.c b/arch/x86/kernel/acpi/realmode/video-vesa.c
deleted file mode 100644
index 9dbb9672226a..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-vesa.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vesa.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vga.c b/arch/x86/kernel/acpi/realmode/video-vga.c
deleted file mode 100644
index bcc81255f374..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-vga.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vga.c"
diff --git a/arch/x86/kernel/acpi/realmode/wakemain.c b/arch/x86/kernel/acpi/realmode/wakemain.c
deleted file mode 100644
index 883962d9eef2..000000000000
--- a/arch/x86/kernel/acpi/realmode/wakemain.c
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "wakeup.h"
-#include "boot.h"
-
-static void udelay(int loops)
-{
- while (loops--)
- io_delay(); /* Approximately 1 us */
-}
-
-static void beep(unsigned int hz)
-{
- u8 enable;
-
- if (!hz) {
- enable = 0x00; /* Turn off speaker */
- } else {
- u16 div = 1193181/hz;
-
- outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */
- io_delay();
- outb(div, 0x42); /* LSB of counter */
- io_delay();
- outb(div >> 8, 0x42); /* MSB of counter */
- io_delay();
-
- enable = 0x03; /* Turn on speaker */
- }
- inb(0x61); /* Dummy read of System Control Port B */
- io_delay();
- outb(enable, 0x61); /* Enable timer 2 output to speaker */
- io_delay();
-}
-
-#define DOT_HZ 880
-#define DASH_HZ 587
-#define US_PER_DOT 125000
-
-/* Okay, this is totally silly, but it's kind of fun. */
-static void send_morse(const char *pattern)
-{
- char s;
-
- while ((s = *pattern++)) {
- switch (s) {
- case '.':
- beep(DOT_HZ);
- udelay(US_PER_DOT);
- beep(0);
- udelay(US_PER_DOT);
- break;
- case '-':
- beep(DASH_HZ);
- udelay(US_PER_DOT * 3);
- beep(0);
- udelay(US_PER_DOT);
- break;
- default: /* Assume it's a space */
- udelay(US_PER_DOT * 3);
- break;
- }
- }
-}
-
-void main(void)
-{
- /* Kill machine if structures are wrong */
- if (wakeup_header.real_magic != 0x12345678)
- while (1);
-
- if (wakeup_header.realmode_flags & 4)
- send_morse("...-");
-
- if (wakeup_header.realmode_flags & 1)
- asm volatile("lcallw $0xc000,$3");
-
- if (wakeup_header.realmode_flags & 2) {
- /* Need to call BIOS */
- probe_cards(0);
- set_mode(wakeup_header.video_mode);
- }
-}
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
deleted file mode 100644
index b4fd836e4053..000000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * ACPI wakeup real mode startup stub
- */
-#include <asm/segment.h>
-#include <asm/msr-index.h>
-#include <asm/page_types.h>
-#include <asm/pgtable_types.h>
-#include <asm/processor-flags.h>
-#include "wakeup.h"
-
- .code16
- .section ".jump", "ax"
- .globl _start
-_start:
- cli
- jmp wakeup_code
-
-/* This should match the structure in wakeup.h */
- .section ".header", "a"
- .globl wakeup_header
-wakeup_header:
-video_mode: .short 0 /* Video mode number */
-pmode_return: .byte 0x66, 0xea /* ljmpl */
- .long 0 /* offset goes here */
- .short __KERNEL_CS
-pmode_cr0: .long 0 /* Saved %cr0 */
-pmode_cr3: .long 0 /* Saved %cr3 */
-pmode_cr4: .long 0 /* Saved %cr4 */
-pmode_efer: .quad 0 /* Saved EFER */
-pmode_gdt: .quad 0
-pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */
-pmode_behavior: .long 0 /* Wakeup behavior flags */
-realmode_flags: .long 0
-real_magic: .long 0
-trampoline_segment: .word 0
-_pad1: .byte 0
-wakeup_jmp: .byte 0xea /* ljmpw */
-wakeup_jmp_off: .word 3f
-wakeup_jmp_seg: .word 0
-wakeup_gdt: .quad 0, 0, 0
-signature: .long WAKEUP_HEADER_SIGNATURE
-
- .text
- .code16
-wakeup_code:
- cld
-
- /* Apparently some dimwit BIOS programmers don't know how to
- program a PM to RM transition, and we might end up here with
- junk in the data segment descriptor registers. The only way
- to repair that is to go into PM and fix it ourselves... */
- movw $16, %cx
- lgdtl %cs:wakeup_gdt
- movl %cr0, %eax
- orb $X86_CR0_PE, %al
- movl %eax, %cr0
- jmp 1f
-1: ljmpw $8, $2f
-2:
- movw %cx, %ds
- movw %cx, %es
- movw %cx, %ss
- movw %cx, %fs
- movw %cx, %gs
-
- andb $~X86_CR0_PE, %al
- movl %eax, %cr0
- jmp wakeup_jmp
-3:
- /* Set up segments */
- movw %cs, %ax
- movw %ax, %ds
- movw %ax, %es
- movw %ax, %ss
- lidtl wakeup_idt
-
- movl $wakeup_stack_end, %esp
-
- /* Clear the EFLAGS */
- pushl $0
- popfl
-
- /* Check header signature... */
- movl signature, %eax
- cmpl $WAKEUP_HEADER_SIGNATURE, %eax
- jne bogus_real_magic
-
- /* Check we really have everything... */
- movl end_signature, %eax
- cmpl $WAKEUP_END_SIGNATURE, %eax
- jne bogus_real_magic
-
- /* Call the C code */
- calll main
-
- /* Restore MISC_ENABLE before entering protected mode, in case
- BIOS decided to clear XD_DISABLE during S3. */
- movl pmode_behavior, %eax
- btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
- jnc 1f
-
- movl pmode_misc_en, %eax
- movl pmode_misc_en + 4, %edx
- movl $MSR_IA32_MISC_ENABLE, %ecx
- wrmsr
-1:
-
- /* Do any other stuff... */
-
-#ifndef CONFIG_64BIT
- /* This could also be done in C code... */
- movl pmode_cr3, %eax
- movl %eax, %cr3
-
- movl pmode_cr4, %ecx
- jecxz 1f
- movl %ecx, %cr4
-1:
- movl pmode_efer, %eax
- movl pmode_efer + 4, %edx
- movl %eax, %ecx
- orl %edx, %ecx
- jz 1f
- movl $MSR_EFER, %ecx
- wrmsr
-1:
-
- lgdtl pmode_gdt
-
- /* This really couldn't... */
- movl pmode_cr0, %eax
- movl %eax, %cr0
- jmp pmode_return
-#else
- pushw $0
- pushw trampoline_segment
- pushw $0
- lret
-#endif
-
-bogus_real_magic:
-1:
- hlt
- jmp 1b
-
- .data
- .balign 8
-
- /* This is the standard real-mode IDT */
-wakeup_idt:
- .word 0xffff /* limit */
- .long 0 /* address */
- .word 0
-
- .globl HEAP, heap_end
-HEAP:
- .long wakeup_heap
-heap_end:
- .long wakeup_stack
-
- .bss
-wakeup_heap:
- .space 2048
-wakeup_stack:
- .space 2048
-wakeup_stack_end:
-
- .section ".signature","a"
-end_signature:
- .long WAKEUP_END_SIGNATURE
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
deleted file mode 100644
index 97a29e1430e3..000000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Definitions for the wakeup data structure at the head of the
- * wakeup code.
- */
-
-#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
-#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
-
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-
-/* This must match data at wakeup.S */
-struct wakeup_header {
- u16 video_mode; /* Video mode number */
- u16 _jmp1; /* ljmpl opcode, 32-bit only */
- u32 pmode_entry; /* Protected mode resume point, 32-bit only */
- u16 _jmp2; /* CS value, 32-bit only */
- u32 pmode_cr0; /* Protected mode cr0 */
- u32 pmode_cr3; /* Protected mode cr3 */
- u32 pmode_cr4; /* Protected mode cr4 */
- u32 pmode_efer_low; /* Protected mode EFER */
- u32 pmode_efer_high;
- u64 pmode_gdt;
- u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */
- u32 pmode_misc_en_high;
- u32 pmode_behavior; /* Wakeup routine behavior flags */
- u32 realmode_flags;
- u32 real_magic;
- u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
- u8 _pad1;
- u8 wakeup_jmp;
- u16 wakeup_jmp_off;
- u16 wakeup_jmp_seg;
- u64 wakeup_gdt[3];
- u32 signature; /* To check we have correct structure */
-} __attribute__((__packed__));
-
-extern struct wakeup_header wakeup_header;
-#endif
-
-#define WAKEUP_HEADER_OFFSET 8
-#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
-#define WAKEUP_END_SIGNATURE 0x65a22c82
-
-/* Wakeup behavior bits */
-#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0
-
-#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
deleted file mode 100644
index d4f8010a5b1b..000000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * wakeup.ld
- *
- * Linker script for the real-mode wakeup code
- */
-#undef i386
-#include "wakeup.h"
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(_start)
-
-SECTIONS
-{
- . = 0;
- .jump : {
- *(.jump)
- } = 0x90909090
-
- . = WAKEUP_HEADER_OFFSET;
- .header : {
- *(.header)
- }
-
- . = ALIGN(16);
- .text : {
- *(.text*)
- } = 0x90909090
-
- . = ALIGN(16);
- .rodata : {
- *(.rodata*)
- }
-
- .videocards : {
- video_cards = .;
- *(.videocards)
- video_cards_end = .;
- }
-
- . = ALIGN(16);
- .data : {
- *(.data*)
- }
-
- . = ALIGN(16);
- .bss : {
- __bss_start = .;
- *(.bss)
- __bss_end = .;
- }
-
- .signature : {
- *(.signature)
- }
-
- _end = .;
-
- /DISCARD/ : {
- *(.note*)
- }
-}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 103b6ab368d3..95bf99de9058 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -14,8 +14,9 @@
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
+#include <asm/realmode.h>
-#include "realmode/wakeup.h"
+#include "../../realmode/rm/wakeup.h"
#include "sleep.h"
unsigned long acpi_realmode_flags;
@@ -24,6 +25,10 @@ unsigned long acpi_realmode_flags;
static char temp_stack[4096];
#endif
+asmlinkage void acpi_enter_s3(void)
+{
+ acpi_enter_sleep_state(3, wake_sleep_flags);
+}
/**
* acpi_suspend_lowlevel - save kernel state
*
@@ -32,13 +37,9 @@ static char temp_stack[4096];
*/
int acpi_suspend_lowlevel(void)
{
- struct wakeup_header *header;
- /* address in low memory of the wakeup routine. */
- char *acpi_realmode;
-
- acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
+ struct wakeup_header *header =
+ (struct wakeup_header *) __va(real_mode_header->wakeup_header);
- header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
if (header->signature != WAKEUP_HEADER_SIGNATURE) {
printk(KERN_ERR "wakeup header does not match\n");
return -EINVAL;
@@ -46,27 +47,6 @@ int acpi_suspend_lowlevel(void)
header->video_mode = saved_video_mode;
- header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
-
- /*
- * Set up the wakeup GDT. We set these up as Big Real Mode,
- * that is, with limits set to 4 GB. At least the Lenovo
- * Thinkpad X61 is known to need this for the video BIOS
- * initialization quirk to work; this is likely to also
- * be the case for other laptops or integrated video devices.
- */
-
- /* GDT[0]: GDT self-pointer */
- header->wakeup_gdt[0] =
- (u64)(sizeof(header->wakeup_gdt) - 1) +
- ((u64)__pa(&header->wakeup_gdt) << 16);
- /* GDT[1]: big real mode-like code segment */
- header->wakeup_gdt[1] =
- GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
- /* GDT[2]: big real mode-like data segment */
- header->wakeup_gdt[2] =
- GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
-
#ifndef CONFIG_64BIT
store_gdt((struct desc_ptr *)&header->pmode_gdt);
@@ -91,7 +71,6 @@ int acpi_suspend_lowlevel(void)
header->pmode_cr3 = (u32)__pa(&initial_page_table);
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
- header->trampoline_segment = trampoline_address() >> 4;
#ifdef CONFIG_SMP
stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index 416d4be13fef..5653a5791ec9 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -2,13 +2,17 @@
* Variables and functions used by the code in sleep.c
*/
-#include <asm/trampoline.h>
+#include <linux/linkage.h>
+#include <asm/realmode.h>
extern unsigned long saved_video_mode;
extern long saved_magic;
extern int wakeup_pmode_return;
+extern u8 wake_sleep_flags;
+extern asmlinkage void acpi_enter_s3(void);
+
extern unsigned long acpi_copy_wakeup_routine(unsigned long);
extern void wakeup_long64(void);
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 13ab720573e3..72610839f03b 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -74,9 +74,7 @@ restore_registers:
ENTRY(do_suspend_lowlevel)
call save_processor_state
call save_registers
- pushl $3
- call acpi_enter_sleep_state
- addl $4, %esp
+ call acpi_enter_s3
# In case of S3 failure, we'll emerge here. Jump
# to ret_point to recover
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 8ea5164cbd04..014d1d28c397 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -71,9 +71,7 @@ ENTRY(do_suspend_lowlevel)
movq %rsi, saved_rsi
addq $8, %rsp
- movl $3, %edi
- xorl %eax, %eax
- call acpi_enter_sleep_state
+ call acpi_enter_s3
/* in case something went wrong, restore the machine status and go on */
jmp resume_point
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
deleted file mode 100644
index 63b8ab524f2c..000000000000
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Wrapper script for the realmode binary as a transport object
- * before copying to low memory.
- */
-#include <asm/page_types.h>
-
- .section ".x86_trampoline","a"
- .balign PAGE_SIZE
- .globl acpi_wakeup_code
-acpi_wakeup_code:
- .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
- .size acpi_wakeup_code, .-acpi_wakeup_code
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 6e76c191a835..d5fd66f0d4cd 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -20,7 +20,6 @@
#include <linux/bitops.h>
#include <linux/ioport.h>
#include <linux/suspend.h>
-#include <linux/kmemleak.h>
#include <asm/e820.h>
#include <asm/io.h>
#include <asm/iommu.h>
@@ -95,11 +94,6 @@ static u32 __init allocate_aperture(void)
return 0;
}
memblock_reserve(addr, aper_size);
- /*
- * Kmemleak should not scan this block as it may not be mapped via the
- * kernel direct mapping.
- */
- kmemleak_ignore(phys_to_virt(addr));
printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
aper_size >> 10, addr);
insert_aperture_resource((u32)addr, aper_size);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 11544d8f1e97..39a222e094af 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -35,6 +35,7 @@
#include <linux/smp.h>
#include <linux/mm.h>
+#include <asm/irq_remapping.h>
#include <asm/perf_event.h>
#include <asm/x86_init.h>
#include <asm/pgalloc.h>
@@ -1325,11 +1326,13 @@ void __cpuinit setup_local_APIC(void)
acked);
break;
}
- if (cpu_has_tsc) {
- rdtscll(ntsc);
- max_loops = (cpu_khz << 10) - (ntsc - tsc);
- } else
- max_loops--;
+ if (queued) {
+ if (cpu_has_tsc) {
+ rdtscll(ntsc);
+ max_loops = (cpu_khz << 10) - (ntsc - tsc);
+ } else
+ max_loops--;
+ }
} while (queued && max_loops > 0);
WARN_ON(max_loops <= 0);
@@ -1441,8 +1444,8 @@ void __init bsp_end_local_APIC_setup(void)
* Now that local APIC setup is completed for BP, configure the fault
* handling for interrupt remapping.
*/
- if (intr_remapping_enabled)
- enable_drhd_fault_handling();
+ if (irq_remapping_enabled)
+ irq_remap_enable_fault_handling();
}
@@ -1517,7 +1520,7 @@ void enable_x2apic(void)
int __init enable_IR(void)
{
#ifdef CONFIG_IRQ_REMAP
- if (!intr_remapping_supported()) {
+ if (!irq_remapping_supported()) {
pr_debug("intr-remapping not supported\n");
return -1;
}
@@ -1528,7 +1531,7 @@ int __init enable_IR(void)
return -1;
}
- return enable_intr_remapping();
+ return irq_remapping_enable();
#endif
return -1;
}
@@ -1537,10 +1540,13 @@ void __init enable_IR_x2apic(void)
{
unsigned long flags;
int ret, x2apic_enabled = 0;
- int dmar_table_init_ret;
+ int hardware_init_ret;
+
+ /* Make sure irq_remap_ops are initialized */
+ setup_irq_remapping_ops();
- dmar_table_init_ret = dmar_table_init();
- if (dmar_table_init_ret && !x2apic_supported())
+ hardware_init_ret = irq_remapping_prepare();
+ if (hardware_init_ret && !x2apic_supported())
return;
ret = save_ioapic_entries();
@@ -1556,7 +1562,7 @@ void __init enable_IR_x2apic(void)
if (x2apic_preenabled && nox2apic)
disable_x2apic();
- if (dmar_table_init_ret)
+ if (hardware_init_ret)
ret = -1;
else
ret = enable_IR();
@@ -1637,9 +1643,11 @@ static int __init apic_verify(void)
mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
/* The BIOS may have set up the APIC at some other address */
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (l & MSR_IA32_APICBASE_ENABLE)
- mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (l & MSR_IA32_APICBASE_ENABLE)
+ mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+ }
pr_info("Found and enabled local APIC!\n");
return 0;
@@ -1657,13 +1665,15 @@ int __init apic_force_enable(unsigned long addr)
* MSR. This can only be done in software for Intel P6 or later
* and AMD K7 (Model > 1) or later.
*/
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (!(l & MSR_IA32_APICBASE_ENABLE)) {
- pr_info("Local APIC disabled by BIOS -- reenabling.\n");
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
- enabled_via_apicbase = 1;
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+ pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ enabled_via_apicbase = 1;
+ }
}
return apic_verify();
}
@@ -2172,8 +2182,8 @@ static int lapic_suspend(void)
local_irq_save(flags);
disable_local_APIC();
- if (intr_remapping_enabled)
- disable_intr_remapping();
+ if (irq_remapping_enabled)
+ irq_remapping_disable();
local_irq_restore(flags);
return 0;
@@ -2189,7 +2199,7 @@ static void lapic_resume(void)
return;
local_irq_save(flags);
- if (intr_remapping_enabled) {
+ if (irq_remapping_enabled) {
/*
* IO-APIC and PIC have their own resume routines.
* We just mask them here to make sure the interrupt
@@ -2209,10 +2219,12 @@ static void lapic_resume(void)
* FIXME! This will be wrong if we ever support suspend on
* SMP! We'll need to do this as part of the CPU restore!
*/
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
}
maxlvt = lapic_get_maxlvt();
@@ -2239,8 +2251,8 @@ static void lapic_resume(void)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
- if (intr_remapping_enabled)
- reenable_intr_remapping(x2apic_mode);
+ if (irq_remapping_enabled)
+ irq_remapping_reenable(x2apic_mode);
local_irq_restore(flags);
}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 359b6899a36c..0e881c46e8c8 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -227,6 +227,7 @@ static struct apic apic_flat = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
@@ -386,6 +387,7 @@ static struct apic apic_physflat = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 634ae6cdd5c9..a6e4c6e06c08 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -181,6 +181,7 @@ struct apic apic_noop = {
.read = noop_apic_read,
.write = noop_apic_write,
+ .eoi_write = noop_apic_write,
.icr_read = noop_apic_icr_read,
.icr_write = noop_apic_icr_write,
.wait_icr_idle = noop_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 899803e03214..6ec6d5d297c3 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -207,8 +207,11 @@ static void __init map_csrs(void)
static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
{
- c->phys_proc_id = node;
- per_cpu(cpu_llc_id, smp_processor_id()) = node;
+
+ if (c->phys_proc_id != node) {
+ c->phys_proc_id = node;
+ per_cpu(cpu_llc_id, smp_processor_id()) = node;
+ }
}
static int __init numachip_system_init(void)
@@ -292,6 +295,7 @@ static struct apic apic_numachip __refconst = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 0cdec7065aff..31fbdbfbf960 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -248,6 +248,7 @@ static struct apic apic_bigsmp = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index e42d1d3b9134..db4ab1be3c79 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -678,6 +678,7 @@ static struct apic __refdata apic_es7000_cluster = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
@@ -742,6 +743,7 @@ static struct apic __refdata apic_es7000 = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e88300d8e80a..5f0ff597437c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -68,23 +68,21 @@
#define for_each_irq_pin(entry, head) \
for (entry = head; entry; entry = entry->next)
-static void __init __ioapic_init_mappings(void);
-
-static unsigned int __io_apic_read (unsigned int apic, unsigned int reg);
-static void __io_apic_write (unsigned int apic, unsigned int reg, unsigned int val);
-static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val);
-
-static struct io_apic_ops io_apic_ops = {
- .init = __ioapic_init_mappings,
- .read = __io_apic_read,
- .write = __io_apic_write,
- .modify = __io_apic_modify,
-};
-
-void __init set_io_apic_ops(const struct io_apic_ops *ops)
+#ifdef CONFIG_IRQ_REMAP
+static void irq_remap_modify_chip_defaults(struct irq_chip *chip);
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+ return cfg->irq_2_iommu.iommu != NULL;
+}
+#else
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+ return false;
+}
+static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
{
- io_apic_ops = *ops;
}
+#endif
/*
* Is the SiS APIC rmw bug present ?
@@ -142,7 +140,7 @@ int mp_irq_entries;
/* GSI interrupts */
static int nr_irqs_gsi = NR_IRQS_LEGACY;
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+#ifdef CONFIG_EISA
int mp_bus_id_to_type[MAX_MP_BUSSES];
#endif
@@ -313,21 +311,6 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
irq_free_desc(at);
}
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
- return io_apic_ops.read(apic, reg);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
- io_apic_ops.write(apic, reg, value);
-}
-
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
- io_apic_ops.modify(apic, reg, value);
-}
-
struct io_apic {
unsigned int index;
@@ -349,14 +332,14 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
writel(vector, &io_apic->eoi);
}
-static unsigned int __io_apic_read(unsigned int apic, unsigned int reg)
+unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
writel(reg, &io_apic->index);
return readl(&io_apic->data);
}
-static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -370,7 +353,7 @@ static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int va
*
* Older SiS APIC requires we rewrite the index register
*/
-static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -379,29 +362,6 @@ static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int v
writel(value, &io_apic->data);
}
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
-{
- struct irq_pin_list *entry;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- for_each_irq_pin(entry, cfg->irq_2_pin) {
- unsigned int reg;
- int pin;
-
- pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin*2);
- /* Is the remote IRR bit set? */
- if (reg & IO_APIC_REDIR_REMOTE_IRR) {
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- return true;
- }
- }
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return false;
-}
-
union entry_union {
struct { u32 w1, w2; };
struct IO_APIC_route_entry entry;
@@ -875,7 +835,7 @@ static int __init find_isa_irq_apic(int irq, int type)
return -1;
}
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
/*
* EISA Edge/Level control register, ELCR
*/
@@ -912,12 +872,6 @@ static int EISA_ELCR(unsigned int irq)
#define default_PCI_trigger(idx) (1)
#define default_PCI_polarity(idx) (1)
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx) (1)
-#define default_MCA_polarity(idx) default_ISA_polarity(idx)
-
static int irq_polarity(int idx)
{
int bus = mp_irqs[idx].srcbus;
@@ -975,7 +929,7 @@ static int irq_trigger(int idx)
trigger = default_ISA_trigger(idx);
else
trigger = default_PCI_trigger(idx);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
switch (mp_bus_id_to_type[bus]) {
case MP_BUS_ISA: /* ISA pin */
{
@@ -992,11 +946,6 @@ static int irq_trigger(int idx)
/* set before the switch */
break;
}
- case MP_BUS_MCA: /* MCA pin */
- {
- trigger = default_MCA_trigger(idx);
- break;
- }
default:
{
printk(KERN_WARNING "broken BIOS!!\n");
@@ -1246,7 +1195,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
BUG_ON(!cfg->vector);
vector = cfg->vector;
- for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+ for_each_cpu(cpu, cfg->domain)
per_cpu(vector_irq, cpu)[vector] = -1;
cfg->vector = 0;
@@ -1254,7 +1203,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
if (likely(!cfg->move_in_progress))
return;
- for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+ for_each_cpu(cpu, cfg->old_domain) {
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
vector++) {
if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -1361,77 +1310,13 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
fasteoi ? "fasteoi" : "edge");
}
-
-static int setup_ir_ioapic_entry(int irq,
- struct IR_IO_APIC_route_entry *entry,
- unsigned int destination, int vector,
- struct io_apic_irq_attr *attr)
-{
- int index;
- struct irte irte;
- int ioapic_id = mpc_ioapic_id(attr->ioapic);
- struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id);
-
- if (!iommu) {
- pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
- return -ENODEV;
- }
-
- index = alloc_irte(iommu, irq, 1);
- if (index < 0) {
- pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id);
- return -ENOMEM;
- }
-
- prepare_irte(&irte, vector, destination);
-
- /* Set source-id of interrupt request */
- set_ioapic_sid(&irte, ioapic_id);
-
- modify_irte(irq, &irte);
-
- apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
- "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
- "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
- "Avail:%X Vector:%02X Dest:%08X "
- "SID:%04X SQ:%X SVT:%X)\n",
- attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
- irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
- irte.avail, irte.vector, irte.dest_id,
- irte.sid, irte.sq, irte.svt);
-
- memset(entry, 0, sizeof(*entry));
-
- entry->index2 = (index >> 15) & 0x1;
- entry->zero = 0;
- entry->format = 1;
- entry->index = (index & 0x7fff);
- /*
- * IO-APIC RTE will be configured with virtual vector.
- * irq handler will do the explicit EOI to the io-apic.
- */
- entry->vector = attr->ioapic_pin;
- entry->mask = 0; /* enable IRQ */
- entry->trigger = attr->trigger;
- entry->polarity = attr->polarity;
-
- /* Mask level triggered irqs.
- * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
- */
- if (attr->trigger)
- entry->mask = 1;
-
- return 0;
-}
-
static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
unsigned int destination, int vector,
struct io_apic_irq_attr *attr)
{
- if (intr_remapping_enabled)
- return setup_ir_ioapic_entry(irq,
- (struct IR_IO_APIC_route_entry *)entry,
- destination, vector, attr);
+ if (irq_remapping_enabled)
+ return setup_ioapic_remapped_entry(irq, entry, destination,
+ vector, attr);
memset(entry, 0, sizeof(*entry));
@@ -1588,7 +1473,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
{
struct IO_APIC_route_entry entry;
- if (intr_remapping_enabled)
+ if (irq_remapping_enabled)
return;
memset(&entry, 0, sizeof(entry));
@@ -1674,7 +1559,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
printk(KERN_DEBUG ".... IRQ redirection table:\n");
- if (intr_remapping_enabled) {
+ if (irq_remapping_enabled) {
printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
" Pol Stat Indx2 Zero Vect:\n");
} else {
@@ -1683,7 +1568,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
}
for (i = 0; i <= reg_01.bits.entries; i++) {
- if (intr_remapping_enabled) {
+ if (irq_remapping_enabled) {
struct IO_APIC_route_entry entry;
struct IR_IO_APIC_route_entry *ir_entry;
@@ -2050,7 +1935,7 @@ void disable_IO_APIC(void)
* IOAPIC RTE as well as interrupt-remapping table entry).
* As this gets called during crash dump, keep this simple for now.
*/
- if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
+ if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) {
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
@@ -2074,7 +1959,7 @@ void disable_IO_APIC(void)
* Use virtual wire A mode when interrupt remapping is enabled.
*/
if (cpu_has_apic || apic_from_smp_config())
- disconnect_bsp_APIC(!intr_remapping_enabled &&
+ disconnect_bsp_APIC(!irq_remapping_enabled &&
ioapic_i8259.pin != -1);
}
@@ -2390,71 +2275,6 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
return ret;
}
-#ifdef CONFIG_IRQ_REMAP
-
-/*
- * Migrate the IO-APIC irq in the presence of intr-remapping.
- *
- * For both level and edge triggered, irq migration is a simple atomic
- * update(of vector and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we eliminate the io-apic RTE modification (with the
- * updated vector information), by using a virtual vector (io-apic pin number).
- * Real vector that is used for interrupting cpu will be coming from
- * the interrupt-remapping table entry.
- *
- * As the migration is a simple atomic update of IRTE, the same mechanism
- * is used to migrate MSI irq's in the presence of interrupt-remapping.
- */
-static int
-ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
- bool force)
-{
- struct irq_cfg *cfg = data->chip_data;
- unsigned int dest, irq = data->irq;
- struct irte irte;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return -EINVAL;
-
- if (get_irte(irq, &irte))
- return -EBUSY;
-
- if (assign_irq_vector(irq, cfg, mask))
- return -EBUSY;
-
- dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
-
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
-
- /*
- * Atomically updates the IRTE with the new destination, vector
- * and flushes the interrupt entry cache.
- */
- modify_irte(irq, &irte);
-
- /*
- * After this point, all the interrupts will start arriving
- * at the new destination. So, time to cleanup the previous
- * vector allocation.
- */
- if (cfg->move_in_progress)
- send_cleanup_vector(cfg);
-
- cpumask_copy(data->affinity, mask);
- return 0;
-}
-
-#else
-static inline int
-ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
- bool force)
-{
- return 0;
-}
-#endif
-
asmlinkage void smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
@@ -2552,6 +2372,29 @@ static void ack_apic_edge(struct irq_data *data)
atomic_t irq_mis_count;
#ifdef CONFIG_GENERIC_PENDING_IRQ
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+{
+ struct irq_pin_list *entry;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ unsigned int reg;
+ int pin;
+
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ /* Is the remote IRR bit set? */
+ if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return true;
+ }
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return false;
+}
+
static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
{
/* If we are moving the irq we need to mask it */
@@ -2699,7 +2542,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
chip->irq_eoi = ir_ack_apic_level;
#ifdef CONFIG_SMP
- chip->irq_set_affinity = ir_ioapic_set_affinity;
+ chip->irq_set_affinity = set_remapped_irq_affinity;
#endif
}
#endif /* CONFIG_IRQ_REMAP */
@@ -2912,7 +2755,7 @@ static inline void __init check_timer(void)
* 8259A.
*/
if (pin1 == -1) {
- if (intr_remapping_enabled)
+ if (irq_remapping_enabled)
panic("BIOS bug: timer not connected to IO-APIC");
pin1 = pin2;
apic1 = apic2;
@@ -2945,7 +2788,7 @@ static inline void __init check_timer(void)
clear_IO_APIC_pin(0, pin1);
goto out;
}
- if (intr_remapping_enabled)
+ if (irq_remapping_enabled)
panic("timer doesn't work through Interrupt-remapped IO-APIC");
local_irq_disable();
clear_IO_APIC_pin(apic1, pin1);
@@ -3169,7 +3012,7 @@ void destroy_irq(unsigned int irq)
irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
if (irq_remapped(cfg))
- free_irte(irq);
+ free_remapped_irq(irq);
raw_spin_lock_irqsave(&vector_lock, flags);
__clear_irq_vector(irq, cfg);
raw_spin_unlock_irqrestore(&vector_lock, flags);
@@ -3198,54 +3041,34 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
if (irq_remapped(cfg)) {
- struct irte irte;
- int ir_index;
- u16 sub_handle;
-
- ir_index = map_irq_to_irte_handle(irq, &sub_handle);
- BUG_ON(ir_index == -1);
-
- prepare_irte(&irte, cfg->vector, dest);
-
- /* Set source-id of interrupt request */
- if (pdev)
- set_msi_sid(&irte, pdev);
- else
- set_hpet_sid(&irte, hpet_id);
-
- modify_irte(irq, &irte);
+ compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id);
+ return err;
+ }
+ if (x2apic_enabled())
+ msg->address_hi = MSI_ADDR_BASE_HI |
+ MSI_ADDR_EXT_DEST_ID(dest);
+ else
msg->address_hi = MSI_ADDR_BASE_HI;
- msg->data = sub_handle;
- msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
- MSI_ADDR_IR_SHV |
- MSI_ADDR_IR_INDEX1(ir_index) |
- MSI_ADDR_IR_INDEX2(ir_index);
- } else {
- if (x2apic_enabled())
- msg->address_hi = MSI_ADDR_BASE_HI |
- MSI_ADDR_EXT_DEST_ID(dest);
- else
- msg->address_hi = MSI_ADDR_BASE_HI;
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- ((apic->irq_dest_mode == 0) ?
- MSI_ADDR_DEST_MODE_PHYSICAL:
- MSI_ADDR_DEST_MODE_LOGICAL) |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_ADDR_REDIRECTION_CPU:
- MSI_ADDR_REDIRECTION_LOWPRI) |
- MSI_ADDR_DEST_ID(dest);
+ msg->address_lo =
+ MSI_ADDR_BASE_LO |
+ ((apic->irq_dest_mode == 0) ?
+ MSI_ADDR_DEST_MODE_PHYSICAL:
+ MSI_ADDR_DEST_MODE_LOGICAL) |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ MSI_ADDR_REDIRECTION_CPU:
+ MSI_ADDR_REDIRECTION_LOWPRI) |
+ MSI_ADDR_DEST_ID(dest);
+
+ msg->data =
+ MSI_DATA_TRIGGER_EDGE |
+ MSI_DATA_LEVEL_ASSERT |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ MSI_DATA_DELIVERY_FIXED:
+ MSI_DATA_DELIVERY_LOWPRI) |
+ MSI_DATA_VECTOR(cfg->vector);
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_DATA_DELIVERY_FIXED:
- MSI_DATA_DELIVERY_LOWPRI) |
- MSI_DATA_VECTOR(cfg->vector);
- }
return err;
}
@@ -3288,33 +3111,6 @@ static struct irq_chip msi_chip = {
.irq_retrigger = ioapic_retrigger_irq,
};
-/*
- * Map the PCI dev to the corresponding remapping hardware unit
- * and allocate 'nvec' consecutive interrupt-remapping table entries
- * in it.
- */
-static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
-{
- struct intel_iommu *iommu;
- int index;
-
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- printk(KERN_ERR
- "Unable to map PCI %s to iommu\n", pci_name(dev));
- return -ENOENT;
- }
-
- index = alloc_irte(iommu, irq, nvec);
- if (index < 0) {
- printk(KERN_ERR
- "Unable to allocate %d IRTE for PCI %s\n", nvec,
- pci_name(dev));
- return -ENOSPC;
- }
- return index;
-}
-
static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
struct irq_chip *chip = &msi_chip;
@@ -3345,7 +3141,6 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
int node, ret, sub_handle, index = 0;
unsigned int irq, irq_want;
struct msi_desc *msidesc;
- struct intel_iommu *iommu = NULL;
/* x86 doesn't support multiple MSI yet */
if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3359,7 +3154,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (irq == 0)
return -1;
irq_want = irq + 1;
- if (!intr_remapping_enabled)
+ if (!irq_remapping_enabled)
goto no_ir;
if (!sub_handle) {
@@ -3367,23 +3162,16 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
* allocate the consecutive block of IRTE's
* for 'nvec'
*/
- index = msi_alloc_irte(dev, irq, nvec);
+ index = msi_alloc_remapped_irq(dev, irq, nvec);
if (index < 0) {
ret = index;
goto error;
}
} else {
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- ret = -ENOENT;
+ ret = msi_setup_remapped_irq(dev, irq, index,
+ sub_handle);
+ if (ret < 0)
goto error;
- }
- /*
- * setup the mapping between the irq and the IRTE
- * base index, the sub_handle pointing to the
- * appropriate interrupt remap table entry.
- */
- set_irte_irq(irq, iommu, index, sub_handle);
}
no_ir:
ret = setup_msi_irq(dev, msidesc, irq);
@@ -3501,15 +3289,8 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
struct msi_msg msg;
int ret;
- if (intr_remapping_enabled) {
- struct intel_iommu *iommu = map_hpet_to_ir(id);
- int index;
-
- if (!iommu)
- return -1;
-
- index = alloc_irte(iommu, irq, 1);
- if (index < 0)
+ if (irq_remapping_enabled) {
+ if (!setup_hpet_msi_remapped(irq, id))
return -1;
}
@@ -3888,8 +3669,8 @@ void __init setup_ioapic_dest(void)
else
mask = apic->target_cpus();
- if (intr_remapping_enabled)
- ir_ioapic_set_affinity(idata, mask, false);
+ if (irq_remapping_enabled)
+ set_remapped_irq_affinity(idata, mask, false);
else
ioapic_set_affinity(idata, mask, false);
}
@@ -3931,12 +3712,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
return res;
}
-void __init ioapic_and_gsi_init(void)
-{
- io_apic_ops.init();
-}
-
-static void __init __ioapic_init_mappings(void)
+void __init native_io_apic_init_mappings(void)
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
struct resource *ioapic_res;
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 00d2422ca7c9..f00a68cca37a 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -530,6 +530,7 @@ static struct apic __refdata apic_numaq = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index ff2c1b9aac4d..1b291da09e60 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -142,6 +142,7 @@ static struct apic apic_default = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index fea000b27f07..659897c00755 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -546,6 +546,7 @@ static struct apic apic_summit = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 48f3103b3c93..ff35cff0e1a7 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -260,6 +260,7 @@ static struct apic apic_x2apic_cluster = {
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 8a778db45e3a..c17e982db275 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -24,6 +24,12 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
if (x2apic_phys)
return x2apic_enabled();
+ else if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) &&
+ x2apic_enabled()) {
+ printk(KERN_DEBUG "System requires x2apic physical mode\n");
+ return 1;
+ }
else
return 0;
}
@@ -166,6 +172,7 @@ static struct apic apic_x2apic_phys = {
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 87bfa69e216e..c6d03f7a4401 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -404,6 +404,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 459e78cbf61e..07b0c0db466c 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -2401,7 +2401,7 @@ static void __exit apm_exit(void)
* (pm_idle), Wait for all processors to update cached/local
* copies of pm_idle before proceeding.
*/
- cpu_idle_wait();
+ kick_all_cpus_sync();
}
if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
&& (apm_info.connection_version > 0x0100)) {
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 5da1269e8ddc..e2dbcb7dabdd 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -27,21 +27,29 @@ static int num_scan_areas;
static __init int set_corruption_check(char *arg)
{
- char *end;
+ ssize_t ret;
+ unsigned long val;
- memory_corruption_check = simple_strtol(arg, &end, 10);
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
- return (*end == 0) ? 0 : -EINVAL;
+ memory_corruption_check = val;
+ return 0;
}
early_param("memory_corruption_check", set_corruption_check);
static __init int set_corruption_check_period(char *arg)
{
- char *end;
+ ssize_t ret;
+ unsigned long val;
- corruption_check_period = simple_strtoul(arg, &end, 10);
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
- return (*end == 0) ? 0 : -EINVAL;
+ corruption_check_period = val;
+ return 0;
}
early_param("memory_corruption_check_period", set_corruption_check_period);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 0a44b90602b0..146bb6218eec 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -26,7 +26,8 @@
* contact AMD for precise details and a CPU swap.
*
* See http://www.multimania.com/poulot/k6bug.html
- * http://www.amd.com/K6/k6docs/revgd.html
+ * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6"
+ * (Publication # 21266 Issue Date: August 1998)
*
* The following test is erm.. interesting. AMD neglected to up
* the chip setting when fixing the bug but they also tweaked some
@@ -94,7 +95,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
"system stability may be impaired when more than 32 MB are used.\n");
else
printk(KERN_CONT "probably OK (after B9730xxxx).\n");
- printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
}
/* K6 with old style WHCR */
@@ -353,10 +353,11 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
node = per_cpu(cpu_llc_id, cpu);
/*
- * If core numbers are inconsistent, it's likely a multi-fabric platform,
- * so invoke platform-specific handler
+ * On multi-fabric platform (e.g. Numascale NumaChip) a
+ * platform-specific handler needs to be called to fixup some
+ * IDs of the CPU.
*/
- if (c->phys_proc_id != node)
+ if (x86_cpuinit.fixup_cpu_id)
x86_cpuinit.fixup_cpu_id(c, node);
if (!node_online(node)) {
@@ -579,6 +580,24 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
}
}
+ /* re-enable TopologyExtensions if switched off by BIOS */
+ if ((c->x86 == 0x15) &&
+ (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
+ !cpu_has(c, X86_FEATURE_TOPOEXT)) {
+ u64 val;
+
+ if (!rdmsrl_amd_safe(0xc0011005, &val)) {
+ val |= 1ULL << 54;
+ wrmsrl_amd_safe(0xc0011005, val);
+ rdmsrl(0xc0011005, val);
+ if (val & (1ULL << 54)) {
+ set_cpu_cap(c, X86_FEATURE_TOPOEXT);
+ printk(KERN_INFO FW_INFO "CPU: Re-enabling "
+ "disabled Topology Extensions Support\n");
+ }
+ }
+ }
+
cpu_detect_cache_sizes(c);
/* Multi core CPU? */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 67e258362a3d..6b9333b429ba 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1101,14 +1101,20 @@ int is_debug_stack(unsigned long addr)
addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
}
+static DEFINE_PER_CPU(u32, debug_stack_use_ctr);
+
void debug_stack_set_zero(void)
{
+ this_cpu_inc(debug_stack_use_ctr);
load_idt((const struct desc_ptr *)&nmi_idt_descr);
}
void debug_stack_reset(void)
{
- load_idt((const struct desc_ptr *)&idt_descr);
+ if (WARN_ON(!this_cpu_read(debug_stack_use_ctr)))
+ return;
+ if (this_cpu_dec_return(debug_stack_use_ctr) == 0)
+ load_idt((const struct desc_ptr *)&idt_descr);
}
#else /* CONFIG_X86_64 */
@@ -1163,15 +1169,6 @@ static void dbg_restore_debug_regs(void)
#endif /* ! CONFIG_KGDB */
/*
- * Prints an error where the NUMA and configured core-number mismatch and the
- * platform didn't override this to fix it up
- */
-void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node)
-{
- pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id);
-}
-
-/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
* and IDT. We reload them nevertheless, this function acts as a
@@ -1194,7 +1191,7 @@ void __cpuinit cpu_init(void)
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
- if (cpu != 0 && percpu_read(numa_node) == 0 &&
+ if (cpu != 0 && this_cpu_read(numa_node) == 0 &&
early_cpu_to_node(cpu) != NUMA_NO_NODE)
set_numa_node(early_cpu_to_node(cpu));
#endif
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 73d08ed98a64..9a7c90d80bc4 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -433,14 +433,14 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
/* check if @slot is already used or the index is already disabled */
ret = amd_get_l3_disable_slot(nb, slot);
if (ret >= 0)
- return -EINVAL;
+ return -EEXIST;
if (index > nb->l3_cache.indices)
return -EINVAL;
/* check whether the other slot has disabled the same index already */
if (index == amd_get_l3_disable_slot(nb, !slot))
- return -EINVAL;
+ return -EEXIST;
amd_l3_disable_index(nb, cpu, slot, index);
@@ -468,8 +468,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
if (err) {
if (err == -EEXIST)
- printk(KERN_WARNING "L3 disable slot %d in use!\n",
- slot);
+ pr_warning("L3 slot %d in use/index already disabled!\n",
+ slot);
return err;
}
return count;
@@ -615,14 +615,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
new_l2 = this_leaf.size/1024;
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
- l2_id = c->apicid >> index_msb;
+ l2_id = c->apicid & ~((1 << index_msb) - 1);
break;
case 3:
new_l3 = this_leaf.size/1024;
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(
num_threads_sharing);
- l3_id = c->apicid >> index_msb;
+ l3_id = c->apicid & ~((1 << index_msb) - 1);
break;
default:
break;
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 5502b289341b..36565373af87 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -23,7 +23,7 @@
* %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
*
* Arrays used to match for this should also be declared using
- * MODULE_DEVICE_TABLE(x86_cpu, ...)
+ * MODULE_DEVICE_TABLE(x86cpu, ...)
*
* This always matches against the boot cpu, assuming models and features are
* consistent over all CPUs.
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 507ea58688e2..cd8b166a1735 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -42,7 +42,8 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
struct mce m;
/* Only corrected MC is reported */
- if (!corrected)
+ if (!corrected || !(mem_err->validation_bits &
+ CPER_MEM_VALID_PHYSICAL_ADDRESS))
return;
mce_setup(&m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 0c82091b1652..413c2ced887c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -126,6 +126,16 @@ static struct severity {
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
USER
),
+ MCESEV(
+ KEEP, "HT thread notices Action required: instruction fetch error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ MCGMASK(MCG_STATUS_EIPV, 0)
+ ),
+ MCESEV(
+ AR, "Action required: instruction fetch error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ USER
+ ),
#endif
MCESEV(
PANIC, "Action required: unknown MCACOD",
@@ -165,15 +175,19 @@ static struct severity {
};
/*
- * If the EIPV bit is set, it means the saved IP is the
- * instruction which caused the MCE.
+ * If mcgstatus indicated that ip/cs on the stack were
+ * no good, then "m->cs" will be zero and we will have
+ * to assume the worst case (IN_KERNEL) as we actually
+ * have no idea what we were executing when the machine
+ * check hit.
+ * If we do have a good "m->cs" (or a faked one in the
+ * case we were executing in VM86 mode) we can use it to
+ * distinguish an exception taken in user from from one
+ * taken in the kernel.
*/
static int error_context(struct mce *m)
{
- if (m->mcgstatus & MCG_STATUS_EIPV)
- return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
- /* Unknown, assume kernel */
- return IN_KERNEL;
+ return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}
int mce_severity(struct mce *m, int tolerant, char **msg)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d086a09c087d..da27c5d2168a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -437,6 +437,14 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
m->ip = regs->ip;
m->cs = regs->cs;
+
+ /*
+ * When in VM86 mode make the cs look like ring 3
+ * always. This is a lie, but it's better than passing
+ * the additional vm86 bit around everywhere.
+ */
+ if (v8086_mode(regs))
+ m->cs |= 3;
}
/* Use accurate RIP reporting if available. */
if (rip_msr)
@@ -583,7 +591,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
struct mce m;
int i;
- percpu_inc(mce_poll_count);
+ this_cpu_inc(mce_poll_count);
mce_gather_info(&m, NULL);
@@ -641,16 +649,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
-static int mce_no_way_out(struct mce *m, char **msg)
+static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp)
{
- int i;
+ int i, ret = 0;
for (i = 0; i < banks; i++) {
m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
+ if (m->status & MCI_STATUS_VAL)
+ __set_bit(i, validp);
if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
- return 1;
+ ret = 1;
}
- return 0;
+ return ret;
}
/*
@@ -945,9 +955,10 @@ struct mce_info {
atomic_t inuse;
struct task_struct *t;
__u64 paddr;
+ int restartable;
} mce_info[MCE_INFO_MAX];
-static void mce_save_info(__u64 addr)
+static void mce_save_info(__u64 addr, int c)
{
struct mce_info *mi;
@@ -955,6 +966,7 @@ static void mce_save_info(__u64 addr)
if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
mi->t = current;
mi->paddr = addr;
+ mi->restartable = c;
return;
}
}
@@ -1011,11 +1023,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
*/
int kill_it = 0;
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown";
atomic_inc(&mce_entry);
- percpu_inc(mce_exception_count);
+ this_cpu_inc(mce_exception_count);
if (!banks)
goto out;
@@ -1025,7 +1038,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
final = &__get_cpu_var(mces_seen);
*final = m;
- no_way_out = mce_no_way_out(&m, &msg);
+ memset(valid_banks, 0, sizeof(valid_banks));
+ no_way_out = mce_no_way_out(&m, &msg, valid_banks);
barrier();
@@ -1045,6 +1059,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
order = mce_start(&no_way_out);
for (i = 0; i < banks; i++) {
__clear_bit(i, toclear);
+ if (!test_bit(i, valid_banks))
+ continue;
if (!mce_banks[i].ctl)
continue;
@@ -1130,7 +1146,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_panic("Fatal machine check on current CPU", &m, msg);
if (worst == MCE_AR_SEVERITY) {
/* schedule action before return to userland */
- mce_save_info(m.addr);
+ mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
set_thread_flag(TIF_MCE_NOTIFY);
} else if (kill_it) {
force_sig(SIGBUS, current);
@@ -1179,7 +1195,13 @@ void mce_notify_process(void)
pr_err("Uncorrected hardware memory error in user-access at %llx",
mi->paddr);
- if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) {
+ /*
+ * We must call memory_failure() here even if the current process is
+ * doomed. We still need to mark the page as poisoned and alert any
+ * other users of the page.
+ */
+ if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 ||
+ mi->restartable == 0) {
pr_err("Memory error not recovered");
force_sig(SIGBUS, current);
}
@@ -1229,15 +1251,15 @@ void mce_log_therm_throt_event(__u64 status)
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
*/
-static int check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = 5 * 60; /* 5 minutes */
-static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mce_start_timer(unsigned long data)
+static void mce_timer_fn(unsigned long data)
{
- struct timer_list *t = &per_cpu(mce_timer, data);
- int *n;
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned long iv;
WARN_ON(smp_processor_id() != data);
@@ -1250,13 +1272,14 @@ static void mce_start_timer(unsigned long data)
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
- n = &__get_cpu_var(mce_next_interval);
+ iv = __this_cpu_read(mce_next_interval);
if (mce_notify_irq())
- *n = max(*n/2, HZ/100);
+ iv = max(iv / 2, (unsigned long) HZ/100);
else
- *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
+ iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+ __this_cpu_write(mce_next_interval, iv);
- t->expires = jiffies + *n;
+ t->expires = jiffies + iv;
add_timer_on(t, smp_processor_id());
}
@@ -1423,6 +1446,43 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
*/
if (c->x86 == 6 && banks > 0)
mce_banks[0].ctl = 0;
+
+ /*
+ * Turn off MC4_MISC thresholding banks on those models since
+ * they're not supported there.
+ */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+ int i;
+ u64 val, hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
+ 0x00000413, /* MC4_MISC0 */
+ 0xc0000408, /* MC4_MISC1 */
+ };
+
+ rdmsrl(MSR_K7_HWCR, hwcr);
+
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
+
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+
+ for (i = 0; i < ARRAY_SIZE(msrs); i++) {
+ rdmsrl(msrs[i], val);
+
+ /* CntP bit set? */
+ if (val & BIT_64(62)) {
+ val &= ~BIT_64(62);
+ wrmsrl(msrs[i], val);
+ }
+ }
+
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+ }
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1497,17 +1557,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
static void __mcheck_cpu_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
- int *n = &__get_cpu_var(mce_next_interval);
+ unsigned long iv = check_interval * HZ;
- setup_timer(t, mce_start_timer, smp_processor_id());
+ setup_timer(t, mce_timer_fn, smp_processor_id());
if (mce_ignore_ce)
return;
- *n = check_interval * HZ;
- if (!*n)
+ __this_cpu_write(mce_next_interval, iv);
+ if (!iv)
return;
- t->expires = round_jiffies(jiffies + *n);
+ t->expires = round_jiffies(jiffies + iv);
add_timer_on(t, smp_processor_id());
}
@@ -2217,7 +2277,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DOWN_FAILED_FROZEN:
if (!mce_ignore_ce && check_interval) {
t->expires = round_jiffies(jiffies +
- __get_cpu_var(mce_next_interval));
+ per_cpu(mce_next_interval, cpu));
add_timer_on(t, cpu);
}
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 99b57179f912..f4873a64f46d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -51,6 +51,7 @@ struct threshold_block {
unsigned int cpu;
u32 address;
u16 interrupt_enable;
+ bool interrupt_capable;
u16 threshold_limit;
struct kobject kobj;
struct list_head miscj;
@@ -83,6 +84,21 @@ struct thresh_restart {
u16 old_limit;
};
+static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
+{
+ /*
+ * bank 4 supports APIC LVT interrupts implicitly since forever.
+ */
+ if (bank == 4)
+ return true;
+
+ /*
+ * IntP: interrupt present; if this bit is set, the thresholding
+ * bank can generate APIC LVT interrupts
+ */
+ return msr_high_bits & BIT(28);
+}
+
static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
{
int msr = (hi & MASK_LVTOFF_HI) >> 20;
@@ -104,8 +120,10 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
return 1;
};
-/* must be called with correct cpu affinity */
-/* Called via smp_call_function_single() */
+/*
+ * Called via smp_call_function_single(), must be called with correct
+ * cpu affinity.
+ */
static void threshold_restart_bank(void *_tr)
{
struct thresh_restart *tr = _tr;
@@ -128,6 +146,12 @@ static void threshold_restart_bank(void *_tr)
(new_count & THRESHOLD_MAX);
}
+ /* clear IntType */
+ hi &= ~MASK_INT_TYPE_HI;
+
+ if (!tr->b->interrupt_capable)
+ goto done;
+
if (tr->set_lvt_off) {
if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
/* set new lvt offset */
@@ -136,9 +160,10 @@ static void threshold_restart_bank(void *_tr)
}
}
- tr->b->interrupt_enable ?
- (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
- (hi &= ~MASK_INT_TYPE_HI);
+ if (tr->b->interrupt_enable)
+ hi |= INT_TYPE_APIC;
+
+ done:
hi |= MASK_COUNT_EN_HI;
wrmsr(tr->b->address, lo, hi);
@@ -202,14 +227,17 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (shared_bank[bank] && c->cpu_core_id)
break;
- offset = setup_APIC_mce(offset,
- (high & MASK_LVTOFF_HI) >> 20);
-
memset(&b, 0, sizeof(b));
- b.cpu = cpu;
- b.bank = bank;
- b.block = block;
- b.address = address;
+ b.cpu = cpu;
+ b.bank = bank;
+ b.block = block;
+ b.address = address;
+ b.interrupt_capable = lvt_interrupt_supported(bank, high);
+
+ if (b.interrupt_capable) {
+ int new = (high & MASK_LVTOFF_HI) >> 20;
+ offset = setup_APIC_mce(offset, new);
+ }
mce_threshold_block_init(&b, offset);
mce_threshold_vector = amd_threshold_interrupt;
@@ -309,6 +337,9 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
struct thresh_restart tr;
unsigned long new;
+ if (!b->interrupt_capable)
+ return -EINVAL;
+
if (strict_strtoul(buf, 0, &new) < 0)
return -EINVAL;
@@ -390,10 +421,10 @@ RW_ATTR(threshold_limit);
RW_ATTR(error_count);
static struct attribute *default_attrs[] = {
- &interrupt_enable.attr,
&threshold_limit.attr,
&error_count.attr,
- NULL
+ NULL, /* possibly interrupt_enable if supported, see below */
+ NULL,
};
#define to_block(k) container_of(k, struct threshold_block, kobj)
@@ -467,8 +498,14 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
b->cpu = cpu;
b->address = address;
b->interrupt_enable = 0;
+ b->interrupt_capable = lvt_interrupt_supported(bank, high);
b->threshold_limit = THRESHOLD_MAX;
+ if (b->interrupt_capable)
+ threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
+ else
+ threshold_ktype.default_attrs[2] = NULL;
+
INIT_LIST_HEAD(&b->miscj);
if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
index dfea390e1608..c7b3fe2d72e0 100644
--- a/arch/x86/kernel/cpu/mkcapflags.pl
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/perl -w
#
# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
#
@@ -11,22 +11,35 @@ open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
print OUT "#include <asm/cpufeature.h>\n\n";
print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
+%features = ();
+$err = 0;
+
while (defined($line = <IN>)) {
if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
$macro = $1;
- $feature = $2;
+ $feature = "\L$2";
$tail = $3;
if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
- $feature = $1;
+ $feature = "\L$1";
}
- if ($feature ne '') {
- printf OUT "\t%-32s = \"%s\",\n",
- "[$macro]", "\L$feature";
+ next if ($feature eq '');
+
+ if ($features{$feature}++) {
+ print STDERR "$in: duplicate feature name: $feature\n";
+ $err++;
}
+ printf OUT "\t%-32s = \"%s\",\n", "[$macro]", $feature;
}
}
print OUT "};\n";
close(IN);
close(OUT);
+
+if ($err) {
+ unlink($out);
+ exit(1);
+}
+
+exit(0);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ac140c7be396..bdda2e6c673b 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -266,7 +266,7 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
if (align > max_align)
align = max_align;
- sizek = 1 << align;
+ sizek = 1UL << align;
if (debug_print) {
char start_factor = 'K', size_factor = 'K';
unsigned long start_base, size_base;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index bb8e03407e18..c4706cf9c011 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -484,9 +484,6 @@ static int __x86_pmu_event_init(struct perf_event *event)
/* mark unused */
event->hw.extra_reg.idx = EXTRA_REG_NONE;
-
- /* mark not used */
- event->hw.extra_reg.idx = EXTRA_REG_NONE;
event->hw.branch_reg.idx = EXTRA_REG_NONE;
return x86_pmu.hw_config(event);
@@ -1186,8 +1183,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- perf_sample_data_init(&data, 0);
-
cpuc = &__get_cpu_var(cpu_hw_events);
/*
@@ -1222,7 +1217,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
* event overflow
*/
handled++;
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
if (!x86_perf_event_set_period(event))
continue;
@@ -1501,6 +1496,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void)
if (!cpuc->shared_regs)
goto error;
}
+ cpuc->is_fake = 1;
return cpuc;
error:
free_fake_cpuc(cpuc);
@@ -1761,6 +1757,12 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
}
+static inline int
+valid_user_frame(const void __user *fp, unsigned long size)
+{
+ return (__range_not_ok(fp, size, TASK_SIZE) == 0);
+}
+
#ifdef CONFIG_COMPAT
#include <asm/compat.h>
@@ -1785,7 +1787,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
if (bytes != sizeof(frame))
break;
- if (fp < compat_ptr(regs->sp))
+ if (!valid_user_frame(fp, sizeof(frame)))
break;
perf_callchain_store(entry, frame.return_address);
@@ -1831,7 +1833,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
if (bytes != sizeof(frame))
break;
- if ((unsigned long)fp < regs->sp)
+ if (!valid_user_frame(fp, sizeof(frame)))
break;
perf_callchain_store(entry, frame.return_address);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 6638aaf54493..7241e2fc3c17 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -117,6 +117,7 @@ struct cpu_hw_events {
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
unsigned int group_flag;
+ int is_fake;
/*
* Intel DebugStore bits
@@ -364,6 +365,7 @@ struct x86_pmu {
int pebs_record_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
+ void (*pebs_aliases)(struct perf_event *event);
/*
* Intel LBR
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 95e7fe1c5f0b..11a4eb9131d5 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -134,8 +134,13 @@ static u64 amd_pmu_event_map(int hw_event)
static int amd_pmu_hw_config(struct perf_event *event)
{
- int ret = x86_pmu_hw_config(event);
+ int ret;
+ /* pass precise event sampling to ibs: */
+ if (event->attr.precise_ip && get_ibs_caps())
+ return -ENOENT;
+
+ ret = x86_pmu_hw_config(event);
if (ret)
return ret;
@@ -205,10 +210,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
* when we come here
*/
for (i = 0; i < x86_pmu.num_counters; i++) {
- if (nb->owners[i] == event) {
- cmpxchg(nb->owners+i, event, NULL);
+ if (cmpxchg(nb->owners + i, event, NULL) == event)
break;
- }
}
}
@@ -493,6 +496,7 @@ static __initconst const struct x86_pmu amd_pmu = {
* 0x023 DE PERF_CTL[2:0]
* 0x02D LS PERF_CTL[3]
* 0x02E LS PERF_CTL[3,0]
+ * 0x031 LS PERF_CTL[2:0] (**)
* 0x043 CU PERF_CTL[2:0]
* 0x045 CU PERF_CTL[2:0]
* 0x046 CU PERF_CTL[2:0]
@@ -506,10 +510,12 @@ static __initconst const struct x86_pmu amd_pmu = {
* 0x0DD LS PERF_CTL[5:0]
* 0x0DE LS PERF_CTL[5:0]
* 0x0DF LS PERF_CTL[5:0]
+ * 0x1C0 EX PERF_CTL[5:3]
* 0x1D6 EX PERF_CTL[5:0]
* 0x1D8 EX PERF_CTL[5:0]
*
- * (*) depending on the umask all FPU counters may be used
+ * (*) depending on the umask all FPU counters may be used
+ * (**) only one unitmask enabled at a time
*/
static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
@@ -559,6 +565,12 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
return &amd_f15_PMC3;
case 0x02E:
return &amd_f15_PMC30;
+ case 0x031:
+ if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+ return &amd_f15_PMC20;
+ return &emptyconstraint;
+ case 0x1C0:
+ return &amd_f15_PMC53;
default:
return &amd_f15_PMC50;
}
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 3b8a2d30d14e..da9bcdcd9856 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -9,6 +9,7 @@
#include <linux/perf_event.h>
#include <linux/module.h>
#include <linux/pci.h>
+#include <linux/ptrace.h>
#include <asm/apic.h>
@@ -16,36 +17,591 @@ static u32 ibs_caps;
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
-static struct pmu perf_ibs;
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+
+#include <asm/nmi.h>
+
+#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
+
+enum ibs_states {
+ IBS_ENABLED = 0,
+ IBS_STARTED = 1,
+ IBS_STOPPING = 2,
+
+ IBS_MAX_STATES,
+};
+
+struct cpu_perf_ibs {
+ struct perf_event *event;
+ unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)];
+};
+
+struct perf_ibs {
+ struct pmu pmu;
+ unsigned int msr;
+ u64 config_mask;
+ u64 cnt_mask;
+ u64 enable_mask;
+ u64 valid_mask;
+ u64 max_period;
+ unsigned long offset_mask[1];
+ int offset_max;
+ struct cpu_perf_ibs __percpu *pcpu;
+ u64 (*get_count)(u64 config);
+};
+
+struct perf_ibs_data {
+ u32 size;
+ union {
+ u32 data[0]; /* data buffer starts here */
+ u32 caps;
+ };
+ u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
+};
+
+static int
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
+{
+ s64 left = local64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ int overflow = 0;
+
+ /*
+ * If we are way outside a reasonable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ overflow = 1;
+ }
+
+ if (unlikely(left < (s64)min)) {
+ left += period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ overflow = 1;
+ }
+
+ /*
+ * If the hw period that triggers the sw overflow is too short
+ * we might hit the irq handler. This biases the results.
+ * Thus we shorten the next-to-last period and set the last
+ * period to the max period.
+ */
+ if (left > max) {
+ left -= max;
+ if (left > max)
+ left = max;
+ else if (left < min)
+ left = min;
+ }
+
+ *hw_period = (u64)left;
+
+ return overflow;
+}
+
+static int
+perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int shift = 64 - width;
+ u64 prev_raw_count;
+ u64 delta;
+
+ /*
+ * Careful: an NMI might modify the previous event value.
+ *
+ * Our tactic to handle this is to first atomically read and
+ * exchange a new raw count - then add that new-prev delta
+ * count to the generic event atomically:
+ */
+ prev_raw_count = local64_read(&hwc->prev_count);
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ return 0;
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
+
+ return 1;
+}
+
+static struct perf_ibs perf_ibs_fetch;
+static struct perf_ibs perf_ibs_op;
+
+static struct perf_ibs *get_ibs_pmu(int type)
+{
+ if (perf_ibs_fetch.pmu.type == type)
+ return &perf_ibs_fetch;
+ if (perf_ibs_op.pmu.type == type)
+ return &perf_ibs_op;
+ return NULL;
+}
+
+/*
+ * Use IBS for precise event sampling:
+ *
+ * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count
+ * perf record -a -e r076:p ... # same as -e cpu-cycles:p
+ * perf record -a -e r0C1:p ... # use ibs op counting micro-ops
+ *
+ * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
+ * MSRC001_1033) is used to select either cycle or micro-ops counting
+ * mode.
+ *
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ *
+ */
+static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
+{
+ switch (event->attr.precise_ip) {
+ case 0:
+ return -ENOENT;
+ case 1:
+ case 2:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ switch (event->attr.type) {
+ case PERF_TYPE_HARDWARE:
+ switch (event->attr.config) {
+ case PERF_COUNT_HW_CPU_CYCLES:
+ *config = 0;
+ return 0;
+ }
+ break;
+ case PERF_TYPE_RAW:
+ switch (event->attr.config) {
+ case 0x0076:
+ *config = 0;
+ return 0;
+ case 0x00C1:
+ *config = IBS_OP_CNT_CTL;
+ return 0;
+ }
+ break;
+ default:
+ return -ENOENT;
+ }
+
+ return -EOPNOTSUPP;
+}
static int perf_ibs_init(struct perf_event *event)
{
- if (perf_ibs.type != event->attr.type)
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs;
+ u64 max_cnt, config;
+ int ret;
+
+ perf_ibs = get_ibs_pmu(event->attr.type);
+ if (perf_ibs) {
+ config = event->attr.config;
+ } else {
+ perf_ibs = &perf_ibs_op;
+ ret = perf_ibs_precise_event(event, &config);
+ if (ret)
+ return ret;
+ }
+
+ if (event->pmu != &perf_ibs->pmu)
return -ENOENT;
+
+ if (config & ~perf_ibs->config_mask)
+ return -EINVAL;
+
+ if (hwc->sample_period) {
+ if (config & perf_ibs->cnt_mask)
+ /* raw max_cnt may not be set */
+ return -EINVAL;
+ if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
+ /*
+ * lower 4 bits can not be set in ibs max cnt,
+ * but allowing it in case we adjust the
+ * sample period to set a frequency.
+ */
+ return -EINVAL;
+ hwc->sample_period &= ~0x0FULL;
+ if (!hwc->sample_period)
+ hwc->sample_period = 0x10;
+ } else {
+ max_cnt = config & perf_ibs->cnt_mask;
+ config &= ~perf_ibs->cnt_mask;
+ event->attr.sample_period = max_cnt << 4;
+ hwc->sample_period = event->attr.sample_period;
+ }
+
+ if (!hwc->sample_period)
+ return -EINVAL;
+
+ /*
+ * If we modify hwc->sample_period, we also need to update
+ * hwc->last_period and hwc->period_left.
+ */
+ hwc->last_period = hwc->sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+
+ hwc->config_base = perf_ibs->msr;
+ hwc->config = config;
+
return 0;
}
+static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 *period)
+{
+ int overflow;
+
+ /* ignore lower 4 bits in min count: */
+ overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+ local64_set(&hwc->prev_count, 0);
+
+ return overflow;
+}
+
+static u64 get_ibs_fetch_count(u64 config)
+{
+ return (config & IBS_FETCH_CNT) >> 12;
+}
+
+static u64 get_ibs_op_count(u64 config)
+{
+ u64 count = 0;
+
+ if (config & IBS_OP_VAL)
+ count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
+
+ if (ibs_caps & IBS_CAPS_RDWROPCNT)
+ count += (config & IBS_OP_CUR_CNT) >> 32;
+
+ return count;
+}
+
+static void
+perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
+ u64 *config)
+{
+ u64 count = perf_ibs->get_count(*config);
+
+ /*
+ * Set width to 64 since we do not overflow on max width but
+ * instead on max count. In perf_ibs_set_period() we clear
+ * prev count manually on overflow.
+ */
+ while (!perf_event_try_update(event, count, 64)) {
+ rdmsrl(event->hw.config_base, *config);
+ count = perf_ibs->get_count(*config);
+ }
+}
+
+static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 config)
+{
+ wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
+}
+
+/*
+ * Erratum #420 Instruction-Based Sampling Engine May Generate
+ * Interrupt that Cannot Be Cleared:
+ *
+ * Must clear counter mask first, then clear the enable bit. See
+ * Revision Guide for AMD Family 10h Processors, Publication #41322.
+ */
+static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 config)
+{
+ config &= ~perf_ibs->cnt_mask;
+ wrmsrl(hwc->config_base, config);
+ config &= ~perf_ibs->enable_mask;
+ wrmsrl(hwc->config_base, config);
+}
+
+/*
+ * We cannot restore the ibs pmu state, so we always needs to update
+ * the event while stopping it and then reset the state when starting
+ * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
+ * perf_ibs_start()/perf_ibs_stop() and instead always do it.
+ */
+static void perf_ibs_start(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ u64 period;
+
+ if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ return;
+
+ WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+ hwc->state = 0;
+
+ perf_ibs_set_period(perf_ibs, hwc, &period);
+ set_bit(IBS_STARTED, pcpu->state);
+ perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+ perf_event_update_userpage(event);
+}
+
+static void perf_ibs_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ u64 config;
+ int stopping;
+
+ stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
+
+ if (!stopping && (hwc->state & PERF_HES_UPTODATE))
+ return;
+
+ rdmsrl(hwc->config_base, config);
+
+ if (stopping) {
+ set_bit(IBS_STOPPING, pcpu->state);
+ perf_ibs_disable_event(perf_ibs, hwc, config);
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ if (hwc->state & PERF_HES_UPTODATE)
+ return;
+
+ /*
+ * Clear valid bit to not count rollovers on update, rollovers
+ * are only updated in the irq handler.
+ */
+ config &= ~perf_ibs->valid_mask;
+
+ perf_ibs_event_update(perf_ibs, event, &config);
+ hwc->state |= PERF_HES_UPTODATE;
+}
+
static int perf_ibs_add(struct perf_event *event, int flags)
{
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+ if (test_and_set_bit(IBS_ENABLED, pcpu->state))
+ return -ENOSPC;
+
+ event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ pcpu->event = event;
+
+ if (flags & PERF_EF_START)
+ perf_ibs_start(event, PERF_EF_RELOAD);
+
return 0;
}
static void perf_ibs_del(struct perf_event *event, int flags)
{
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+ if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
+ return;
+
+ perf_ibs_stop(event, PERF_EF_UPDATE);
+
+ pcpu->event = NULL;
+
+ perf_event_update_userpage(event);
}
-static struct pmu perf_ibs = {
- .event_init= perf_ibs_init,
- .add= perf_ibs_add,
- .del= perf_ibs_del,
+static void perf_ibs_read(struct perf_event *event) { }
+
+static struct perf_ibs perf_ibs_fetch = {
+ .pmu = {
+ .task_ctx_nr = perf_invalid_context,
+
+ .event_init = perf_ibs_init,
+ .add = perf_ibs_add,
+ .del = perf_ibs_del,
+ .start = perf_ibs_start,
+ .stop = perf_ibs_stop,
+ .read = perf_ibs_read,
+ },
+ .msr = MSR_AMD64_IBSFETCHCTL,
+ .config_mask = IBS_FETCH_CONFIG_MASK,
+ .cnt_mask = IBS_FETCH_MAX_CNT,
+ .enable_mask = IBS_FETCH_ENABLE,
+ .valid_mask = IBS_FETCH_VAL,
+ .max_period = IBS_FETCH_MAX_CNT << 4,
+ .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
+ .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
+
+ .get_count = get_ibs_fetch_count,
};
+static struct perf_ibs perf_ibs_op = {
+ .pmu = {
+ .task_ctx_nr = perf_invalid_context,
+
+ .event_init = perf_ibs_init,
+ .add = perf_ibs_add,
+ .del = perf_ibs_del,
+ .start = perf_ibs_start,
+ .stop = perf_ibs_stop,
+ .read = perf_ibs_read,
+ },
+ .msr = MSR_AMD64_IBSOPCTL,
+ .config_mask = IBS_OP_CONFIG_MASK,
+ .cnt_mask = IBS_OP_MAX_CNT,
+ .enable_mask = IBS_OP_ENABLE,
+ .valid_mask = IBS_OP_VAL,
+ .max_period = IBS_OP_MAX_CNT << 4,
+ .offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
+ .offset_max = MSR_AMD64_IBSOP_REG_COUNT,
+
+ .get_count = get_ibs_op_count,
+};
+
+static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
+{
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ struct perf_event *event = pcpu->event;
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ struct perf_ibs_data ibs_data;
+ int offset, size, check_rip, offset_max, throttle = 0;
+ unsigned int msr;
+ u64 *buf, *config, period;
+
+ if (!test_bit(IBS_STARTED, pcpu->state)) {
+ /*
+ * Catch spurious interrupts after stopping IBS: After
+ * disabling IBS there could be still incomming NMIs
+ * with samples that even have the valid bit cleared.
+ * Mark all this NMIs as handled.
+ */
+ return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
+ }
+
+ msr = hwc->config_base;
+ buf = ibs_data.regs;
+ rdmsrl(msr, *buf);
+ if (!(*buf++ & perf_ibs->valid_mask))
+ return 0;
+
+ config = &ibs_data.regs[0];
+ perf_ibs_event_update(perf_ibs, event, config);
+ perf_sample_data_init(&data, 0, hwc->last_period);
+ if (!perf_ibs_set_period(perf_ibs, hwc, &period))
+ goto out; /* no sw counter overflow */
+
+ ibs_data.caps = ibs_caps;
+ size = 1;
+ offset = 1;
+ check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
+ if (event->attr.sample_type & PERF_SAMPLE_RAW)
+ offset_max = perf_ibs->offset_max;
+ else if (check_rip)
+ offset_max = 2;
+ else
+ offset_max = 1;
+ do {
+ rdmsrl(msr + offset, *buf++);
+ size++;
+ offset = find_next_bit(perf_ibs->offset_mask,
+ perf_ibs->offset_max,
+ offset + 1);
+ } while (offset < offset_max);
+ ibs_data.size = sizeof(u64) * size;
+
+ regs = *iregs;
+ if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
+ regs.flags &= ~PERF_EFLAGS_EXACT;
+ } else {
+ instruction_pointer_set(&regs, ibs_data.regs[1]);
+ regs.flags |= PERF_EFLAGS_EXACT;
+ }
+
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw.size = sizeof(u32) + ibs_data.size;
+ raw.data = ibs_data.data;
+ data.raw = &raw;
+ }
+
+ throttle = perf_event_overflow(event, &data, &regs);
+out:
+ if (throttle)
+ perf_ibs_disable_event(perf_ibs, hwc, *config);
+ else
+ perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+ perf_event_update_userpage(event);
+
+ return 1;
+}
+
+static int __kprobes
+perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ int handled = 0;
+
+ handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
+ handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ return handled;
+}
+
+static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
+{
+ struct cpu_perf_ibs __percpu *pcpu;
+ int ret;
+
+ pcpu = alloc_percpu(struct cpu_perf_ibs);
+ if (!pcpu)
+ return -ENOMEM;
+
+ perf_ibs->pcpu = pcpu;
+
+ ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
+ if (ret) {
+ perf_ibs->pcpu = NULL;
+ free_percpu(pcpu);
+ }
+
+ return ret;
+}
+
static __init int perf_event_ibs_init(void)
{
if (!ibs_caps)
return -ENODEV; /* ibs not supported by the cpu */
- perf_pmu_register(&perf_ibs, "ibs", -1);
+ perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+ if (ibs_caps & IBS_CAPS_OPCNT)
+ perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
+ perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+ register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 26b3e2fef104..187c294bc658 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1027,8 +1027,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
u64 status;
int handled;
- perf_sample_data_init(&data, 0);
-
cpuc = &__get_cpu_var(cpu_hw_events);
/*
@@ -1082,7 +1080,7 @@ again:
if (!intel_pmu_save_and_restart(event))
continue;
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
if (has_branch_stack(event))
data.br_stack = &cpuc->lbr_stack;
@@ -1121,27 +1119,33 @@ intel_bts_constraints(struct perf_event *event)
return NULL;
}
-static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
+static int intel_alt_er(int idx)
{
if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
- return false;
+ return idx;
- if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
- event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= 0x01bb;
- event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
- event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
- } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
+ if (idx == EXTRA_REG_RSP_0)
+ return EXTRA_REG_RSP_1;
+
+ if (idx == EXTRA_REG_RSP_1)
+ return EXTRA_REG_RSP_0;
+
+ return idx;
+}
+
+static void intel_fixup_er(struct perf_event *event, int idx)
+{
+ event->hw.extra_reg.idx = idx;
+
+ if (idx == EXTRA_REG_RSP_0) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
event->hw.config |= 0x01b7;
- event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+ } else if (idx == EXTRA_REG_RSP_1) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= 0x01bb;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
}
-
- if (event->hw.extra_reg.idx == orig_idx)
- return false;
-
- return true;
}
/*
@@ -1159,14 +1163,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
struct event_constraint *c = &emptyconstraint;
struct er_account *era;
unsigned long flags;
- int orig_idx = reg->idx;
+ int idx = reg->idx;
- /* already allocated shared msr */
- if (reg->alloc)
+ /*
+ * reg->alloc can be set due to existing state, so for fake cpuc we
+ * need to ignore this, otherwise we might fail to allocate proper fake
+ * state for this extra reg constraint. Also see the comment below.
+ */
+ if (reg->alloc && !cpuc->is_fake)
return NULL; /* call x86_get_event_constraint() */
again:
- era = &cpuc->shared_regs->regs[reg->idx];
+ era = &cpuc->shared_regs->regs[idx];
/*
* we use spin_lock_irqsave() to avoid lockdep issues when
* passing a fake cpuc
@@ -1175,6 +1183,29 @@ again:
if (!atomic_read(&era->ref) || era->config == reg->config) {
+ /*
+ * If its a fake cpuc -- as per validate_{group,event}() we
+ * shouldn't touch event state and we can avoid doing so
+ * since both will only call get_event_constraints() once
+ * on each event, this avoids the need for reg->alloc.
+ *
+ * Not doing the ER fixup will only result in era->reg being
+ * wrong, but since we won't actually try and program hardware
+ * this isn't a problem either.
+ */
+ if (!cpuc->is_fake) {
+ if (idx != reg->idx)
+ intel_fixup_er(event, idx);
+
+ /*
+ * x86_schedule_events() can call get_event_constraints()
+ * multiple times on events in the case of incremental
+ * scheduling(). reg->alloc ensures we only do the ER
+ * allocation once.
+ */
+ reg->alloc = 1;
+ }
+
/* lock in msr value */
era->config = reg->config;
era->reg = reg->reg;
@@ -1182,17 +1213,17 @@ again:
/* one more user */
atomic_inc(&era->ref);
- /* no need to reallocate during incremental event scheduling */
- reg->alloc = 1;
-
/*
* need to call x86_get_event_constraint()
* to check if associated event has constraints
*/
c = NULL;
- } else if (intel_try_alt_er(event, orig_idx)) {
- raw_spin_unlock_irqrestore(&era->lock, flags);
- goto again;
+ } else {
+ idx = intel_alt_er(idx);
+ if (idx != reg->idx) {
+ raw_spin_unlock_irqrestore(&era->lock, flags);
+ goto again;
+ }
}
raw_spin_unlock_irqrestore(&era->lock, flags);
@@ -1206,11 +1237,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
struct er_account *era;
/*
- * only put constraint if extra reg was actually
- * allocated. Also takes care of event which do
- * not use an extra shared reg
+ * Only put constraint if extra reg was actually allocated. Also takes
+ * care of event which do not use an extra shared reg.
+ *
+ * Also, if this is a fake cpuc we shouldn't touch any event state
+ * (reg->alloc) and we don't care about leaving inconsistent cpuc state
+ * either since it'll be thrown out.
*/
- if (!reg->alloc)
+ if (!reg->alloc || cpuc->is_fake)
return;
era = &cpuc->shared_regs->regs[reg->idx];
@@ -1302,15 +1336,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
intel_put_shared_regs_event_constraints(cpuc, event);
}
-static int intel_pmu_hw_config(struct perf_event *event)
+static void intel_pebs_aliases_core2(struct perf_event *event)
{
- int ret = x86_pmu_hw_config(event);
-
- if (ret)
- return ret;
-
- if (event->attr.precise_ip &&
- (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
/*
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
* (0x003c) so that we can use it with PEBS.
@@ -1331,10 +1359,48 @@ static int intel_pmu_hw_config(struct perf_event *event)
*/
u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_snb(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use UOPS_RETIRED.ALL
+ * (0x01c2), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * UOPS_RETIRED.ALL counts the number of cycles that retires
+ * CNTMASK micro-ops. By setting CNTMASK to a value (16)
+ * larger than the maximum number of micro-ops that can be
+ * retired per cycle (4) and then inverting the condition, we
+ * count all cycles that retire 16 or less micro-ops, which
+ * is every cycle.
+ *
+ * Thereby we gain a PEBS capable cycle counter.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
event->hw.config = alt_config;
}
+}
+
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ if (event->attr.precise_ip && x86_pmu.pebs_aliases)
+ x86_pmu.pebs_aliases(event);
if (intel_pmu_needs_lbr_smpl(event)) {
ret = intel_pmu_setup_lbr_filter(event);
@@ -1609,6 +1675,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.max_period = (1ULL << 31) - 1,
.get_event_constraints = intel_get_event_constraints,
.put_event_constraints = intel_put_event_constraints,
+ .pebs_aliases = intel_pebs_aliases_core2,
.format_attrs = intel_arch3_formats_attr,
@@ -1842,8 +1909,9 @@ __init int intel_pmu_init(void)
break;
case 42: /* SandyBridge */
- x86_add_quirk(intel_sandybridge_quirk);
case 45: /* SandyBridge, "Romely-EP" */
+ x86_add_quirk(intel_sandybridge_quirk);
+ case 58: /* IvyBridge */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -1851,6 +1919,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_snb_event_constraints;
x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
x86_pmu.extra_regs = intel_snb_extra_regs;
/* all extra regs are per-cpu when HT is on */
x86_pmu.er_flags |= ERF_HAS_RSP_1;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 7f64df19e7dd..35e2192df9f4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -316,8 +316,7 @@ int intel_pmu_drain_bts_buffer(void)
ds->bts_index = ds->bts_buffer_base;
- perf_sample_data_init(&data, 0);
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
regs.ip = 0;
/*
@@ -401,14 +400,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
- INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
- INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
- INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
+ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
@@ -564,8 +556,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
if (!intel_pmu_save_and_restart(event))
return;
- perf_sample_data_init(&data, 0);
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
/*
* We use the interrupt regs as a base because the PEBS record
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index a2dfacfd7103..47124a73dd73 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1005,8 +1005,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- perf_sample_data_init(&data, 0);
-
cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -1034,10 +1032,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
handled += overflow;
/* event overflow for sure */
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, hwc->last_period);
if (!x86_perf_event_set_period(event))
continue;
+
+
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index addf9e82a7f2..ee8e9abc859f 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,7 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
const struct cpuid_bit *cb;
static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
- { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 },
+ { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 },
{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
{ X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 1b81839b6c88..571246d81edf 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -271,7 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
return 1;
- show_registers(regs);
+ show_regs(regs);
#ifdef CONFIG_X86_32
if (user_mode_vm(regs)) {
sp = regs->sp;
@@ -311,16 +311,33 @@ void die(const char *str, struct pt_regs *regs, long err)
static int __init kstack_setup(char *s)
{
+ ssize_t ret;
+ unsigned long val;
+
if (!s)
return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+
+ ret = kstrtoul(s, 0, &val);
+ if (ret)
+ return ret;
+ kstack_depth_to_print = val;
return 0;
}
early_param("kstack", kstack_setup);
static int __init code_bytes_setup(char *s)
{
- code_bytes = simple_strtoul(s, NULL, 0);
+ ssize_t ret;
+ unsigned long val;
+
+ if (!s)
+ return -EINVAL;
+
+ ret = kstrtoul(s, 0, &val);
+ if (ret)
+ return ret;
+
+ code_bytes = val;
if (code_bytes > 8192)
code_bytes = 8192;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 88ec9129271d..e0b1d783daab 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -82,7 +82,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
}
-void show_registers(struct pt_regs *regs)
+void show_regs(struct pt_regs *regs)
{
int i;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 17107bd6e1f0..791b76122aa8 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -245,7 +245,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
-void show_registers(struct pt_regs *regs)
+void show_regs(struct pt_regs *regs)
{
int i;
unsigned long sp;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 62d61e9976eb..41857970517f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -113,7 +113,9 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
int x = e820x->nr_map;
if (x >= ARRAY_SIZE(e820x->map)) {
- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+ printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n",
+ (unsigned long long) start,
+ (unsigned long long) (start + size - 1));
return;
}
@@ -133,19 +135,19 @@ static void __init e820_print_type(u32 type)
switch (type) {
case E820_RAM:
case E820_RESERVED_KERN:
- printk(KERN_CONT "(usable)");
+ printk(KERN_CONT "usable");
break;
case E820_RESERVED:
- printk(KERN_CONT "(reserved)");
+ printk(KERN_CONT "reserved");
break;
case E820_ACPI:
- printk(KERN_CONT "(ACPI data)");
+ printk(KERN_CONT "ACPI data");
break;
case E820_NVS:
- printk(KERN_CONT "(ACPI NVS)");
+ printk(KERN_CONT "ACPI NVS");
break;
case E820_UNUSABLE:
- printk(KERN_CONT "(unusable)");
+ printk(KERN_CONT "unusable");
break;
default:
printk(KERN_CONT "type %u", type);
@@ -158,10 +160,10 @@ void __init e820_print_map(char *who)
int i;
for (i = 0; i < e820.nr_map; i++) {
- printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+ printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
(unsigned long long) e820.map[i].addr,
(unsigned long long)
- (e820.map[i].addr + e820.map[i].size));
+ (e820.map[i].addr + e820.map[i].size - 1));
e820_print_type(e820.map[i].type);
printk(KERN_CONT "\n");
}
@@ -428,9 +430,8 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
size = ULLONG_MAX - start;
end = start + size;
- printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
- (unsigned long long) start,
- (unsigned long long) end);
+ printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ",
+ (unsigned long long) start, (unsigned long long) (end - 1));
e820_print_type(old_type);
printk(KERN_CONT " ==> ");
e820_print_type(new_type);
@@ -509,9 +510,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
size = ULLONG_MAX - start;
end = start + size;
- printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
- (unsigned long long) start,
- (unsigned long long) end);
+ printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ",
+ (unsigned long long) start, (unsigned long long) (end - 1));
if (checktype)
e820_print_type(old_type);
printk(KERN_CONT "\n");
@@ -567,7 +567,7 @@ void __init update_e820(void)
if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
return;
e820.nr_map = nr_map;
- printk(KERN_INFO "modified physical RAM map:\n");
+ printk(KERN_INFO "e820: modified physical RAM map:\n");
e820_print_map("modified");
}
static void __init update_e820_saved(void)
@@ -637,8 +637,8 @@ __init void e820_setup_gap(void)
if (!found) {
gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
printk(KERN_ERR
- "PCI: Warning: Cannot find a gap in the 32bit address range\n"
- "PCI: Unassigned devices with 32bit resource registers may break!\n");
+ "e820: cannot find a gap in the 32bit address range\n"
+ "e820: PCI devices with unassigned 32bit BARs may break!\n");
}
#endif
@@ -648,8 +648,8 @@ __init void e820_setup_gap(void)
pci_mem_start = gapstart;
printk(KERN_INFO
- "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
- pci_mem_start, gapstart, gapsize);
+ "e820: [mem %#010lx-%#010lx] available for PCI devices\n",
+ gapstart, gapstart + gapsize - 1);
}
/**
@@ -667,7 +667,7 @@ void __init parse_e820_ext(struct setup_data *sdata)
extmap = (struct e820entry *)(sdata->data);
__append_e820_map(extmap, entries);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
- printk(KERN_INFO "extended physical RAM map:\n");
+ printk(KERN_INFO "e820: extended physical RAM map:\n");
e820_print_map("extended");
}
@@ -734,7 +734,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
if (addr) {
e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
- printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
+ printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n");
update_e820_saved();
}
@@ -784,7 +784,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
if (last_pfn > max_arch_pfn)
last_pfn = max_arch_pfn;
- printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
+ printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
last_pfn, max_arch_pfn);
return last_pfn;
}
@@ -888,7 +888,7 @@ void __init finish_e820_parsing(void)
early_panic("Invalid user supplied memory map");
e820.nr_map = nr;
- printk(KERN_INFO "user-defined physical RAM map:\n");
+ printk(KERN_INFO "e820: user-defined physical RAM map:\n");
e820_print_map("user");
}
}
@@ -996,8 +996,9 @@ void __init e820_reserve_resources_late(void)
end = MAX_RESOURCE_SIZE;
if (start >= end)
continue;
- printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
- start, end);
+ printk(KERN_DEBUG
+ "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n",
+ start, end);
reserve_region_with_split(&iomem_resource, start, end,
"RAM buffer");
}
@@ -1047,7 +1048,7 @@ void __init setup_memory_map(void)
who = x86_init.resources.memory_setup();
memcpy(&e820_saved, &e820, sizeof(struct e820map));
- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+ printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
e820_print_map(who);
}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 7b784f4ef1e4..623f28837476 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -56,6 +56,7 @@
#include <asm/irq_vectors.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
+#include <asm/asm.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -151,10 +152,8 @@
.pushsection .fixup, "ax"
99: movl $0, (%esp)
jmp 98b
-.section __ex_table, "a"
- .align 4
- .long 98b, 99b
.popsection
+ _ASM_EXTABLE(98b,99b)
.endm
.macro PTGS_TO_GS
@@ -164,10 +163,8 @@
.pushsection .fixup, "ax"
99: movl $0, PT_GS(%esp)
jmp 98b
-.section __ex_table, "a"
- .align 4
- .long 98b, 99b
.popsection
+ _ASM_EXTABLE(98b,99b)
.endm
.macro GS_TO_REG reg
@@ -249,12 +246,10 @@
jmp 2b
6: movl $0, (%esp)
jmp 3b
-.section __ex_table, "a"
- .align 4
- .long 1b, 4b
- .long 2b, 5b
- .long 3b, 6b
.popsection
+ _ASM_EXTABLE(1b,4b)
+ _ASM_EXTABLE(2b,5b)
+ _ASM_EXTABLE(3b,6b)
POP_GS_EX
.endm
@@ -321,7 +316,6 @@ ret_from_exception:
preempt_stop(CLBR_ANY)
ret_from_intr:
GET_THREAD_INFO(%ebp)
-resume_userspace_sig:
#ifdef CONFIG_VM86
movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
movb PT_CS(%esp), %al
@@ -415,10 +409,7 @@ sysenter_past_esp:
jae syscall_fault
1: movl (%ebp),%ebp
movl %ebp,PT_EBP(%esp)
-.section __ex_table,"a"
- .align 4
- .long 1b,syscall_fault
-.previous
+ _ASM_EXTABLE(1b,syscall_fault)
GET_THREAD_INFO(%ebp)
@@ -485,10 +476,8 @@ sysexit_audit:
.pushsection .fixup,"ax"
2: movl $0,PT_FS(%esp)
jmp 1b
-.section __ex_table,"a"
- .align 4
- .long 1b,2b
.popsection
+ _ASM_EXTABLE(1b,2b)
PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
@@ -543,10 +532,7 @@ ENTRY(iret_exc)
pushl $do_iret_error
jmp error_code
.previous
-.section __ex_table,"a"
- .align 4
- .long irq_return,iret_exc
-.previous
+ _ASM_EXTABLE(irq_return,iret_exc)
CFI_RESTORE_STATE
ldt_ss:
@@ -628,9 +614,13 @@ work_notifysig: # deal with pending signals and
# vm86-space
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
+ movb PT_CS(%esp), %bl
+ andb $SEGMENT_RPL_MASK, %bl
+ cmpb $USER_RPL, %bl
+ jb resume_kernel
xorl %edx, %edx
call do_notify_resume
- jmp resume_userspace_sig
+ jmp resume_userspace
ALIGN
work_notifysig_v86:
@@ -643,9 +633,13 @@ work_notifysig_v86:
#endif
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
+ movb PT_CS(%esp), %bl
+ andb $SEGMENT_RPL_MASK, %bl
+ cmpb $USER_RPL, %bl
+ jb resume_kernel
xorl %edx, %edx
call do_notify_resume
- jmp resume_userspace_sig
+ jmp resume_userspace
END(work_pending)
# perform syscall exit tracing
@@ -901,10 +895,7 @@ END(device_not_available)
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
iret
-.section __ex_table,"a"
- .align 4
- .long native_iret, iret_exc
-.previous
+ _ASM_EXTABLE(native_iret, iret_exc)
END(native_iret)
ENTRY(native_irq_enable_sysexit)
@@ -1093,13 +1084,10 @@ ENTRY(xen_failsafe_callback)
movl %eax,16(%esp)
jmp 4b
.previous
-.section __ex_table,"a"
- .align 4
- .long 1b,6b
- .long 2b,7b
- .long 3b,8b
- .long 4b,9b
-.previous
+ _ASM_EXTABLE(1b,6b)
+ _ASM_EXTABLE(2b,7b)
+ _ASM_EXTABLE(3b,8b)
+ _ASM_EXTABLE(4b,9b)
ENDPROC(xen_failsafe_callback)
BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index cdc79b5cfcd9..7d65133b51be 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,7 @@
#include <asm/paravirt.h>
#include <asm/ftrace.h>
#include <asm/percpu.h>
+#include <asm/asm.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -190,6 +191,44 @@ ENDPROC(native_usergs_sysret64)
.endm
/*
+ * When dynamic function tracer is enabled it will add a breakpoint
+ * to all locations that it is about to modify, sync CPUs, update
+ * all the code, sync CPUs, then remove the breakpoints. In this time
+ * if lockdep is enabled, it might jump back into the debug handler
+ * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
+ *
+ * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
+ * make sure the stack pointer does not get reset back to the top
+ * of the debug stack, and instead just reuses the current stack.
+ */
+#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
+
+.macro TRACE_IRQS_OFF_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_OFF
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_ON_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_ON
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
+ bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
+ jnc 1f
+ TRACE_IRQS_ON_DEBUG
+1:
+.endm
+
+#else
+# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
+# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
+#endif
+
+/*
* C code is not supposed to know about undefined top of stack. Every time
* a C function with an pt_regs argument is called from the SYSCALL based
* fast path FIXUP_TOP_OF_STACK is needed.
@@ -900,18 +939,12 @@ restore_args:
irq_return:
INTERRUPT_RETURN
-
- .section __ex_table, "a"
- .quad irq_return, bad_iret
- .previous
+ _ASM_EXTABLE(irq_return, bad_iret)
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
iretq
-
- .section __ex_table,"a"
- .quad native_iret, bad_iret
- .previous
+ _ASM_EXTABLE(native_iret, bad_iret)
#endif
.section .fixup,"ax"
@@ -1103,7 +1136,7 @@ ENTRY(\sym)
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
- TRACE_IRQS_OFF
+ TRACE_IRQS_OFF_DEBUG
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
@@ -1181,10 +1214,7 @@ gs_change:
CFI_ENDPROC
END(native_load_gs_index)
- .section __ex_table,"a"
- .align 8
- .quad gs_change,bad_gs
- .previous
+ _ASM_EXTABLE(gs_change,bad_gs)
.section .fixup,"ax"
/* running with kernelgs */
bad_gs:
@@ -1401,7 +1431,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
ENTRY(paranoid_exit)
DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
+ TRACE_IRQS_OFF_DEBUG
testl %ebx,%ebx /* swapgs needed? */
jnz paranoid_restore
testl $3,CS(%rsp)
@@ -1412,7 +1442,7 @@ paranoid_swapgs:
RESTORE_ALL 8
jmp irq_return
paranoid_restore:
- TRACE_IRQS_IRETQ 0
+ TRACE_IRQS_IRETQ_DEBUG 0
RESTORE_ALL 8
jmp irq_return
paranoid_userspace:
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c9a281f272fd..c3a7cb4bf6e6 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -24,40 +24,21 @@
#include <trace/syscall.h>
#include <asm/cacheflush.h>
+#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
-#include <asm/nmi.h>
-
#ifdef CONFIG_DYNAMIC_FTRACE
-/*
- * modifying_code is set to notify NMIs that they need to use
- * memory barriers when entering or exiting. But we don't want
- * to burden NMIs with unnecessary memory barriers when code
- * modification is not being done (which is most of the time).
- *
- * A mutex is already held when ftrace_arch_code_modify_prepare
- * and post_process are called. No locks need to be taken here.
- *
- * Stop machine will make sure currently running NMIs are done
- * and new NMIs will see the updated variable before we need
- * to worry about NMIs doing memory barriers.
- */
-static int modifying_code __read_mostly;
-static DEFINE_PER_CPU(int, save_modifying_code);
-
int ftrace_arch_code_modify_prepare(void)
{
set_kernel_text_rw();
set_all_modules_text_rw();
- modifying_code = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
{
- modifying_code = 0;
set_all_modules_text_ro();
set_kernel_text_ro();
return 0;
@@ -90,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
return calc.code;
}
-/*
- * Modifying code must take extra care. On an SMP machine, if
- * the code being modified is also being executed on another CPU
- * that CPU will have undefined results and possibly take a GPF.
- * We use kstop_machine to stop other CPUS from exectuing code.
- * But this does not stop NMIs from happening. We still need
- * to protect against that. We separate out the modification of
- * the code to take care of this.
- *
- * Two buffers are added: An IP buffer and a "code" buffer.
- *
- * 1) Put the instruction pointer into the IP buffer
- * and the new code into the "code" buffer.
- * 2) Wait for any running NMIs to finish and set a flag that says
- * we are modifying code, it is done in an atomic operation.
- * 3) Write the code
- * 4) clear the flag.
- * 5) Wait for any running NMIs to finish.
- *
- * If an NMI is executed, the first thing it does is to call
- * "ftrace_nmi_enter". This will check if the flag is set to write
- * and if it is, it will write what is in the IP and "code" buffers.
- *
- * The trick is, it does not matter if everyone is writing the same
- * content to the code location. Also, if a CPU is executing code
- * it is OK to write to that code location if the contents being written
- * are the same as what exists.
- */
-
-#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */
-static atomic_t nmi_running = ATOMIC_INIT(0);
-static int mod_code_status; /* holds return value of text write */
-static void *mod_code_ip; /* holds the IP to write to */
-static const void *mod_code_newcode; /* holds the text to write to the IP */
-
-static unsigned nmi_wait_count;
-static atomic_t nmi_update_count = ATOMIC_INIT(0);
-
-int ftrace_arch_read_dyn_info(char *buf, int size)
-{
- int r;
-
- r = snprintf(buf, size, "%u %u",
- nmi_wait_count,
- atomic_read(&nmi_update_count));
- return r;
-}
-
-static void clear_mod_flag(void)
-{
- int old = atomic_read(&nmi_running);
-
- for (;;) {
- int new = old & ~MOD_CODE_WRITE_FLAG;
-
- if (old == new)
- break;
-
- old = atomic_cmpxchg(&nmi_running, old, new);
- }
-}
-
-static void ftrace_mod_code(void)
-{
- /*
- * Yes, more than one CPU process can be writing to mod_code_status.
- * (and the code itself)
- * But if one were to fail, then they all should, and if one were
- * to succeed, then they all should.
- */
- mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
- MCOUNT_INSN_SIZE);
-
- /* if we fail, then kill any new writers */
- if (mod_code_status)
- clear_mod_flag();
-}
-
-void ftrace_nmi_enter(void)
-{
- __this_cpu_write(save_modifying_code, modifying_code);
-
- if (!__this_cpu_read(save_modifying_code))
- return;
-
- if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
- smp_rmb();
- ftrace_mod_code();
- atomic_inc(&nmi_update_count);
- }
- /* Must have previous changes seen before executions */
- smp_mb();
-}
-
-void ftrace_nmi_exit(void)
-{
- if (!__this_cpu_read(save_modifying_code))
- return;
-
- /* Finish all executions before clearing nmi_running */
- smp_mb();
- atomic_dec(&nmi_running);
-}
-
-static void wait_for_nmi_and_set_mod_flag(void)
-{
- if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
- return;
-
- do {
- cpu_relax();
- } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
-
- nmi_wait_count++;
-}
-
-static void wait_for_nmi(void)
-{
- if (!atomic_read(&nmi_running))
- return;
-
- do {
- cpu_relax();
- } while (atomic_read(&nmi_running));
-
- nmi_wait_count++;
-}
-
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
@@ -238,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
ip = (unsigned long)__va(__pa(ip));
- mod_code_ip = (void *)ip;
- mod_code_newcode = new_code;
-
- /* The buffers need to be visible before we let NMIs write them */
- smp_mb();
-
- wait_for_nmi_and_set_mod_flag();
-
- /* Make sure all running NMIs have finished before we write the code */
- smp_mb();
-
- ftrace_mod_code();
-
- /* Make sure the write happens before clearing the bit */
- smp_mb();
-
- clear_mod_flag();
- wait_for_nmi();
-
- return mod_code_status;
+ return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
}
static const unsigned char *ftrace_nop_replace(void)
@@ -266,7 +100,7 @@ static const unsigned char *ftrace_nop_replace(void)
}
static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
unsigned const char *new_code)
{
unsigned char replaced[MCOUNT_INSN_SIZE];
@@ -307,7 +141,20 @@ int ftrace_make_nop(struct module *mod,
old = ftrace_call_replace(ip, addr);
new = ftrace_nop_replace();
- return ftrace_modify_code(rec->ip, old, new);
+ /*
+ * On boot up, and when modules are loaded, the MCOUNT_ADDR
+ * is converted to a nop, and will never become MCOUNT_ADDR
+ * again. This code is either running before SMP (on boot up)
+ * or before the code will ever be executed (module load).
+ * We do not want to use the breakpoint version in this case,
+ * just modify the code directly.
+ */
+ if (addr == MCOUNT_ADDR)
+ return ftrace_modify_code_direct(rec->ip, old, new);
+
+ /* Normal cases use add_brk_on_nop */
+ WARN_ONCE(1, "invalid use of ftrace_make_nop");
+ return -EINVAL;
}
int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
@@ -318,9 +165,47 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
old = ftrace_nop_replace();
new = ftrace_call_replace(ip, addr);
- return ftrace_modify_code(rec->ip, old, new);
+ /* Should only be called when module is loaded */
+ return ftrace_modify_code_direct(rec->ip, old, new);
}
+/*
+ * The modifying_ftrace_code is used to tell the breakpoint
+ * handler to call ftrace_int3_handler(). If it fails to
+ * call this handler for a breakpoint added by ftrace, then
+ * the kernel may crash.
+ *
+ * As atomic_writes on x86 do not need a barrier, we do not
+ * need to add smp_mb()s for this to work. It is also considered
+ * that we can not read the modifying_ftrace_code before
+ * executing the breakpoint. That would be quite remarkable if
+ * it could do that. Here's the flow that is required:
+ *
+ * CPU-0 CPU-1
+ *
+ * atomic_inc(mfc);
+ * write int3s
+ * <trap-int3> // implicit (r)mb
+ * if (atomic_read(mfc))
+ * call ftrace_int3_handler()
+ *
+ * Then when we are finished:
+ *
+ * atomic_dec(mfc);
+ *
+ * If we hit a breakpoint that was not set by ftrace, it does not
+ * matter if ftrace_int3_handler() is called or not. It will
+ * simply be ignored. But it is crucial that a ftrace nop/caller
+ * breakpoint is handled. No other user should ever place a
+ * breakpoint on an ftrace nop/caller location. It must only
+ * be done by this code.
+ */
+atomic_t modifying_ftrace_code __read_mostly;
+
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code);
+
int ftrace_update_ftrace_func(ftrace_func_t func)
{
unsigned long ip = (unsigned long)(&ftrace_call);
@@ -329,9 +214,376 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
new = ftrace_call_replace(ip, (unsigned long)func);
+
+ /* See comment above by declaration of modifying_ftrace_code */
+ atomic_inc(&modifying_ftrace_code);
+
ret = ftrace_modify_code(ip, old, new);
+ atomic_dec(&modifying_ftrace_code);
+
+ return ret;
+}
+
+/*
+ * A breakpoint was added to the code address we are about to
+ * modify, and this is the handle that will just skip over it.
+ * We are either changing a nop into a trace call, or a trace
+ * call to a nop. While the change is taking place, we treat
+ * it just like it was a nop.
+ */
+int ftrace_int3_handler(struct pt_regs *regs)
+{
+ if (WARN_ON_ONCE(!regs))
+ return 0;
+
+ if (!ftrace_location(regs->ip - 1))
+ return 0;
+
+ regs->ip += MCOUNT_INSN_SIZE - 1;
+
+ return 1;
+}
+
+static int ftrace_write(unsigned long ip, const char *val, int size)
+{
+ /*
+ * On x86_64, kernel text mappings are mapped read-only with
+ * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
+ * of the kernel text mapping to modify the kernel text.
+ *
+ * For 32bit kernels, these mappings are same and we can use
+ * kernel identity mapping to modify code.
+ */
+ if (within(ip, (unsigned long)_text, (unsigned long)_etext))
+ ip = (unsigned long)__va(__pa(ip));
+
+ return probe_kernel_write((void *)ip, val, size);
+}
+
+static int add_break(unsigned long ip, const char *old)
+{
+ unsigned char replaced[MCOUNT_INSN_SIZE];
+ unsigned char brk = BREAKPOINT_INSTRUCTION;
+
+ if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ /* Make sure it is what we expect it to be */
+ if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
+ return -EINVAL;
+
+ if (ftrace_write(ip, &brk, 1))
+ return -EPERM;
+
+ return 0;
+}
+
+static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned const char *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_call_replace(ip, addr);
+
+ return add_break(rec->ip, old);
+}
+
+
+static int add_brk_on_nop(struct dyn_ftrace *rec)
+{
+ unsigned const char *old;
+
+ old = ftrace_nop_replace();
+
+ return add_break(rec->ip, old);
+}
+
+static int add_breakpoints(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ret = ftrace_test_record(rec, enable);
+
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return add_brk_on_nop(rec);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return add_brk_on_call(rec, ftrace_addr);
+ }
+ return 0;
+}
+
+/*
+ * On error, we need to remove breakpoints. This needs to
+ * be done caefully. If the address does not currently have a
+ * breakpoint, we know we are done. Otherwise, we look at the
+ * remaining 4 bytes of the instruction. If it matches a nop
+ * we replace the breakpoint with the nop. Otherwise we replace
+ * it with the call instruction.
+ */
+static int remove_breakpoint(struct dyn_ftrace *rec)
+{
+ unsigned char ins[MCOUNT_INSN_SIZE];
+ unsigned char brk = BREAKPOINT_INSTRUCTION;
+ const unsigned char *nop;
+ unsigned long ftrace_addr;
+ unsigned long ip = rec->ip;
+
+ /* If we fail the read, just give up */
+ if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ /* If this does not have a breakpoint, we are done */
+ if (ins[0] != brk)
+ return -1;
+
+ nop = ftrace_nop_replace();
+
+ /*
+ * If the last 4 bytes of the instruction do not match
+ * a nop, then we assume that this is a call to ftrace_addr.
+ */
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
+ /*
+ * For extra paranoidism, we check if the breakpoint is on
+ * a call that would actually jump to the ftrace_addr.
+ * If not, don't touch the breakpoint, we make just create
+ * a disaster.
+ */
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+ nop = ftrace_call_replace(ip, ftrace_addr);
+
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
+ return -EINVAL;
+ }
+
+ return probe_kernel_write((void *)ip, &nop[0], 1);
+}
+
+static int add_update_code(unsigned long ip, unsigned const char *new)
+{
+ /* skip breakpoint */
+ ip++;
+ new++;
+ if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1))
+ return -EPERM;
+ return 0;
+}
+
+static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_call_replace(ip, addr);
+ return add_update_code(ip, new);
+}
+
+static int add_update_nop(struct dyn_ftrace *rec)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_nop_replace();
+ return add_update_code(ip, new);
+}
+
+static int add_update(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ret = ftrace_test_record(rec, enable);
+
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return add_update_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return add_update_nop(rec);
+ }
+
+ return 0;
+}
+
+static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_call_replace(ip, addr);
+
+ if (ftrace_write(ip, new, 1))
+ return -EPERM;
+
+ return 0;
+}
+
+static int finish_update_nop(struct dyn_ftrace *rec)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_nop_replace();
+
+ if (ftrace_write(ip, new, 1))
+ return -EPERM;
+ return 0;
+}
+
+static int finish_update(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ret = ftrace_update_record(rec, enable);
+
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return finish_update_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return finish_update_nop(rec);
+ }
+
+ return 0;
+}
+
+static void do_sync_core(void *data)
+{
+ sync_core();
+}
+
+static void run_sync(void)
+{
+ int enable_irqs = irqs_disabled();
+
+ /* We may be called with interrupts disbled (on bootup). */
+ if (enable_irqs)
+ local_irq_enable();
+ on_each_cpu(do_sync_core, NULL, 1);
+ if (enable_irqs)
+ local_irq_disable();
+}
+
+void ftrace_replace_code(int enable)
+{
+ struct ftrace_rec_iter *iter;
+ struct dyn_ftrace *rec;
+ const char *report = "adding breakpoints";
+ int count = 0;
+ int ret;
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = add_breakpoints(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ count++;
+ }
+
+ run_sync();
+
+ report = "updating code";
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = add_update(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ }
+
+ run_sync();
+
+ report = "removing breakpoints";
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = finish_update(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ }
+
+ run_sync();
+
+ return;
+
+ remove_breakpoints:
+ ftrace_bug(ret, rec ? rec->ip : 0);
+ printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+ remove_breakpoint(rec);
+ }
+}
+
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code)
+{
+ int ret;
+
+ ret = add_break(ip, old_code);
+ if (ret)
+ goto out;
+
+ run_sync();
+
+ ret = add_update_code(ip, new_code);
+ if (ret)
+ goto fail_update;
+
+ run_sync();
+
+ ret = ftrace_write(ip, new_code, 1);
+ if (ret) {
+ ret = -EPERM;
+ goto out;
+ }
+ run_sync();
+ out:
return ret;
+
+ fail_update:
+ probe_kernel_write((void *)ip, &old_code[0], 1);
+ goto out;
+}
+
+void arch_ftrace_update_code(int command)
+{
+ /* See comment above by declaration of modifying_ftrace_code */
+ atomic_inc(&modifying_ftrace_code);
+
+ ftrace_modify_all_code(command);
+
+ atomic_dec(&modifying_ftrace_code);
}
int __init ftrace_dyn_arch_init(void *data)
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 51ff18616d50..c18f59d10101 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -14,7 +14,6 @@
#include <asm/sections.h>
#include <asm/e820.h>
#include <asm/page.h>
-#include <asm/trampoline.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/bios_ebda.h>
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 3a3b779f41d3..037df57a99ac 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,7 +24,6 @@
#include <asm/sections.h>
#include <asm/kdebug.h>
#include <asm/e820.h>
-#include <asm/trampoline.h>
#include <asm/bios_ebda.h>
static void __init zap_identity_mappings(void)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index ce0be7cd085e..d42ab17b7397 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -21,6 +21,7 @@
#include <asm/msr-index.h>
#include <asm/cpufeature.h>
#include <asm/percpu.h>
+#include <asm/nops.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -273,10 +274,7 @@ num_subarch_entries = (. - subarch_entries) / 4
* If cpu hotplug is not supported then this code can go in init section
* which will be freed later
*/
-
__CPUINIT
-
-#ifdef CONFIG_SMP
ENTRY(startup_32_smp)
cld
movl $(__BOOT_DS),%eax
@@ -287,7 +285,7 @@ ENTRY(startup_32_smp)
movl pa(stack_start),%ecx
movl %eax,%ss
leal -__PAGE_OFFSET(%ecx),%esp
-#endif /* CONFIG_SMP */
+
default_entry:
/*
@@ -363,28 +361,23 @@ default_entry:
pushl $0
popfl
-#ifdef CONFIG_SMP
- cmpb $0, ready
- jnz checkCPUtype
-#endif /* CONFIG_SMP */
-
/*
* start system 32-bit setup. We need to re-do some of the things done
* in 16-bit mode for the "real" operations.
*/
- call setup_idt
-
-checkCPUtype:
-
- movl $-1,X86_CPUID # -1 for no CPUID initially
-
+ movl setup_once_ref,%eax
+ andl %eax,%eax
+ jz 1f # Did we do this already?
+ call *%eax
+1:
+
/* check if it is 486 or 386. */
/*
* XXX - this does a lot of unnecessary setup. Alignment checks don't
* apply at our cpl of 0 and the stack ought to be aligned already, and
* we don't need to preserve eflags.
*/
-
+ movl $-1,X86_CPUID # -1 for no CPUID initially
movb $3,X86 # at least 386
pushfl # push EFLAGS
popl %eax # get EFLAGS
@@ -450,21 +443,6 @@ is386: movl $2,%ecx # set MP
movl $(__KERNEL_PERCPU), %eax
movl %eax,%fs # set this cpu's percpu
-#ifdef CONFIG_CC_STACKPROTECTOR
- /*
- * The linker can't handle this by relocation. Manually set
- * base address in stack canary segment descriptor.
- */
- cmpb $0,ready
- jne 1f
- movl $gdt_page,%eax
- movl $stack_canary,%ecx
- movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
- shrl $16, %ecx
- movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
- movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
-1:
-#endif
movl $(__KERNEL_STACK_CANARY),%eax
movl %eax,%gs
@@ -473,7 +451,6 @@ is386: movl $2,%ecx # set MP
cld # gcc2 wants the direction flag cleared at all times
pushl $0 # fake return address for unwinder
- movb $1, ready
jmp *(initial_code)
/*
@@ -495,81 +472,122 @@ check_x87:
.byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
ret
+
+#include "verify_cpu.S"
+
/*
- * setup_idt
+ * setup_once
*
- * sets up a idt with 256 entries pointing to
- * ignore_int, interrupt gates. It doesn't actually load
- * idt - that can be done only after paging has been enabled
- * and the kernel moved to PAGE_OFFSET. Interrupts
- * are enabled elsewhere, when we can be relatively
- * sure everything is ok.
+ * The setup work we only want to run on the BSP.
*
* Warning: %esi is live across this function.
*/
-setup_idt:
- lea ignore_int,%edx
- movl $(__KERNEL_CS << 16),%eax
- movw %dx,%ax /* selector = 0x0010 = cs */
- movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
+__INIT
+setup_once:
+ /*
+ * Set up a idt with 256 entries pointing to ignore_int,
+ * interrupt gates. It doesn't actually load idt - that needs
+ * to be done on each CPU. Interrupts are enabled elsewhere,
+ * when we can be relatively sure everything is ok.
+ */
- lea idt_table,%edi
- mov $256,%ecx
-rp_sidt:
+ movl $idt_table,%edi
+ movl $early_idt_handlers,%eax
+ movl $NUM_EXCEPTION_VECTORS,%ecx
+1:
movl %eax,(%edi)
- movl %edx,4(%edi)
+ movl %eax,4(%edi)
+ /* interrupt gate, dpl=0, present */
+ movl $(0x8E000000 + __KERNEL_CS),2(%edi)
+ addl $9,%eax
addl $8,%edi
- dec %ecx
- jne rp_sidt
+ loop 1b
-.macro set_early_handler handler,trapno
- lea \handler,%edx
+ movl $256 - NUM_EXCEPTION_VECTORS,%ecx
+ movl $ignore_int,%edx
movl $(__KERNEL_CS << 16),%eax
- movw %dx,%ax
+ movw %dx,%ax /* selector = 0x0010 = cs */
movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
- lea idt_table,%edi
- movl %eax,8*\trapno(%edi)
- movl %edx,8*\trapno+4(%edi)
-.endm
+2:
+ movl %eax,(%edi)
+ movl %edx,4(%edi)
+ addl $8,%edi
+ loop 2b
- set_early_handler handler=early_divide_err,trapno=0
- set_early_handler handler=early_illegal_opcode,trapno=6
- set_early_handler handler=early_protection_fault,trapno=13
- set_early_handler handler=early_page_fault,trapno=14
+#ifdef CONFIG_CC_STACKPROTECTOR
+ /*
+ * Configure the stack canary. The linker can't handle this by
+ * relocation. Manually set base address in stack canary
+ * segment descriptor.
+ */
+ movl $gdt_page,%eax
+ movl $stack_canary,%ecx
+ movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
+ shrl $16, %ecx
+ movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
+ movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
+#endif
+ andl $0,setup_once_ref /* Once is enough, thanks */
ret
-early_divide_err:
- xor %edx,%edx
- pushl $0 /* fake errcode */
- jmp early_fault
+ENTRY(early_idt_handlers)
+ # 36(%esp) %eflags
+ # 32(%esp) %cs
+ # 28(%esp) %eip
+ # 24(%rsp) error code
+ i = 0
+ .rept NUM_EXCEPTION_VECTORS
+ .if (EXCEPTION_ERRCODE_MASK >> i) & 1
+ ASM_NOP2
+ .else
+ pushl $0 # Dummy error code, to make stack frame uniform
+ .endif
+ pushl $i # 20(%esp) Vector number
+ jmp early_idt_handler
+ i = i + 1
+ .endr
+ENDPROC(early_idt_handlers)
+
+ /* This is global to keep gas from relaxing the jumps */
+ENTRY(early_idt_handler)
+ cld
+ cmpl $2,%ss:early_recursion_flag
+ je hlt_loop
+ incl %ss:early_recursion_flag
-early_illegal_opcode:
- movl $6,%edx
- pushl $0 /* fake errcode */
- jmp early_fault
+ push %eax # 16(%esp)
+ push %ecx # 12(%esp)
+ push %edx # 8(%esp)
+ push %ds # 4(%esp)
+ push %es # 0(%esp)
+ movl $(__KERNEL_DS),%eax
+ movl %eax,%ds
+ movl %eax,%es
-early_protection_fault:
- movl $13,%edx
- jmp early_fault
+ cmpl $(__KERNEL_CS),32(%esp)
+ jne 10f
-early_page_fault:
- movl $14,%edx
- jmp early_fault
+ leal 28(%esp),%eax # Pointer to %eip
+ call early_fixup_exception
+ andl %eax,%eax
+ jnz ex_entry /* found an exception entry */
-early_fault:
- cld
+10:
#ifdef CONFIG_PRINTK
- pusha
- movl $(__KERNEL_DS),%eax
- movl %eax,%ds
- movl %eax,%es
- cmpl $2,early_recursion_flag
- je hlt_loop
- incl early_recursion_flag
+ xorl %eax,%eax
+ movw %ax,2(%esp) /* clean up the segment values on some cpus */
+ movw %ax,6(%esp)
+ movw %ax,34(%esp)
+ leal 40(%esp),%eax
+ pushl %eax /* %esp before the exception */
+ pushl %ebx
+ pushl %ebp
+ pushl %esi
+ pushl %edi
movl %cr2,%eax
pushl %eax
- pushl %edx /* trapno */
+ pushl (20+6*4)(%esp) /* trapno */
pushl $fault_msg
call printk
#endif
@@ -578,6 +596,17 @@ hlt_loop:
hlt
jmp hlt_loop
+ex_entry:
+ pop %es
+ pop %ds
+ pop %edx
+ pop %ecx
+ pop %eax
+ addl $8,%esp /* drop vector number and error code */
+ decl %ss:early_recursion_flag
+ iret
+ENDPROC(early_idt_handler)
+
/* This is the default interrupt "handler" :-) */
ALIGN
ignore_int:
@@ -611,13 +640,18 @@ ignore_int:
popl %eax
#endif
iret
+ENDPROC(ignore_int)
+__INITDATA
+ .align 4
+early_recursion_flag:
+ .long 0
-#include "verify_cpu.S"
-
- __REFDATA
-.align 4
+__REFDATA
+ .align 4
ENTRY(initial_code)
.long i386_start_kernel
+ENTRY(setup_once_ref)
+ .long setup_once
/*
* BSS section
@@ -670,22 +704,19 @@ ENTRY(initial_page_table)
ENTRY(stack_start)
.long init_thread_union+THREAD_SIZE
-early_recursion_flag:
- .long 0
-
-ready: .byte 0
-
+__INITRODATA
int_msg:
.asciz "Unknown interrupt or fault at: %p %p %p\n"
fault_msg:
/* fault info: */
.ascii "BUG: Int %d: CR2 %p\n"
-/* pusha regs: */
- .ascii " EDI %p ESI %p EBP %p ESP %p\n"
- .ascii " EBX %p EDX %p ECX %p EAX %p\n"
+/* regs pushed in early_idt_handler: */
+ .ascii " EDI %p ESI %p EBP %p EBX %p\n"
+ .ascii " ESP %p ES %p DS %p\n"
+ .ascii " EDX %p ECX %p EAX %p\n"
/* fault frame: */
- .ascii " err %p EIP %p CS %p flg %p\n"
+ .ascii " vec %p err %p EIP %p CS %p flg %p\n"
.ascii "Stack: %p %p %p %p %p %p %p %p\n"
.ascii " %p %p %p %p %p %p %p %p\n"
.asciz " %p %p %p %p %p %p %p %p\n"
@@ -699,6 +730,7 @@ fault_msg:
* segment size, and 32-bit linear address value:
*/
+ .data
.globl boot_gdt_descr
.globl idt_descr
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 40f4eb3766d1..94bf9cc2c7ee 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,12 +19,15 @@
#include <asm/cache.h>
#include <asm/processor-flags.h>
#include <asm/percpu.h>
+#include <asm/nops.h>
#ifdef CONFIG_PARAVIRT
#include <asm/asm-offsets.h>
#include <asm/paravirt.h>
+#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
#else
-#define GET_CR2_INTO_RCX movq %cr2, %rcx
+#define GET_CR2_INTO(reg) movq %cr2, reg
+#define INTERRUPT_RETURN iretq
#endif
/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
@@ -136,10 +139,6 @@ ident_complete:
/* Fixup phys_base */
addq %rbp, phys_base(%rip)
- /* Fixup trampoline */
- addq %rbp, trampoline_level4_pgt + 0(%rip)
- addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
-
/* Due to ENTRY(), sometimes the empty space gets filled with
* zeros. Better take a jmp than relying on empty space being
* filled with 0x90 (nop)
@@ -270,36 +269,56 @@ bad_address:
jmp bad_address
.section ".init.text","ax"
-#ifdef CONFIG_EARLY_PRINTK
.globl early_idt_handlers
early_idt_handlers:
+ # 104(%rsp) %rflags
+ # 96(%rsp) %cs
+ # 88(%rsp) %rip
+ # 80(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- movl $i, %esi
+ .if (EXCEPTION_ERRCODE_MASK >> i) & 1
+ ASM_NOP2
+ .else
+ pushq $0 # Dummy error code, to make stack frame uniform
+ .endif
+ pushq $i # 72(%rsp) Vector number
jmp early_idt_handler
i = i + 1
.endr
-#endif
ENTRY(early_idt_handler)
-#ifdef CONFIG_EARLY_PRINTK
+ cld
+
cmpl $2,early_recursion_flag(%rip)
jz 1f
incl early_recursion_flag(%rip)
- GET_CR2_INTO_RCX
- movq %rcx,%r9
- xorl %r8d,%r8d # zero for error code
- movl %esi,%ecx # get vector number
- # Test %ecx against mask of vectors that push error code.
- cmpl $31,%ecx
- ja 0f
- movl $1,%eax
- salq %cl,%rax
- testl $0x27d00,%eax
- je 0f
- popq %r8 # get error code
-0: movq 0(%rsp),%rcx # get ip
- movq 8(%rsp),%rdx # get cs
+
+ pushq %rax # 64(%rsp)
+ pushq %rcx # 56(%rsp)
+ pushq %rdx # 48(%rsp)
+ pushq %rsi # 40(%rsp)
+ pushq %rdi # 32(%rsp)
+ pushq %r8 # 24(%rsp)
+ pushq %r9 # 16(%rsp)
+ pushq %r10 # 8(%rsp)
+ pushq %r11 # 0(%rsp)
+
+ cmpl $__KERNEL_CS,96(%rsp)
+ jne 10f
+
+ leaq 88(%rsp),%rdi # Pointer to %rip
+ call early_fixup_exception
+ andl %eax,%eax
+ jnz 20f # Found an exception entry
+
+10:
+#ifdef CONFIG_EARLY_PRINTK
+ GET_CR2_INTO(%r9) # can clobber any volatile register if pv
+ movl 80(%rsp),%r8d # error code
+ movl 72(%rsp),%esi # vector number
+ movl 96(%rsp),%edx # %cs
+ movq 88(%rsp),%rcx # %rip
xorl %eax,%eax
leaq early_idt_msg(%rip),%rdi
call early_printk
@@ -308,17 +327,32 @@ ENTRY(early_idt_handler)
call dump_stack
#ifdef CONFIG_KALLSYMS
leaq early_idt_ripmsg(%rip),%rdi
- movq 0(%rsp),%rsi # get rip again
+ movq 40(%rsp),%rsi # %rip again
call __print_symbol
#endif
#endif /* EARLY_PRINTK */
1: hlt
jmp 1b
-#ifdef CONFIG_EARLY_PRINTK
+20: # Exception table entry found
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rdi
+ popq %rsi
+ popq %rdx
+ popq %rcx
+ popq %rax
+ addq $16,%rsp # drop vector number and error code
+ decl early_recursion_flag(%rip)
+ INTERRUPT_RETURN
+
+ .balign 4
early_recursion_flag:
.long 0
+#ifdef CONFIG_EARLY_PRINTK
early_idt_msg:
.asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
early_idt_ripmsg:
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad0de0c2714e..1460a5df92f7 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -94,13 +94,18 @@ static int hpet_verbose;
static int __init hpet_setup(char *str)
{
- if (str) {
+ while (str) {
+ char *next = strchr(str, ',');
+
+ if (next)
+ *next++ = 0;
if (!strncmp("disable", str, 7))
boot_hpet_disable = 1;
if (!strncmp("force", str, 5))
hpet_force_user = 1;
if (!strncmp("verbose", str, 7))
hpet_verbose = 1;
+ str = next;
}
return 1;
}
@@ -319,8 +324,6 @@ static void hpet_set_mode(enum clock_event_mode mode,
now = hpet_readl(HPET_COUNTER);
cmp = now + (unsigned int) delta;
cfg = hpet_readl(HPET_Tn_CFG(timer));
- /* Make sure we use edge triggered interrupts */
- cfg &= ~HPET_TN_LEVEL;
cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
HPET_TN_SETVAL | HPET_TN_32BIT;
hpet_writel(cfg, HPET_Tn_CFG(timer));
@@ -787,15 +790,16 @@ static int hpet_clocksource_register(void)
return 0;
}
+static u32 *hpet_boot_cfg;
+
/**
* hpet_enable - Try to setup the HPET timer. Returns 1 on success.
*/
int __init hpet_enable(void)
{
- unsigned long hpet_period;
- unsigned int id;
+ u32 hpet_period, cfg, id;
u64 freq;
- int i;
+ unsigned int i, last;
if (!is_hpet_capable())
return 0;
@@ -847,15 +851,45 @@ int __init hpet_enable(void)
id = hpet_readl(HPET_ID);
hpet_print_config();
+ last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+
#ifdef CONFIG_HPET_EMULATE_RTC
/*
* The legacy routing mode needs at least two channels, tick timer
* and the rtc emulation channel.
*/
- if (!(id & HPET_ID_NUMBER))
+ if (!last)
goto out_nohpet;
#endif
+ cfg = hpet_readl(HPET_CFG);
+ hpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg),
+ GFP_KERNEL);
+ if (hpet_boot_cfg)
+ *hpet_boot_cfg = cfg;
+ else
+ pr_warn("HPET initial state will not be saved\n");
+ cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+ hpet_writel(cfg, HPET_CFG);
+ if (cfg)
+ pr_warn("HPET: Unrecognized bits %#x set in global cfg\n",
+ cfg);
+
+ for (i = 0; i <= last; ++i) {
+ cfg = hpet_readl(HPET_Tn_CFG(i));
+ if (hpet_boot_cfg)
+ hpet_boot_cfg[i + 1] = cfg;
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB);
+ hpet_writel(cfg, HPET_Tn_CFG(i));
+ cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP
+ | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
+ | HPET_TN_FSB | HPET_TN_FSB_CAP);
+ if (cfg)
+ pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n",
+ cfg, i);
+ }
+ hpet_print_config();
+
if (hpet_clocksource_register())
goto out_nohpet;
@@ -923,14 +957,28 @@ fs_initcall(hpet_late_init);
void hpet_disable(void)
{
if (is_hpet_capable() && hpet_virt_address) {
- unsigned int cfg = hpet_readl(HPET_CFG);
+ unsigned int cfg = hpet_readl(HPET_CFG), id, last;
- if (hpet_legacy_int_enabled) {
+ if (hpet_boot_cfg)
+ cfg = *hpet_boot_cfg;
+ else if (hpet_legacy_int_enabled) {
cfg &= ~HPET_CFG_LEGACY;
hpet_legacy_int_enabled = 0;
}
cfg &= ~HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
+
+ if (!hpet_boot_cfg)
+ return;
+
+ id = hpet_readl(HPET_ID);
+ last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+
+ for (id = 0; id <= last; ++id)
+ hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id));
+
+ if (*hpet_boot_cfg & HPET_CFG_ENABLE)
+ hpet_writel(*hpet_boot_cfg, HPET_CFG);
}
}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 7734bcbb5a3a..f250431fb505 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -88,7 +88,7 @@ void kernel_fpu_begin(void)
__thread_clear_has_fpu(me);
/* We do 'stts()' in kernel_fpu_end() */
} else {
- percpu_write(fpu_owner_task, NULL);
+ this_cpu_write(fpu_owner_task, NULL);
clts();
}
}
@@ -235,6 +235,7 @@ int init_fpu(struct task_struct *tsk)
if (tsk_used_math(tsk)) {
if (HAVE_HWFP && tsk == current)
unlazy_fpu(tsk);
+ tsk->thread.fpu.last_cpu = ~0;
return 0;
}
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
deleted file mode 100644
index 43e9ccf44947..000000000000
--- a/arch/x86/kernel/init_task.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-#include <linux/mqueue.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-
-static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
-static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-
-/*
- * Initial thread structure.
- *
- * We need to make sure that this is THREAD_SIZE aligned due to the
- * way process stacks are handled. This is done by having a special
- * "init_task" linker map entry..
- */
-union thread_union init_thread_union __init_task_data =
- { INIT_THREAD_INFO(init_task) };
-
-/*
- * Initial task structure.
- *
- * All other task structs will be allocated on slabs in fork.c
- */
-struct task_struct init_task = INIT_TASK(init_task);
-EXPORT_SYMBOL(init_task);
-
-/*
- * per-CPU TSS segments. Threads are completely 'soft' on Linux,
- * no more per-task TSS's. The TSS size is kept cacheline-aligned
- * so they are allowed to end up in the .data..cacheline_aligned
- * section. Since TSS's are completely CPU-local, we want them
- * on exact cacheline boundaries, to eliminate cacheline ping-pong.
- */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
-
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 58b7f27cb3e9..344faf8d0d62 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -127,8 +127,8 @@ void __cpuinit irq_ctx_init(int cpu)
return;
irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
- THREAD_FLAGS,
- THREAD_ORDER));
+ THREADINFO_GFP,
+ THREAD_SIZE_ORDER));
memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
@@ -137,8 +137,8 @@ void __cpuinit irq_ctx_init(int cpu)
per_cpu(hardirq_ctx, cpu) = irqctx;
irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
- THREAD_FLAGS,
- THREAD_ORDER));
+ THREADINFO_GFP,
+ THREAD_SIZE_ORDER));
memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8bfb6146f753..3f61904365cf 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -444,12 +444,12 @@ void kgdb_roundup_cpus(unsigned long flags)
/**
* kgdb_arch_handle_exception - Handle architecture specific GDB packets.
- * @vector: The error vector of the exception that happened.
+ * @e_vector: The error vector of the exception that happened.
* @signo: The signal number of the exception that happened.
* @err_code: The error code of the exception that happened.
- * @remcom_in_buffer: The buffer of the packet we have read.
- * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into.
- * @regs: The &struct pt_regs of the current process.
+ * @remcomInBuffer: The buffer of the packet we have read.
+ * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into.
+ * @linux_regs: The &struct pt_regs of the current process.
*
* This function MUST handle the 'c' and 's' command packets,
* as well packets to set / remove a hardware breakpoint, if used.
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index e213fc8408d2..e2f751efb7b1 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1037,9 +1037,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
"current sp %p does not match saved sp %p\n",
stack_addr(regs), kcb->jprobe_saved_sp);
printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
- show_registers(saved_regs);
+ show_regs(saved_regs);
printk(KERN_ERR "Current registers\n");
- show_registers(regs);
+ show_regs(regs);
BUG();
}
*regs = kcb->jprobe_saved_regs;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b8ba6e4a27e4..e554e5ad2fe8 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -79,7 +79,6 @@ struct kvm_task_sleep_node {
u32 token;
int cpu;
bool halted;
- struct mm_struct *mm;
};
static struct kvm_task_sleep_head {
@@ -126,9 +125,7 @@ void kvm_async_pf_task_wait(u32 token)
n.token = token;
n.cpu = smp_processor_id();
- n.mm = current->active_mm;
n.halted = idle || preempt_count() > 1;
- atomic_inc(&n.mm->mm_count);
init_waitqueue_head(&n.wq);
hlist_add_head(&n.link, &b->list);
spin_unlock(&b->lock);
@@ -161,9 +158,6 @@ EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
hlist_del_init(&n->link);
- if (!n->mm)
- return;
- mmdrop(n->mm);
if (n->halted)
smp_send_reschedule(n->cpu);
else if (waitqueue_active(&n->wq))
@@ -207,7 +201,7 @@ again:
* async PF was not yet handled.
* Add dummy entry for the token.
*/
- n = kmalloc(sizeof(*n), GFP_ATOMIC);
+ n = kzalloc(sizeof(*n), GFP_ATOMIC);
if (!n) {
/*
* Allocation failed! Busy wait while other cpu
@@ -219,7 +213,6 @@ again:
}
n->token = token;
n->cpu = smp_processor_id();
- n->mm = NULL;
init_waitqueue_head(&n->wq);
hlist_add_head(&n->link, &b->list);
} else
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f8492da65bfc..f1b42b3a186c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,6 +22,7 @@
#include <asm/msr.h>
#include <asm/apic.h>
#include <linux/percpu.h>
+#include <linux/hardirq.h>
#include <asm/x86_init.h>
#include <asm/reboot.h>
@@ -114,6 +115,20 @@ static void kvm_get_preset_lpj(void)
preset_lpj = lpj;
}
+bool kvm_check_and_clear_guest_paused(void)
+{
+ bool ret = false;
+ struct pvclock_vcpu_time_info *src;
+
+ src = &__get_cpu_var(hv_clock);
+ if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
+ __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED);
+ ret = true;
+ }
+
+ return ret;
+}
+
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_get_cycles,
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
deleted file mode 100644
index 7eb1e2b97827..000000000000
--- a/arch/x86/kernel/mca_32.c
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Written by Martin Kolinek, February 1996
- *
- * Changes:
- *
- * Chris Beauregard July 28th, 1996
- * - Fixed up integrated SCSI detection
- *
- * Chris Beauregard August 3rd, 1996
- * - Made mca_info local
- * - Made integrated registers accessible through standard function calls
- * - Added name field
- * - More sanity checking
- *
- * Chris Beauregard August 9th, 1996
- * - Rewrote /proc/mca
- *
- * Chris Beauregard January 7th, 1997
- * - Added basic NMI-processing
- * - Added more information to mca_info structure
- *
- * David Weinehall October 12th, 1998
- * - Made a lot of cleaning up in the source
- * - Added use of save_flags / restore_flags
- * - Added the 'driver_loaded' flag in MCA_adapter
- * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter
- *
- * David Weinehall March 24th, 1999
- * - Fixed the output of 'Driver Installed' in /proc/mca/pos
- * - Made the Integrated Video & SCSI show up even if they have id 0000
- *
- * Alexander Viro November 9th, 1999
- * - Switched to regular procfs methods
- *
- * Alfred Arnold & David Weinehall August 23rd, 2000
- * - Added support for Planar POS-registers
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mca.h>
-#include <linux/kprobes.h>
-#include <linux/slab.h>
-#include <asm/io.h>
-#include <linux/proc_fs.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/ioport.h>
-#include <asm/uaccess.h>
-#include <linux/init.h>
-
-static unsigned char which_scsi;
-
-int MCA_bus;
-EXPORT_SYMBOL(MCA_bus);
-
-/*
- * Motherboard register spinlock. Untested on SMP at the moment, but
- * are there any MCA SMP boxes?
- *
- * Yes - Alan
- */
-static DEFINE_SPINLOCK(mca_lock);
-
-/* Build the status info for the adapter */
-
-static void mca_configure_adapter_status(struct mca_device *mca_dev)
-{
- mca_dev->status = MCA_ADAPTER_NONE;
-
- mca_dev->pos_id = mca_dev->pos[0]
- + (mca_dev->pos[1] << 8);
-
- if (!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
-
- /*
- * id = 0x0000 usually indicates hardware failure,
- * however, ZP Gu (zpg@castle.net> reports that his 9556
- * has 0x0000 as id and everything still works. There
- * also seem to be an adapter with id = 0x0000; the
- * NCR Parallel Bus Memory Card. Until this is confirmed,
- * however, this code will stay.
- */
-
- mca_dev->status = MCA_ADAPTER_ERROR;
-
- return;
- } else if (mca_dev->pos_id != 0xffff) {
-
- /*
- * 0xffff usually indicates that there's no adapter,
- * however, some integrated adapters may have 0xffff as
- * their id and still be valid. Examples are on-board
- * VGA of the 55sx, the integrated SCSI of the 56 & 57,
- * and possibly also the 95 ULTIMEDIA.
- */
-
- mca_dev->status = MCA_ADAPTER_NORMAL;
- }
-
- if ((mca_dev->pos_id == 0xffff ||
- mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
- int j;
-
- for (j = 2; j < 8; j++) {
- if (mca_dev->pos[j] != 0xff) {
- mca_dev->status = MCA_ADAPTER_NORMAL;
- break;
- }
- }
- }
-
- if (!(mca_dev->pos[2] & MCA_ENABLED)) {
-
- /* enabled bit is in POS 2 */
-
- mca_dev->status = MCA_ADAPTER_DISABLED;
- }
-} /* mca_configure_adapter_status */
-
-/*--------------------------------------------------------------------*/
-
-static struct resource mca_standard_resources[] = {
- { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
- { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
- { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
- { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
- { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
- { .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
- { .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
-};
-
-#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources)
-
-/*
- * mca_read_and_store_pos - read the POS registers into a memory buffer
- * @pos: a char pointer to 8 bytes, contains the POS register value on
- * successful return
- *
- * Returns 1 if a card actually exists (i.e. the pos isn't
- * all 0xff) or 0 otherwise
- */
-static int mca_read_and_store_pos(unsigned char *pos)
-{
- int j;
- int found = 0;
-
- for (j = 0; j < 8; j++) {
- pos[j] = inb_p(MCA_POS_REG(j));
- if (pos[j] != 0xff) {
- /* 0xff all across means no device. 0x00 means
- * something's broken, but a device is
- * probably there. However, if you get 0x00
- * from a motherboard register it won't matter
- * what we find. For the record, on the
- * 57SLC, the integrated SCSI adapter has
- * 0xffff for the adapter ID, but nonzero for
- * other registers. */
-
- found = 1;
- }
- }
- return found;
-}
-
-static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
-{
- unsigned char byte;
- unsigned long flags;
-
- if (reg < 0 || reg >= 8)
- return 0;
-
- spin_lock_irqsave(&mca_lock, flags);
- if (mca_dev->pos_register) {
- /* Disable adapter setup, enable motherboard setup */
-
- outb_p(0, MCA_ADAPTER_SETUP_REG);
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
-
- byte = inb_p(MCA_POS_REG(reg));
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
- } else {
-
- /* Make sure motherboard setup is off */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /* Read the appropriate register */
-
- outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
- byte = inb_p(MCA_POS_REG(reg));
- outb_p(0, MCA_ADAPTER_SETUP_REG);
- }
- spin_unlock_irqrestore(&mca_lock, flags);
-
- mca_dev->pos[reg] = byte;
-
- return byte;
-}
-
-static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
- unsigned char byte)
-{
- unsigned long flags;
-
- if (reg < 0 || reg >= 8)
- return;
-
- spin_lock_irqsave(&mca_lock, flags);
-
- /* Make sure motherboard setup is off */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /* Read in the appropriate register */
-
- outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
- outb_p(byte, MCA_POS_REG(reg));
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- spin_unlock_irqrestore(&mca_lock, flags);
-
- /* Update the global register list, while we have the byte */
-
- mca_dev->pos[reg] = byte;
-
-}
-
-/* for the primary MCA bus, we have identity transforms */
-static int mca_dummy_transform_irq(struct mca_device *mca_dev, int irq)
-{
- return irq;
-}
-
-static int mca_dummy_transform_ioport(struct mca_device *mca_dev, int port)
-{
- return port;
-}
-
-static void *mca_dummy_transform_memory(struct mca_device *mca_dev, void *mem)
-{
- return mem;
-}
-
-
-static int __init mca_init(void)
-{
- unsigned int i, j;
- struct mca_device *mca_dev;
- unsigned char pos[8];
- short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
- struct mca_bus *bus;
-
- /*
- * WARNING: Be careful when making changes here. Putting an adapter
- * and the motherboard simultaneously into setup mode may result in
- * damage to chips (according to The Indispensable PC Hardware Book
- * by Hans-Peter Messmer). Also, we disable system interrupts (so
- * that we are not disturbed in the middle of this).
- */
-
- /* Make sure the MCA bus is present */
-
- if (mca_system_init()) {
- printk(KERN_ERR "MCA bus system initialisation failed\n");
- return -ENODEV;
- }
-
- if (!MCA_bus)
- return -ENODEV;
-
- printk(KERN_INFO "Micro Channel bus detected.\n");
-
- /* All MCA systems have at least a primary bus */
- bus = mca_attach_bus(MCA_PRIMARY_BUS);
- if (!bus)
- goto out_nomem;
- bus->default_dma_mask = 0xffffffffLL;
- bus->f.mca_write_pos = mca_pc_write_pos;
- bus->f.mca_read_pos = mca_pc_read_pos;
- bus->f.mca_transform_irq = mca_dummy_transform_irq;
- bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
- bus->f.mca_transform_memory = mca_dummy_transform_memory;
-
- /* get the motherboard device */
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
- if (unlikely(!mca_dev))
- goto out_nomem;
-
- /*
- * We do not expect many MCA interrupts during initialization,
- * but let us be safe:
- */
- spin_lock_irq(&mca_lock);
-
- /* Make sure adapter setup is off */
-
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- /* Read motherboard POS registers */
-
- mca_dev->pos_register = 0x7f;
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
- mca_dev->name[0] = 0;
- mca_read_and_store_pos(mca_dev->pos);
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for a motherboard */
- mca_dev->pos_id = MCA_MOTHERBOARD_POS;
- mca_dev->slot = MCA_MOTHERBOARD;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- /* Put motherboard into video setup mode, read integrated video
- * POS registers, and turn motherboard setup off.
- */
-
- mca_dev->pos_register = 0xdf;
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
- mca_dev->name[0] = 0;
- mca_read_and_store_pos(mca_dev->pos);
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for the integrated video */
- mca_dev->pos_id = MCA_INTEGVIDEO_POS;
- mca_dev->slot = MCA_INTEGVIDEO;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
- /*
- * Put motherboard into scsi setup mode, read integrated scsi
- * POS registers, and turn motherboard setup off.
- *
- * It seems there are two possible SCSI registers. Martin says that
- * for the 56,57, 0xf7 is the one, but fails on the 76.
- * Alfredo (apena@vnet.ibm.com) says
- * 0xfd works on his machine. We'll try both of them. I figure it's
- * a good bet that only one could be valid at a time. This could
- * screw up though if one is used for something else on the other
- * machine.
- */
-
- for (i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
- outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
- if (mca_read_and_store_pos(pos))
- break;
- }
- if (which_scsi) {
- /* found a scsi card */
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- for (j = 0; j < 8; j++)
- mca_dev->pos[j] = pos[j];
-
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for integrated SCSI controller */
- mca_dev->pos_id = MCA_INTEGSCSI_POS;
- mca_dev->slot = MCA_INTEGSCSI;
- mca_dev->pos_register = which_scsi;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
- }
-
- /* Turn off motherboard setup */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /*
- * Now loop over MCA slots: put each adapter into setup mode, and
- * read its POS registers. Then put adapter setup off.
- */
-
- for (i = 0; i < MCA_MAX_SLOT_NR; i++) {
- outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
- if (!mca_read_and_store_pos(pos))
- continue;
-
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- for (j = 0; j < 8; j++)
- mca_dev->pos[j] = pos[j];
-
- mca_dev->driver_loaded = 0;
- mca_dev->slot = i;
- mca_dev->pos_register = 0;
- mca_configure_adapter_status(mca_dev);
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
- }
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- /* Enable interrupts and return memory start */
- spin_unlock_irq(&mca_lock);
-
- for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
- request_resource(&ioport_resource, mca_standard_resources + i);
-
- mca_do_proc_init();
-
- return 0;
-
- out_unlock_nomem:
- spin_unlock_irq(&mca_lock);
- out_nomem:
- printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
- return -ENOMEM;
-}
-
-subsys_initcall(mca_init);
-
-/*--------------------------------------------------------------------*/
-
-static __kprobes void
-mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
-{
- int slot = mca_dev->slot;
-
- if (slot == MCA_INTEGSCSI) {
- printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
- mca_dev->name);
- } else if (slot == MCA_INTEGVIDEO) {
- printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
- mca_dev->name);
- } else if (slot == MCA_MOTHERBOARD) {
- printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
- mca_dev->name);
- }
-
- /* More info available in POS 6 and 7? */
-
- if (check_flag) {
- unsigned char pos6, pos7;
-
- pos6 = mca_device_read_pos(mca_dev, 6);
- pos7 = mca_device_read_pos(mca_dev, 7);
-
- printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
- }
-
-} /* mca_handle_nmi_slot */
-
-/*--------------------------------------------------------------------*/
-
-static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
-{
- struct mca_device *mca_dev = to_mca_device(dev);
- unsigned char pos5;
-
- pos5 = mca_device_read_pos(mca_dev, 5);
-
- if (!(pos5 & 0x80)) {
- /*
- * Bit 7 of POS 5 is reset when this adapter has a hardware
- * error. Bit 7 it reset if there's error information
- * available in POS 6 and 7.
- */
- mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
- return 1;
- }
- return 0;
-}
-
-void __kprobes mca_handle_nmi(void)
-{
- /*
- * First try - scan the various adapters and see if a specific
- * adapter was responsible for the error.
- */
- bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
-}
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 73465aab28f8..8a2ce8fd41c0 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -82,11 +82,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
- if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
- pr_warning("CPU%d: family %d not supported\n", cpu, c->x86);
- return -1;
- }
-
csig->rev = c->microcode;
pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
@@ -380,6 +375,13 @@ static struct microcode_ops microcode_amd_ops = {
struct microcode_ops * __init init_amd_microcode(void)
{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
+ if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
+ pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+
patch = (void *)get_zeroed_page(GFP_KERNEL);
if (!patch)
return NULL;
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 87a0f8688301..fbdfc6917180 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -299,12 +299,11 @@ static ssize_t reload_store(struct device *dev,
{
unsigned long val;
int cpu = dev->id;
- int ret = 0;
- char *end;
+ ssize_t ret = 0;
- val = simple_strtoul(buf, &end, 0);
- if (end == buf)
- return -EINVAL;
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
if (val == 1) {
get_online_cpus();
@@ -419,10 +418,8 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)
if (err)
return err;
- if (microcode_init_cpu(cpu) == UCODE_ERROR) {
- sysfs_remove_group(&dev->kobj, &mc_attr_group);
+ if (microcode_init_cpu(cpu) == UCODE_ERROR)
return -EINVAL;
- }
return err;
}
@@ -528,11 +525,11 @@ static int __init microcode_init(void)
microcode_ops = init_intel_microcode();
else if (c->x86_vendor == X86_VENDOR_AMD)
microcode_ops = init_amd_microcode();
-
- if (!microcode_ops) {
+ else
pr_err("no support for this CPU vendor\n");
+
+ if (!microcode_ops)
return -ENODEV;
- }
microcode_pdev = platform_device_register_simple("microcode", -1,
NULL, 0);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 3ca42d0e43a2..0327e2b3c408 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -147,12 +147,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
memset(csig, 0, sizeof(*csig));
- if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
- cpu_has(c, X86_FEATURE_IA64)) {
- pr_err("CPU%d not a capable Intel processor\n", cpu_num);
- return -1;
- }
-
csig->sig = cpuid_eax(0x00000001);
if ((c->x86_model >= 5) || (c->x86 > 6)) {
@@ -463,6 +457,14 @@ static struct microcode_ops microcode_intel_ops = {
struct microcode_ops * __init init_intel_microcode(void)
{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
+ if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+ cpu_has(c, X86_FEATURE_IA64)) {
+ pr_err("Intel CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+
return &microcode_intel_ops;
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index ca470e4c92dc..d2b56489d70f 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,7 +27,6 @@
#include <asm/proto.h>
#include <asm/bios_ebda.h>
#include <asm/e820.h>
-#include <asm/trampoline.h>
#include <asm/setup.h>
#include <asm/smp.h>
@@ -97,7 +96,7 @@ static void __init MP_bus_info(struct mpc_bus *m)
set_bit(m->busid, mp_bus_not_pci);
if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
@@ -105,12 +104,10 @@ static void __init MP_bus_info(struct mpc_bus *m)
x86_init.mpparse.mpc_oem_pci_bus(m);
clear_bit(m->busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
- } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
- mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
#endif
} else
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
@@ -368,9 +365,6 @@ static void __init construct_ioapic_table(int mpc_default_type)
case 3:
memcpy(bus.bustype, "EISA ", 6);
break;
- case 4:
- case 7:
- memcpy(bus.bustype, "MCA ", 6);
}
MP_bus_info(&bus);
if (mpc_default_type > 4) {
@@ -573,8 +567,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
struct mpf_intel *mpf;
unsigned long mem;
- apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
- bp, length);
+ apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
+ base, base + length - 1);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
@@ -589,8 +583,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
#endif
mpf_found = mpf;
- printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
- mpf, (u64)virt_to_phys(mpf));
+ printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
+ (unsigned long long) virt_to_phys(mpf),
+ (unsigned long long) virt_to_phys(mpf) +
+ sizeof(*mpf) - 1, mpf);
mem = virt_to_phys(mpf);
memblock_reserve(mem, sizeof(*mpf));
@@ -623,7 +619,7 @@ void __init default_find_smp_config(void)
return;
/*
* If it is an SMP machine we should know now, unless the
- * configuration is in an EISA/MCA bus machine with an
+ * configuration is in an EISA bus machine with an
* extended bios data area.
*
* there is a real-mode segmented pointer pointing to the
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 47acaf319165..a0b2f84457be 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -19,8 +19,6 @@
#include <linux/slab.h>
#include <linux/export.h>
-#include <linux/mca.h>
-
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
@@ -31,14 +29,6 @@
#include <asm/nmi.h>
#include <asm/x86_init.h>
-#define NMI_MAX_NAMELEN 16
-struct nmiaction {
- struct list_head list;
- nmi_handler_t handler;
- unsigned int flags;
- char *name;
-};
-
struct nmi_desc {
spinlock_t lock;
struct list_head head;
@@ -54,6 +44,14 @@ static struct nmi_desc nmi_desc[NMI_MAX] =
.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
.head = LIST_HEAD_INIT(nmi_desc[1].head),
},
+ {
+ .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[2].head),
+ },
+ {
+ .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[3].head),
+ },
};
@@ -84,7 +82,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
#define nmi_to_desc(type) (&nmi_desc[type])
-static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *a;
@@ -107,11 +105,14 @@ static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs,
return handled;
}
-static int __setup_nmi(unsigned int type, struct nmiaction *action)
+int __register_nmi_handler(unsigned int type, struct nmiaction *action)
{
struct nmi_desc *desc = nmi_to_desc(type);
unsigned long flags;
+ if (!action->handler)
+ return -EINVAL;
+
spin_lock_irqsave(&desc->lock, flags);
/*
@@ -120,6 +121,8 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action)
* to manage expectations
*/
WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
+ WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
+ WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
/*
* some handlers need to be executed first otherwise a fake
@@ -133,8 +136,9 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action)
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
+EXPORT_SYMBOL(__register_nmi_handler);
-static struct nmiaction *__free_nmi(unsigned int type, const char *name)
+void unregister_nmi_handler(unsigned int type, const char *name)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *n;
@@ -157,61 +161,16 @@ static struct nmiaction *__free_nmi(unsigned int type, const char *name)
spin_unlock_irqrestore(&desc->lock, flags);
synchronize_rcu();
- return (n);
-}
-
-int register_nmi_handler(unsigned int type, nmi_handler_t handler,
- unsigned long nmiflags, const char *devname)
-{
- struct nmiaction *action;
- int retval = -ENOMEM;
-
- if (!handler)
- return -EINVAL;
-
- action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL);
- if (!action)
- goto fail_action;
-
- action->handler = handler;
- action->flags = nmiflags;
- action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL);
- if (!action->name)
- goto fail_action_name;
-
- retval = __setup_nmi(type, action);
-
- if (retval)
- goto fail_setup_nmi;
-
- return retval;
-
-fail_setup_nmi:
- kfree(action->name);
-fail_action_name:
- kfree(action);
-fail_action:
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(register_nmi_handler);
-
-void unregister_nmi_handler(unsigned int type, const char *name)
-{
- struct nmiaction *a;
-
- a = __free_nmi(type, name);
- if (a) {
- kfree(a->name);
- kfree(a);
- }
}
-
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
-static notrace __kprobes void
+static __kprobes void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_SERR, regs, false))
+ return;
+
pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
reason, smp_processor_id());
@@ -236,15 +195,19 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
outb(reason, NMI_REASON_PORT);
}
-static notrace __kprobes void
+static __kprobes void
io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_IO_CHECK, regs, false))
+ return;
+
pr_emerg(
"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
reason, smp_processor_id());
- show_registers(regs);
+ show_regs(regs);
if (panic_on_io_nmi)
panic("NMI IOCK error: Not continuing");
@@ -263,7 +226,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
outb(reason, NMI_REASON_PORT);
}
-static notrace __kprobes void
+static __kprobes void
unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
int handled;
@@ -282,16 +245,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
__this_cpu_add(nmi_stats.unknown, 1);
-#ifdef CONFIG_MCA
- /*
- * Might actually be able to figure out what the guilty party
- * is:
- */
- if (MCA_bus) {
- mca_handle_nmi();
- return;
- }
-#endif
pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
reason, smp_processor_id());
@@ -305,7 +258,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
-static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+static __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;
@@ -491,14 +444,16 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
*/
if (unlikely(is_debug_stack(regs->sp))) {
debug_stack_set_zero();
- __get_cpu_var(update_debug_stack) = 1;
+ this_cpu_write(update_debug_stack, 1);
}
}
static inline void nmi_nesting_postprocess(void)
{
- if (unlikely(__get_cpu_var(update_debug_stack)))
+ if (unlikely(this_cpu_read(update_debug_stack))) {
debug_stack_reset();
+ this_cpu_write(update_debug_stack, 0);
+ }
}
#endif
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 2c39dcd510fa..149b8d9c6ad4 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -13,6 +13,7 @@
#include <linux/cpumask.h>
#include <linux/delay.h>
#include <linux/init.h>
+#include <linux/percpu.h>
#include <asm/apic.h>
#include <asm/nmi.h>
@@ -41,7 +42,7 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
static void __init init_nmi_testsuite(void)
{
/* trap all the unknown NMIs we may generate */
- register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
+ register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
}
static void __init cleanup_nmi_testsuite(void)
@@ -63,7 +64,7 @@ static void __init test_nmi_ipi(struct cpumask *mask)
{
unsigned long timeout;
- if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+ if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback,
NMI_FLAG_FIRST, "nmi_selftest")) {
nmi_fail = FAILURE;
return;
@@ -117,15 +118,15 @@ static void __init dotest(void (*testcase_fn)(void), int expected)
unexpected_testcase_failures++;
if (nmi_fail == FAILURE)
- printk("FAILED |");
+ printk(KERN_CONT "FAILED |");
else if (nmi_fail == TIMEOUT)
- printk("TIMEOUT|");
+ printk(KERN_CONT "TIMEOUT|");
else
- printk("ERROR |");
+ printk(KERN_CONT "ERROR |");
dump_stack();
} else {
testcase_successes++;
- printk(" ok |");
+ printk(KERN_CONT " ok |");
}
testcase_total++;
@@ -150,10 +151,10 @@ void __init nmi_selftest(void)
print_testname("remote IPI");
dotest(remote_ipi, SUCCESS);
- printk("\n");
+ printk(KERN_CONT "\n");
print_testname("local IPI");
dotest(local_ipi, SUCCESS);
- printk("\n");
+ printk(KERN_CONT "\n");
cleanup_nmi_testsuite();
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ab137605e694..9ce885996fd7 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -241,16 +241,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
static inline void enter_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+ BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
- percpu_write(paravirt_lazy_mode, mode);
+ this_cpu_write(paravirt_lazy_mode, mode);
}
static void leave_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
+ BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
- percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+ this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
}
void paravirt_enter_lazy_mmu(void)
@@ -267,7 +267,7 @@ void paravirt_start_context_switch(struct task_struct *prev)
{
BUG_ON(preemptible());
- if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+ if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
arch_leave_lazy_mmu_mode();
set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
}
@@ -289,7 +289,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
if (in_interrupt())
return PARAVIRT_LAZY_NONE;
- return percpu_read(paravirt_lazy_mode);
+ return this_cpu_read(paravirt_lazy_mode);
}
void arch_flush_lazy_mmu_mode(void)
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index d0b2fb9ccbb1..b72838bae64a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1480,8 +1480,9 @@ cleanup:
static int __init calgary_parse_options(char *p)
{
unsigned int bridge;
+ unsigned long val;
size_t len;
- char* endp;
+ ssize_t ret;
while (*p) {
if (!strncmp(p, "64k", 3))
@@ -1512,10 +1513,11 @@ static int __init calgary_parse_options(char *p)
++p;
if (*p == '\0')
break;
- bridge = simple_strtoul(p, &endp, 0);
- if (p == endp)
+ ret = kstrtoul(p, 0, &val);
+ if (ret)
break;
+ bridge = val;
if (bridge < MAX_PHB_BUS_NUM) {
printk(KERN_INFO "Calgary: disabling "
"translation for PHB %#x\n", bridge);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 3003250ac51d..c0f420f76cd3 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -101,13 +101,18 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
{
unsigned long dma_mask;
struct page *page;
+ unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
dma_addr_t addr;
dma_mask = dma_alloc_coherent_mask(dev, flag);
flag |= __GFP_ZERO;
again:
- page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
+ page = NULL;
+ if (!(flag & GFP_ATOMIC))
+ page = dma_alloc_from_contiguous(dev, count, get_order(size));
+ if (!page)
+ page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
if (!page)
return NULL;
@@ -127,6 +132,16 @@ again:
return page_address(page);
}
+void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_addr, struct dma_attrs *attrs)
+{
+ unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ struct page *page = virt_to_page(vaddr);
+
+ if (!dma_release_from_contiguous(dev, page, count))
+ free_pages((unsigned long)vaddr, get_order(size));
+}
+
/*
* See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
* parameter documentation.
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index f96050685b46..871be4a84c7d 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -74,12 +74,6 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
return nents;
}
-static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_addr, struct dma_attrs *attrs)
-{
- free_pages((unsigned long)vaddr, get_order(size));
-}
-
static void nommu_sync_single_for_device(struct device *dev,
dma_addr_t addr, size_t size,
enum dma_data_direction dir)
@@ -97,7 +91,7 @@ static void nommu_sync_sg_for_device(struct device *dev,
struct dma_map_ops nommu_dma_ops = {
.alloc = dma_generic_alloc_coherent,
- .free = nommu_free_coherent,
+ .free = dma_generic_free_coherent,
.map_sg = nommu_map_sg,
.map_page = nommu_map_page,
.sync_single_for_device = nommu_sync_single_for_device,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1d92a5ab6e8b..735279e54e59 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -27,6 +27,15 @@
#include <asm/debugreg.h>
#include <asm/nmi.h>
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data..cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU(unsigned char, is_idle);
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -47,10 +56,16 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
struct kmem_cache *task_xstate_cachep;
EXPORT_SYMBOL_GPL(task_xstate_cachep);
+/*
+ * this gets called so that we can store lazy state into memory and copy the
+ * current task into the new thread.
+ */
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
int ret;
+ unlazy_fpu(src);
+
*dst = *src;
if (fpu_allocated(&src->thread.fpu)) {
memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
@@ -67,10 +82,9 @@ void free_thread_xstate(struct task_struct *tsk)
fpu_free(&tsk->thread.fpu);
}
-void free_thread_info(struct thread_info *ti)
+void arch_release_task_struct(struct task_struct *tsk)
{
- free_thread_xstate(ti->task);
- free_pages((unsigned long)ti, THREAD_ORDER);
+ free_thread_xstate(tsk);
}
void arch_task_cache_init(void)
@@ -81,6 +95,16 @@ void arch_task_cache_init(void)
SLAB_PANIC | SLAB_NOTRACK, NULL);
}
+static inline void drop_fpu(struct task_struct *tsk)
+{
+ /*
+ * Forget coprocessor state..
+ */
+ tsk->fpu_counter = 0;
+ clear_fpu(tsk);
+ clear_used_math();
+}
+
/*
* Free current thread data structures etc..
*/
@@ -103,12 +127,8 @@ void exit_thread(void)
put_cpu();
kfree(bp);
}
-}
-void show_regs(struct pt_regs *regs)
-{
- show_registers(regs);
- show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
+ drop_fpu(me);
}
void show_regs_common(void)
@@ -143,12 +163,7 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- /*
- * Forget coprocessor state..
- */
- tsk->fpu_counter = 0;
- clear_fpu(tsk);
- clear_used_math();
+ drop_fpu(tsk);
}
static void hard_disable_TSC(void)
@@ -377,7 +392,7 @@ static inline void play_dead(void)
#ifdef CONFIG_X86_64
void enter_idle(void)
{
- percpu_write(is_idle, 1);
+ this_cpu_write(is_idle, 1);
atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
@@ -516,26 +531,6 @@ void stop_this_cpu(void *dummy)
}
}
-static void do_nothing(void *unused)
-{
-}
-
-/*
- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
- * pm_idle and update to new pm_idle value. Required while changing pm_idle
- * handler on SMP systems.
- *
- * Caller must have changed pm_idle to the new value before the call. Old
- * pm_idle value will not be used by any CPU after the return of this function.
- */
-void cpu_idle_wait(void)
-{
- smp_mb();
- /* kick all the CPUs so that they exit out of pm_idle */
- smp_call_function(do_nothing, NULL, 1);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
@@ -594,9 +589,17 @@ int mwait_usable(const struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
+ /* Use mwait if idle=mwait boot option is given */
if (boot_option_idle_override == IDLE_FORCE_MWAIT)
return 1;
+ /*
+ * Any idle= boot option other than idle=mwait means that we must not
+ * use mwait. Eg: idle=halt or idle=poll or idle=nomwait
+ */
+ if (boot_option_idle_override != IDLE_NO_OVERRIDE)
+ return 0;
+
if (c->cpuid_level < MWAIT_INFO)
return 0;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ae6847303e26..516fa186121b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -126,15 +126,6 @@ void release_thread(struct task_struct *dead_task)
release_vm86_irqs(dead_task);
}
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
- unlazy_fpu(tsk);
-}
-
int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long unused,
struct task_struct *p, struct pt_regs *regs)
@@ -302,7 +293,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
switch_fpu_finish(next_p, fpu);
- percpu_write(current_task, next_p);
+ this_cpu_write(current_task, next_p);
return prev_p;
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 733ca39f367e..61cdf7fdf099 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -145,15 +145,6 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
return get_desc_base(&t->thread.tls_array[tls]);
}
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
- unlazy_fpu(tsk);
-}
-
int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long unused,
struct task_struct *p, struct pt_regs *regs)
@@ -237,7 +228,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
current->thread.usersp = new_sp;
regs->ip = new_ip;
regs->sp = new_sp;
- percpu_write(old_rsp, new_sp);
+ this_cpu_write(old_rsp, new_sp);
regs->cs = _cs;
regs->ss = _ss;
regs->flags = X86_EFLAGS_IF;
@@ -359,11 +350,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- prev->usersp = percpu_read(old_rsp);
- percpu_write(old_rsp, next->usersp);
- percpu_write(current_task, next_p);
+ prev->usersp = this_cpu_read(old_rsp);
+ this_cpu_write(old_rsp, next->usersp);
+ this_cpu_write(current_task, next_p);
- percpu_write(kernel_stack,
+ this_cpu_write(kernel_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE - KERNEL_STACK_OFFSET);
@@ -423,6 +414,7 @@ void set_personality_ia32(bool x32)
current_thread_info()->status |= TS_COMPAT;
}
}
+EXPORT_SYMBOL_GPL(set_personality_ia32);
unsigned long get_wchan(struct task_struct *p)
{
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 685845cf16e0..c4c6a5c2bf0f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1211,12 +1211,6 @@ static long x32_arch_ptrace(struct task_struct *child,
0, sizeof(struct user_i387_struct),
datap);
- /* normal 64bit interface to access TLS data.
- Works just like arch_prctl, except that the arguments
- are reversed. */
- case PTRACE_ARCH_PRCTL:
- return do_arch_prctl(child, data, addr);
-
default:
return compat_ptrace_request(child, request, addr, data);
}
@@ -1480,7 +1474,11 @@ long syscall_trace_enter(struct pt_regs *regs)
regs->flags |= X86_EFLAGS_TF;
/* do the secure computing check first */
- secure_computing(regs->orig_ax);
+ if (secure_computing(regs->orig_ax)) {
+ /* seccomp failures shouldn't expose any additional code. */
+ ret = -1L;
+ goto out;
+ }
if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
ret = -1L;
@@ -1505,6 +1503,7 @@ long syscall_trace_enter(struct pt_regs *regs)
regs->dx, regs->r10);
#endif
+out:
return ret ?: regs->orig_ax;
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d840e69a853c..5de92f1abd76 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -24,6 +24,7 @@
#ifdef CONFIG_X86_32
# include <linux/ctype.h>
# include <linux/mc146818rtc.h>
+# include <asm/realmode.h>
#else
# include <asm/x86_init.h>
#endif
@@ -39,7 +40,8 @@ static int reboot_mode;
enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
-/* This variable is used privately to keep track of whether or not
+/*
+ * This variable is used privately to keep track of whether or not
* reboot_type is still set to its default value (i.e., reboot= hasn't
* been set on the command line). This is needed so that we can
* suppress DMI scanning for reboot quirks. Without it, it's
@@ -51,7 +53,8 @@ static int reboot_default = 1;
static int reboot_cpu = -1;
#endif
-/* This is set if we need to go through the 'emergency' path.
+/*
+ * This is set if we need to go through the 'emergency' path.
* When machine_emergency_restart() is called, we may be on
* an inconsistent state and won't be able to do a clean cleanup
*/
@@ -60,22 +63,24 @@ static int reboot_emergency;
/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
bool port_cf9_safe = false;
-/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
- warm Don't set the cold reboot flag
- cold Set the cold reboot flag
- bios Reboot by jumping through the BIOS (only for X86_32)
- smp Reboot by executing reset on BSP or other CPU (only for X86_32)
- triple Force a triple fault (init)
- kbd Use the keyboard controller. cold reset (default)
- acpi Use the RESET_REG in the FADT
- efi Use efi reset_system runtime service
- pci Use the so-called "PCI reset register", CF9
- force Avoid anything that could hang.
+/*
+ * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
+ * warm Don't set the cold reboot flag
+ * cold Set the cold reboot flag
+ * bios Reboot by jumping through the BIOS (only for X86_32)
+ * smp Reboot by executing reset on BSP or other CPU (only for X86_32)
+ * triple Force a triple fault (init)
+ * kbd Use the keyboard controller. cold reset (default)
+ * acpi Use the RESET_REG in the FADT
+ * efi Use efi reset_system runtime service
+ * pci Use the so-called "PCI reset register", CF9
+ * force Avoid anything that could hang.
*/
static int __init reboot_setup(char *str)
{
for (;;) {
- /* Having anything passed on the command line via
+ /*
+ * Having anything passed on the command line via
* reboot= will cause us to disable DMI checking
* below.
*/
@@ -98,9 +103,11 @@ static int __init reboot_setup(char *str)
if (isdigit(*(str+2)))
reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
}
- /* we will leave sorting out the final value
- when we are ready to reboot, since we might not
- have detected BSP APIC ID or smp_num_cpu */
+ /*
+ * We will leave sorting out the final value
+ * when we are ready to reboot, since we might not
+ * have detected BSP APIC ID or smp_num_cpu
+ */
break;
#endif /* CONFIG_SMP */
@@ -150,6 +157,62 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
return 0;
}
+void machine_real_restart(unsigned int type)
+{
+ void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int))
+ real_mode_header->machine_real_restart_asm;
+
+ local_irq_disable();
+
+ /*
+ * Write zero to CMOS register number 0x0f, which the BIOS POST
+ * routine will recognize as telling it to do a proper reboot. (Well
+ * that's what this book in front of me says -- it may only apply to
+ * the Phoenix BIOS though, it's not clear). At the same time,
+ * disable NMIs by setting the top bit in the CMOS address register,
+ * as we're about to do peculiar things to the CPU. I'm not sure if
+ * `outb_p' is needed instead of just `outb'. Use it to be on the
+ * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
+ */
+ spin_lock(&rtc_lock);
+ CMOS_WRITE(0x00, 0x8f);
+ spin_unlock(&rtc_lock);
+
+ /*
+ * Switch back to the initial page table.
+ */
+ load_cr3(initial_page_table);
+
+ /*
+ * Write 0x1234 to absolute memory location 0x472. The BIOS reads
+ * this on booting to tell it to "Bypass memory test (also warm
+ * boot)". This seems like a fairly standard thing that gets set by
+ * REBOOT.COM programs, and the previous reset routine did this
+ * too. */
+ *((unsigned short *)0x472) = reboot_mode;
+
+ /* Jump to the identity-mapped low memory code */
+ restart_lowmem(type);
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(machine_real_restart);
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+/*
+ * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
+ */
+static int __init set_pci_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_CF9) {
+ reboot_type = BOOT_CF9;
+ printk(KERN_INFO "%s series board detected. "
+ "Selecting PCI-method for reboots.\n", d->ident);
+ }
+ return 0;
+}
+
static int __init set_kbd_reboot(const struct dmi_system_id *d)
{
if (reboot_type != BOOT_KBD) {
@@ -159,7 +222,12 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
return 0;
}
+/*
+ * This is a single dmi_table handling all reboot quirks. Note that
+ * REBOOT_BIOS is only available for 32bit
+ */
static struct dmi_system_id __initdata reboot_dmi_table[] = {
+#ifdef CONFIG_X86_32
{ /* Handle problems with rebooting on Dell E520's */
.callback = set_bios_reboot,
.ident = "Dell E520",
@@ -184,7 +252,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
+ { /* Handle problems with rebooting on Dell Optiplex 745's SFF */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 745",
.matches = {
@@ -192,7 +260,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/
+ { /* Handle problems with rebooting on Dell Optiplex 745's DFF */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 745",
.matches = {
@@ -201,7 +269,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
+ { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 745",
.matches = {
@@ -210,7 +278,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
+ { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 330",
.matches = {
@@ -219,7 +287,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+ { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 360",
.matches = {
@@ -228,7 +296,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
},
},
- { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
+ { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 760",
.matches = {
@@ -301,7 +369,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
},
},
- { /* Handle problems with rebooting on ASUS P4S800 */
+ { /* Handle problems with rebooting on ASUS P4S800 */
.callback = set_bios_reboot,
.ident = "ASUS P4S800",
.matches = {
@@ -309,7 +377,9 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
},
},
- { /* Handle reboot issue on Acer Aspire one */
+#endif /* CONFIG_X86_32 */
+
+ { /* Handle reboot issue on Acer Aspire one */
.callback = set_kbd_reboot,
.ident = "Acer Aspire One A110",
.matches = {
@@ -317,96 +387,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
},
},
- { }
-};
-
-static int __init reboot_init(void)
-{
- /* Only do the DMI check if reboot_type hasn't been overridden
- * on the command line
- */
- if (reboot_default) {
- dmi_check_system(reboot_dmi_table);
- }
- return 0;
-}
-core_initcall(reboot_init);
-
-extern const unsigned char machine_real_restart_asm[];
-extern const u64 machine_real_restart_gdt[3];
-
-void machine_real_restart(unsigned int type)
-{
- void *restart_va;
- unsigned long restart_pa;
- void (*restart_lowmem)(unsigned int);
- u64 *lowmem_gdt;
-
- local_irq_disable();
-
- /* Write zero to CMOS register number 0x0f, which the BIOS POST
- routine will recognize as telling it to do a proper reboot. (Well
- that's what this book in front of me says -- it may only apply to
- the Phoenix BIOS though, it's not clear). At the same time,
- disable NMIs by setting the top bit in the CMOS address register,
- as we're about to do peculiar things to the CPU. I'm not sure if
- `outb_p' is needed instead of just `outb'. Use it to be on the
- safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
- */
- spin_lock(&rtc_lock);
- CMOS_WRITE(0x00, 0x8f);
- spin_unlock(&rtc_lock);
-
- /*
- * Switch back to the initial page table.
- */
- load_cr3(initial_page_table);
-
- /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
- this on booting to tell it to "Bypass memory test (also warm
- boot)". This seems like a fairly standard thing that gets set by
- REBOOT.COM programs, and the previous reset routine did this
- too. */
- *((unsigned short *)0x472) = reboot_mode;
-
- /* Patch the GDT in the low memory trampoline */
- lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
-
- restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
- restart_pa = virt_to_phys(restart_va);
- restart_lowmem = (void (*)(unsigned int))restart_pa;
-
- /* GDT[0]: GDT self-pointer */
- lowmem_gdt[0] =
- (u64)(sizeof(machine_real_restart_gdt) - 1) +
- ((u64)virt_to_phys(lowmem_gdt) << 16);
- /* GDT[1]: 64K real mode code segment */
- lowmem_gdt[1] =
- GDT_ENTRY(0x009b, restart_pa, 0xffff);
-
- /* Jump to the identity-mapped low memory code */
- restart_lowmem(type);
-}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(machine_real_restart);
-#endif
-
-#endif /* CONFIG_X86_32 */
-
-/*
- * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
- */
-static int __init set_pci_reboot(const struct dmi_system_id *d)
-{
- if (reboot_type != BOOT_CF9) {
- reboot_type = BOOT_CF9;
- printk(KERN_INFO "%s series board detected. "
- "Selecting PCI-method for reboots.\n", d->ident);
- }
- return 0;
-}
-
-static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
{ /* Handle problems with rebooting on Apple MacBook5 */
.callback = set_pci_reboot,
.ident = "Apple MacBook5",
@@ -471,20 +451,28 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
},
},
+ { /* Handle problems with rebooting on the Precision M6600. */
+ .callback = set_pci_reboot,
+ .ident = "Dell OptiPlex 990",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
+ },
+ },
{ }
};
-static int __init pci_reboot_init(void)
+static int __init reboot_init(void)
{
- /* Only do the DMI check if reboot_type hasn't been overridden
+ /*
+ * Only do the DMI check if reboot_type hasn't been overridden
* on the command line
*/
- if (reboot_default) {
- dmi_check_system(pci_reboot_dmi_table);
- }
+ if (reboot_default)
+ dmi_check_system(reboot_dmi_table);
return 0;
}
-core_initcall(pci_reboot_init);
+core_initcall(reboot_init);
static inline void kb_wait(void)
{
@@ -502,14 +490,14 @@ static void vmxoff_nmi(int cpu, struct pt_regs *regs)
cpu_emergency_vmxoff();
}
-/* Use NMIs as IPIs to tell all CPUs to disable virtualization
- */
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
static void emergency_vmx_disable_all(void)
{
/* Just make sure we won't change CPUs while doing this */
local_irq_disable();
- /* We need to disable VMX on all CPUs before rebooting, otherwise
+ /*
+ * We need to disable VMX on all CPUs before rebooting, otherwise
* we risk hanging up the machine, because the CPU ignore INIT
* signals when VMX is enabled.
*
@@ -528,8 +516,7 @@ static void emergency_vmx_disable_all(void)
* is still enabling VMX.
*/
if (cpu_has_vmx() && cpu_vmx_enabled()) {
- /* Disable VMX on this CPU.
- */
+ /* Disable VMX on this CPU. */
cpu_vmxoff();
/* Halt and disable VMX on the other CPUs */
@@ -574,12 +561,12 @@ static void native_machine_emergency_restart(void)
/* Could also try the reset bit in the Hammer NB */
switch (reboot_type) {
case BOOT_KBD:
- mach_reboot_fixups(); /* for board specific fixups */
+ mach_reboot_fixups(); /* For board specific fixups */
for (i = 0; i < 10; i++) {
kb_wait();
udelay(50);
- outb(0xfe, 0x64); /* pulse reset low */
+ outb(0xfe, 0x64); /* Pulse reset low */
udelay(50);
}
if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
@@ -621,7 +608,7 @@ static void native_machine_emergency_restart(void)
case BOOT_CF9:
port_cf9_safe = true;
- /* fall through */
+ /* Fall through */
case BOOT_CF9_COND:
if (port_cf9_safe) {
@@ -659,9 +646,12 @@ void native_machine_shutdown(void)
/* Make certain I only run on the appropriate processor */
set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
- /* O.K Now that I'm on the appropriate processor,
- * stop all of the others.
+ /*
+ * O.K Now that I'm on the appropriate processor, stop all of the
+ * others. Also disable the local irq to not receive the per-cpu
+ * timer interrupt which may trigger scheduler's load balance.
*/
+ local_irq_disable();
stop_other_cpus();
#endif
@@ -697,12 +687,11 @@ static void native_machine_restart(char *__unused)
static void native_machine_halt(void)
{
- /* stop other cpus and apics */
+ /* Stop other cpus and apics */
machine_shutdown();
tboot_shutdown(TB_SHUTDOWN_HALT);
- /* stop this cpu */
stop_this_cpu(NULL);
}
@@ -713,7 +702,7 @@ static void native_machine_power_off(void)
machine_shutdown();
pm_power_off();
}
- /* a fallback in case there is no PM info available */
+ /* A fallback in case there is no PM info available */
tboot_shutdown(TB_SHUTDOWN_HALT);
}
@@ -775,7 +764,8 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
cpu = raw_smp_processor_id();
- /* Don't do anything if this handler is invoked on crashing cpu.
+ /*
+ * Don't do anything if this handler is invoked on crashing cpu.
* Otherwise, system will completely hang. Crashing cpu can get
* an NMI if system was initially booted with nmi_watchdog parameter.
*/
@@ -799,7 +789,8 @@ static void smp_send_nmi_allbutself(void)
apic->send_IPI_allbutself(NMI_VECTOR);
}
-/* Halt all other CPUs, calling the specified function on each of them
+/*
+ * Halt all other CPUs, calling the specified function on each of them
*
* This function can be used to halt all other CPUs on crash
* or emergency reboot time. The function passed as parameter
@@ -810,7 +801,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
unsigned long msecs;
local_irq_disable();
- /* Make a note of crashing cpu. Will be used in NMI callback.*/
+ /* Make a note of crashing cpu. Will be used in NMI callback. */
crashing_cpu = safe_smp_processor_id();
shootdown_callback = callback;
@@ -819,8 +810,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
/* Would it be better to replace the trap vector here? */
if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
NMI_FLAG_FIRST, "crash"))
- return; /* return what? */
- /* Ensure the new callback function is set before sending
+ return; /* Return what? */
+ /*
+ * Ensure the new callback function is set before sending
* out the NMI
*/
wmb();
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
deleted file mode 100644
index 1d5c46df0d78..000000000000
--- a/arch/x86/kernel/reboot_32.S
+++ /dev/null
@@ -1,135 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/segment.h>
-#include <asm/page_types.h>
-
-/*
- * The following code and data reboots the machine by switching to real
- * mode and jumping to the BIOS reset entry point, as if the CPU has
- * really been reset. The previous version asked the keyboard
- * controller to pulse the CPU reset line, which is more thorough, but
- * doesn't work with at least one type of 486 motherboard. It is easy
- * to stop this code working; hence the copious comments.
- *
- * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
- */
- .section ".x86_trampoline","a"
- .balign 16
- .code32
-ENTRY(machine_real_restart_asm)
-r_base = .
- /* Get our own relocated address */
- call 1f
-1: popl %ebx
- subl $(1b - r_base), %ebx
-
- /* Compute the equivalent real-mode segment */
- movl %ebx, %ecx
- shrl $4, %ecx
-
- /* Patch post-real-mode segment jump */
- movw (dispatch_table - r_base)(%ebx,%eax,2),%ax
- movw %ax, (101f - r_base)(%ebx)
- movw %cx, (102f - r_base)(%ebx)
-
- /* Set up the IDT for real mode. */
- lidtl (machine_real_restart_idt - r_base)(%ebx)
-
- /*
- * Set up a GDT from which we can load segment descriptors for real
- * mode. The GDT is not used in real mode; it is just needed here to
- * prepare the descriptors.
- */
- lgdtl (machine_real_restart_gdt - r_base)(%ebx)
-
- /*
- * Load the data segment registers with 16-bit compatible values
- */
- movl $16, %ecx
- movl %ecx, %ds
- movl %ecx, %es
- movl %ecx, %fs
- movl %ecx, %gs
- movl %ecx, %ss
- ljmpl $8, $1f - r_base
-
-/*
- * This is 16-bit protected mode code to disable paging and the cache,
- * switch to real mode and jump to the BIOS reset code.
- *
- * The instruction that switches to real mode by writing to CR0 must be
- * followed immediately by a far jump instruction, which set CS to a
- * valid value for real mode, and flushes the prefetch queue to avoid
- * running instructions that have already been decoded in protected
- * mode.
- *
- * Clears all the flags except ET, especially PG (paging), PE
- * (protected-mode enable) and TS (task switch for coprocessor state
- * save). Flushes the TLB after paging has been disabled. Sets CD and
- * NW, to disable the cache on a 486, and invalidates the cache. This
- * is more like the state of a 486 after reset. I don't know if
- * something else should be done for other chips.
- *
- * More could be done here to set up the registers as if a CPU reset had
- * occurred; hopefully real BIOSs don't assume much. This is not the
- * actual BIOS entry point, anyway (that is at 0xfffffff0).
- *
- * Most of this work is probably excessive, but it is what is tested.
- */
- .code16
-1:
- xorl %ecx, %ecx
- movl %cr0, %eax
- andl $0x00000011, %eax
- orl $0x60000000, %eax
- movl %eax, %cr0
- movl %ecx, %cr3
- movl %cr0, %edx
- andl $0x60000000, %edx /* If no cache bits -> no wbinvd */
- jz 2f
- wbinvd
-2:
- andb $0x10, %al
- movl %eax, %cr0
- .byte 0xea /* ljmpw */
-101: .word 0 /* Offset */
-102: .word 0 /* Segment */
-
-bios:
- ljmpw $0xf000, $0xfff0
-
-apm:
- movw $0x1000, %ax
- movw %ax, %ss
- movw $0xf000, %sp
- movw $0x5307, %ax
- movw $0x0001, %bx
- movw $0x0003, %cx
- int $0x15
-
-END(machine_real_restart_asm)
-
- .balign 16
- /* These must match <asm/reboot.h */
-dispatch_table:
- .word bios - r_base
- .word apm - r_base
-END(dispatch_table)
-
- .balign 16
-machine_real_restart_idt:
- .word 0xffff /* Length - real mode default value */
- .long 0 /* Base - real mode default value */
-END(machine_real_restart_idt)
-
- .balign 16
-ENTRY(machine_real_restart_gdt)
- .quad 0 /* Self-pointer, filled in by PM code */
- .quad 0 /* 16-bit code segment, filled in by PM code */
- /*
- * 16-bit data segment with the selector value 16 = 0x10 and
- * base value 0x100; since this is consistent with real mode
- * semantics we don't have to reload the segments once CR0.PE = 0.
- */
- .quad GDT_ENTRY(0x0093, 0x100, 0xffff)
-END(machine_real_restart_gdt)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1a2901562059..16be6dc14db1 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -34,7 +34,6 @@
#include <linux/memblock.h>
#include <linux/seq_file.h>
#include <linux/console.h>
-#include <linux/mca.h>
#include <linux/root_dev.h>
#include <linux/highmem.h>
#include <linux/module.h>
@@ -50,6 +49,7 @@
#include <asm/pci-direct.h>
#include <linux/init_ohci1394_dma.h>
#include <linux/kvm_para.h>
+#include <linux/dma-contiguous.h>
#include <linux/errno.h>
#include <linux/kernel.h>
@@ -73,7 +73,7 @@
#include <asm/mtrr.h>
#include <asm/apic.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
#include <asm/e820.h>
#include <asm/mpspec.h>
#include <asm/setup.h>
@@ -179,12 +179,6 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
/* common cpu data for all cpus */
struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
EXPORT_SYMBOL(boot_cpu_data);
-static void set_mca_bus(int x)
-{
-#ifdef CONFIG_MCA
- MCA_bus = x;
-#endif
-}
unsigned int def_to_bigsmp;
@@ -340,8 +334,8 @@ static void __init relocate_initrd(void)
memblock_reserve(ramdisk_here, area_size);
initrd_start = ramdisk_here + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
- printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
- ramdisk_here, ramdisk_here + ramdisk_size);
+ printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
+ ramdisk_here, ramdisk_here + ramdisk_size - 1);
q = (char *)initrd_start;
@@ -372,8 +366,8 @@ static void __init relocate_initrd(void)
/* high pages is not converted by early_res_to_bootmem */
ramdisk_image = boot_params.hdr.ramdisk_image;
ramdisk_size = boot_params.hdr.ramdisk_size;
- printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
- " %08llx - %08llx\n",
+ printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
+ " [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
ramdisk_here, ramdisk_here + ramdisk_size - 1);
}
@@ -393,14 +387,13 @@ static void __init reserve_initrd(void)
initrd_start = 0;
if (ramdisk_size >= (end_of_lowmem>>1)) {
- memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
- printk(KERN_ERR "initrd too large to handle, "
- "disabling initrd\n");
- return;
+ panic("initrd too large to handle, "
+ "disabling initrd (%lld needed, %lld available)\n",
+ ramdisk_size, end_of_lowmem>>1);
}
- printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
- ramdisk_end);
+ printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
+ ramdisk_end - 1);
if (ramdisk_end <= end_of_lowmem) {
@@ -717,7 +710,6 @@ void __init setup_arch(char **cmdline_p)
apm_info.bios = boot_params.apm_bios_info;
ist_info = boot_params.ist_info;
if (boot_params.sys_desc_table.length != 0) {
- set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
machine_id = boot_params.sys_desc_table.table[0];
machine_submodel_id = boot_params.sys_desc_table.table[1];
BIOS_revision = boot_params.sys_desc_table.table[2];
@@ -914,10 +906,10 @@ void __init setup_arch(char **cmdline_p)
setup_bios_corruption_check();
#endif
- printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
- max_pfn_mapped<<PAGE_SHIFT);
+ printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
+ (max_pfn_mapped<<PAGE_SHIFT) - 1);
- setup_trampolines();
+ setup_real_mode();
init_gbpages();
@@ -934,6 +926,7 @@ void __init setup_arch(char **cmdline_p)
}
#endif
memblock.current_limit = get_max_mapped();
+ dma_contiguous_reserve(0);
/*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -975,6 +968,8 @@ void __init setup_arch(char **cmdline_p)
if (boot_cpu_data.cpuid_level >= 0) {
/* A CPU has %cr4 if and only if it has CPUID */
mmu_cr4_features = read_cr4();
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
}
#ifdef CONFIG_X86_32
@@ -1012,7 +1007,8 @@ void __init setup_arch(char **cmdline_p)
init_cpu_to_node();
init_apic_mappings();
- ioapic_and_gsi_init();
+ if (x86_io_apic_ops.init)
+ x86_io_apic_ops.init();
kvm_guest_init();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 71f4727da373..5a98aa272184 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -185,10 +185,22 @@ void __init setup_per_cpu_areas(void)
#endif
rc = -EINVAL;
if (pcpu_chosen_fc != PCPU_FC_PAGE) {
- const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
const size_t dyn_size = PERCPU_MODULE_RESERVE +
PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+ size_t atom_size;
+ /*
+ * On 64bit, use PMD_SIZE for atom_size so that embedded
+ * percpu areas are aligned to PMD. This, in the future,
+ * can also allow using PMD mappings in vmalloc area. Use
+ * PAGE_SIZE on 32bit as vmalloc space is highly contended
+ * and large vmalloc area allocs can easily fail.
+ */
+#ifdef CONFIG_X86_64
+ atom_size = PMD_SIZE;
+#else
+ atom_size = PAGE_SIZE;
+#endif
rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
dyn_size, atom_size,
pcpu_cpu_distance,
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 115eac431483..21af737053aa 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -18,6 +18,7 @@
#include <linux/personality.h>
#include <linux/uaccess.h>
#include <linux/user-return-notifier.h>
+#include <linux/uprobes.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
@@ -478,18 +479,8 @@ asmlinkage int
sys_sigsuspend(int history0, int history1, old_sigset_t mask)
{
sigset_t blocked;
-
- current->saved_sigmask = current->blocked;
-
- mask &= _BLOCKABLE;
siginitset(&blocked, mask);
- set_current_blocked(&blocked);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
-
- set_restore_sigmask();
- return -ERESTARTNOHAND;
+ return sigsuspend(&blocked);
}
asmlinkage int
@@ -564,7 +555,6 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
sizeof(frame->extramask))))
goto badframe;
- sigdelsetmask(&set, ~_BLOCKABLE);
set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->sc, &ax))
@@ -590,7 +580,6 @@ long sys_rt_sigreturn(struct pt_regs *regs)
if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
goto badframe;
- sigdelsetmask(&set, ~_BLOCKABLE);
set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
@@ -656,42 +645,28 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
struct pt_regs *regs)
{
int usig = signr_convert(sig);
- sigset_t *set = &current->blocked;
- int ret;
-
- if (current_thread_info()->status & TS_RESTORE_SIGMASK)
- set = &current->saved_sigmask;
+ sigset_t *set = sigmask_to_save();
/* Set up the stack frame */
if (is_ia32) {
if (ka->sa.sa_flags & SA_SIGINFO)
- ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+ return ia32_setup_rt_frame(usig, ka, info, set, regs);
else
- ret = ia32_setup_frame(usig, ka, set, regs);
+ return ia32_setup_frame(usig, ka, set, regs);
#ifdef CONFIG_X86_X32_ABI
} else if (is_x32) {
- ret = x32_setup_rt_frame(usig, ka, info,
+ return x32_setup_rt_frame(usig, ka, info,
(compat_sigset_t *)set, regs);
#endif
} else {
- ret = __setup_rt_frame(sig, ka, info, set, regs);
- }
-
- if (ret) {
- force_sigsegv(sig, current);
- return -EFAULT;
+ return __setup_rt_frame(sig, ka, info, set, regs);
}
-
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- return ret;
}
-static int
+static void
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
struct pt_regs *regs)
{
- int ret;
-
/* Are we from a system call? */
if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
@@ -722,10 +697,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
regs->flags &= ~X86_EFLAGS_TF;
- ret = setup_rt_frame(sig, ka, info, regs);
-
- if (ret)
- return ret;
+ if (setup_rt_frame(sig, ka, info, regs) < 0) {
+ force_sigsegv(sig, current);
+ return;
+ }
/*
* Clear the direction flag as per the ABI for function entry.
@@ -740,12 +715,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
*/
regs->flags &= ~X86_EFLAGS_TF;
- block_sigmask(ka, sig);
-
- tracehook_signal_handler(sig, info, ka, regs,
- test_thread_flag(TIF_SINGLESTEP));
-
- return 0;
+ signal_delivered(sig, info, ka, regs,
+ test_thread_flag(TIF_SINGLESTEP));
}
#ifdef CONFIG_X86_32
@@ -766,16 +737,6 @@ static void do_signal(struct pt_regs *regs)
siginfo_t info;
int signr;
- /*
- * We want the common case to go fast, which is why we may in certain
- * cases get here from kernel mode. Just return without doing anything
- * if so.
- * X86_32: vm86 regs switched out by assembly code before reaching
- * here, so testing against kernel CS suffices.
- */
- if (!user_mode(regs))
- return;
-
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
if (signr > 0) {
/* Whee! Actually deliver the signal. */
@@ -805,10 +766,7 @@ static void do_signal(struct pt_regs *regs)
* If there's no signal to deliver, we just put the saved sigmask
* back.
*/
- if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- set_current_blocked(&current->saved_sigmask);
- }
+ restore_saved_sigmask();
}
/*
@@ -824,6 +782,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
mce_notify_process();
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
+ if (thread_info_flags & _TIF_UPROBE) {
+ clear_thread_flag(TIF_UPROBE);
+ uprobe_notify_resume(regs);
+ }
+
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
@@ -831,8 +794,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
- if (current->replacement_session_keyring)
- key_replace_session_keyring();
}
if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
@@ -940,7 +901,6 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
goto badframe;
- sigdelsetmask(&set, ~_BLOCKABLE);
set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 66c74f481cab..48d2b7ded422 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -109,6 +109,9 @@
* about nothing of note with C stepping upwards.
*/
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+static bool smp_no_nmi_ipi = false;
+
/*
* this function sends a 'reschedule' IPI to another CPU.
* it goes straight through and wastes no time serializing
@@ -149,8 +152,6 @@ void native_send_call_func_ipi(const struct cpumask *mask)
free_cpumask_var(allbutself);
}
-static atomic_t stopping_cpu = ATOMIC_INIT(-1);
-
static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
{
/* We are registered on stopping cpu too, avoid spurious NMI */
@@ -162,7 +163,19 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
}
-static void native_nmi_stop_other_cpus(int wait)
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+
+asmlinkage void smp_reboot_interrupt(void)
+{
+ ack_APIC_irq();
+ irq_enter();
+ stop_this_cpu(NULL);
+ irq_exit();
+}
+
+static void native_stop_other_cpus(int wait)
{
unsigned long flags;
unsigned long timeout;
@@ -174,20 +187,25 @@ static void native_nmi_stop_other_cpus(int wait)
* Use an own vector here because smp_call_function
* does lots of things not suitable in a panic situation.
*/
+
+ /*
+ * We start by using the REBOOT_VECTOR irq.
+ * The irq is treated as a sync point to allow critical
+ * regions of code on other cpus to release their spin locks
+ * and re-enable irqs. Jumping straight to an NMI might
+ * accidentally cause deadlocks with further shutdown/panic
+ * code. By syncing, we give the cpus up to one second to
+ * finish their work before we force them off with the NMI.
+ */
if (num_online_cpus() > 1) {
/* did someone beat us here? */
if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
return;
- if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
- NMI_FLAG_FIRST, "smp_stop"))
- /* Note: we ignore failures here */
- return;
-
- /* sync above data before sending NMI */
+ /* sync above data before sending IRQ */
wmb();
- apic->send_IPI_allbutself(NMI_VECTOR);
+ apic->send_IPI_allbutself(REBOOT_VECTOR);
/*
* Don't wait longer than a second if the caller
@@ -197,63 +215,37 @@ static void native_nmi_stop_other_cpus(int wait)
while (num_online_cpus() > 1 && (wait || timeout--))
udelay(1);
}
+
+ /* if the REBOOT_VECTOR didn't work, try with the NMI */
+ if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) {
+ if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+ NMI_FLAG_FIRST, "smp_stop"))
+ /* Note: we ignore failures here */
+ /* Hope the REBOOT_IRQ is good enough */
+ goto finish;
- local_irq_save(flags);
- disable_local_APIC();
- local_irq_restore(flags);
-}
-
-/*
- * this function calls the 'stop' function on all other CPUs in the system.
- */
-
-asmlinkage void smp_reboot_interrupt(void)
-{
- ack_APIC_irq();
- irq_enter();
- stop_this_cpu(NULL);
- irq_exit();
-}
-
-static void native_irq_stop_other_cpus(int wait)
-{
- unsigned long flags;
- unsigned long timeout;
+ /* sync above data before sending IRQ */
+ wmb();
- if (reboot_force)
- return;
+ pr_emerg("Shutting down cpus with NMI\n");
- /*
- * Use an own vector here because smp_call_function
- * does lots of things not suitable in a panic situation.
- * On most systems we could also use an NMI here,
- * but there are a few systems around where NMI
- * is problematic so stay with an non NMI for now
- * (this implies we cannot stop CPUs spinning with irq off
- * currently)
- */
- if (num_online_cpus() > 1) {
- apic->send_IPI_allbutself(REBOOT_VECTOR);
+ apic->send_IPI_allbutself(NMI_VECTOR);
/*
- * Don't wait longer than a second if the caller
+ * Don't wait longer than a 10 ms if the caller
* didn't ask us to wait.
*/
- timeout = USEC_PER_SEC;
+ timeout = USEC_PER_MSEC * 10;
while (num_online_cpus() > 1 && (wait || timeout--))
udelay(1);
}
+finish:
local_irq_save(flags);
disable_local_APIC();
local_irq_restore(flags);
}
-static void native_smp_disable_nmi_ipi(void)
-{
- smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
-}
-
/*
* Reschedule call back.
*/
@@ -287,8 +279,8 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
static int __init nonmi_ipi_setup(char *str)
{
- native_smp_disable_nmi_ipi();
- return 1;
+ smp_no_nmi_ipi = true;
+ return 1;
}
__setup("nonmi_ipi", nonmi_ipi_setup);
@@ -298,7 +290,7 @@ struct smp_ops smp_ops = {
.smp_prepare_cpus = native_smp_prepare_cpus,
.smp_cpus_done = native_smp_cpus_done,
- .stop_other_cpus = native_nmi_stop_other_cpus,
+ .stop_other_cpus = native_stop_other_cpus,
.smp_send_reschedule = native_smp_send_reschedule,
.cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6e1e406038c2..7bd8a0823654 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -57,7 +57,7 @@
#include <asm/nmi.h>
#include <asm/irq.h>
#include <asm/idle.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
#include <asm/cpu.h>
#include <asm/numa.h>
#include <asm/pgtable.h>
@@ -73,23 +73,13 @@
#include <asm/smpboot_hooks.h>
#include <asm/i8259.h>
+#include <asm/realmode.h>
+
/* State of each CPU */
DEFINE_PER_CPU(int, cpu_state) = { 0 };
-/* Store all idle threads, this can be reused instead of creating
-* a new thread. Also avoids complicated thread destroy functionality
-* for idle threads.
-*/
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
- * removed after init for !CONFIG_HOTPLUG_CPU.
- */
-static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
-#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
-#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
-
-/*
* We need this for trampoline_base protection from concurrent accesses when
* off- and onlining cores wildly.
*/
@@ -97,20 +87,16 @@ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
void cpu_hotplug_driver_lock(void)
{
- mutex_lock(&x86_cpu_hotplug_driver_mutex);
+ mutex_lock(&x86_cpu_hotplug_driver_mutex);
}
void cpu_hotplug_driver_unlock(void)
{
- mutex_unlock(&x86_cpu_hotplug_driver_mutex);
+ mutex_unlock(&x86_cpu_hotplug_driver_mutex);
}
ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
-#else
-static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-#define get_idle_for_cpu(x) (idle_thread_array[(x)])
-#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
#endif
/* Number of siblings per CPU package */
@@ -315,59 +301,102 @@ void __cpuinit smp_store_cpu_info(int id)
identify_secondary_cpu(c);
}
-static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+static bool __cpuinit
+topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
+ "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
+ "[node: %d != %d]. Ignoring dependency.\n",
+ cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
+}
+
+#define link_mask(_m, c1, c2) \
+do { \
+ cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \
+ cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \
+} while (0)
+
+static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ if (c->phys_proc_id == o->phys_proc_id &&
+ per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
+ c->compute_unit_id == o->compute_unit_id)
+ return topology_sane(c, o, "smt");
+
+ } else if (c->phys_proc_id == o->phys_proc_id &&
+ c->cpu_core_id == o->cpu_core_id) {
+ return topology_sane(c, o, "smt");
+ }
+
+ return false;
+}
+
+static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
- cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
- cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
- cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
- cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
- cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
+ per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
+ return topology_sane(c, o, "llc");
+
+ return false;
}
+static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (c->phys_proc_id == o->phys_proc_id) {
+ if (cpu_has(c, X86_FEATURE_AMD_DCM))
+ return true;
+
+ return topology_sane(c, o, "mc");
+ }
+ return false;
+}
void __cpuinit set_cpu_sibling_map(int cpu)
{
- int i;
+ bool has_mc = boot_cpu_data.x86_max_cores > 1;
+ bool has_smt = smp_num_siblings > 1;
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct cpuinfo_x86 *o;
+ int i;
cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
- if (smp_num_siblings > 1) {
- for_each_cpu(i, cpu_sibling_setup_mask) {
- struct cpuinfo_x86 *o = &cpu_data(i);
-
- if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
- if (c->phys_proc_id == o->phys_proc_id &&
- per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
- c->compute_unit_id == o->compute_unit_id)
- link_thread_siblings(cpu, i);
- } else if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_core_id == o->cpu_core_id) {
- link_thread_siblings(cpu, i);
- }
- }
- } else {
+ if (!has_smt && !has_mc) {
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_core_mask(cpu));
+ c->booted_cores = 1;
+ return;
}
- cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ for_each_cpu(i, cpu_sibling_setup_mask) {
+ o = &cpu_data(i);
+
+ if ((i == cpu) || (has_smt && match_smt(c, o)))
+ link_mask(sibling, cpu, i);
+
+ if ((i == cpu) || (has_mc && match_llc(c, o)))
+ link_mask(llc_shared, cpu, i);
- if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
- cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
- c->booted_cores = 1;
- return;
}
+ /*
+ * This needs a separate iteration over the cpus because we rely on all
+ * cpu_sibling_mask links to be set-up.
+ */
for_each_cpu(i, cpu_sibling_setup_mask) {
- if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
- per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
- cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
- cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
- }
- if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- cpumask_set_cpu(cpu, cpu_core_mask(i));
+ o = &cpu_data(i);
+
+ if ((i == cpu) || (has_mc && match_mc(c, o))) {
+ link_mask(core, cpu, i);
+
/*
* Does this new cpu bringup a new core?
*/
@@ -393,16 +422,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
/* maps the cpu to the sched domain representing multi-core */
const struct cpumask *cpu_coregroup_mask(int cpu)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- /*
- * For perf, we return last level cache shared map.
- * And for power savings, we return cpu_core_map
- */
- if ((sched_mc_power_savings || sched_smt_power_savings) &&
- !(cpu_has(c, X86_FEATURE_AMD_DCM)))
- return cpu_core_mask(cpu);
- else
- return cpu_llc_shared_mask(cpu);
+ return cpu_llc_shared_mask(cpu);
}
static void impress_friends(void)
@@ -618,22 +638,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-struct create_idle {
- struct work_struct work;
- struct task_struct *idle;
- struct completion done;
- int cpu;
-};
-
-static void __cpuinit do_fork_idle(struct work_struct *work)
-{
- struct create_idle *c_idle =
- container_of(work, struct create_idle, work);
-
- c_idle->idle = fork_idle(c_idle->cpu);
- complete(&c_idle->done);
-}
-
/* reduce the number of lines printed when booting a large cpu count system */
static void __cpuinit announce_cpu(int cpu, int apicid)
{
@@ -660,61 +664,35 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
* Returns zero if CPU booted OK, else error code from
* ->wakeup_secondary_cpu.
*/
-static int __cpuinit do_boot_cpu(int apicid, int cpu)
+static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
{
+ volatile u32 *trampoline_status =
+ (volatile u32 *) __va(real_mode_header->trampoline_status);
+ /* start_ip had better be page-aligned! */
+ unsigned long start_ip = real_mode_header->trampoline_start;
+
unsigned long boot_error = 0;
- unsigned long start_ip;
int timeout;
- struct create_idle c_idle = {
- .cpu = cpu,
- .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
- };
-
- INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
alternatives_smp_switch(1);
- c_idle.idle = get_idle_for_cpu(cpu);
+ idle->thread.sp = (unsigned long) (((struct pt_regs *)
+ (THREAD_SIZE + task_stack_page(idle))) - 1);
+ per_cpu(current_task, cpu) = idle;
- /*
- * We can't use kernel_thread since we must avoid to
- * reschedule the child.
- */
- if (c_idle.idle) {
- c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
- init_idle(c_idle.idle, cpu);
- goto do_rest;
- }
-
- schedule_work(&c_idle.work);
- wait_for_completion(&c_idle.done);
-
- if (IS_ERR(c_idle.idle)) {
- printk("failed fork for CPU %d\n", cpu);
- destroy_work_on_stack(&c_idle.work);
- return PTR_ERR(c_idle.idle);
- }
-
- set_idle_for_cpu(cpu, c_idle.idle);
-do_rest:
- per_cpu(current_task, cpu) = c_idle.idle;
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
- clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+ clear_tsk_thread_flag(idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
per_cpu(kernel_stack, cpu) =
- (unsigned long)task_stack_page(c_idle.idle) -
+ (unsigned long)task_stack_page(idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
- stack_start = c_idle.idle->thread.sp;
-
- /* start_ip had better be page-aligned! */
- start_ip = trampoline_address();
+ stack_start = idle->thread.sp;
/* So we see what's up */
announce_cpu(cpu, apicid);
@@ -778,8 +756,7 @@ do_rest:
pr_debug("CPU%d: has booted.\n", cpu);
} else {
boot_error = 1;
- if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
- == 0xA5A5A5A5)
+ if (*trampoline_status == 0xA5A5A5A5)
/* trampoline started but...? */
pr_err("CPU%d: Stuck ??\n", cpu);
else
@@ -805,7 +782,7 @@ do_rest:
}
/* mark "stuck" area as not stuck */
- *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0;
+ *trampoline_status = 0;
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
/*
@@ -813,12 +790,10 @@ do_rest:
*/
smpboot_restore_warm_reset_vector();
}
-
- destroy_work_on_stack(&c_idle.work);
return boot_error;
}
-int __cpuinit native_cpu_up(unsigned int cpu)
+int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int apicid = apic->cpu_present_to_apicid(cpu);
unsigned long flags;
@@ -851,7 +826,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
- err = do_boot_cpu(apicid, cpu);
+ err = do_boot_cpu(apicid, cpu, tidle);
if (err) {
pr_debug("do_boot_cpu failed %d\n", err);
return -EIO;
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 6410744ac5cb..f84fe00fad48 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -32,7 +32,7 @@
#include <linux/mm.h>
#include <linux/tboot.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
#include <asm/processor.h>
#include <asm/bootparam.h>
#include <asm/pgtable.h>
@@ -44,7 +44,7 @@
#include <asm/e820.h>
#include <asm/io.h>
-#include "acpi/realmode/wakeup.h"
+#include "../realmode/rm/wakeup.h"
/* Global pointer to shared data; NULL means no measured launch. */
struct tboot *tboot __read_mostly;
@@ -201,7 +201,8 @@ static int tboot_setup_sleep(void)
add_mac_region(e820.map[i].addr, e820.map[i].size);
}
- tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
+ tboot->acpi_sinfo.kernel_s3_resume_vector =
+ real_mode_header->wakeup_start;
return 0;
}
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index c29e235792af..b79133abda48 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
+#include <asm/asm.h>
int rodata_test(void)
{
@@ -42,14 +43,7 @@ int rodata_test(void)
".section .fixup,\"ax\"\n"
"2: jmp 1b\n"
".previous\n"
- ".section __ex_table,\"a\"\n"
- " .align 16\n"
-#ifdef CONFIG_X86_32
- " .long 0b,2b\n"
-#else
- " .quad 0b,2b\n"
-#endif
- ".previous"
+ _ASM_EXTABLE(0b,2b)
: [rslt] "=r" (result)
: [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
);
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index c6eba2b42673..24d3c91e9812 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -14,7 +14,6 @@
#include <linux/i8253.h>
#include <linux/time.h>
#include <linux/export.h>
-#include <linux/mca.h>
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
@@ -58,11 +57,6 @@ EXPORT_SYMBOL(profile_pc);
static irqreturn_t timer_interrupt(int irq, void *dev_id)
{
global_clock_event->event_handler(global_clock_event);
-
- /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
- if (MCA_bus)
- outb_p(inb_p(0x61)| 0x80, 0x61);
-
return IRQ_HANDLED;
}
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
deleted file mode 100644
index a73b61055ad6..000000000000
--- a/arch/x86/kernel/trampoline.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <linux/io.h>
-#include <linux/memblock.h>
-
-#include <asm/trampoline.h>
-#include <asm/cacheflush.h>
-#include <asm/pgtable.h>
-
-unsigned char *x86_trampoline_base;
-
-void __init setup_trampolines(void)
-{
- phys_addr_t mem;
- size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
-
- /* Has to be in very low memory so we can execute real-mode AP code. */
- mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
- if (!mem)
- panic("Cannot allocate trampoline\n");
-
- x86_trampoline_base = __va(mem);
- memblock_reserve(mem, size);
-
- printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
- x86_trampoline_base, (unsigned long long)mem, size);
-
- memcpy(x86_trampoline_base, x86_trampoline_start, size);
-}
-
-/*
- * setup_trampolines() gets called very early, to guarantee the
- * availability of low memory. This is before the proper kernel page
- * tables are set up, so we cannot set page permissions in that
- * function. Thus, we use an arch_initcall instead.
- */
-static int __init configure_trampolines(void)
-{
- size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
-
- set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT);
- return 0;
-}
-arch_initcall(configure_trampolines);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
deleted file mode 100644
index 451c0a7ef7fd..000000000000
--- a/arch/x86/kernel/trampoline_32.S
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- *
- * Trampoline.S Derived from Setup.S by Linus Torvalds
- *
- * 4 Jan 1997 Michael Chastain: changed to gnu as.
- *
- * This is only used for booting secondary CPUs in SMP machine
- *
- * Entry: CS:IP point to the start of our code, we are
- * in real mode with no stack, but the rest of the
- * trampoline page to make our stack and everything else
- * is a mystery.
- *
- * We jump into arch/x86/kernel/head_32.S.
- *
- * On entry to trampoline_data, the processor is in real mode
- * with 16-bit addressing and 16-bit data. CS has some value
- * and IP is zero. Thus, data addresses need to be absolute
- * (no relocation) and are taken with regard to r_base.
- *
- * If you work on this file, check the object module with
- * objdump --reloc to make sure there are no relocation
- * entries except for:
- *
- * TYPE VALUE
- * R_386_32 startup_32_smp
- * R_386_32 boot_gdt
- */
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/segment.h>
-#include <asm/page_types.h>
-
-#ifdef CONFIG_SMP
-
- .section ".x86_trampoline","a"
- .balign PAGE_SIZE
- .code16
-
-ENTRY(trampoline_data)
-r_base = .
- wbinvd # Needed for NUMA-Q should be harmless for others
- mov %cs, %ax # Code and data in the same place
- mov %ax, %ds
-
- cli # We should be safe anyway
-
- movl $0xA5A5A5A5, trampoline_status - r_base
- # write marker for master knows we're running
-
- /* GDT tables in non default location kernel can be beyond 16MB and
- * lgdt will not be able to load the address as in real mode default
- * operand size is 16bit. Use lgdtl instead to force operand size
- * to 32 bit.
- */
-
- lidtl boot_idt_descr - r_base # load idt with 0, 0
- lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate
-
- xor %ax, %ax
- inc %ax # protected mode (PE) bit
- lmsw %ax # into protected mode
- # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
- ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
-
- # These need to be in the same 64K segment as the above;
- # hence we don't use the boot_gdt_descr defined in head.S
-boot_gdt_descr:
- .word __BOOT_DS + 7 # gdt limit
- .long boot_gdt - __PAGE_OFFSET # gdt base
-
-boot_idt_descr:
- .word 0 # idt limit = 0
- .long 0 # idt base = 0L
-
-ENTRY(trampoline_status)
- .long 0
-
-.globl trampoline_end
-trampoline_end:
-
-#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
deleted file mode 100644
index 09ff51799e96..000000000000
--- a/arch/x86/kernel/trampoline_64.S
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- *
- * Trampoline.S Derived from Setup.S by Linus Torvalds
- *
- * 4 Jan 1997 Michael Chastain: changed to gnu as.
- * 15 Sept 2005 Eric Biederman: 64bit PIC support
- *
- * Entry: CS:IP point to the start of our code, we are
- * in real mode with no stack, but the rest of the
- * trampoline page to make our stack and everything else
- * is a mystery.
- *
- * On entry to trampoline_data, the processor is in real mode
- * with 16-bit addressing and 16-bit data. CS has some value
- * and IP is zero. Thus, data addresses need to be absolute
- * (no relocation) and are taken with regard to r_base.
- *
- * With the addition of trampoline_level4_pgt this code can
- * now enter a 64bit kernel that lives at arbitrary 64bit
- * physical addresses.
- *
- * If you work on this file, check the object module with objdump
- * --full-contents --reloc to make sure there are no relocation
- * entries.
- */
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/pgtable_types.h>
-#include <asm/page_types.h>
-#include <asm/msr.h>
-#include <asm/segment.h>
-#include <asm/processor-flags.h>
-
- .section ".x86_trampoline","a"
- .balign PAGE_SIZE
- .code16
-
-ENTRY(trampoline_data)
-r_base = .
- cli # We should be safe anyway
- wbinvd
- mov %cs, %ax # Code and data in the same place
- mov %ax, %ds
- mov %ax, %es
- mov %ax, %ss
-
-
- movl $0xA5A5A5A5, trampoline_status - r_base
- # write marker for master knows we're running
-
- # Setup stack
- movw $(trampoline_stack_end - r_base), %sp
-
- call verify_cpu # Verify the cpu supports long mode
- testl %eax, %eax # Check for return code
- jnz no_longmode
-
- mov %cs, %ax
- movzx %ax, %esi # Find the 32bit trampoline location
- shll $4, %esi
-
- # Fixup the absolute vectors
- leal (startup_32 - r_base)(%esi), %eax
- movl %eax, startup_32_vector - r_base
- leal (startup_64 - r_base)(%esi), %eax
- movl %eax, startup_64_vector - r_base
- leal (tgdt - r_base)(%esi), %eax
- movl %eax, (tgdt + 2 - r_base)
-
- /*
- * GDT tables in non default location kernel can be beyond 16MB and
- * lgdt will not be able to load the address as in real mode default
- * operand size is 16bit. Use lgdtl instead to force operand size
- * to 32 bit.
- */
-
- lidtl tidt - r_base # load idt with 0, 0
- lgdtl tgdt - r_base # load gdt with whatever is appropriate
-
- mov $X86_CR0_PE, %ax # protected mode (PE) bit
- lmsw %ax # into protected mode
-
- # flush prefetch and jump to startup_32
- ljmpl *(startup_32_vector - r_base)
-
- .code32
- .balign 4
-startup_32:
- movl $__KERNEL_DS, %eax # Initialize the %ds segment register
- movl %eax, %ds
-
- movl $X86_CR4_PAE, %eax
- movl %eax, %cr4 # Enable PAE mode
-
- # Setup trampoline 4 level pagetables
- leal (trampoline_level4_pgt - r_base)(%esi), %eax
- movl %eax, %cr3
-
- movl $MSR_EFER, %ecx
- movl $(1 << _EFER_LME), %eax # Enable Long Mode
- xorl %edx, %edx
- wrmsr
-
- # Enable paging and in turn activate Long Mode
- # Enable protected mode
- movl $(X86_CR0_PG | X86_CR0_PE), %eax
- movl %eax, %cr0
-
- /*
- * At this point we're in long mode but in 32bit compatibility mode
- * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
- * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
- * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
- */
- ljmp *(startup_64_vector - r_base)(%esi)
-
- .code64
- .balign 4
-startup_64:
- # Now jump into the kernel using virtual addresses
- movq $secondary_startup_64, %rax
- jmp *%rax
-
- .code16
-no_longmode:
- hlt
- jmp no_longmode
-#include "verify_cpu.S"
-
- .balign 4
- # Careful these need to be in the same 64K segment as the above;
-tidt:
- .word 0 # idt limit = 0
- .word 0, 0 # idt base = 0L
-
- # Duplicate the global descriptor table
- # so the kernel can live anywhere
- .balign 4
-tgdt:
- .short tgdt_end - tgdt # gdt limit
- .long tgdt - r_base
- .short 0
- .quad 0x00cf9b000000ffff # __KERNEL32_CS
- .quad 0x00af9b000000ffff # __KERNEL_CS
- .quad 0x00cf93000000ffff # __KERNEL_DS
-tgdt_end:
-
- .balign 4
-startup_32_vector:
- .long startup_32 - r_base
- .word __KERNEL32_CS, 0
-
- .balign 4
-startup_64_vector:
- .long startup_64 - r_base
- .word __KERNEL_CS, 0
-
- .balign 4
-ENTRY(trampoline_status)
- .long 0
-
-trampoline_stack:
- .org 0x1000
-trampoline_stack_end:
-ENTRY(trampoline_level4_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 510,8,0
- .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-
-ENTRY(trampoline_end)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ff9281f16029..05b31d92f69c 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -37,10 +37,6 @@
#include <linux/eisa.h>
#endif
-#ifdef CONFIG_MCA
-#include <linux/mca.h>
-#endif
-
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
@@ -50,6 +46,7 @@
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <linux/atomic.h>
+#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/i387.h>
@@ -303,8 +300,17 @@ gp_in_kernel:
}
/* May run on IST stack. */
-dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
+dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
{
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /*
+ * ftrace must be first, everything else may cause a recursive crash.
+ * See note by declaration of modifying_ftrace_code in ftrace.c
+ */
+ if (unlikely(atomic_read(&modifying_ftrace_code)) &&
+ ftrace_int3_handler(regs))
+ return;
+#endif
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
new file mode 100644
index 000000000000..dc4e910a7d96
--- /dev/null
+++ b/arch/x86/kernel/uprobes.c
@@ -0,0 +1,674 @@
+/*
+ * User-space Probes (UProbes) for x86
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2011
+ * Authors:
+ * Srikar Dronamraju
+ * Jim Keniston
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/uprobes.h>
+#include <linux/uaccess.h>
+
+#include <linux/kdebug.h>
+#include <asm/processor.h>
+#include <asm/insn.h>
+
+/* Post-execution fixups. */
+
+/* No fixup needed */
+#define UPROBE_FIX_NONE 0x0
+
+/* Adjust IP back to vicinity of actual insn */
+#define UPROBE_FIX_IP 0x1
+
+/* Adjust the return address of a call insn */
+#define UPROBE_FIX_CALL 0x2
+
+#define UPROBE_FIX_RIP_AX 0x8000
+#define UPROBE_FIX_RIP_CX 0x4000
+
+#define UPROBE_TRAP_NR UINT_MAX
+
+/* Adaptations for mhiramat x86 decoder v14. */
+#define OPCODE1(insn) ((insn)->opcode.bytes[0])
+#define OPCODE2(insn) ((insn)->opcode.bytes[1])
+#define OPCODE3(insn) ((insn)->opcode.bytes[2])
+#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value)
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
+ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
+ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
+ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
+ << (row % 32))
+
+/*
+ * Good-instruction tables for 32-bit apps. This is non-const and volatile
+ * to keep gcc from statically optimizing it out, as variable_test_bit makes
+ * some versions of gcc to think only *(unsigned long*) is used.
+ */
+static volatile u32 good_insns_32[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
+ W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+
+/* Using this for both 64-bit and 32-bit apps */
+static volatile u32 good_2byte_insns[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+ W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
+ W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+
+#ifdef CONFIG_X86_64
+/* Good-instruction tables for 64-bit apps */
+static volatile u32 good_insns_64[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
+ W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
+ W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#endif
+#undef W
+
+/*
+ * opcodes we'll probably never support:
+ *
+ * 6c-6d, e4-e5, ec-ed - in
+ * 6e-6f, e6-e7, ee-ef - out
+ * cc, cd - int3, int
+ * cf - iret
+ * d6 - illegal instruction
+ * f1 - int1/icebp
+ * f4 - hlt
+ * fa, fb - cli, sti
+ * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
+ *
+ * invalid opcodes in 64-bit mode:
+ *
+ * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
+ * 63 - we support this opcode in x86_64 but not in i386.
+ *
+ * opcodes we may need to refine support for:
+ *
+ * 0f - 2-byte instructions: For many of these instructions, the validity
+ * depends on the prefix and/or the reg field. On such instructions, we
+ * just consider the opcode combination valid if it corresponds to any
+ * valid instruction.
+ *
+ * 8f - Group 1 - only reg = 0 is OK
+ * c6-c7 - Group 11 - only reg = 0 is OK
+ * d9-df - fpu insns with some illegal encodings
+ * f2, f3 - repnz, repz prefixes. These are also the first byte for
+ * certain floating-point instructions, such as addsd.
+ *
+ * fe - Group 4 - only reg = 0 or 1 is OK
+ * ff - Group 5 - only reg = 0-6 is OK
+ *
+ * others -- Do we need to support these?
+ *
+ * 0f - (floating-point?) prefetch instructions
+ * 07, 17, 1f - pop es, pop ss, pop ds
+ * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
+ * but 64 and 65 (fs: and gs:) seem to be used, so we support them
+ * 67 - addr16 prefix
+ * ce - into
+ * f0 - lock prefix
+ */
+
+/*
+ * TODO:
+ * - Where necessary, examine the modrm byte and allow only valid instructions
+ * in the different Groups and fpu instructions.
+ */
+
+static bool is_prefix_bad(struct insn *insn)
+{
+ int i;
+
+ for (i = 0; i < insn->prefixes.nbytes; i++) {
+ switch (insn->prefixes.bytes[i]) {
+ case 0x26: /* INAT_PFX_ES */
+ case 0x2E: /* INAT_PFX_CS */
+ case 0x36: /* INAT_PFX_DS */
+ case 0x3E: /* INAT_PFX_SS */
+ case 0xF0: /* INAT_PFX_LOCK */
+ return true;
+ }
+ }
+ return false;
+}
+
+static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ insn_init(insn, auprobe->insn, false);
+
+ /* Skip good instruction prefixes; reject "bad" ones. */
+ insn_get_opcode(insn);
+ if (is_prefix_bad(insn))
+ return -ENOTSUPP;
+
+ if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32))
+ return 0;
+
+ if (insn->opcode.nbytes == 2) {
+ if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
+ return 0;
+ }
+
+ return -ENOTSUPP;
+}
+
+/*
+ * Figure out which fixups arch_uprobe_post_xol() will need to perform, and
+ * annotate arch_uprobe->fixups accordingly. To start with,
+ * arch_uprobe->fixups is either zero or it reflects rip-related fixups.
+ */
+static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ bool fix_ip = true, fix_call = false; /* defaults */
+ int reg;
+
+ insn_get_opcode(insn); /* should be a nop */
+
+ switch (OPCODE1(insn)) {
+ case 0xc3: /* ret/lret */
+ case 0xcb:
+ case 0xc2:
+ case 0xca:
+ /* ip is correct */
+ fix_ip = false;
+ break;
+ case 0xe8: /* call relative - Fix return addr */
+ fix_call = true;
+ break;
+ case 0x9a: /* call absolute - Fix return addr, not ip */
+ fix_call = true;
+ fix_ip = false;
+ break;
+ case 0xff:
+ insn_get_modrm(insn);
+ reg = MODRM_REG(insn);
+ if (reg == 2 || reg == 3) {
+ /* call or lcall, indirect */
+ /* Fix return addr; ip is correct. */
+ fix_call = true;
+ fix_ip = false;
+ } else if (reg == 4 || reg == 5) {
+ /* jmp or ljmp, indirect */
+ /* ip is correct. */
+ fix_ip = false;
+ }
+ break;
+ case 0xea: /* jmp absolute -- ip is correct */
+ fix_ip = false;
+ break;
+ default:
+ break;
+ }
+ if (fix_ip)
+ auprobe->fixups |= UPROBE_FIX_IP;
+ if (fix_call)
+ auprobe->fixups |= UPROBE_FIX_CALL;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * If arch_uprobe->insn doesn't use rip-relative addressing, return
+ * immediately. Otherwise, rewrite the instruction so that it accesses
+ * its memory operand indirectly through a scratch register. Set
+ * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address
+ * accordingly. (The contents of the scratch register will be saved
+ * before we single-step the modified instruction, and restored
+ * afterward.)
+ *
+ * We do this because a rip-relative instruction can access only a
+ * relatively small area (+/- 2 GB from the instruction), and the XOL
+ * area typically lies beyond that area. At least for instructions
+ * that store to memory, we can't execute the original instruction
+ * and "fix things up" later, because the misdirected store could be
+ * disastrous.
+ *
+ * Some useful facts about rip-relative instructions:
+ *
+ * - There's always a modrm byte.
+ * - There's never a SIB byte.
+ * - The displacement is always 4 bytes.
+ */
+static void
+handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+{
+ u8 *cursor;
+ u8 reg;
+
+ if (mm->context.ia32_compat)
+ return;
+
+ auprobe->rip_rela_target_address = 0x0;
+ if (!insn_rip_relative(insn))
+ return;
+
+ /*
+ * insn_rip_relative() would have decoded rex_prefix, modrm.
+ * Clear REX.b bit (extension of MODRM.rm field):
+ * we want to encode rax/rcx, not r8/r9.
+ */
+ if (insn->rex_prefix.nbytes) {
+ cursor = auprobe->insn + insn_offset_rex_prefix(insn);
+ *cursor &= 0xfe; /* Clearing REX.B bit */
+ }
+
+ /*
+ * Point cursor at the modrm byte. The next 4 bytes are the
+ * displacement. Beyond the displacement, for some instructions,
+ * is the immediate operand.
+ */
+ cursor = auprobe->insn + insn_offset_modrm(insn);
+ insn_get_length(insn);
+
+ /*
+ * Convert from rip-relative addressing to indirect addressing
+ * via a scratch register. Change the r/m field from 0x5 (%rip)
+ * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field.
+ */
+ reg = MODRM_REG(insn);
+ if (reg == 0) {
+ /*
+ * The register operand (if any) is either the A register
+ * (%rax, %eax, etc.) or (if the 0x4 bit is set in the
+ * REX prefix) %r8. In any case, we know the C register
+ * is NOT the register operand, so we use %rcx (register
+ * #1) for the scratch register.
+ */
+ auprobe->fixups = UPROBE_FIX_RIP_CX;
+ /* Change modrm from 00 000 101 to 00 000 001. */
+ *cursor = 0x1;
+ } else {
+ /* Use %rax (register #0) for the scratch register. */
+ auprobe->fixups = UPROBE_FIX_RIP_AX;
+ /* Change modrm from 00 xxx 101 to 00 xxx 000 */
+ *cursor = (reg << 3);
+ }
+
+ /* Target address = address of next instruction + (signed) offset */
+ auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value;
+
+ /* Displacement field is gone; slide immediate field (if any) over. */
+ if (insn->immediate.nbytes) {
+ cursor++;
+ memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes);
+ }
+ return;
+}
+
+static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ insn_init(insn, auprobe->insn, true);
+
+ /* Skip good instruction prefixes; reject "bad" ones. */
+ insn_get_opcode(insn);
+ if (is_prefix_bad(insn))
+ return -ENOTSUPP;
+
+ if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64))
+ return 0;
+
+ if (insn->opcode.nbytes == 2) {
+ if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
+ return 0;
+ }
+ return -ENOTSUPP;
+}
+
+static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+{
+ if (mm->context.ia32_compat)
+ return validate_insn_32bits(auprobe, insn);
+ return validate_insn_64bits(auprobe, insn);
+}
+#else /* 32-bit: */
+static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+{
+ /* No RIP-relative addressing on 32-bit */
+}
+
+static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+{
+ return validate_insn_32bits(auprobe, insn);
+}
+#endif /* CONFIG_X86_64 */
+
+/**
+ * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
+ * @mm: the probed address space.
+ * @arch_uprobe: the probepoint information.
+ * Return 0 on success or a -ve number on error.
+ */
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm)
+{
+ int ret;
+ struct insn insn;
+
+ auprobe->fixups = 0;
+ ret = validate_insn_bits(auprobe, mm, &insn);
+ if (ret != 0)
+ return ret;
+
+ handle_riprel_insn(auprobe, mm, &insn);
+ prepare_fixups(auprobe, &insn);
+
+ return 0;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * If we're emulating a rip-relative instruction, save the contents
+ * of the scratch register and store the target address in that register.
+ */
+static void
+pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
+ struct arch_uprobe_task *autask)
+{
+ if (auprobe->fixups & UPROBE_FIX_RIP_AX) {
+ autask->saved_scratch_register = regs->ax;
+ regs->ax = current->utask->vaddr;
+ regs->ax += auprobe->rip_rela_target_address;
+ } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) {
+ autask->saved_scratch_register = regs->cx;
+ regs->cx = current->utask->vaddr;
+ regs->cx += auprobe->rip_rela_target_address;
+ }
+}
+#else
+static void
+pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
+ struct arch_uprobe_task *autask)
+{
+ /* No RIP-relative addressing on 32-bit */
+}
+#endif
+
+/*
+ * arch_uprobe_pre_xol - prepare to execute out of line.
+ * @auprobe: the probepoint information.
+ * @regs: reflects the saved user state of current task.
+ */
+int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct arch_uprobe_task *autask;
+
+ autask = &current->utask->autask;
+ autask->saved_trap_nr = current->thread.trap_nr;
+ current->thread.trap_nr = UPROBE_TRAP_NR;
+ regs->ip = current->utask->xol_vaddr;
+ pre_xol_rip_insn(auprobe, regs, autask);
+
+ return 0;
+}
+
+/*
+ * This function is called by arch_uprobe_post_xol() to adjust the return
+ * address pushed by a call instruction executed out of line.
+ */
+static int adjust_ret_addr(unsigned long sp, long correction)
+{
+ int rasize, ncopied;
+ long ra = 0;
+
+ if (is_ia32_task())
+ rasize = 4;
+ else
+ rasize = 8;
+
+ ncopied = copy_from_user(&ra, (void __user *)sp, rasize);
+ if (unlikely(ncopied))
+ return -EFAULT;
+
+ ra += correction;
+ ncopied = copy_to_user((void __user *)sp, &ra, rasize);
+ if (unlikely(ncopied))
+ return -EFAULT;
+
+ return 0;
+}
+
+#ifdef CONFIG_X86_64
+static bool is_riprel_insn(struct arch_uprobe *auprobe)
+{
+ return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0);
+}
+
+static void
+handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
+{
+ if (is_riprel_insn(auprobe)) {
+ struct arch_uprobe_task *autask;
+
+ autask = &current->utask->autask;
+ if (auprobe->fixups & UPROBE_FIX_RIP_AX)
+ regs->ax = autask->saved_scratch_register;
+ else
+ regs->cx = autask->saved_scratch_register;
+
+ /*
+ * The original instruction includes a displacement, and so
+ * is 4 bytes longer than what we've just single-stepped.
+ * Fall through to handle stuff like "jmpq *...(%rip)" and
+ * "callq *...(%rip)".
+ */
+ if (correction)
+ *correction += 4;
+ }
+}
+#else
+static void
+handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
+{
+ /* No RIP-relative addressing on 32-bit */
+}
+#endif
+
+/*
+ * If xol insn itself traps and generates a signal(Say,
+ * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
+ * instruction jumps back to its own address. It is assumed that anything
+ * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
+ *
+ * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
+ * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
+ * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
+ */
+bool arch_uprobe_xol_was_trapped(struct task_struct *t)
+{
+ if (t->thread.trap_nr != UPROBE_TRAP_NR)
+ return true;
+
+ return false;
+}
+
+/*
+ * Called after single-stepping. To avoid the SMP problems that can
+ * occur when we temporarily put back the original opcode to
+ * single-step, we single-stepped a copy of the instruction.
+ *
+ * This function prepares to resume execution after the single-step.
+ * We have to fix things up as follows:
+ *
+ * Typically, the new ip is relative to the copied instruction. We need
+ * to make it relative to the original instruction (FIX_IP). Exceptions
+ * are return instructions and absolute or indirect jump or call instructions.
+ *
+ * If the single-stepped instruction was a call, the return address that
+ * is atop the stack is the address following the copied instruction. We
+ * need to make it the address following the original instruction (FIX_CALL).
+ *
+ * If the original instruction was a rip-relative instruction such as
+ * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
+ * instruction using a scratch register -- e.g., "movl %edx,(%rax)".
+ * We need to restore the contents of the scratch register and adjust
+ * the ip, keeping in mind that the instruction we executed is 4 bytes
+ * shorter than the original instruction (since we squeezed out the offset
+ * field). (FIX_RIP_AX or FIX_RIP_CX)
+ */
+int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+ long correction;
+ int result = 0;
+
+ WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
+
+ utask = current->utask;
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+ correction = (long)(utask->vaddr - utask->xol_vaddr);
+ handle_riprel_post_xol(auprobe, regs, &correction);
+ if (auprobe->fixups & UPROBE_FIX_IP)
+ regs->ip += correction;
+
+ if (auprobe->fixups & UPROBE_FIX_CALL)
+ result = adjust_ret_addr(regs->sp, correction);
+
+ return result;
+}
+
+/* callback routine for handling exceptions. */
+int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
+{
+ struct die_args *args = data;
+ struct pt_regs *regs = args->regs;
+ int ret = NOTIFY_DONE;
+
+ /* We are only interested in userspace traps */
+ if (regs && !user_mode_vm(regs))
+ return NOTIFY_DONE;
+
+ switch (val) {
+ case DIE_INT3:
+ if (uprobe_pre_sstep_notifier(regs))
+ ret = NOTIFY_STOP;
+
+ break;
+
+ case DIE_DEBUG:
+ if (uprobe_post_sstep_notifier(regs))
+ ret = NOTIFY_STOP;
+
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * This function gets called when XOL instruction either gets trapped or
+ * the thread has a fatal signal, so reset the instruction pointer to its
+ * probed address.
+ */
+void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+ handle_riprel_post_xol(auprobe, regs, NULL);
+ instruction_pointer_set(regs, utask->vaddr);
+}
+
+/*
+ * Skip these instructions as per the currently known x86 ISA.
+ * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 }
+ */
+bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ int i;
+
+ for (i = 0; i < MAX_UINSN_BYTES; i++) {
+ if ((auprobe->insn[i] == 0x66))
+ continue;
+
+ if (auprobe->insn[i] == 0x90)
+ return true;
+
+ if (i == (MAX_UINSN_BYTES - 1))
+ break;
+
+ if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x1f))
+ return true;
+
+ if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x19))
+ return true;
+
+ if ((auprobe->insn[i] == 0x87) && (auprobe->insn[i+1] == 0xc0))
+ return true;
+
+ break;
+ }
+ return false;
+}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0f703f10901a..22a1530146a8 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -197,18 +197,6 @@ SECTIONS
INIT_DATA_SECTION(16)
- /*
- * Code and data for a variety of lowlevel trampolines, to be
- * copied into base memory (< 1 MiB) during initialization.
- * Since it is copied early, the main copy can be discarded
- * afterwards.
- */
- .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
- x86_trampoline_start = .;
- *(.x86_trampoline)
- x86_trampoline_end = .;
- }
-
.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
__x86_cpu_dev_start = .;
*(.x86_cpu_dev.init)
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a1d804bcd483..8eeb55a551b4 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -15,6 +15,7 @@
#include <linux/init.h>
#include <linux/pci_ids.h>
#include <linux/pci_regs.h>
+#include <linux/smp.h>
#include <asm/apic.h>
#include <asm/pci-direct.h>
@@ -22,6 +23,8 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
+#define TOPOLOGY_REGISTER_OFFSET 0x10
+
#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
/*
* Interrupt control on vSMPowered systems:
@@ -149,12 +152,49 @@ int is_vsmp_box(void)
return 0;
}
#endif
+
+static void __init vsmp_cap_cpus(void)
+{
+#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP)
+ void __iomem *address;
+ unsigned int cfg, topology, node_shift, maxcpus;
+
+ /*
+ * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the
+ * ones present in the first board, unless explicitly overridden by
+ * setup_max_cpus
+ */
+ if (setup_max_cpus != NR_CPUS)
+ return;
+
+ /* Read the vSMP Foundation topology register */
+ cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
+ address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4);
+ if (WARN_ON(!address))
+ return;
+
+ topology = readl(address);
+ node_shift = (topology >> 16) & 0x7;
+ if (!node_shift)
+ /* The value 0 should be decoded as 8 */
+ node_shift = 8;
+ maxcpus = (topology & ((1 << node_shift) - 1)) + 1;
+
+ pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n",
+ maxcpus);
+ setup_max_cpus = maxcpus;
+ early_iounmap(address, 4);
+#endif
+}
+
void __init vsmp_init(void)
{
detect_vsmp_box();
if (!is_vsmp_box())
return;
+ vsmp_cap_cpus();
+
set_vsmp_pv_ops();
return;
}
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index e9f265fd79ae..35c5e543f550 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -18,6 +18,7 @@
#include <asm/e820.h>
#include <asm/time.h>
#include <asm/irq.h>
+#include <asm/io_apic.h>
#include <asm/pat.h>
#include <asm/tsc.h>
#include <asm/iommu.h>
@@ -93,7 +94,6 @@ struct x86_init_ops x86_init __initdata = {
struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
.early_percpu_clock_init = x86_init_noop,
.setup_percpu_clockev = setup_secondary_APIC_clock,
- .fixup_cpu_id = x86_default_fixup_cpu_id,
};
static void default_nmi_init(void) { };
@@ -120,3 +120,10 @@ struct x86_msi_ops x86_msi = {
.teardown_msi_irqs = default_teardown_msi_irqs,
.restore_msi_irqs = default_restore_msi_irqs,
};
+
+struct x86_io_apic_ops x86_io_apic_ops = {
+ .init = native_io_apic_init_mappings,
+ .read = native_io_apic_read,
+ .write = native_io_apic_write,
+ .modify = native_io_apic_modify,
+};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index e62728e30b01..bd18149b2b0f 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -48,8 +48,6 @@ void __sanitize_i387_state(struct task_struct *tsk)
if (!fx)
return;
- BUG_ON(__thread_has_fpu(tsk));
-
xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
/*