aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/acpi/boot.c53
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/apb_timer.c8
-rw-r--r--arch/x86/kernel/apic/apic.c455
-rw-r--r--arch/x86/kernel/apic/io_apic.c13
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c19
-rw-r--r--arch/x86/kernel/cpu/common.c32
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c140
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c8
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c2
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event.c76
-rw-r--r--arch/x86/kernel/cpu/perf_event.h2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c46
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h20
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c17
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/e820.c26
-rw-r--r--arch/x86/kernel/early_printk.c187
-rw-r--r--arch/x86/kernel/entry_64.S317
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/head64.c11
-rw-r--r--arch/x86/kernel/head_64.S30
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c45
-rw-r--r--arch/x86/kernel/i387.c42
-rw-r--r--arch/x86/kernel/irq.c2
-rw-r--r--arch/x86/kernel/irq_32.c13
-rw-r--r--arch/x86/kernel/kprobes/core.c20
-rw-r--r--arch/x86/kernel/kprobes/opt.c3
-rw-r--r--arch/x86/kernel/livepatch.c90
-rw-r--r--arch/x86/kernel/module.c14
-rw-r--r--arch/x86/kernel/perf_regs.c90
-rw-r--r--arch/x86/kernel/pmc_atom.c81
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/rtc.c6
-rw-r--r--arch/x86/kernel/setup.c13
-rw-r--r--arch/x86/kernel/signal.c8
-rw-r--r--arch/x86/kernel/smpboot.c113
-rw-r--r--arch/x86/kernel/tls.c25
-rw-r--r--arch/x86/kernel/traps.c131
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c10
-rw-r--r--arch/x86/kernel/xsave.c3
58 files changed, 1429 insertions, 841 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5d4502c8b983..cdb1b70ddad0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -16,6 +16,10 @@ CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
endif
+KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
+
CFLAGS_irq.o := -I$(src)/../include/asm/trace
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
@@ -63,6 +67,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-y += apic/
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4433a4be8171..ae97ed0873c6 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -611,20 +611,20 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
{
- int irq;
-
- if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
- *irqp = gsi;
- } else {
- mutex_lock(&acpi_ioapic_lock);
- irq = mp_map_gsi_to_irq(gsi,
- IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK);
- mutex_unlock(&acpi_ioapic_lock);
- if (irq < 0)
- return -1;
- *irqp = irq;
+ int rc, irq, trigger, polarity;
+
+ rc = acpi_get_override_irq(gsi, &trigger, &polarity);
+ if (rc == 0) {
+ trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
+ polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
+ irq = acpi_register_gsi(NULL, gsi, trigger, polarity);
+ if (irq >= 0) {
+ *irqp = irq;
+ return 0;
+ }
}
- return 0;
+
+ return -1;
}
EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
@@ -653,6 +653,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
return gsi;
}
+#ifdef CONFIG_X86_LOCAL_APIC
static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
int trigger, int polarity)
{
@@ -675,6 +676,7 @@ static void acpi_unregister_gsi_ioapic(u32 gsi)
mutex_unlock(&acpi_ioapic_lock);
#endif
}
+#endif
int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
int trigger, int polarity) = acpi_register_gsi_pic;
@@ -750,13 +752,13 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
}
/* wrapper to silence section mismatch warning */
-int __ref acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
+int __ref acpi_map_cpu(acpi_handle handle, int physid, int *pcpu)
{
return _acpi_map_lsapic(handle, physid, pcpu);
}
-EXPORT_SYMBOL(acpi_map_lsapic);
+EXPORT_SYMBOL(acpi_map_cpu);
-int acpi_unmap_lsapic(int cpu)
+int acpi_unmap_cpu(int cpu)
{
#ifdef CONFIG_ACPI_NUMA
set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
@@ -768,8 +770,7 @@ int acpi_unmap_lsapic(int cpu)
return (0);
}
-
-EXPORT_SYMBOL(acpi_unmap_lsapic);
+EXPORT_SYMBOL(acpi_unmap_cpu);
#endif /* CONFIG_ACPI_HOTPLUG_CPU */
int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
@@ -844,13 +845,7 @@ int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base)
static int __init acpi_parse_sbf(struct acpi_table_header *table)
{
- struct acpi_table_boot *sb;
-
- sb = (struct acpi_table_boot *)table;
- if (!sb) {
- printk(KERN_WARNING PREFIX "Unable to map SBF\n");
- return -ENODEV;
- }
+ struct acpi_table_boot *sb = (struct acpi_table_boot *)table;
sbf_port = sb->cmos_index; /* Save CMOS port */
@@ -864,13 +859,7 @@ static struct resource *hpet_res __initdata;
static int __init acpi_parse_hpet(struct acpi_table_header *table)
{
- struct acpi_table_hpet *hpet_tbl;
-
- hpet_tbl = (struct acpi_table_hpet *)table;
- if (!hpet_tbl) {
- printk(KERN_WARNING PREFIX "Unable to map HPET\n");
- return -ENODEV;
- }
+ struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table;
if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
printk(KERN_WARNING PREFIX "HPET timers must be located in "
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 31368207837c..d1daead5fcdd 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,7 +78,7 @@ int x86_acpi_suspend_lowlevel(void)
header->pmode_cr0 = read_cr0();
if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
- header->pmode_cr4 = read_cr4();
+ header->pmode_cr4 = __read_cr4();
header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
}
if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index b708738d016e..6a7c23ff21d3 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -135,14 +135,6 @@ static inline void apbt_clear_mapping(void)
apbt_virt_address = NULL;
}
-/*
- * APBT timer interrupt enable / disable
- */
-static inline int is_apbt_capable(void)
-{
- return apbt_virt_address ? 1 : 0;
-}
-
static int __init apbt_clockevent_register(void)
{
struct sfi_timer_table_entry *mtmr;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 29b5b18afa27..ad3639ae1b9b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -134,9 +134,6 @@ static inline void imcr_apic_to_pic(void)
*/
static int force_enable_local_apic __initdata;
-/* Control whether x2APIC mode is enabled or not */
-static bool nox2apic __initdata;
-
/*
* APIC command line parameters
*/
@@ -161,33 +158,6 @@ static __init int setup_apicpmtimer(char *s)
__setup("apicpmtimer", setup_apicpmtimer);
#endif
-int x2apic_mode;
-#ifdef CONFIG_X86_X2APIC
-/* x2apic enabled before OS handover */
-int x2apic_preenabled;
-static int x2apic_disabled;
-static int __init setup_nox2apic(char *str)
-{
- if (x2apic_enabled()) {
- int apicid = native_apic_msr_read(APIC_ID);
-
- if (apicid >= 255) {
- pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
- apicid);
- return 0;
- }
-
- pr_warning("x2apic already enabled. will disable it\n");
- } else
- setup_clear_cpu_cap(X86_FEATURE_X2APIC);
-
- nox2apic = true;
-
- return 0;
-}
-early_param("nox2apic", setup_nox2apic);
-#endif
-
unsigned long mp_lapic_addr;
int disable_apic;
/* Disable local APIC timer from the kernel commandline or via dmi quirk */
@@ -1475,7 +1445,7 @@ void setup_local_APIC(void)
#endif
}
-void end_local_APIC_setup(void)
+static void end_local_APIC_setup(void)
{
lapic_setup_esr();
@@ -1492,116 +1462,183 @@ void end_local_APIC_setup(void)
apic_pm_activate();
}
-void __init bsp_end_local_APIC_setup(void)
+/*
+ * APIC setup function for application processors. Called from smpboot.c
+ */
+void apic_ap_setup(void)
{
+ setup_local_APIC();
end_local_APIC_setup();
-
- /*
- * Now that local APIC setup is completed for BP, configure the fault
- * handling for interrupt remapping.
- */
- irq_remap_enable_fault_handling();
-
}
#ifdef CONFIG_X86_X2APIC
-/*
- * Need to disable xapic and x2apic at the same time and then enable xapic mode
- */
-static inline void __disable_x2apic(u64 msr)
-{
- wrmsrl(MSR_IA32_APICBASE,
- msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
- wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
-}
+int x2apic_mode;
-static __init void disable_x2apic(void)
+enum {
+ X2APIC_OFF,
+ X2APIC_ON,
+ X2APIC_DISABLED,
+};
+static int x2apic_state;
+
+static inline void __x2apic_disable(void)
{
u64 msr;
- if (!cpu_has_x2apic)
+ if (cpu_has_apic)
return;
rdmsrl(MSR_IA32_APICBASE, msr);
- if (msr & X2APIC_ENABLE) {
- u32 x2apic_id = read_apic_id();
-
- if (x2apic_id >= 255)
- panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
+ if (!(msr & X2APIC_ENABLE))
+ return;
+ /* Disable xapic and x2apic first and then reenable xapic mode */
+ wrmsrl(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
+ wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
+ printk_once(KERN_INFO "x2apic disabled\n");
+}
- pr_info("Disabling x2apic\n");
- __disable_x2apic(msr);
+static inline void __x2apic_enable(void)
+{
+ u64 msr;
- if (nox2apic) {
- clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC);
- setup_clear_cpu_cap(X86_FEATURE_X2APIC);
- }
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ if (msr & X2APIC_ENABLE)
+ return;
+ wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
+ printk_once(KERN_INFO "x2apic enabled\n");
+}
- x2apic_disabled = 1;
- x2apic_mode = 0;
+static int __init setup_nox2apic(char *str)
+{
+ if (x2apic_enabled()) {
+ int apicid = native_apic_msr_read(APIC_ID);
- register_lapic_address(mp_lapic_addr);
+ if (apicid >= 255) {
+ pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
+ apicid);
+ return 0;
+ }
+ pr_warning("x2apic already enabled.\n");
+ __x2apic_disable();
}
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ x2apic_state = X2APIC_DISABLED;
+ x2apic_mode = 0;
+ return 0;
}
+early_param("nox2apic", setup_nox2apic);
-void check_x2apic(void)
+/* Called from cpu_init() to enable x2apic on (secondary) cpus */
+void x2apic_setup(void)
{
- if (x2apic_enabled()) {
- pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
- x2apic_preenabled = x2apic_mode = 1;
+ /*
+ * If x2apic is not in ON state, disable it if already enabled
+ * from BIOS.
+ */
+ if (x2apic_state != X2APIC_ON) {
+ __x2apic_disable();
+ return;
}
+ __x2apic_enable();
}
-void enable_x2apic(void)
+static __init void x2apic_disable(void)
{
- u64 msr;
+ u32 x2apic_id;
- rdmsrl(MSR_IA32_APICBASE, msr);
- if (x2apic_disabled) {
- __disable_x2apic(msr);
+ if (x2apic_state != X2APIC_ON)
+ goto out;
+
+ x2apic_id = read_apic_id();
+ if (x2apic_id >= 255)
+ panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
+
+ __x2apic_disable();
+ register_lapic_address(mp_lapic_addr);
+out:
+ x2apic_state = X2APIC_DISABLED;
+ x2apic_mode = 0;
+}
+
+static __init void x2apic_enable(void)
+{
+ if (x2apic_state != X2APIC_OFF)
return;
- }
- if (!x2apic_mode)
+ x2apic_mode = 1;
+ x2apic_state = X2APIC_ON;
+ __x2apic_enable();
+}
+
+static __init void try_to_enable_x2apic(int remap_mode)
+{
+ if (x2apic_state == X2APIC_DISABLED)
return;
- if (!(msr & X2APIC_ENABLE)) {
- printk_once(KERN_INFO "Enabling x2apic\n");
- wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
+ if (remap_mode != IRQ_REMAP_X2APIC_MODE) {
+ /* IR is required if there is APIC ID > 255 even when running
+ * under KVM
+ */
+ if (max_physical_apicid > 255 ||
+ !hypervisor_x2apic_available()) {
+ pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
+ x2apic_disable();
+ return;
+ }
+
+ /*
+ * without IR all CPUs can be addressed by IOAPIC/MSI
+ * only in physical mode
+ */
+ x2apic_phys = 1;
}
+ x2apic_enable();
}
-#endif /* CONFIG_X86_X2APIC */
-int __init enable_IR(void)
+void __init check_x2apic(void)
{
-#ifdef CONFIG_IRQ_REMAP
- if (!irq_remapping_supported()) {
- pr_debug("intr-remapping not supported\n");
- return -1;
+ if (x2apic_enabled()) {
+ pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n");
+ x2apic_mode = 1;
+ x2apic_state = X2APIC_ON;
+ } else if (!cpu_has_x2apic) {
+ x2apic_state = X2APIC_DISABLED;
}
+}
+#else /* CONFIG_X86_X2APIC */
+static int __init validate_x2apic(void)
+{
+ if (!apic_is_x2apic_enabled())
+ return 0;
+ /*
+ * Checkme: Can we simply turn off x2apic here instead of panic?
+ */
+ panic("BIOS has enabled x2apic but kernel doesn't support x2apic, please disable x2apic in BIOS.\n");
+}
+early_initcall(validate_x2apic);
- if (!x2apic_preenabled && skip_ioapic_setup) {
- pr_info("Skipped enabling intr-remap because of skipping "
- "io-apic setup\n");
+static inline void try_to_enable_x2apic(int remap_mode) { }
+static inline void __x2apic_enable(void) { }
+#endif /* !CONFIG_X86_X2APIC */
+
+static int __init try_to_enable_IR(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ if (!x2apic_enabled() && skip_ioapic_setup) {
+ pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n");
return -1;
}
-
- return irq_remapping_enable();
#endif
- return -1;
+ return irq_remapping_enable();
}
void __init enable_IR_x2apic(void)
{
unsigned long flags;
- int ret, x2apic_enabled = 0;
- int hardware_init_ret;
-
- /* Make sure irq_remap_ops are initialized */
- setup_irq_remapping_ops();
+ int ret, ir_stat;
- hardware_init_ret = irq_remapping_prepare();
- if (hardware_init_ret && !x2apic_supported())
+ ir_stat = irq_remapping_prepare();
+ if (ir_stat < 0 && !x2apic_supported())
return;
ret = save_ioapic_entries();
@@ -1614,49 +1651,13 @@ void __init enable_IR_x2apic(void)
legacy_pic->mask_all();
mask_ioapic_entries();
- if (x2apic_preenabled && nox2apic)
- disable_x2apic();
-
- if (hardware_init_ret)
- ret = -1;
- else
- ret = enable_IR();
-
- if (!x2apic_supported())
- goto skip_x2apic;
-
- if (ret < 0) {
- /* IR is required if there is APIC ID > 255 even when running
- * under KVM
- */
- if (max_physical_apicid > 255 ||
- !hypervisor_x2apic_available()) {
- if (x2apic_preenabled)
- disable_x2apic();
- goto skip_x2apic;
- }
- /*
- * without IR all CPUs can be addressed by IOAPIC/MSI
- * only in physical mode
- */
- x2apic_force_phys();
- }
+ /* If irq_remapping_prepare() succeded, try to enable it */
+ if (ir_stat >= 0)
+ ir_stat = try_to_enable_IR();
+ /* ir_stat contains the remap mode or an error code */
+ try_to_enable_x2apic(ir_stat);
- if (ret == IRQ_REMAP_XAPIC_MODE) {
- pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
- goto skip_x2apic;
- }
-
- x2apic_enabled = 1;
-
- if (x2apic_supported() && !x2apic_mode) {
- x2apic_mode = 1;
- enable_x2apic();
- pr_info("Enabled x2apic\n");
- }
-
-skip_x2apic:
- if (ret < 0) /* IR enabling failed */
+ if (ir_stat < 0)
restore_ioapic_entries();
legacy_pic->restore_mask();
local_irq_restore(flags);
@@ -1847,82 +1848,8 @@ void __init register_lapic_address(unsigned long address)
}
}
-/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
- */
int apic_version[MAX_LOCAL_APIC];
-int __init APIC_init_uniprocessor(void)
-{
- if (disable_apic) {
- pr_info("Apic disabled\n");
- return -1;
- }
-#ifdef CONFIG_X86_64
- if (!cpu_has_apic) {
- disable_apic = 1;
- pr_info("Apic disabled by BIOS\n");
- return -1;
- }
-#else
- if (!smp_found_config && !cpu_has_apic)
- return -1;
-
- /*
- * Complain if the BIOS pretends there is one.
- */
- if (!cpu_has_apic &&
- APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
- pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
- boot_cpu_physical_apicid);
- return -1;
- }
-#endif
-
- default_setup_apic_routing();
-
- verify_local_APIC();
- connect_bsp_APIC();
-
-#ifdef CONFIG_X86_64
- apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
-#else
- /*
- * Hack: In case of kdump, after a crash, kernel might be booting
- * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
- * might be zero if read from MP tables. Get it from LAPIC.
- */
-# ifdef CONFIG_CRASH_DUMP
- boot_cpu_physical_apicid = read_apic_id();
-# endif
-#endif
- physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
- setup_local_APIC();
-
-#ifdef CONFIG_X86_IO_APIC
- /*
- * Now enable IO-APICs, actually call clear_IO_APIC
- * We need clear_IO_APIC before enabling error vector
- */
- if (!skip_ioapic_setup && nr_ioapics)
- enable_IO_APIC();
-#endif
-
- bsp_end_local_APIC_setup();
-
-#ifdef CONFIG_X86_IO_APIC
- if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
- else {
- nr_ioapics = 0;
- }
-#endif
-
- x86_init.timers.setup_percpu_clockev();
- return 0;
-}
-
/*
* Local APIC interrupts
*/
@@ -2027,7 +1954,7 @@ __visible void smp_trace_error_interrupt(struct pt_regs *regs)
/**
* connect_bsp_APIC - attach the APIC to the interrupt system
*/
-void __init connect_bsp_APIC(void)
+static void __init connect_bsp_APIC(void)
{
#ifdef CONFIG_X86_32
if (pic_mode) {
@@ -2274,6 +2201,100 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
}
}
+static void __init apic_bsp_up_setup(void)
+{
+#ifdef CONFIG_X86_64
+ apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
+#else
+ /*
+ * Hack: In case of kdump, after a crash, kernel might be booting
+ * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
+ * might be zero if read from MP tables. Get it from LAPIC.
+ */
+# ifdef CONFIG_CRASH_DUMP
+ boot_cpu_physical_apicid = read_apic_id();
+# endif
+#endif
+ physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
+}
+
+/**
+ * apic_bsp_setup - Setup function for local apic and io-apic
+ * @upmode: Force UP mode (for APIC_init_uniprocessor)
+ *
+ * Returns:
+ * apic_id of BSP APIC
+ */
+int __init apic_bsp_setup(bool upmode)
+{
+ int id;
+
+ connect_bsp_APIC();
+ if (upmode)
+ apic_bsp_up_setup();
+ setup_local_APIC();
+
+ if (x2apic_mode)
+ id = apic_read(APIC_LDR);
+ else
+ id = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+
+ enable_IO_APIC();
+ end_local_APIC_setup();
+ irq_remap_enable_fault_handling();
+ setup_IO_APIC();
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
+ return id;
+}
+
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor(void)
+{
+ if (disable_apic) {
+ pr_info("Apic disabled\n");
+ return -1;
+ }
+#ifdef CONFIG_X86_64
+ if (!cpu_has_apic) {
+ disable_apic = 1;
+ pr_info("Apic disabled by BIOS\n");
+ return -1;
+ }
+#else
+ if (!smp_found_config && !cpu_has_apic)
+ return -1;
+
+ /*
+ * Complain if the BIOS pretends there is one.
+ */
+ if (!cpu_has_apic &&
+ APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
+ boot_cpu_physical_apicid);
+ return -1;
+ }
+#endif
+
+ if (!smp_found_config)
+ disable_ioapic_support();
+
+ default_setup_apic_routing();
+ verify_local_APIC();
+ apic_bsp_setup(true);
+ return 0;
+}
+
+#ifdef CONFIG_UP_LATE_INIT
+void __init up_late_init(void)
+{
+ APIC_init_uniprocessor();
+}
+#endif
+
/*
* Power management
*/
@@ -2359,9 +2380,9 @@ static void lapic_resume(void)
mask_ioapic_entries();
legacy_pic->mask_all();
- if (x2apic_mode)
- enable_x2apic();
- else {
+ if (x2apic_mode) {
+ __x2apic_enable();
+ } else {
/*
* Make sure the APICBASE points to the right address
*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3f5f60406ab1..f4dc2462a1ac 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1507,7 +1507,10 @@ void __init enable_IO_APIC(void)
int i8259_apic, i8259_pin;
int apic, pin;
- if (!nr_legacy_irqs())
+ if (skip_ioapic_setup)
+ nr_ioapics = 0;
+
+ if (!nr_legacy_irqs() || !nr_ioapics)
return;
for_each_ioapic_pin(apic, pin) {
@@ -2295,7 +2298,7 @@ static inline void __init check_timer(void)
}
local_irq_disable();
apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
- if (x2apic_preenabled)
+ if (apic_is_x2apic_enabled())
apic_printk(APIC_QUIET, KERN_INFO
"Perhaps problem with the pre-enabled x2apic mode\n"
"Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
@@ -2373,9 +2376,9 @@ void __init setup_IO_APIC(void)
{
int ioapic;
- /*
- * calling enable_IO_APIC() is moved to setup_local_APIC for BP
- */
+ if (skip_ioapic_setup || !nr_ioapics)
+ return;
+
io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index e27b49d7c922..80091ae54c2b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -66,3 +66,4 @@ targets += capflags.c
$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
endif
+clean-files += capflags.c
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 15c5df92f74e..a220239cea65 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -869,3 +869,22 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
return false;
}
+
+void set_dr_addr_mask(unsigned long mask, int dr)
+{
+ if (!cpu_has_bpext)
+ return;
+
+ switch (dr) {
+ case 0:
+ wrmsr(MSR_F16H_DR0_ADDR_MASK, mask, 0);
+ break;
+ case 1:
+ case 2:
+ case 3:
+ wrmsr(MSR_F16H_DR1_ADDR_MASK - 1 + dr, mask, 0);
+ break;
+ default:
+ break;
+ }
+}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c6049650c093..b5c8ff5e9dfc 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -19,6 +19,7 @@
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/debugreg.h>
#include <asm/sections.h>
#include <asm/vsyscall.h>
@@ -278,7 +279,7 @@ __setup("nosmep", setup_disable_smep);
static __always_inline void setup_smep(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_SMEP))
- set_in_cr4(X86_CR4_SMEP);
+ cr4_set_bits(X86_CR4_SMEP);
}
static __init int setup_disable_smap(char *arg)
@@ -298,9 +299,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_SMAP)) {
#ifdef CONFIG_X86_SMAP
- set_in_cr4(X86_CR4_SMAP);
+ cr4_set_bits(X86_CR4_SMAP);
#else
- clear_in_cr4(X86_CR4_SMAP);
+ cr4_clear_bits(X86_CR4_SMAP);
#endif
}
}
@@ -491,17 +492,18 @@ u16 __read_mostly tlb_lld_2m[NR_INFO];
u16 __read_mostly tlb_lld_4m[NR_INFO];
u16 __read_mostly tlb_lld_1g[NR_INFO];
-void cpu_detect_tlb(struct cpuinfo_x86 *c)
+static void cpu_detect_tlb(struct cpuinfo_x86 *c)
{
if (this_cpu->c_detect_tlb)
this_cpu->c_detect_tlb(c);
- printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
- "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
+ pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
- tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
- tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
- tlb_lld_1g[ENTRIES]);
+ tlb_lli_4m[ENTRIES]);
+
+ pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
+ tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES],
+ tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
}
void detect_ht(struct cpuinfo_x86 *c)
@@ -1294,6 +1296,12 @@ void cpu_init(void)
wait_for_master_cpu(cpu);
/*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+
+ /*
* Load microcode on this cpu if a valid microcode is available.
* This is early microcode loading procedure.
*/
@@ -1312,7 +1320,7 @@ void cpu_init(void)
pr_debug("Initializing CPU#%d\n", cpu);
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
/*
* Initialize the per-CPU GDT with the boot GDT,
@@ -1332,7 +1340,7 @@ void cpu_init(void)
barrier();
x86_configure_nx();
- enable_x2apic();
+ x2apic_setup();
/*
* set up and load the per-CPU TSS
@@ -1393,7 +1401,7 @@ void cpu_init(void)
printk(KERN_INFO "Initializing CPU#%d\n", cpu);
if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
load_current_idt();
switch_to_new_gdt(cpu);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 9cc6b6f25f42..94d7dcb12145 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -487,10 +487,8 @@ static void init_intel(struct cpuinfo_x86 *c)
rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
- printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
- " Set to 'normal', was 'performance'\n"
- "ENERGY_PERF_BIAS: View and update with"
- " x86_energy_perf_policy(8)\n");
+ pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
+ pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c7035073dfc1..659643376dbf 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -952,20 +952,18 @@ static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
int type, char *buf)
{
- ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
- int n = 0;
-
- if (len > 1) {
- const struct cpumask *mask;
-
- mask = to_cpumask(this_leaf->shared_cpu_map);
- n = type ?
- cpulist_scnprintf(buf, len-2, mask) :
- cpumask_scnprintf(buf, len-2, mask);
- buf[n++] = '\n';
- buf[n] = '\0';
- }
- return n;
+ const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
+ int ret;
+
+ if (type)
+ ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
+ cpumask_pr_args(mask));
+ else
+ ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb",
+ cpumask_pr_args(mask));
+ buf[ret++] = '\n';
+ buf[ret] = '\0';
+ return ret;
}
static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d2c611699cd9..3be9fa69f875 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -43,6 +43,8 @@
#include <linux/export.h>
#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
@@ -115,7 +117,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
*/
-ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
@@ -311,7 +313,7 @@ static void wait_for_panic(void)
panic("Panicing machine check CPU died");
}
-static void mce_panic(char *msg, struct mce *final, char *exp)
+static void mce_panic(const char *msg, struct mce *final, char *exp)
{
int i, apei_err = 0;
@@ -529,7 +531,7 @@ static void mce_schedule_work(void)
schedule_work(this_cpu_ptr(&mce_work));
}
-DEFINE_PER_CPU(struct irq_work, mce_irq_work);
+static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
static void mce_irq_work_cb(struct irq_work *entry)
{
@@ -735,7 +737,7 @@ static atomic_t mce_callin;
/*
* Check if a timeout waiting for other CPUs happened.
*/
-static int mce_timed_out(u64 *t)
+static int mce_timed_out(u64 *t, const char *msg)
{
/*
* The others already did panic for some reason.
@@ -750,8 +752,7 @@ static int mce_timed_out(u64 *t)
goto out;
if ((s64)*t < SPINUNIT) {
if (mca_cfg.tolerant <= 1)
- mce_panic("Timeout synchronizing machine check over CPUs",
- NULL, NULL);
+ mce_panic(msg, NULL, NULL);
cpu_missing = 1;
return 1;
}
@@ -867,7 +868,8 @@ static int mce_start(int *no_way_out)
* Wait for everyone.
*/
while (atomic_read(&mce_callin) != cpus) {
- if (mce_timed_out(&timeout)) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Not all CPUs entered broadcast exception handler")) {
atomic_set(&global_nwo, 0);
return -1;
}
@@ -892,7 +894,8 @@ static int mce_start(int *no_way_out)
* only seen by one CPU before cleared, avoiding duplicates.
*/
while (atomic_read(&mce_executing) < order) {
- if (mce_timed_out(&timeout)) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Subject CPUs unable to finish machine check processing")) {
atomic_set(&global_nwo, 0);
return -1;
}
@@ -936,7 +939,8 @@ static int mce_end(int order)
* loops.
*/
while (atomic_read(&mce_executing) <= cpus) {
- if (mce_timed_out(&timeout))
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU unable to finish machine check processing"))
goto reset;
ndelay(SPINUNIT);
}
@@ -949,7 +953,8 @@ static int mce_end(int order)
* Subject: Wait for Monarch to finish.
*/
while (atomic_read(&mce_executing) != 0) {
- if (mce_timed_out(&timeout))
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU did not finish machine check processing"))
goto reset;
ndelay(SPINUNIT);
}
@@ -1003,51 +1008,6 @@ static void mce_clear_state(unsigned long *toclear)
}
/*
- * Need to save faulting physical address associated with a process
- * in the machine check handler some place where we can grab it back
- * later in mce_notify_process()
- */
-#define MCE_INFO_MAX 16
-
-struct mce_info {
- atomic_t inuse;
- struct task_struct *t;
- __u64 paddr;
- int restartable;
-} mce_info[MCE_INFO_MAX];
-
-static void mce_save_info(__u64 addr, int c)
-{
- struct mce_info *mi;
-
- for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
- if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
- mi->t = current;
- mi->paddr = addr;
- mi->restartable = c;
- return;
- }
- }
-
- mce_panic("Too many concurrent recoverable errors", NULL, NULL);
-}
-
-static struct mce_info *mce_find_info(void)
-{
- struct mce_info *mi;
-
- for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
- if (atomic_read(&mi->inuse) && mi->t == current)
- return mi;
- return NULL;
-}
-
-static void mce_clear_info(struct mce_info *mi)
-{
- atomic_set(&mi->inuse, 0);
-}
-
-/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
*
@@ -1063,6 +1023,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
{
struct mca_config *cfg = &mca_cfg;
struct mce m, *final;
+ enum ctx_state prev_state;
int i;
int worst = 0;
int severity;
@@ -1084,6 +1045,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown";
+ u64 recover_paddr = ~0ull;
+ int flags = MF_ACTION_REQUIRED;
+
+ prev_state = ist_enter(regs);
this_cpu_inc(mce_exception_count);
@@ -1203,9 +1168,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (no_way_out)
mce_panic("Fatal machine check on current CPU", &m, msg);
if (worst == MCE_AR_SEVERITY) {
- /* schedule action before return to userland */
- mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
- set_thread_flag(TIF_MCE_NOTIFY);
+ recover_paddr = m.addr;
+ if (!(m.mcgstatus & MCG_STATUS_RIPV))
+ flags |= MF_MUST_KILL;
} else if (kill_it) {
force_sig(SIGBUS, current);
}
@@ -1216,6 +1181,27 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
out:
sync_core();
+
+ if (recover_paddr == ~0ull)
+ goto done;
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx",
+ recover_paddr);
+ /*
+ * We must call memory_failure() here even if the current process is
+ * doomed. We still need to mark the page as poisoned and alert any
+ * other users of the page.
+ */
+ ist_begin_non_atomic(regs);
+ local_irq_enable();
+ if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
+ pr_err("Memory error not recovered");
+ force_sig(SIGBUS, current);
+ }
+ local_irq_disable();
+ ist_end_non_atomic();
+done:
+ ist_exit(regs, prev_state);
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1233,42 +1219,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
#endif
/*
- * Called in process context that interrupted by MCE and marked with
- * TIF_MCE_NOTIFY, just before returning to erroneous userland.
- * This code is allowed to sleep.
- * Attempt possible recovery such as calling the high level VM handler to
- * process any corrupted pages, and kill/signal current process if required.
- * Action required errors are handled here.
- */
-void mce_notify_process(void)
-{
- unsigned long pfn;
- struct mce_info *mi = mce_find_info();
- int flags = MF_ACTION_REQUIRED;
-
- if (!mi)
- mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
- pfn = mi->paddr >> PAGE_SHIFT;
-
- clear_thread_flag(TIF_MCE_NOTIFY);
-
- pr_err("Uncorrected hardware memory error in user-access at %llx",
- mi->paddr);
- /*
- * We must call memory_failure() here even if the current process is
- * doomed. We still need to mark the page as poisoned and alert any
- * other users of the page.
- */
- if (!mi->restartable)
- flags |= MF_MUST_KILL;
- if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
- pr_err("Memory error not recovered");
- force_sig(SIGBUS, current);
- }
- mce_clear_info(mi);
-}
-
-/*
* Action optional processing happens here (picking up
* from the list of faulting pages that do_machine_check()
* placed into the "ring").
@@ -1503,7 +1453,7 @@ static void __mcheck_cpu_init_generic(void)
bitmap_fill(all_banks, MAX_NR_BANKS);
machine_check_poll(MCP_UC | m_fl, &all_banks);
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index a3042989398c..737b0ad4e61a 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -8,6 +8,8 @@
#include <linux/smp.h>
#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
@@ -17,8 +19,11 @@ int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
static void pentium_machine_check(struct pt_regs *regs, long error_code)
{
+ enum ctx_state prev_state;
u32 loaddr, hi, lotype;
+ prev_state = ist_enter(regs);
+
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -33,6 +38,8 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
}
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ ist_exit(regs, prev_state);
}
/* Set up machine check reporting for processors with Intel style MCE: */
@@ -59,7 +66,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
"Intel old style machine check architecture supported.\n");
/* Enable MCE: */
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
printk(KERN_INFO
"Intel old style machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 7dc5564d0cdf..44f138296fbe 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -7,14 +7,20 @@
#include <linux/types.h>
#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
/* Machine check handler for WinChip C6: */
static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
+ enum ctx_state prev_state = ist_enter(regs);
+
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ ist_exit(regs, prev_state);
}
/* Set up machine check reporting on the Winchip C6 series */
@@ -31,7 +37,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
printk(KERN_INFO
"Winchip machine check reporting enabled on CPU#0.\n");
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 15c29096136b..36a83617eb21 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -552,7 +552,7 @@ static int __init microcode_init(void)
int error;
if (paravirt_enabled() || dis_ucode_ldr)
- return 0;
+ return -EINVAL;
if (c->x86_vendor == X86_VENDOR_INTEL)
microcode_ops = init_intel_microcode();
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index e2b22df964cd..36d99a337b49 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -28,7 +28,7 @@ function dump_array()
# If the /* comment */ starts with a quote string, grab that.
VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
[ -z "$VALUE" ] && VALUE="\"$NAME\""
- [ "$VALUE" == '""' ] && continue
+ [ "$VALUE" = '""' ] && continue
# Name is uppercase, VALUE is all lowercase
VALUE="$(echo "$VALUE" | tr A-Z a-z)"
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index a450373e8e91..939155ffdece 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -107,6 +107,7 @@ static struct clocksource hyperv_cs = {
.rating = 400, /* use this when running on Hyperv*/
.read = read_hv_clock,
.mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static void __init ms_hyperv_init_platform(void)
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 9e451b0876b5..f8c81ba0b465 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -138,8 +138,8 @@ static void prepare_set(void)
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
}
/*
@@ -171,7 +171,7 @@ static void post_set(void)
/* Restore value of CR4 */
if (cpu_has_pge)
- write_cr4(cr4);
+ __write_cr4(cr4);
}
static void cyrix_set_arr(unsigned int reg, unsigned long base,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0e25a1bc5ab5..7d74f7b3c6ba 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -678,8 +678,8 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
}
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
@@ -708,7 +708,7 @@ static void post_set(void) __releases(set_atomicity_lock)
/* Restore value of CR4 */
if (cpu_has_pge)
- write_cr4(cr4);
+ __write_cr4(cr4);
raw_spin_unlock(&set_atomicity_lock);
}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 143e5f5dc855..b71a7f86d68a 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,8 @@
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/alternative.h>
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
#include <asm/timer.h>
#include <asm/desc.h>
#include <asm/ldt.h>
@@ -43,6 +45,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
};
+struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
+
u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1327,8 +1331,6 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
break;
case CPU_STARTING:
- if (x86_pmu.attr_rdpmc)
- set_in_cr4(X86_CR4_PCE);
if (x86_pmu.cpu_starting)
x86_pmu.cpu_starting(cpu);
break;
@@ -1804,14 +1806,44 @@ static int x86_pmu_event_init(struct perf_event *event)
event->destroy(event);
}
+ if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
+ event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+
return err;
}
+static void refresh_pce(void *ignored)
+{
+ if (current->mm)
+ load_mm_cr4(current->mm);
+}
+
+static void x86_pmu_event_mapped(struct perf_event *event)
+{
+ if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ return;
+
+ if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
+ on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static void x86_pmu_event_unmapped(struct perf_event *event)
+{
+ if (!current->mm)
+ return;
+
+ if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ return;
+
+ if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
+ on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
static int x86_pmu_event_idx(struct perf_event *event)
{
int idx = event->hw.idx;
- if (!x86_pmu.attr_rdpmc)
+ if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return 0;
if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
@@ -1829,16 +1861,6 @@ static ssize_t get_attr_rdpmc(struct device *cdev,
return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
}
-static void change_rdpmc(void *info)
-{
- bool enable = !!(unsigned long)info;
-
- if (enable)
- set_in_cr4(X86_CR4_PCE);
- else
- clear_in_cr4(X86_CR4_PCE);
-}
-
static ssize_t set_attr_rdpmc(struct device *cdev,
struct device_attribute *attr,
const char *buf, size_t count)
@@ -1850,14 +1872,27 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
if (ret)
return ret;
+ if (val > 2)
+ return -EINVAL;
+
if (x86_pmu.attr_rdpmc_broken)
return -ENOTSUPP;
- if (!!val != !!x86_pmu.attr_rdpmc) {
- x86_pmu.attr_rdpmc = !!val;
- on_each_cpu(change_rdpmc, (void *)val, 1);
+ if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
+ /*
+ * Changing into or out of always available, aka
+ * perf-event-bypassing mode. This path is extremely slow,
+ * but only root can trigger it, so it's okay.
+ */
+ if (val == 2)
+ static_key_slow_inc(&rdpmc_always_available);
+ else
+ static_key_slow_dec(&rdpmc_always_available);
+ on_each_cpu(refresh_pce, NULL, 1);
}
+ x86_pmu.attr_rdpmc = val;
+
return count;
}
@@ -1900,6 +1935,9 @@ static struct pmu pmu = {
.event_init = x86_pmu_event_init,
+ .event_mapped = x86_pmu_event_mapped,
+ .event_unmapped = x86_pmu_event_unmapped,
+
.add = x86_pmu_add,
.del = x86_pmu_del,
.start = x86_pmu_start,
@@ -1914,13 +1952,15 @@ static struct pmu pmu = {
.flush_branch_stack = x86_pmu_flush_branch_stack,
};
-void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+void arch_perf_update_userpage(struct perf_event *event,
+ struct perf_event_mmap_page *userpg, u64 now)
{
struct cyc2ns_data *data;
userpg->cap_user_time = 0;
userpg->cap_user_time_zero = 0;
- userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
+ userpg->cap_user_rdpmc =
+ !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
userpg->pmc_width = x86_pmu.cntval_bits;
if (!sched_clock_stable())
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 4e6cdb0ddc70..df525d2be1e8 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -71,6 +71,8 @@ struct event_constraint {
#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */
#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */
#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */
+
struct amd_nb {
int nb_id; /* NorthBridge id */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 944bf019b74f..498b6d967138 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2431,6 +2431,7 @@ __init int intel_pmu_init(void)
break;
case 55: /* 22nm Atom "Silvermont" */
+ case 76: /* 14nm Atom "Airmont" */
case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 3c895d480cd7..073983398364 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -568,8 +568,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
};
struct event_constraint intel_slm_pebs_event_constraints[] = {
- /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
EVENT_CONSTRAINT_END
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 673f930c700f..c4bb8b8e5017 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -103,6 +103,13 @@ static struct kobj_attribute format_attr_##_var = \
#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
+#define RAPL_EVENT_ATTR_STR(_name, v, str) \
+static struct perf_pmu_events_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, rapl_sysfs_show, NULL), \
+ .id = 0, \
+ .event_str = str, \
+};
+
struct rapl_pmu {
spinlock_t lock;
int hw_unit; /* 1/2^hw_unit Joule */
@@ -135,7 +142,7 @@ static inline u64 rapl_scale(u64 v)
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
- return v << (32 - __this_cpu_read(rapl_pmu->hw_unit));
+ return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit);
}
static u64 rapl_event_update(struct perf_event *event)
@@ -379,23 +386,36 @@ static struct attribute_group rapl_pmu_attr_group = {
.attrs = rapl_pmu_attrs,
};
-EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
-EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
-EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
-EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
+static ssize_t rapl_sysfs_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr = \
+ container_of(attr, struct perf_pmu_events_attr, attr);
+
+ if (pmu_attr->event_str)
+ return sprintf(page, "%s", pmu_attr->event_str);
+
+ return 0;
+}
+
+RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
+RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
+RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
-EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
-EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
-EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
-EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
/*
* we compute in 0.23 nJ increments regardless of MSR
*/
-EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
-EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
-EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
-EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
static struct attribute *rapl_events_srv_attr[] = {
EVENT_PTR(rapl_cores),
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 10b8d3eaaf15..c635b8b49e93 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -840,7 +840,6 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
box->phys_id = phys_id;
box->pci_dev = pdev;
box->pmu = pmu;
- uncore_box_init(box);
pci_set_drvdata(pdev, box);
raw_spin_lock(&uncore_box_lock);
@@ -1004,10 +1003,8 @@ static int uncore_cpu_starting(int cpu)
pmu = &type->pmus[j];
box = *per_cpu_ptr(pmu->box, cpu);
/* called by uncore_cpu_init? */
- if (box && box->phys_id >= 0) {
- uncore_box_init(box);
+ if (box && box->phys_id >= 0)
continue;
- }
for_each_online_cpu(k) {
exist = *per_cpu_ptr(pmu->box, k);
@@ -1023,10 +1020,8 @@ static int uncore_cpu_starting(int cpu)
}
}
- if (box) {
+ if (box)
box->phys_id = phys_id;
- uncore_box_init(box);
- }
}
}
return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 18eb78bbdd10..6c8c1e7e69d8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -17,7 +17,7 @@
#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff)
#define UNCORE_PCI_DEV_IDX(data) (data & 0xff)
#define UNCORE_EXTRA_PCI_DEV 0xff
-#define UNCORE_EXTRA_PCI_DEV_MAX 2
+#define UNCORE_EXTRA_PCI_DEV_MAX 3
/* support up to 8 sockets */
#define UNCORE_SOCKET_MAX 8
@@ -257,6 +257,14 @@ static inline int uncore_num_counters(struct intel_uncore_box *box)
return box->pmu->type->num_counters;
}
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+ if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+ if (box->pmu->type->ops->init_box)
+ box->pmu->type->ops->init_box(box);
+ }
+}
+
static inline void uncore_disable_box(struct intel_uncore_box *box)
{
if (box->pmu->type->ops->disable_box)
@@ -265,6 +273,8 @@ static inline void uncore_disable_box(struct intel_uncore_box *box)
static inline void uncore_enable_box(struct intel_uncore_box *box)
{
+ uncore_box_init(box);
+
if (box->pmu->type->ops->enable_box)
box->pmu->type->ops->enable_box(box);
}
@@ -287,14 +297,6 @@ static inline u64 uncore_read_counter(struct intel_uncore_box *box,
return box->pmu->type->ops->read_counter(box, event);
}
-static inline void uncore_box_init(struct intel_uncore_box *box)
-{
- if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
- if (box->pmu->type->ops->init_box)
- box->pmu->type->ops->init_box(box);
- }
-}
-
static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
{
return (box->phys_id < 0);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 745b158e9a65..21af6149edf2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -891,6 +891,7 @@ void snbep_uncore_cpu_init(void)
enum {
SNBEP_PCI_QPI_PORT0_FILTER,
SNBEP_PCI_QPI_PORT1_FILTER,
+ HSWEP_PCI_PCU_3,
};
static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
@@ -2026,6 +2027,17 @@ void hswep_uncore_cpu_init(void)
{
if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+
+ /* Detect 6-8 core systems with only two SBOXes */
+ if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) {
+ u32 capid4;
+
+ pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3],
+ 0x94, &capid4);
+ if (((capid4 >> 6) & 0x3) == 0)
+ hswep_uncore_sbox.num_boxes = 2;
+ }
+
uncore_msr_uncores = hswep_msr_uncores;
}
@@ -2287,6 +2299,11 @@ static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = {
.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
SNBEP_PCI_QPI_PORT1_FILTER),
},
+ { /* PCU.3 (for Capability registers) */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ HSWEP_PCI_PCU_3),
+ },
{ /* end: all zeroes */ }
};
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index b74ebc7c4402..cf3df1d8d039 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -265,7 +265,10 @@ int __die(const char *str, struct pt_regs *regs, long err)
printk("SMP ");
#endif
#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
+ printk("DEBUG_PAGEALLOC ");
+#endif
+#ifdef CONFIG_KASAN
+ printk("KASAN");
#endif
printk("\n");
if (notify_die(DIE_OOPS, str, regs, err,
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index dd2f07ae9d0c..46201deee923 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -184,9 +184,9 @@ void __init e820_print_map(char *who)
* overwritten in the same location, starting at biosmap.
*
* The integer pointed to by pnr_map must be valid on entry (the
- * current number of valid entries located at biosmap) and will
- * be updated on return, with the new number of valid entries
- * (something no more than max_nr_map.)
+ * current number of valid entries located at biosmap). If the
+ * sanitizing succeeds the *pnr_map will be updated with the new
+ * number of valid entries (something no more than max_nr_map).
*
* The return value from sanitize_e820_map() is zero if it
* successfully 'sanitized' the map entries passed in, and is -1
@@ -561,23 +561,15 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
void __init update_e820(void)
{
- u32 nr_map;
-
- nr_map = e820.nr_map;
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map))
return;
- e820.nr_map = nr_map;
printk(KERN_INFO "e820: modified physical RAM map:\n");
e820_print_map("modified");
}
static void __init update_e820_saved(void)
{
- u32 nr_map;
-
- nr_map = e820_saved.nr_map;
- if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
- return;
- e820_saved.nr_map = nr_map;
+ sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map),
+ &e820_saved.nr_map);
}
#define MAX_GAP_END 0x100000000ull
/*
@@ -898,11 +890,9 @@ early_param("memmap", parse_memmap_opt);
void __init finish_e820_parsing(void)
{
if (userdef) {
- u32 nr = e820.nr_map;
-
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map),
+ &e820.nr_map) < 0)
early_panic("Invalid user supplied memory map");
- e820.nr_map = nr;
printk(KERN_INFO "e820: user-defined physical RAM map:\n");
e820_print_map("user");
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 01d1c187c9f9..a62536a1be88 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -19,6 +19,7 @@
#include <linux/usb/ehci_def.h>
#include <linux/efi.h>
#include <asm/efi.h>
+#include <asm/pci_x86.h>
/* Simple VGA output */
#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -76,7 +77,7 @@ static struct console early_vga_console = {
/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
-static int early_serial_base = 0x3f8; /* ttyS0 */
+static unsigned long early_serial_base = 0x3f8; /* ttyS0 */
#define XMTRDY 0x20
@@ -94,13 +95,40 @@ static int early_serial_base = 0x3f8; /* ttyS0 */
#define DLL 0 /* Divisor Latch Low */
#define DLH 1 /* Divisor latch High */
+static void mem32_serial_out(unsigned long addr, int offset, int value)
+{
+ uint32_t *vaddr = (uint32_t *)addr;
+ /* shift implied by pointer type */
+ writel(value, vaddr + offset);
+}
+
+static unsigned int mem32_serial_in(unsigned long addr, int offset)
+{
+ uint32_t *vaddr = (uint32_t *)addr;
+ /* shift implied by pointer type */
+ return readl(vaddr + offset);
+}
+
+static unsigned int io_serial_in(unsigned long addr, int offset)
+{
+ return inb(addr + offset);
+}
+
+static void io_serial_out(unsigned long addr, int offset, int value)
+{
+ outb(value, addr + offset);
+}
+
+static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in;
+static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out;
+
static int early_serial_putc(unsigned char ch)
{
unsigned timeout = 0xffff;
- while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout)
cpu_relax();
- outb(ch, early_serial_base + TXR);
+ serial_out(early_serial_base, TXR, ch);
return timeout ? 0 : -1;
}
@@ -114,13 +142,28 @@ static void early_serial_write(struct console *con, const char *s, unsigned n)
}
}
+static __init void early_serial_hw_init(unsigned divisor)
+{
+ unsigned char c;
+
+ serial_out(early_serial_base, LCR, 0x3); /* 8n1 */
+ serial_out(early_serial_base, IER, 0); /* no interrupt */
+ serial_out(early_serial_base, FCR, 0); /* no fifo */
+ serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */
+
+ c = serial_in(early_serial_base, LCR);
+ serial_out(early_serial_base, LCR, c | DLAB);
+ serial_out(early_serial_base, DLL, divisor & 0xff);
+ serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff);
+ serial_out(early_serial_base, LCR, c & ~DLAB);
+}
+
#define DEFAULT_BAUD 9600
static __init void early_serial_init(char *s)
{
- unsigned char c;
unsigned divisor;
- unsigned baud = DEFAULT_BAUD;
+ unsigned long baud = DEFAULT_BAUD;
char *e;
if (*s == ',')
@@ -145,24 +188,124 @@ static __init void early_serial_init(char *s)
s++;
}
- outb(0x3, early_serial_base + LCR); /* 8n1 */
- outb(0, early_serial_base + IER); /* no interrupt */
- outb(0, early_serial_base + FCR); /* no fifo */
- outb(0x3, early_serial_base + MCR); /* DTR + RTS */
+ if (*s) {
+ if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+ baud = DEFAULT_BAUD;
+ }
+
+ /* Convert from baud to divisor value */
+ divisor = 115200 / baud;
+
+ /* These will always be IO based ports */
+ serial_in = io_serial_in;
+ serial_out = io_serial_out;
+
+ /* Set up the HW */
+ early_serial_hw_init(divisor);
+}
+
+#ifdef CONFIG_PCI
+/*
+ * early_pci_serial_init()
+ *
+ * This function is invoked when the early_printk param starts with "pciserial"
+ * The rest of the param should be ",B:D.F,baud" where B, D & F describe the
+ * location of a PCI device that must be a UART device.
+ */
+static __init void early_pci_serial_init(char *s)
+{
+ unsigned divisor;
+ unsigned long baud = DEFAULT_BAUD;
+ u8 bus, slot, func;
+ uint32_t classcode, bar0;
+ uint16_t cmdreg;
+ char *e;
+
+
+ /*
+ * First, part the param to get the BDF values
+ */
+ if (*s == ',')
+ ++s;
+
+ if (*s == 0)
+ return;
+
+ bus = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+ if (*s != ':')
+ return;
+ ++s;
+ slot = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+ if (*s != '.')
+ return;
+ ++s;
+ func = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+
+ /* A baud might be following */
+ if (*s == ',')
+ s++;
+
+ /*
+ * Second, find the device from the BDF
+ */
+ cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND);
+ classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
+ bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+
+ /*
+ * Verify it is a UART type device
+ */
+ if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) &&
+ (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) ||
+ (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */
+ return;
+
+ /*
+ * Determine if it is IO or memory mapped
+ */
+ if (bar0 & 0x01) {
+ /* it is IO mapped */
+ serial_in = io_serial_in;
+ serial_out = io_serial_out;
+ early_serial_base = bar0&0xfffffffc;
+ write_pci_config(bus, slot, func, PCI_COMMAND,
+ cmdreg|PCI_COMMAND_IO);
+ } else {
+ /* It is memory mapped - assume 32-bit alignment */
+ serial_in = mem32_serial_in;
+ serial_out = mem32_serial_out;
+ /* WARNING! assuming the address is always in the first 4G */
+ early_serial_base =
+ (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10);
+ write_pci_config(bus, slot, func, PCI_COMMAND,
+ cmdreg|PCI_COMMAND_MEMORY);
+ }
+ /*
+ * Lastly, initalize the hardware
+ */
if (*s) {
- baud = simple_strtoul(s, &e, 0);
- if (baud == 0 || s == e)
+ if (strcmp(s, "nocfg") == 0)
+ /* Sometimes, we want to leave the UART alone
+ * and assume the BIOS has set it up correctly.
+ * "nocfg" tells us this is the case, and we
+ * should do no more setup.
+ */
+ return;
+ if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
baud = DEFAULT_BAUD;
}
+ /* Convert from baud to divisor value */
divisor = 115200 / baud;
- c = inb(early_serial_base + LCR);
- outb(c | DLAB, early_serial_base + LCR);
- outb(divisor & 0xff, early_serial_base + DLL);
- outb((divisor >> 8) & 0xff, early_serial_base + DLH);
- outb(c & ~DLAB, early_serial_base + LCR);
+
+ /* Set up the HW */
+ early_serial_hw_init(divisor);
}
+#endif
static struct console early_serial_console = {
.name = "earlyser",
@@ -210,6 +353,13 @@ static int __init setup_early_printk(char *buf)
early_serial_init(buf + 4);
early_console_register(&early_serial_console, keep);
}
+#ifdef CONFIG_PCI
+ if (!strncmp(buf, "pciserial", 9)) {
+ early_pci_serial_init(buf + 9);
+ early_console_register(&early_serial_console, keep);
+ buf += 9; /* Keep from match the above "serial" */
+ }
+#endif
if (!strncmp(buf, "vga", 3) &&
boot_params.screen_info.orig_video_isVGA == 1) {
max_xpos = boot_params.screen_info.orig_video_cols;
@@ -226,11 +376,6 @@ static int __init setup_early_printk(char *buf)
early_console_register(&xenboot_console, keep);
#endif
#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
- if (!strncmp(buf, "mrst", 4)) {
- mrst_early_console_init();
- early_console_register(&early_mrst_console, keep);
- }
-
if (!strncmp(buf, "hsu", 3)) {
hsu_early_console_init(buf + 3);
early_console_register(&early_hsu_console, keep);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 9ebaf63ba182..db13655c3a2a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -143,7 +143,8 @@ ENDPROC(native_usergs_sysret64)
movq \tmp,RSP+\offset(%rsp)
movq $__USER_DS,SS+\offset(%rsp)
movq $__USER_CS,CS+\offset(%rsp)
- movq $-1,RCX+\offset(%rsp)
+ movq RIP+\offset(%rsp),\tmp /* get rip */
+ movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
movq R11+\offset(%rsp),\tmp /* get eflags */
movq \tmp,EFLAGS+\offset(%rsp)
.endm
@@ -155,27 +156,6 @@ ENDPROC(native_usergs_sysret64)
movq \tmp,R11+\offset(%rsp)
.endm
- .macro FAKE_STACK_FRAME child_rip
- /* push in order ss, rsp, eflags, cs, rip */
- xorl %eax, %eax
- pushq_cfi $__KERNEL_DS /* ss */
- /*CFI_REL_OFFSET ss,0*/
- pushq_cfi %rax /* rsp */
- CFI_REL_OFFSET rsp,0
- pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */
- /*CFI_REL_OFFSET rflags,0*/
- pushq_cfi $__KERNEL_CS /* cs */
- /*CFI_REL_OFFSET cs,0*/
- pushq_cfi \child_rip /* rip */
- CFI_REL_OFFSET rip,0
- pushq_cfi %rax /* orig rax */
- .endm
-
- .macro UNFAKE_STACK_FRAME
- addq $8*6, %rsp
- CFI_ADJUST_CFA_OFFSET -(6*8)
- .endm
-
/*
* initial frame state for interrupts (and exceptions without error code)
*/
@@ -238,51 +218,6 @@ ENDPROC(native_usergs_sysret64)
CFI_REL_OFFSET r15, R15+\offset
.endm
-/* save partial stack frame */
- .macro SAVE_ARGS_IRQ
- cld
- /* start from rbp in pt_regs and jump over */
- movq_cfi rdi, (RDI-RBP)
- movq_cfi rsi, (RSI-RBP)
- movq_cfi rdx, (RDX-RBP)
- movq_cfi rcx, (RCX-RBP)
- movq_cfi rax, (RAX-RBP)
- movq_cfi r8, (R8-RBP)
- movq_cfi r9, (R9-RBP)
- movq_cfi r10, (R10-RBP)
- movq_cfi r11, (R11-RBP)
-
- /* Save rbp so that we can unwind from get_irq_regs() */
- movq_cfi rbp, 0
-
- /* Save previous stack value */
- movq %rsp, %rsi
-
- leaq -RBP(%rsp),%rdi /* arg1 for handler */
- testl $3, CS-RBP(%rsi)
- je 1f
- SWAPGS
- /*
- * irq_count is used to check if a CPU is already on an interrupt stack
- * or not. While this is essentially redundant with preempt_count it is
- * a little cheaper to use a separate counter in the PDA (short of
- * moving irq_enter into assembly, which would be too much work)
- */
-1: incl PER_CPU_VAR(irq_count)
- cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
- CFI_DEF_CFA_REGISTER rsi
-
- /* Store previous stack value */
- pushq %rsi
- CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
- 0x77 /* DW_OP_breg7 */, 0, \
- 0x06 /* DW_OP_deref */, \
- 0x08 /* DW_OP_const1u */, SS+8-RBP, \
- 0x22 /* DW_OP_plus */
- /* We entered an interrupt context - irqs are off: */
- TRACE_IRQS_OFF
- .endm
-
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
@@ -426,15 +361,12 @@ system_call_fastpath:
* Has incomplete stack frame and undefined top of stack.
*/
ret_from_sys_call:
- movl $_TIF_ALLWORK_MASK,%edi
- /* edi: flagmask */
-sysret_check:
+ testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ jnz int_ret_from_sys_call_fixup /* Go the the slow path */
+
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
- andl %edi,%edx
- jnz sysret_careful
CFI_REMEMBER_STATE
/*
* sysretq will re-enable interrupts:
@@ -448,49 +380,10 @@ sysret_check:
USERGS_SYSRET64
CFI_RESTORE_STATE
- /* Handle reschedules */
- /* edx: work, edi: workmask */
-sysret_careful:
- bt $TIF_NEED_RESCHED,%edx
- jnc sysret_signal
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq_cfi %rdi
- SCHEDULE_USER
- popq_cfi %rdi
- jmp sysret_check
- /* Handle a signal */
-sysret_signal:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
-#ifdef CONFIG_AUDITSYSCALL
- bt $TIF_SYSCALL_AUDIT,%edx
- jc sysret_audit
-#endif
- /*
- * We have a signal, or exit tracing or single-step.
- * These all wind up with the iret return path anyway,
- * so just join that path right now.
- */
+int_ret_from_sys_call_fixup:
FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
- jmp int_check_syscall_exit_work
-
-#ifdef CONFIG_AUDITSYSCALL
- /*
- * Return fast path for syscall audit. Call __audit_syscall_exit()
- * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
- * masked off.
- */
-sysret_audit:
- movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
- cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
- setbe %al /* 1 if so, 0 if not */
- movzbl %al,%edi /* zero-extend that into %edi */
- call __audit_syscall_exit
- movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
- jmp sysret_check
-#endif /* CONFIG_AUDITSYSCALL */
+ jmp int_ret_from_sys_call
/* Do syscall tracing */
tracesys:
@@ -626,19 +519,6 @@ END(\label)
FORK_LIKE vfork
FIXED_FRAME stub_iopl, sys_iopl
-ENTRY(ptregscall_common)
- DEFAULT_FRAME 1 8 /* offset 8: return address */
- RESTORE_TOP_OF_STACK %r11, 8
- movq_cfi_restore R15+8, r15
- movq_cfi_restore R14+8, r14
- movq_cfi_restore R13+8, r13
- movq_cfi_restore R12+8, r12
- movq_cfi_restore RBP+8, rbp
- movq_cfi_restore RBX+8, rbx
- ret $REST_SKIP /* pop extended registers */
- CFI_ENDPROC
-END(ptregscall_common)
-
ENTRY(stub_execve)
CFI_STARTPROC
addq $8, %rsp
@@ -779,7 +659,48 @@ END(interrupt)
/* reserve pt_regs for scratch regs and rbp */
subq $ORIG_RAX-RBP, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
- SAVE_ARGS_IRQ
+ cld
+ /* start from rbp in pt_regs and jump over */
+ movq_cfi rdi, (RDI-RBP)
+ movq_cfi rsi, (RSI-RBP)
+ movq_cfi rdx, (RDX-RBP)
+ movq_cfi rcx, (RCX-RBP)
+ movq_cfi rax, (RAX-RBP)
+ movq_cfi r8, (R8-RBP)
+ movq_cfi r9, (R9-RBP)
+ movq_cfi r10, (R10-RBP)
+ movq_cfi r11, (R11-RBP)
+
+ /* Save rbp so that we can unwind from get_irq_regs() */
+ movq_cfi rbp, 0
+
+ /* Save previous stack value */
+ movq %rsp, %rsi
+
+ leaq -RBP(%rsp),%rdi /* arg1 for handler */
+ testl $3, CS-RBP(%rsi)
+ je 1f
+ SWAPGS
+ /*
+ * irq_count is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+ * a little cheaper to use a separate counter in the PDA (short of
+ * moving irq_enter into assembly, which would be too much work)
+ */
+1: incl PER_CPU_VAR(irq_count)
+ cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
+ CFI_DEF_CFA_REGISTER rsi
+
+ /* Store previous stack value */
+ pushq %rsi
+ CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
+ 0x77 /* DW_OP_breg7 */, 0, \
+ 0x06 /* DW_OP_deref */, \
+ 0x08 /* DW_OP_const1u */, SS+8-RBP, \
+ 0x22 /* DW_OP_plus */
+ /* We entered an interrupt context - irqs are off: */
+ TRACE_IRQS_OFF
+
call \func
.endm
@@ -831,6 +752,60 @@ retint_swapgs: /* return to user-space */
*/
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ
+
+ /*
+ * Try to use SYSRET instead of IRET if we're returning to
+ * a completely clean 64-bit userspace context.
+ */
+ movq (RCX-R11)(%rsp), %rcx
+ cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
+ jne opportunistic_sysret_failed
+
+ /*
+ * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP. It's not worth
+ * testing for canonicalness exactly -- this check detects any
+ * of the 17 high bits set, which is true for non-canonical
+ * or kernel addresses. (This will pessimize vsyscall=native.
+ * Big deal.)
+ *
+ * If virtual addresses ever become wider, this will need
+ * to be updated to remain correct on both old and new CPUs.
+ */
+ .ifne __VIRTUAL_MASK_SHIFT - 47
+ .error "virtual address width changed -- sysret checks need update"
+ .endif
+ shr $__VIRTUAL_MASK_SHIFT, %rcx
+ jnz opportunistic_sysret_failed
+
+ cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ movq (R11-ARGOFFSET)(%rsp), %r11
+ cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
+ jne opportunistic_sysret_failed
+
+ testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
+ jnz opportunistic_sysret_failed
+
+ /* nothing to check for RSP */
+
+ cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ /*
+ * We win! This label is here just for ease of understanding
+ * perf profiles. Nothing jumps here.
+ */
+irq_return_via_sysret:
+ CFI_REMEMBER_STATE
+ RESTORE_ARGS 1,8,1
+ movq (RSP-RIP)(%rsp),%rsp
+ USERGS_SYSRET64
+ CFI_RESTORE_STATE
+
+opportunistic_sysret_failed:
SWAPGS
jmp restore_args
@@ -1048,6 +1023,11 @@ ENTRY(\sym)
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
.if \paranoid
+ .if \paranoid == 1
+ CFI_REMEMBER_STATE
+ testl $3, CS(%rsp) /* If coming from userspace, switch */
+ jnz 1f /* stacks. */
+ .endif
call save_paranoid
.else
call error_entry
@@ -1088,6 +1068,36 @@ ENTRY(\sym)
jmp error_exit /* %ebx: no swapgs flag */
.endif
+ .if \paranoid == 1
+ CFI_RESTORE_STATE
+ /*
+ * Paranoid entry from userspace. Switch stacks and treat it
+ * as a normal entry. This means that paranoid handlers
+ * run in real process context if user_mode(regs).
+ */
+1:
+ call error_entry
+
+ DEFAULT_FRAME 0
+
+ movq %rsp,%rdi /* pt_regs pointer */
+ call sync_regs
+ movq %rax,%rsp /* switch stack */
+
+ movq %rsp,%rdi /* pt_regs pointer */
+
+ .if \has_error_code
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ .else
+ xorl %esi,%esi /* no error code */
+ .endif
+
+ call \do_sym
+
+ jmp error_exit /* %ebx: no swapgs flag */
+ .endif
+
CFI_ENDPROC
END(\sym)
.endm
@@ -1108,7 +1118,7 @@ idtentry overflow do_overflow has_error_code=0
idtentry bounds do_bounds has_error_code=0
idtentry invalid_op do_invalid_op has_error_code=0
idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=1
+idtentry double_fault do_double_fault has_error_code=1 paranoid=2
idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
idtentry invalid_TSS do_invalid_TSS has_error_code=1
idtentry segment_not_present do_segment_not_present has_error_code=1
@@ -1289,16 +1299,14 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(
#endif
/*
- * "Paranoid" exit path from exception stack.
- * Paranoid because this is used by NMIs and cannot take
- * any kernel state for granted.
- * We don't do kernel preemption checks here, because only
- * NMI should be common and it does not enable IRQs and
- * cannot get reschedule ticks.
+ * "Paranoid" exit path from exception stack. This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
*
- * "trace" is 0 for the NMI handler only, because irq-tracing
- * is fundamentally NMI-unsafe. (we cannot change the soft and
- * hard flags at once, atomically)
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated. Fortunately, we there's no good reason
+ * to try to handle preemption here.
*/
/* ebx: no swapgs flag */
@@ -1308,43 +1316,14 @@ ENTRY(paranoid_exit)
TRACE_IRQS_OFF_DEBUG
testl %ebx,%ebx /* swapgs needed? */
jnz paranoid_restore
- testl $3,CS(%rsp)
- jnz paranoid_userspace
-paranoid_swapgs:
TRACE_IRQS_IRETQ 0
SWAPGS_UNSAFE_STACK
RESTORE_ALL 8
- jmp irq_return
+ INTERRUPT_RETURN
paranoid_restore:
TRACE_IRQS_IRETQ_DEBUG 0
RESTORE_ALL 8
- jmp irq_return
-paranoid_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz paranoid_schedule
- movl %ebx,%edx /* arg3: thread flags */
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
-paranoid_schedule:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
- SCHEDULE_USER
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
+ INTERRUPT_RETURN
CFI_ENDPROC
END(paranoid_exit)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 2142376dc8c6..8b7b0a51e742 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -674,7 +674,7 @@ static inline void *alloc_tramp(unsigned long size)
}
static inline void tramp_free(void *tramp)
{
- module_free(NULL, tramp);
+ module_memfree(tramp);
}
#else
/* Trampolines can only be created if modules are supported */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6c1b9836995..2911ef3a9f1c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void)
asmlinkage __visible void __init i386_start_kernel(void)
{
+ cr4_init_shadow();
sanitize_boot_params(&boot_params);
/* Call the subarch specific early setup function */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index eda1a865641e..c4f8d4659070 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,6 +27,7 @@
#include <asm/bios_ebda.h>
#include <asm/bootparam_utils.h>
#include <asm/microcode.h>
+#include <asm/kasan.h>
/*
* Manage page tables very early on.
@@ -46,7 +47,7 @@ static void __init reset_early_page_tables(void)
next_early_pgt = 0;
- write_cr3(__pa(early_level4_pgt));
+ write_cr3(__pa_nodebug(early_level4_pgt));
}
/* Create a new PMD entry */
@@ -59,7 +60,7 @@ int __init early_make_pgtable(unsigned long address)
pmdval_t pmd, *pmd_p;
/* Invalid address or early pgt is done ? */
- if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
+ if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
return -1;
again:
@@ -155,9 +156,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
+ cr4_init_shadow();
+
/* Kill off the identity-map trampoline */
reset_early_page_tables();
+ kasan_map_early_shadow(early_level4_pgt);
+
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
@@ -179,6 +184,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
/* set init_level4_pgt kernel high mapping*/
init_level4_pgt[511] = early_level4_pgt[511];
+ kasan_map_early_shadow(init_level4_pgt);
+
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index a468c0a65c42..6fd514d9f69a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -514,8 +514,38 @@ ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
+#ifdef CONFIG_KASAN
+#define FILL(VAL, COUNT) \
+ .rept (COUNT) ; \
+ .quad (VAL) ; \
+ .endr
+
+NEXT_PAGE(kasan_zero_pte)
+ FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512)
+NEXT_PAGE(kasan_zero_pmd)
+ FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512)
+NEXT_PAGE(kasan_zero_pud)
+ FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512)
+
+#undef FILL
+#endif
+
+
#include "../../x86/xen/xen-head.S"
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
+
+#ifdef CONFIG_KASAN
+/*
+ * This page used as early shadow. We don't use empty_zero_page
+ * at early stages, stack instrumentation could write some garbage
+ * to this page.
+ * Latter we reuse it as zero shadow for large ranges of memory
+ * that allowed to access, but not instrumented by kasan
+ * (vmalloc/vmemmap ...).
+ */
+NEXT_PAGE(kasan_zero_page)
+ .skip PAGE_SIZE
+#endif
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 319bcb9372fe..3acbff4716b0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -168,7 +168,7 @@ static void _hpet_print_config(const char *function, int line)
#define hpet_print_config() \
do { \
if (hpet_verbose) \
- _hpet_print_config(__FUNCTION__, __LINE__); \
+ _hpet_print_config(__func__, __LINE__); \
} while (0)
/*
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 3d5fb509bdeb..7114ba220fd4 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -126,6 +126,8 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
*dr7 |= encode_dr7(i, info->len, info->type);
set_debugreg(*dr7, 7);
+ if (info->mask)
+ set_dr_addr_mask(info->mask, i);
return 0;
}
@@ -161,29 +163,8 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
*dr7 &= ~__encode_dr7(i, info->len, info->type);
set_debugreg(*dr7, 7);
-}
-
-static int get_hbp_len(u8 hbp_len)
-{
- unsigned int len_in_bytes = 0;
-
- switch (hbp_len) {
- case X86_BREAKPOINT_LEN_1:
- len_in_bytes = 1;
- break;
- case X86_BREAKPOINT_LEN_2:
- len_in_bytes = 2;
- break;
- case X86_BREAKPOINT_LEN_4:
- len_in_bytes = 4;
- break;
-#ifdef CONFIG_X86_64
- case X86_BREAKPOINT_LEN_8:
- len_in_bytes = 8;
- break;
-#endif
- }
- return len_in_bytes;
+ if (info->mask)
+ set_dr_addr_mask(0, i);
}
/*
@@ -196,7 +177,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
va = info->address;
- len = get_hbp_len(info->len);
+ len = bp->attr.bp_len;
return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
}
@@ -277,6 +258,8 @@ static int arch_build_bp_info(struct perf_event *bp)
}
/* Len */
+ info->mask = 0;
+
switch (bp->attr.bp_len) {
case HW_BREAKPOINT_LEN_1:
info->len = X86_BREAKPOINT_LEN_1;
@@ -293,11 +276,17 @@ static int arch_build_bp_info(struct perf_event *bp)
break;
#endif
default:
- return -EINVAL;
+ if (!is_power_of_2(bp->attr.bp_len))
+ return -EINVAL;
+ if (!cpu_has_bpext)
+ return -EOPNOTSUPP;
+ info->mask = bp->attr.bp_len - 1;
+ info->len = X86_BREAKPOINT_LEN_1;
}
return 0;
}
+
/*
* Validate the arch-specific HW Breakpoint register settings
*/
@@ -312,11 +301,11 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
if (ret)
return ret;
- ret = -EINVAL;
-
switch (info->len) {
case X86_BREAKPOINT_LEN_1:
align = 0;
+ if (info->mask)
+ align = info->mask;
break;
case X86_BREAKPOINT_LEN_2:
align = 1;
@@ -330,7 +319,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
break;
#endif
default:
- return ret;
+ WARN_ON_ONCE(1);
}
/*
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a9a4229f6161..d5651fce0b71 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -13,12 +13,26 @@
#include <asm/sigcontext.h>
#include <asm/processor.h>
#include <asm/math_emu.h>
+#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#include <asm/ptrace.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/user.h>
+static DEFINE_PER_CPU(bool, in_kernel_fpu);
+
+void kernel_fpu_disable(void)
+{
+ WARN_ON(this_cpu_read(in_kernel_fpu));
+ this_cpu_write(in_kernel_fpu, true);
+}
+
+void kernel_fpu_enable(void)
+{
+ this_cpu_write(in_kernel_fpu, false);
+}
+
/*
* Were we in an interrupt that interrupted kernel mode?
*
@@ -33,6 +47,9 @@
*/
static inline bool interrupted_kernel_fpu_idle(void)
{
+ if (this_cpu_read(in_kernel_fpu))
+ return false;
+
if (use_eager_fpu())
return __thread_has_fpu(current);
@@ -73,10 +90,10 @@ void __kernel_fpu_begin(void)
{
struct task_struct *me = current;
+ this_cpu_write(in_kernel_fpu, true);
+
if (__thread_has_fpu(me)) {
- __thread_clear_has_fpu(me);
__save_init_fpu(me);
- /* We do 'stts()' in __kernel_fpu_end() */
} else if (!use_eager_fpu()) {
this_cpu_write(fpu_owner_task, NULL);
clts();
@@ -86,19 +103,16 @@ EXPORT_SYMBOL(__kernel_fpu_begin);
void __kernel_fpu_end(void)
{
- if (use_eager_fpu()) {
- /*
- * For eager fpu, most the time, tsk_used_math() is true.
- * Restore the user math as we are done with the kernel usage.
- * At few instances during thread exit, signal handling etc,
- * tsk_used_math() is false. Those few places will take proper
- * actions, so we don't need to restore the math here.
- */
- if (likely(tsk_used_math(current)))
- math_state_restore();
- } else {
+ struct task_struct *me = current;
+
+ if (__thread_has_fpu(me)) {
+ if (WARN_ON(restore_fpu_checking(me)))
+ drop_init_fpu(me);
+ } else if (!use_eager_fpu()) {
stts();
}
+
+ this_cpu_write(in_kernel_fpu, false);
}
EXPORT_SYMBOL(__kernel_fpu_end);
@@ -180,7 +194,7 @@ void fpu_init(void)
if (cpu_has_xmm)
cr4_mask |= X86_CR4_OSXMMEXCPT;
if (cr4_mask)
- set_in_cr4(cr4_mask);
+ cr4_set_bits(cr4_mask);
cr0 = read_cr0();
cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 6307a0f0cf17..705ef8d48e2d 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -127,7 +127,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_puts(p, " Machine check polls\n");
#endif
#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
- seq_printf(p, "%*s: ", prec, "THR");
+ seq_printf(p, "%*s: ", prec, "HYP");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count);
seq_puts(p, " Hypervisor callback interrupts\n");
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 63ce838e5a54..28d28f5eb8f4 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -69,16 +69,9 @@ static void call_on_stack(void *func, void *stack)
: "memory", "cc", "edx", "ecx", "eax");
}
-/* how to get the current stack pointer from C */
-#define current_stack_pointer ({ \
- unsigned long sp; \
- asm("mov %%esp,%0" : "=g" (sp)); \
- sp; \
-})
-
static inline void *current_stack(void)
{
- return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
+ return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
}
static inline int
@@ -103,7 +96,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
/* Save the next esp at the bottom of the stack */
prev_esp = (u32 *)irqstk;
- *prev_esp = current_stack_pointer;
+ *prev_esp = current_stack_pointer();
if (unlikely(overflow))
call_on_stack(print_stack_overflow, isp);
@@ -156,7 +149,7 @@ void do_softirq_own_stack(void)
/* Push the previous esp onto the stack */
prev_esp = (u32 *)irqstk;
- *prev_esp = current_stack_pointer;
+ *prev_esp = current_stack_pointer();
call_on_stack(__do_softirq, isp);
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index f7e3cd50ece0..98f654d466e5 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1020,6 +1020,15 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
regs->flags &= ~X86_EFLAGS_IF;
trace_hardirqs_off();
regs->ip = (unsigned long)(jp->entry);
+
+ /*
+ * jprobes use jprobe_return() which skips the normal return
+ * path of the function, and this messes up the accounting of the
+ * function graph tracer to get messed up.
+ *
+ * Pause function graph tracing while performing the jprobe function.
+ */
+ pause_graph_tracing();
return 1;
}
NOKPROBE_SYMBOL(setjmp_pre_handler);
@@ -1048,24 +1057,25 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
u8 *addr = (u8 *) (regs->ip - 1);
struct jprobe *jp = container_of(p, struct jprobe, kp);
+ void *saved_sp = kcb->jprobe_saved_sp;
if ((addr > (u8 *) jprobe_return) &&
(addr < (u8 *) jprobe_return_end)) {
- if (stack_addr(regs) != kcb->jprobe_saved_sp) {
+ if (stack_addr(regs) != saved_sp) {
struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
printk(KERN_ERR
"current sp %p does not match saved sp %p\n",
- stack_addr(regs), kcb->jprobe_saved_sp);
+ stack_addr(regs), saved_sp);
printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
show_regs(saved_regs);
printk(KERN_ERR "Current registers\n");
show_regs(regs);
BUG();
}
+ /* It's OK to start function graph tracing again */
+ unpause_graph_tracing();
*regs = kcb->jprobe_saved_regs;
- memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
- kcb->jprobes_stack,
- MIN_STACK_SIZE(kcb->jprobe_saved_sp));
+ memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp));
preempt_enable_no_resched();
return 1;
}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7c523bbf3dc8..0dd8d089c315 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -322,7 +322,8 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
* Target instructions MUST be relocatable (checked inside)
* This is called when new aggr(opt)probe is allocated or reused.
*/
-int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+ struct kprobe *__unused)
{
u8 *buf;
int ret;
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
new file mode 100644
index 000000000000..ff3c3101d003
--- /dev/null
+++ b/arch/x86/kernel/livepatch.c
@@ -0,0 +1,90 @@
+/*
+ * livepatch.c - x86-specific Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/page_types.h>
+#include <asm/elf.h>
+#include <asm/livepatch.h>
+
+/**
+ * klp_write_module_reloc() - write a relocation in a module
+ * @mod: module in which the section to be modified is found
+ * @type: ELF relocation type (see asm/elf.h)
+ * @loc: address that the relocation should be written to
+ * @value: relocation value (sym address + addend)
+ *
+ * This function writes a relocation to the specified location for
+ * a particular module.
+ */
+int klp_write_module_reloc(struct module *mod, unsigned long type,
+ unsigned long loc, unsigned long value)
+{
+ int ret, numpages, size = 4;
+ bool readonly;
+ unsigned long val;
+ unsigned long core = (unsigned long)mod->module_core;
+ unsigned long core_ro_size = mod->core_ro_size;
+ unsigned long core_size = mod->core_size;
+
+ switch (type) {
+ case R_X86_64_NONE:
+ return 0;
+ case R_X86_64_64:
+ val = value;
+ size = 8;
+ break;
+ case R_X86_64_32:
+ val = (u32)value;
+ break;
+ case R_X86_64_32S:
+ val = (s32)value;
+ break;
+ case R_X86_64_PC32:
+ val = (u32)(value - loc);
+ break;
+ default:
+ /* unsupported relocation type */
+ return -EINVAL;
+ }
+
+ if (loc < core || loc >= core + core_size)
+ /* loc does not point to any symbol inside the module */
+ return -EINVAL;
+
+ if (loc < core + core_ro_size)
+ readonly = true;
+ else
+ readonly = false;
+
+ /* determine if the relocation spans a page boundary */
+ numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2;
+
+ if (readonly)
+ set_memory_rw(loc & PAGE_MASK, numpages);
+
+ ret = probe_kernel_write((void *)loc, &val, size);
+
+ if (readonly)
+ set_memory_ro(loc & PAGE_MASK, numpages);
+
+ return ret;
+}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e69f9882bf95..d1ac80b72c72 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/kernel.h>
+#include <linux/kasan.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/gfp.h>
@@ -83,13 +84,22 @@ static unsigned long int get_module_load_offset(void)
void *module_alloc(unsigned long size)
{
+ void *p;
+
if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL;
- return __vmalloc_node_range(size, 1,
+
+ p = __vmalloc_node_range(size, MODULE_ALIGN,
MODULES_VADDR + get_module_load_offset(),
MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
- PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+ PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
__builtin_return_address(0));
+ if (p && (kasan_module_alloc(p, size) < 0)) {
+ vfree(p);
+ return NULL;
+ }
+
+ return p;
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index e309cc5c276e..781861cc5ee8 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -78,6 +78,14 @@ u64 perf_reg_abi(struct task_struct *task)
{
return PERF_SAMPLE_REGS_ABI_32;
}
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs,
+ struct pt_regs *regs_user_copy)
+{
+ regs_user->regs = task_pt_regs(current);
+ regs_user->abi = perf_reg_abi(current);
+}
#else /* CONFIG_X86_64 */
#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
(1ULL << PERF_REG_X86_ES) | \
@@ -102,4 +110,86 @@ u64 perf_reg_abi(struct task_struct *task)
else
return PERF_SAMPLE_REGS_ABI_64;
}
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs,
+ struct pt_regs *regs_user_copy)
+{
+ struct pt_regs *user_regs = task_pt_regs(current);
+
+ /*
+ * If we're in an NMI that interrupted task_pt_regs setup, then
+ * we can't sample user regs at all. This check isn't really
+ * sufficient, though, as we could be in an NMI inside an interrupt
+ * that happened during task_pt_regs setup.
+ */
+ if (regs->sp > (unsigned long)&user_regs->r11 &&
+ regs->sp <= (unsigned long)(user_regs + 1)) {
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+ regs_user->regs = NULL;
+ return;
+ }
+
+ /*
+ * RIP, flags, and the argument registers are usually saved.
+ * orig_ax is probably okay, too.
+ */
+ regs_user_copy->ip = user_regs->ip;
+ regs_user_copy->cx = user_regs->cx;
+ regs_user_copy->dx = user_regs->dx;
+ regs_user_copy->si = user_regs->si;
+ regs_user_copy->di = user_regs->di;
+ regs_user_copy->r8 = user_regs->r8;
+ regs_user_copy->r9 = user_regs->r9;
+ regs_user_copy->r10 = user_regs->r10;
+ regs_user_copy->r11 = user_regs->r11;
+ regs_user_copy->orig_ax = user_regs->orig_ax;
+ regs_user_copy->flags = user_regs->flags;
+
+ /*
+ * Don't even try to report the "rest" regs.
+ */
+ regs_user_copy->bx = -1;
+ regs_user_copy->bp = -1;
+ regs_user_copy->r12 = -1;
+ regs_user_copy->r13 = -1;
+ regs_user_copy->r14 = -1;
+ regs_user_copy->r15 = -1;
+
+ /*
+ * For this to be at all useful, we need a reasonable guess for
+ * sp and the ABI. Be careful: we're in NMI context, and we're
+ * considering current to be the current task, so we should
+ * be careful not to look at any other percpu variables that might
+ * change during context switches.
+ */
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) &&
+ task_thread_info(current)->status & TS_COMPAT) {
+ /* Easy case: we're in a compat syscall. */
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_32;
+ regs_user_copy->sp = user_regs->sp;
+ regs_user_copy->cs = user_regs->cs;
+ regs_user_copy->ss = user_regs->ss;
+ } else if (user_regs->orig_ax != -1) {
+ /*
+ * We're probably in a 64-bit syscall.
+ * Warning: this code is severely racy. At least it's better
+ * than just blindly copying user_regs.
+ */
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
+ regs_user_copy->sp = this_cpu_read(old_rsp);
+ regs_user_copy->cs = __USER_CS;
+ regs_user_copy->ss = __USER_DS;
+ regs_user_copy->cx = -1; /* usually contains garbage */
+ } else {
+ /* We're probably in an interrupt or exception. */
+ regs_user->abi = user_64bit_mode(user_regs) ?
+ PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
+ regs_user_copy->sp = user_regs->sp;
+ regs_user_copy->cs = user_regs->cs;
+ regs_user_copy->ss = user_regs->ss;
+ }
+
+ regs_user->regs = regs_user_copy;
+}
#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c
index 0ee5025e0fa4..d66a4fe6caee 100644
--- a/arch/x86/kernel/pmc_atom.c
+++ b/arch/x86/kernel/pmc_atom.c
@@ -25,8 +25,6 @@
#include <asm/pmc_atom.h>
-#define DRIVER_NAME KBUILD_MODNAME
-
struct pmc_dev {
u32 base_addr;
void __iomem *regmap;
@@ -38,12 +36,12 @@ struct pmc_dev {
static struct pmc_dev pmc_device;
static u32 acpi_base_addr;
-struct pmc_dev_map {
+struct pmc_bit_map {
const char *name;
u32 bit_mask;
};
-static const struct pmc_dev_map dev_map[] = {
+static const struct pmc_bit_map dev_map[] = {
{"0 - LPSS1_F0_DMA", BIT_LPSS1_F0_DMA},
{"1 - LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1},
{"2 - LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2},
@@ -82,6 +80,27 @@ static const struct pmc_dev_map dev_map[] = {
{"35 - DFX", BIT_DFX},
};
+static const struct pmc_bit_map pss_map[] = {
+ {"0 - GBE", PMC_PSS_BIT_GBE},
+ {"1 - SATA", PMC_PSS_BIT_SATA},
+ {"2 - HDA", PMC_PSS_BIT_HDA},
+ {"3 - SEC", PMC_PSS_BIT_SEC},
+ {"4 - PCIE", PMC_PSS_BIT_PCIE},
+ {"5 - LPSS", PMC_PSS_BIT_LPSS},
+ {"6 - LPE", PMC_PSS_BIT_LPE},
+ {"7 - DFX", PMC_PSS_BIT_DFX},
+ {"8 - USH_CTRL", PMC_PSS_BIT_USH_CTRL},
+ {"9 - USH_SUS", PMC_PSS_BIT_USH_SUS},
+ {"10 - USH_VCCS", PMC_PSS_BIT_USH_VCCS},
+ {"11 - USH_VCCA", PMC_PSS_BIT_USH_VCCA},
+ {"12 - OTG_CTRL", PMC_PSS_BIT_OTG_CTRL},
+ {"13 - OTG_VCCS", PMC_PSS_BIT_OTG_VCCS},
+ {"14 - OTG_VCCA_CLK", PMC_PSS_BIT_OTG_VCCA_CLK},
+ {"15 - OTG_VCCA", PMC_PSS_BIT_OTG_VCCA},
+ {"16 - USB", PMC_PSS_BIT_USB},
+ {"17 - USB_SUS", PMC_PSS_BIT_USB_SUS},
+};
+
static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
{
return readl(pmc->regmap + reg_offset);
@@ -169,6 +188,32 @@ static const struct file_operations pmc_dev_state_ops = {
.release = single_release,
};
+static int pmc_pss_state_show(struct seq_file *s, void *unused)
+{
+ struct pmc_dev *pmc = s->private;
+ u32 pss = pmc_reg_read(pmc, PMC_PSS);
+ int pss_index;
+
+ for (pss_index = 0; pss_index < ARRAY_SIZE(pss_map); pss_index++) {
+ seq_printf(s, "Island: %-32s\tState: %s\n",
+ pss_map[pss_index].name,
+ pss_map[pss_index].bit_mask & pss ? "Off" : "On");
+ }
+ return 0;
+}
+
+static int pmc_pss_state_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pmc_pss_state_show, inode->i_private);
+}
+
+static const struct file_operations pmc_pss_state_ops = {
+ .open = pmc_pss_state_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static int pmc_sleep_tmr_show(struct seq_file *s, void *unused)
{
struct pmc_dev *pmc = s->private;
@@ -202,11 +247,7 @@ static const struct file_operations pmc_sleep_tmr_ops = {
static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
{
- if (!pmc->dbgfs_dir)
- return;
-
debugfs_remove_recursive(pmc->dbgfs_dir);
- pmc->dbgfs_dir = NULL;
}
static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
@@ -217,19 +258,29 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
if (!dir)
return -ENOMEM;
+ pmc->dbgfs_dir = dir;
+
f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
dir, pmc, &pmc_dev_state_ops);
if (!f) {
- dev_err(&pdev->dev, "dev_states register failed\n");
+ dev_err(&pdev->dev, "dev_state register failed\n");
goto err;
}
+
+ f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO,
+ dir, pmc, &pmc_pss_state_ops);
+ if (!f) {
+ dev_err(&pdev->dev, "pss_state register failed\n");
+ goto err;
+ }
+
f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
dir, pmc, &pmc_sleep_tmr_ops);
if (!f) {
dev_err(&pdev->dev, "sleep_state register failed\n");
goto err;
}
- pmc->dbgfs_dir = dir;
+
return 0;
err:
pmc_dbgfs_unregister(pmc);
@@ -292,7 +343,6 @@ MODULE_DEVICE_TABLE(pci, pmc_pci_ids);
static int __init pmc_atom_init(void)
{
- int err = -ENODEV;
struct pci_dev *pdev = NULL;
const struct pci_device_id *ent;
@@ -306,14 +356,11 @@ static int __init pmc_atom_init(void)
*/
for_each_pci_dev(pdev) {
ent = pci_match_id(pmc_pci_ids, pdev);
- if (ent) {
- err = pmc_setup_dev(pdev);
- goto out;
- }
+ if (ent)
+ return pmc_setup_dev(pdev);
}
/* Device not found. */
-out:
- return err;
+ return -ENODEV;
}
module_init(pmc_atom_init);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e127ddaa2d5a..046e2d620bbe 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
#include <asm/nmi.h>
+#include <asm/tlbflush.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -141,7 +142,7 @@ void flush_thread(void)
static void hard_disable_TSC(void)
{
- write_cr4(read_cr4() | X86_CR4_TSD);
+ cr4_set_bits(X86_CR4_TSD);
}
void disable_TSC(void)
@@ -158,7 +159,7 @@ void disable_TSC(void)
static void hard_enable_TSC(void)
{
- write_cr4(read_cr4() & ~X86_CR4_TSD);
+ cr4_clear_bits(X86_CR4_TSD);
}
static void enable_TSC(void)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8f3ebfe710d0..603c4f99cb5a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -101,7 +101,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
- cr4 = read_cr4_safe();
+ cr4 = __read_cr4_safe();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 5a2c02913af3..67fcc43577d2 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
- cr4 = read_cr4();
+ cr4 = __read_cr4();
printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
fs, fsindex, gs, gsindex, shadowgs);
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index ca9622a25e95..cd9685235df9 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -49,11 +49,11 @@ int mach_set_rtc_mmss(const struct timespec *now)
retval = set_rtc_time(&tm);
if (retval)
printk(KERN_ERR "%s: RTC write failed with error %d\n",
- __FUNCTION__, retval);
+ __func__, retval);
} else {
printk(KERN_ERR
"%s: Invalid RTC value: write of %lx to RTC failed\n",
- __FUNCTION__, nowtime);
+ __func__, nowtime);
retval = -EINVAL;
}
return retval;
@@ -170,7 +170,7 @@ static struct platform_device rtc_device = {
static __init int add_rtc_cmos(void)
{
#ifdef CONFIG_PNP
- static const char * const const ids[] __initconst =
+ static const char * const ids[] __initconst =
{ "PNP0b00", "PNP0b01", "PNP0b02", };
struct pnp_dev *dev;
struct pnp_id *id;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ab4734e5411d..0a2421cca01f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -89,6 +89,7 @@
#include <asm/cacheflush.h>
#include <asm/processor.h>
#include <asm/bugs.h>
+#include <asm/kasan.h>
#include <asm/vsyscall.h>
#include <asm/cpu.h>
@@ -431,15 +432,13 @@ static void __init parse_setup_data(void)
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
- u32 data_len, map_len, data_type;
+ u32 data_len, data_type;
- map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
- (u64)sizeof(struct setup_data));
- data = early_memremap(pa_data, map_len);
+ data = early_memremap(pa_data, sizeof(*data));
data_len = data->len + sizeof(struct setup_data);
data_type = data->type;
pa_next = data->next;
- early_iounmap(data, map_len);
+ early_iounmap(data, sizeof(*data));
switch (data_type) {
case SETUP_E820_EXT:
@@ -1176,9 +1175,11 @@ void __init setup_arch(char **cmdline_p)
x86_init.paging.pagetable_init();
+ kasan_init();
+
if (boot_cpu_data.cpuid_level >= 0) {
/* A CPU has %cr4 if and only if it has CPUID */
- mmu_cr4_features = read_cr4();
+ mmu_cr4_features = __read_cr4();
if (trampoline_cr4_features)
*trampoline_cr4_features = mmu_cr4_features;
}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ed37a768d0fc..e5042463c1bc 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
unsigned int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
get_user_try {
@@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
user_exit();
-#ifdef CONFIG_X86_MCE
- /* notify userspace of pending MCEs */
- if (thread_info_flags & _TIF_MCE_NOTIFY)
- mce_notify_process();
-#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
-
if (thread_info_flags & _TIF_UPROBE)
uprobe_notify_resume(regs);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6d7022c683e3..febc6aabc72e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -73,7 +73,6 @@
#include <asm/setup.h>
#include <asm/uv/uv.h>
#include <linux/mc146818rtc.h>
-#include <asm/smpboot_hooks.h>
#include <asm/i8259.h>
#include <asm/realmode.h>
#include <asm/misc.h>
@@ -104,6 +103,43 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
atomic_t init_deasserted;
+static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ CMOS_WRITE(0xa, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ local_flush_tlb();
+ pr_debug("1.\n");
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
+ start_eip >> 4;
+ pr_debug("2.\n");
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
+ start_eip & 0xf;
+ pr_debug("3.\n");
+}
+
+static inline void smpboot_restore_warm_reset_vector(void)
+{
+ unsigned long flags;
+
+ /*
+ * Install writable page 0 entry to set BIOS data area.
+ */
+ local_flush_tlb();
+
+ /*
+ * Paranoid: Set warm reset code and vector here back
+ * to default values.
+ */
+ spin_lock_irqsave(&rtc_lock, flags);
+ CMOS_WRITE(0, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
+}
+
/*
* Report back to the Boot Processor during boot time or to the caller processor
* during CPU online.
@@ -136,8 +172,7 @@ static void smp_callin(void)
* CPU, first the APIC. (this is probably redundant on most
* boards)
*/
- setup_local_APIC();
- end_local_APIC_setup();
+ apic_ap_setup();
/*
* Need to setup vector mappings before we enable interrupts.
@@ -955,9 +990,12 @@ void arch_disable_smp_support(void)
*/
static __init void disable_smp(void)
{
+ pr_info("SMP disabled\n");
+
+ disable_ioapic_support();
+
init_cpu_present(cpumask_of(0));
init_cpu_possible(cpumask_of(0));
- smpboot_clear_io_apic_irqs();
if (smp_found_config)
physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
@@ -967,6 +1005,13 @@ static __init void disable_smp(void)
cpumask_set_cpu(0, cpu_core_mask(0));
}
+enum {
+ SMP_OK,
+ SMP_NO_CONFIG,
+ SMP_NO_APIC,
+ SMP_FORCE_UP,
+};
+
/*
* Various sanity checks.
*/
@@ -1014,10 +1059,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
if (!smp_found_config && !acpi_lapic) {
preempt_enable();
pr_notice("SMP motherboard not detected\n");
- disable_smp();
- if (APIC_init_uniprocessor())
- pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
- return -1;
+ return SMP_NO_CONFIG;
}
/*
@@ -1041,9 +1083,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
boot_cpu_physical_apicid);
pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
}
- smpboot_clear_io_apic();
- disable_ioapic_support();
- return -1;
+ return SMP_NO_APIC;
}
verify_local_APIC();
@@ -1053,15 +1093,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
*/
if (!max_cpus) {
pr_info("SMP mode deactivated\n");
- smpboot_clear_io_apic();
-
- connect_bsp_APIC();
- setup_local_APIC();
- bsp_end_local_APIC_setup();
- return -1;
+ return SMP_FORCE_UP;
}
- return 0;
+ return SMP_OK;
}
static void __init smp_cpu_index_default(void)
@@ -1101,10 +1136,21 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
}
set_cpu_sibling_map(0);
- if (smp_sanity_check(max_cpus) < 0) {
- pr_info("SMP disabled\n");
+ switch (smp_sanity_check(max_cpus)) {
+ case SMP_NO_CONFIG:
disable_smp();
+ if (APIC_init_uniprocessor())
+ pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
return;
+ case SMP_NO_APIC:
+ disable_smp();
+ return;
+ case SMP_FORCE_UP:
+ disable_smp();
+ apic_bsp_setup(false);
+ return;
+ case SMP_OK:
+ break;
}
default_setup_apic_routing();
@@ -1115,33 +1161,10 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
/* Or can we switch back to PIC here? */
}
- connect_bsp_APIC();
-
- /*
- * Switch from PIC to APIC mode.
- */
- setup_local_APIC();
-
- if (x2apic_mode)
- cpu0_logical_apicid = apic_read(APIC_LDR);
- else
- cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
-
- /*
- * Enable IO APIC before setting up error vector
- */
- if (!skip_ioapic_setup && nr_ioapics)
- enable_IO_APIC();
-
- bsp_end_local_APIC_setup();
- smpboot_setup_io_apic();
- /*
- * Set up local APIC timer on boot CPU.
- */
+ cpu0_logical_apicid = apic_bsp_setup(false);
pr_info("CPU%d: ", 0);
print_cpu_info(&cpu_data(0));
- x86_init.timers.setup_percpu_clockev();
if (is_uv_system())
uv_system_init();
@@ -1177,9 +1200,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
nmi_selftest();
impress_friends();
-#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
-#endif
mtrr_aps_init();
}
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 4e942f31b1a7..7fc5e843f247 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -29,7 +29,28 @@ static int get_free_idx(void)
static bool tls_desc_okay(const struct user_desc *info)
{
- if (LDT_empty(info))
+ /*
+ * For historical reasons (i.e. no one ever documented how any
+ * of the segmentation APIs work), user programs can and do
+ * assume that a struct user_desc that's all zeros except for
+ * entry_number means "no segment at all". This never actually
+ * worked. In fact, up to Linux 3.19, a struct user_desc like
+ * this would create a 16-bit read-write segment with base and
+ * limit both equal to zero.
+ *
+ * That was close enough to "no segment at all" until we
+ * hardened this function to disallow 16-bit TLS segments. Fix
+ * it up by interpreting these zeroed segments the way that they
+ * were almost certainly intended to be interpreted.
+ *
+ * The correct way to ask for "no segment at all" is to specify
+ * a user_desc that satisfies LDT_empty. To keep everything
+ * working, we accept both.
+ *
+ * Note that there's a similar kludge in modify_ldt -- look at
+ * the distinction between modes 1 and 0x11.
+ */
+ if (LDT_empty(info) || LDT_zero(info))
return true;
/*
@@ -71,7 +92,7 @@ static void set_tls_desc(struct task_struct *p, int idx,
cpu = get_cpu();
while (n-- > 0) {
- if (LDT_empty(info))
+ if (LDT_empty(info) || LDT_zero(info))
desc->a = desc->b = 0;
else
fill_ldt(desc, info);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 88900e288021..9d2073e2ecc9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,6 +108,88 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
preempt_count_dec();
}
+enum ctx_state ist_enter(struct pt_regs *regs)
+{
+ enum ctx_state prev_state;
+
+ if (user_mode_vm(regs)) {
+ /* Other than that, we're just an exception. */
+ prev_state = exception_enter();
+ } else {
+ /*
+ * We might have interrupted pretty much anything. In
+ * fact, if we're a machine check, we can even interrupt
+ * NMI processing. We don't want in_nmi() to return true,
+ * but we need to notify RCU.
+ */
+ rcu_nmi_enter();
+ prev_state = IN_KERNEL; /* the value is irrelevant. */
+ }
+
+ /*
+ * We are atomic because we're on the IST stack (or we're on x86_32,
+ * in which case we still shouldn't schedule).
+ *
+ * This must be after exception_enter(), because exception_enter()
+ * won't do anything if in_interrupt() returns true.
+ */
+ preempt_count_add(HARDIRQ_OFFSET);
+
+ /* This code is a bit fragile. Test it. */
+ rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
+
+ return prev_state;
+}
+
+void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
+{
+ /* Must be before exception_exit. */
+ preempt_count_sub(HARDIRQ_OFFSET);
+
+ if (user_mode_vm(regs))
+ return exception_exit(prev_state);
+ else
+ rcu_nmi_exit();
+}
+
+/**
+ * ist_begin_non_atomic() - begin a non-atomic section in an IST exception
+ * @regs: regs passed to the IST exception handler
+ *
+ * IST exception handlers normally cannot schedule. As a special
+ * exception, if the exception interrupted userspace code (i.e.
+ * user_mode_vm(regs) would return true) and the exception was not
+ * a double fault, it can be safe to schedule. ist_begin_non_atomic()
+ * begins a non-atomic section within an ist_enter()/ist_exit() region.
+ * Callers are responsible for enabling interrupts themselves inside
+ * the non-atomic section, and callers must call is_end_non_atomic()
+ * before ist_exit().
+ */
+void ist_begin_non_atomic(struct pt_regs *regs)
+{
+ BUG_ON(!user_mode_vm(regs));
+
+ /*
+ * Sanity check: we need to be on the normal thread stack. This
+ * will catch asm bugs and any attempt to use ist_preempt_enable
+ * from double_fault.
+ */
+ BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
+ & ~(THREAD_SIZE - 1)) != 0);
+
+ preempt_count_sub(HARDIRQ_OFFSET);
+}
+
+/**
+ * ist_end_non_atomic() - begin a non-atomic section in an IST exception
+ *
+ * Ends a non-atomic section started with ist_begin_non_atomic().
+ */
+void ist_end_non_atomic(void)
+{
+ preempt_count_add(HARDIRQ_OFFSET);
+}
+
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
struct pt_regs *regs, long error_code)
@@ -251,6 +333,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
* end up promoting it to a doublefault. In that case, modify
* the stack to make it look like we just entered the #GP
* handler from user space, similar to bad_iret.
+ *
+ * No need for ist_enter here because we don't use RCU.
*/
if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
regs->cs == __KERNEL_CS &&
@@ -263,12 +347,12 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
regs->ip = (unsigned long)general_protection;
regs->sp = (unsigned long)&normal_regs->orig_ax;
+
return;
}
#endif
- exception_enter();
- /* Return not checked because double check cannot be ignored */
+ ist_enter(regs); /* Discard prev_state because we won't return. */
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
tsk->thread.error_code = error_code;
@@ -434,7 +518,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
if (poke_int3_handler(regs))
return;
- prev_state = exception_enter();
+ prev_state = ist_enter(regs);
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
@@ -460,33 +544,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs);
debug_stack_usage_dec();
exit:
- exception_exit(prev_state);
+ ist_exit(regs, prev_state);
}
NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
- * Help handler running on IST stack to switch back to user stack
- * for scheduling or signal handling. The actual stack switch is done in
- * entry.S
+ * Help handler running on IST stack to switch off the IST stack if the
+ * interrupted code was in user mode. The actual stack switch is done in
+ * entry_64.S
*/
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{
- struct pt_regs *regs = eregs;
- /* Did already sync */
- if (eregs == (struct pt_regs *)eregs->sp)
- ;
- /* Exception from user space */
- else if (user_mode(eregs))
- regs = task_pt_regs(current);
- /*
- * Exception from kernel and interrupts are enabled. Move to
- * kernel process stack.
- */
- else if (eregs->flags & X86_EFLAGS_IF)
- regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
- if (eregs != regs)
- *regs = *eregs;
+ struct pt_regs *regs = task_pt_regs(current);
+ *regs = *eregs;
return regs;
}
NOKPROBE_SYMBOL(sync_regs);
@@ -554,7 +625,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
unsigned long dr6;
int si_code;
- prev_state = exception_enter();
+ prev_state = ist_enter(regs);
get_debugreg(dr6, 6);
@@ -629,7 +700,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec();
exit:
- exception_exit(prev_state);
+ ist_exit(regs, prev_state);
}
NOKPROBE_SYMBOL(do_debug);
@@ -788,18 +859,16 @@ void math_state_restore(void)
local_irq_disable();
}
+ /* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */
+ kernel_fpu_disable();
__thread_fpu_begin(tsk);
-
- /*
- * Paranoid restore. send a SIGSEGV if we fail to restore the state.
- */
if (unlikely(restore_fpu_checking(tsk))) {
drop_init_fpu(tsk);
force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
- return;
+ } else {
+ tsk->thread.fpu_counter++;
}
-
- tsk->thread.fpu_counter++;
+ kernel_fpu_enable();
}
EXPORT_SYMBOL_GPL(math_state_restore);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index b7e50bba3bbb..505449700e0c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -617,7 +617,7 @@ static unsigned long quick_pit_calibrate(void)
goto success;
}
}
- pr_err("Fast TSC calibration failed\n");
+ pr_info("Fast TSC calibration failed\n");
return 0;
success:
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 040681928e9d..37d8fa4438f0 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -50,13 +50,19 @@ EXPORT_SYMBOL(csum_partial);
#undef memset
#undef memmove
+extern void *__memset(void *, int, __kernel_size_t);
+extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *__memmove(void *, const void *, __kernel_size_t);
extern void *memset(void *, int, __kernel_size_t);
extern void *memcpy(void *, const void *, __kernel_size_t);
-extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *memmove(void *, const void *, __kernel_size_t);
+
+EXPORT_SYMBOL(__memset);
+EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(__memmove);
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(memmove);
#ifndef CONFIG_DEBUG_VIRTUAL
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 0de1fae2bdf0..34f66e58a896 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -12,6 +12,7 @@
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/sigframe.h>
+#include <asm/tlbflush.h>
#include <asm/xcr.h>
/*
@@ -453,7 +454,7 @@ static void prepare_fx_sw_frame(void)
*/
static inline void xstate_enable(void)
{
- set_in_cr4(X86_CR4_OSXSAVE);
+ cr4_set_bits(X86_CR4_OSXSAVE);
xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
}