aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/boot.c10
-rw-r--r--arch/x86/kernel/amd_nb.c8
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/Makefile1
-rw-r--r--arch/x86/kernel/apic/apic.c113
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c9
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c294
-rw-r--r--arch/x86/kernel/apic/io_apic.c6
-rw-r--r--arch/x86/kernel/check.c34
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c14
-rw-r--r--arch/x86/kernel/cpu/cpu.h5
-rw-r--r--arch/x86/kernel/cpu/intel.c2
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c34
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c194
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c18
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c262
-rw-r--r--arch/x86/kernel/cpu/perf_event.h51
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c88
-rw-r--r--arch/x86/kernel/cpu/powerflags.c3
-rw-r--r--arch/x86/kernel/cpu/proc.c4
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/e820.c58
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/entry_32.S4
-rw-r--r--arch/x86/kernel/entry_64.S31
-rw-r--r--arch/x86/kernel/head.c2
-rw-r--r--arch/x86/kernel/head32.c7
-rw-r--r--arch/x86/kernel/head64.c7
-rw-r--r--arch/x86/kernel/hpet.c8
-rw-r--r--arch/x86/kernel/irq.c11
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/jump_label.c2
-rw-r--r--arch/x86/kernel/kvm.c181
-rw-r--r--arch/x86/kernel/microcode_amd.c209
-rw-r--r--arch/x86/kernel/microcode_core.c69
-rw-r--r--arch/x86/kernel/mpparse.c12
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c15
-rw-r--r--arch/x86/kernel/ptrace.c3
-rw-r--r--arch/x86/kernel/setup.c21
-rw-r--r--arch/x86/kernel/smpboot.c3
-rw-r--r--arch/x86/kernel/trampoline.c4
-rw-r--r--arch/x86/kernel/traps.c7
-rw-r--r--arch/x86/kernel/tsc.c6
-rw-r--r--arch/x86/kernel/tsc_sync.c4
-rw-r--r--arch/x86/kernel/vsyscall_64.c77
-rw-r--r--arch/x86/kernel/x86_init.c1
57 files changed, 1312 insertions, 749 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4558f0d0822d..ce664f33ea8e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -219,6 +219,8 @@ static int __init
acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
{
struct acpi_madt_local_x2apic *processor = NULL;
+ int apic_id;
+ u8 enabled;
processor = (struct acpi_madt_local_x2apic *)header;
@@ -227,6 +229,8 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
acpi_table_print_madt_entry(header);
+ apic_id = processor->local_apic_id;
+ enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
#ifdef CONFIG_X86_X2APIC
/*
* We need to register disabled CPU as well to permit
@@ -235,8 +239,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
* to not preallocating memory for all NR_CPUS
* when we use CPU hotplug.
*/
- acpi_register_lapic(processor->local_apic_id, /* APIC ID */
- processor->lapic_flags & ACPI_MADT_ENABLED);
+ if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled)
+ printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+ else
+ acpi_register_lapic(apic_id, enabled);
#else
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
#endif
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4c39baa8facc..013c1810ce72 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -123,16 +123,14 @@ int amd_get_subcaches(int cpu)
{
struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
unsigned int mask;
- int cuid = 0;
+ int cuid;
if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
return 0;
pci_read_config_dword(link, 0x1d4, &mask);
-#ifdef CONFIG_SMP
cuid = cpu_data(cpu).compute_unit_id;
-#endif
return (mask >> (4 * cuid)) & 0xf;
}
@@ -141,7 +139,7 @@ int amd_set_subcaches(int cpu, int mask)
static unsigned int reset, ban;
struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
unsigned int reg;
- int cuid = 0;
+ int cuid;
if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
return -EINVAL;
@@ -159,9 +157,7 @@ int amd_set_subcaches(int cpu, int mask)
pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
}
-#ifdef CONFIG_SMP
cuid = cpu_data(cpu).compute_unit_id;
-#endif
mask <<= 4 * cuid;
mask |= (0xf ^ (1 << cuid)) << 26;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 3d2661ca6542..6e76c191a835 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -88,13 +88,13 @@ static u32 __init allocate_aperture(void)
*/
addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
aper_size, aper_size);
- if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) {
+ if (!addr || addr + aper_size > GART_MAX_ADDR) {
printk(KERN_ERR
"Cannot allocate aperture memory hole (%lx,%uK)\n",
addr, aper_size>>10);
return 0;
}
- memblock_x86_reserve_range(addr, addr + aper_size, "aperture64");
+ memblock_reserve(addr, aper_size);
/*
* Kmemleak should not scan this block as it may not be mapped via the
* kernel direct mapping.
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 767fd04f2843..0ae0323b1f9c 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_SMP) += ipi.o
ifeq ($(CONFIG_X86_64),y)
# APIC probe will depend on the listing order here
+obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o
obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f98d84caf94c..2eec05b6d1b8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -146,16 +146,26 @@ __setup("apicpmtimer", setup_apicpmtimer);
int x2apic_mode;
#ifdef CONFIG_X86_X2APIC
/* x2apic enabled before OS handover */
-static int x2apic_preenabled;
+int x2apic_preenabled;
+static int x2apic_disabled;
+static int nox2apic;
static __init int setup_nox2apic(char *str)
{
if (x2apic_enabled()) {
- pr_warning("Bios already enabled x2apic, "
- "can't enforce nox2apic");
- return 0;
- }
+ int apicid = native_apic_msr_read(APIC_ID);
+
+ if (apicid >= 255) {
+ pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
+ apicid);
+ return 0;
+ }
+
+ pr_warning("x2apic already enabled. will disable it\n");
+ } else
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+
+ nox2apic = 1;
- setup_clear_cpu_cap(X86_FEATURE_X2APIC);
return 0;
}
early_param("nox2apic", setup_nox2apic);
@@ -250,6 +260,7 @@ u32 native_safe_apic_wait_icr_idle(void)
send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
if (!send_status)
break;
+ inc_irq_stat(icr_read_retry_count);
udelay(100);
} while (timeout++ < 1000);
@@ -876,8 +887,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
* Besides, if we don't timer interrupts ignore the global
* interrupt lock, which is the WrongThing (tm) to do.
*/
- exit_idle();
irq_enter();
+ exit_idle();
local_apic_timer_interrupt();
irq_exit();
@@ -1431,6 +1442,45 @@ void __init bsp_end_local_APIC_setup(void)
}
#ifdef CONFIG_X86_X2APIC
+/*
+ * Need to disable xapic and x2apic at the same time and then enable xapic mode
+ */
+static inline void __disable_x2apic(u64 msr)
+{
+ wrmsrl(MSR_IA32_APICBASE,
+ msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
+ wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
+}
+
+static __init void disable_x2apic(void)
+{
+ u64 msr;
+
+ if (!cpu_has_x2apic)
+ return;
+
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ if (msr & X2APIC_ENABLE) {
+ u32 x2apic_id = read_apic_id();
+
+ if (x2apic_id >= 255)
+ panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
+
+ pr_info("Disabling x2apic\n");
+ __disable_x2apic(msr);
+
+ if (nox2apic) {
+ clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC);
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ }
+
+ x2apic_disabled = 1;
+ x2apic_mode = 0;
+
+ register_lapic_address(mp_lapic_addr);
+ }
+}
+
void check_x2apic(void)
{
if (x2apic_enabled()) {
@@ -1441,15 +1491,20 @@ void check_x2apic(void)
void enable_x2apic(void)
{
- int msr, msr2;
+ u64 msr;
+
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ if (x2apic_disabled) {
+ __disable_x2apic(msr);
+ return;
+ }
if (!x2apic_mode)
return;
- rdmsr(MSR_IA32_APICBASE, msr, msr2);
if (!(msr & X2APIC_ENABLE)) {
printk_once(KERN_INFO "Enabling x2apic\n");
- wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2);
+ wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
}
}
#endif /* CONFIG_X86_X2APIC */
@@ -1486,25 +1541,34 @@ void __init enable_IR_x2apic(void)
ret = save_ioapic_entries();
if (ret) {
pr_info("Saving IO-APIC state failed: %d\n", ret);
- goto out;
+ return;
}
local_irq_save(flags);
legacy_pic->mask_all();
mask_ioapic_entries();
+ if (x2apic_preenabled && nox2apic)
+ disable_x2apic();
+
if (dmar_table_init_ret)
ret = -1;
else
ret = enable_IR();
+ if (!x2apic_supported())
+ goto skip_x2apic;
+
if (ret < 0) {
/* IR is required if there is APIC ID > 255 even when running
* under KVM
*/
if (max_physical_apicid > 255 ||
- !hypervisor_x2apic_available())
- goto nox2apic;
+ !hypervisor_x2apic_available()) {
+ if (x2apic_preenabled)
+ disable_x2apic();
+ goto skip_x2apic;
+ }
/*
* without IR all CPUs can be addressed by IOAPIC/MSI
* only in physical mode
@@ -1512,8 +1576,10 @@ void __init enable_IR_x2apic(void)
x2apic_force_phys();
}
- if (ret == IRQ_REMAP_XAPIC_MODE)
- goto nox2apic;
+ if (ret == IRQ_REMAP_XAPIC_MODE) {
+ pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
+ goto skip_x2apic;
+ }
x2apic_enabled = 1;
@@ -1523,22 +1589,11 @@ void __init enable_IR_x2apic(void)
pr_info("Enabled x2apic\n");
}
-nox2apic:
+skip_x2apic:
if (ret < 0) /* IR enabling failed */
restore_ioapic_entries();
legacy_pic->restore_mask();
local_irq_restore(flags);
-
-out:
- if (x2apic_enabled || !x2apic_supported())
- return;
-
- if (x2apic_preenabled)
- panic("x2apic: enabled by BIOS but kernel init failed.");
- else if (ret == IRQ_REMAP_XAPIC_MODE)
- pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
- else if (ret < 0)
- pr_info("x2apic not enabled, IRQ remapping init failed\n");
}
#ifdef CONFIG_X86_64
@@ -1809,8 +1864,8 @@ void smp_spurious_interrupt(struct pt_regs *regs)
{
u32 v;
- exit_idle();
irq_enter();
+ exit_idle();
/*
* Check if this really is a spurious interrupt and ACK it
* if it is a vectored one. Just in case...
@@ -1846,8 +1901,8 @@ void smp_error_interrupt(struct pt_regs *regs)
"Illegal register address", /* APIC Error Bit 7 */
};
- exit_idle();
irq_enter();
+ exit_idle();
/* First tickle the hardware, only then report what went on. -- REW */
v0 = apic_read(APIC_ESR);
apic_write(APIC_ESR, 0);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index f7a41e4cae47..8c3cdded6f2b 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -62,7 +62,7 @@ static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
* an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
* document number 292116). So here it goes...
*/
-static void flat_init_apic_ldr(void)
+void flat_init_apic_ldr(void)
{
unsigned long val;
unsigned long num, id;
@@ -171,9 +171,14 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
return initial_apic_id >> index_msb;
}
+static int flat_probe(void)
+{
+ return 1;
+}
+
static struct apic apic_flat = {
.name = "flat",
- .probe = NULL,
+ .probe = flat_probe,
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
.apic_id_registered = flat_apic_id_registered,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
new file mode 100644
index 000000000000..09d3d8c1cd99
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -0,0 +1,294 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-Specific APIC Code
+ *
+ * Copyright (C) 2011 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/hardirq.h>
+#include <linux/delay.h>
+
+#include <asm/numachip/numachip_csr.h>
+#include <asm/smp.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <asm/apic_flat_64.h>
+
+static int numachip_system __read_mostly;
+
+static struct apic apic_numachip __read_mostly;
+
+static unsigned int get_apic_id(unsigned long x)
+{
+ unsigned long value;
+ unsigned int id;
+
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U);
+
+ return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+ unsigned long x;
+
+ x = ((id & 0xffU) << 24);
+ return x;
+}
+
+static unsigned int read_xapic_id(void)
+{
+ return get_apic_id(apic_read(APIC_ID));
+}
+
+static int numachip_apic_id_registered(void)
+{
+ return physid_isset(read_xapic_id(), phys_cpu_present_map);
+}
+
+static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return initial_apic_id >> index_msb;
+}
+
+static const struct cpumask *numachip_target_cpus(void)
+{
+ return cpu_online_mask;
+}
+
+static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
+static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+{
+ union numachip_csr_g3_ext_irq_gen int_gen;
+
+ int_gen.s._destination_apic_id = phys_apicid;
+ int_gen.s._vector = 0;
+ int_gen.s._msgtype = APIC_DM_INIT >> 8;
+ int_gen.s._index = 0;
+
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+
+ int_gen.s._msgtype = APIC_DM_STARTUP >> 8;
+ int_gen.s._vector = start_rip >> 12;
+
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+
+ atomic_set(&init_deasserted, 1);
+ return 0;
+}
+
+static void numachip_send_IPI_one(int cpu, int vector)
+{
+ union numachip_csr_g3_ext_irq_gen int_gen;
+ int apicid = per_cpu(x86_cpu_to_apicid, cpu);
+
+ int_gen.s._destination_apic_id = apicid;
+ int_gen.s._vector = vector;
+ int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8;
+ int_gen.s._index = 0;
+
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+}
+
+static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask)
+ numachip_send_IPI_one(cpu, vector);
+}
+
+static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_allbutself(int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_all(int vector)
+{
+ numachip_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static void numachip_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+}
+
+static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = cpumask_first(cpumask);
+ if (likely((unsigned)cpu < nr_cpu_ids))
+ return per_cpu(x86_cpu_to_apicid, cpu);
+
+ return BAD_APICID;
+}
+
+static unsigned int
+numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+ return per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+static int __init numachip_probe(void)
+{
+ return apic == &apic_numachip;
+}
+
+static void __init map_csrs(void)
+{
+ printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n",
+ NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1);
+ init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
+
+ printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n",
+ NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1);
+ init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
+}
+
+static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
+{
+ c->phys_proc_id = node;
+ per_cpu(cpu_llc_id, smp_processor_id()) = node;
+}
+
+static int __init numachip_system_init(void)
+{
+ unsigned int val;
+
+ if (!numachip_system)
+ return 0;
+
+ x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+
+ map_csrs();
+
+ val = read_lcsr(CSR_G0_NODE_IDS);
+ printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val);
+
+ return 0;
+}
+early_initcall(numachip_system_init);
+
+static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (!strncmp(oem_id, "NUMASC", 6)) {
+ numachip_system = 1;
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct apic apic_numachip __refconst = {
+
+ .name = "NumaConnect system",
+ .probe = numachip_probe,
+ .acpi_madt_oem_check = numachip_acpi_madt_oem_check,
+ .apic_id_registered = numachip_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .target_cpus = numachip_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = numachip_vector_allocation_domain,
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = numachip_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xffU << 24,
+
+ .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = numachip_send_IPI_mask,
+ .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = numachip_send_IPI_allbutself,
+ .send_IPI_all = numachip_send_IPI_all,
+ .send_IPI_self = numachip_send_IPI_self,
+
+ .wakeup_secondary_cpu = numachip_wakeup_secondary,
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL, /* REMRD not supported */
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
+apic_driver(apic_numachip);
+
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 6d939d7847e2..fb072754bc1d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2421,8 +2421,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
unsigned vector, me;
ack_APIC_irq();
- exit_idle();
irq_enter();
+ exit_idle();
me = smp_processor_id();
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -2948,6 +2948,10 @@ static inline void __init check_timer(void)
}
local_irq_disable();
apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+ if (x2apic_preenabled)
+ apic_printk(APIC_QUIET, KERN_INFO
+ "Perhaps problem with the pre-enabled x2apic mode\n"
+ "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
out:
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 452932d34730..5da1269e8ddc 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -62,7 +62,8 @@ early_param("memory_corruption_check_size", set_corruption_check_size);
void __init setup_bios_corruption_check(void)
{
- u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
+ phys_addr_t start, end;
+ u64 i;
if (memory_corruption_check == -1) {
memory_corruption_check =
@@ -82,28 +83,23 @@ void __init setup_bios_corruption_check(void)
corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
- while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
- u64 size;
- addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE);
+ for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+ start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
+ PAGE_SIZE, corruption_check_size);
+ end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
+ PAGE_SIZE, corruption_check_size);
+ if (start >= end)
+ continue;
- if (addr == MEMBLOCK_ERROR)
- break;
-
- if (addr >= corruption_check_size)
- break;
-
- if ((addr + size) > corruption_check_size)
- size = corruption_check_size - addr;
-
- memblock_x86_reserve_range(addr, addr + size, "SCAN RAM");
- scan_areas[num_scan_areas].addr = addr;
- scan_areas[num_scan_areas].size = size;
- num_scan_areas++;
+ memblock_reserve(start, end - start);
+ scan_areas[num_scan_areas].addr = start;
+ scan_areas[num_scan_areas].size = end - start;
/* Assume we've already mapped this early memory */
- memset(__va(addr), 0, size);
+ memset(__va(start), 0, end - start);
- addr += size;
+ if (++num_scan_areas >= MAX_SCAN_AREAS)
+ break;
}
if (num_scan_areas)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 0bab2b18bb20..f4773f4aae35 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
@@ -192,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
valid_k7:
;
-#endif
}
static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
@@ -353,6 +351,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
if (node == NUMA_NO_NODE)
node = per_cpu(cpu_llc_id, cpu);
+ /*
+ * If core numbers are inconsistent, it's likely a multi-fabric platform,
+ * so invoke platform-specific handler
+ */
+ if (c->phys_proc_id != node)
+ x86_cpuinit.fixup_cpu_id(c, node);
+
if (!node_online(node)) {
/*
* Two possibilities here:
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e58d978e0758..159103c0b1f4 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
}
#ifdef CONFIG_X86_32
/* Cyrix III family needs CX8 & PGE explicitly enabled. */
- if (c->x86_model >= 6 && c->x86_model <= 9) {
+ if (c->x86_model >= 6 && c->x86_model <= 13) {
rdmsr(MSR_VIA_FCR, lo, hi);
lo |= (1<<1 | 1<<7);
wrmsr(MSR_VIA_FCR, lo, hi);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index aa003b13a831..850f2963a420 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -676,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
-#ifdef CONFIG_SMP
c->cpu_index = 0;
-#endif
filter_cpuid_features(c, false);
setup_smep(c);
@@ -764,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
c->apicid = c->initial_apicid;
# endif
#endif
-
-#ifdef CONFIG_X86_HT
c->phys_proc_id = c->initial_apicid;
-#endif
}
setup_smep(c);
@@ -1141,6 +1136,15 @@ static void dbg_restore_debug_regs(void)
#endif /* ! CONFIG_KGDB */
/*
+ * Prints an error where the NUMA and configured core-number mismatch and the
+ * platform didn't override this to fix it up
+ */
+void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node)
+{
+ pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id);
+}
+
+/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
* and IDT. We reload them nevertheless, this function acts as a
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 1b22dcc51af4..8bacc7826fb3 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,5 +1,4 @@
#ifndef ARCH_X86_CPU_H
-
#define ARCH_X86_CPU_H
struct cpu_model_info {
@@ -35,6 +34,4 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
-extern void get_cpu_cap(struct cpuinfo_x86 *c);
-
-#endif
+#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 523131213f08..3e6ff6cbf42a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -181,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void)
static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
@@ -198,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
"with B stepping processors.\n");
}
-#endif
}
static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a3b0811693c9..6b45e5e7a901 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -844,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
#include <linux/kobject.h>
#include <linux/sysfs.h>
-
-extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
+#include <linux/cpu.h>
/* pointer to kobject for cpuX/cache */
static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
@@ -1073,9 +1072,9 @@ err_out:
static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
/* Add/Remove cache interface for CPU device */
-static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
+static int __cpuinit cache_add_dev(struct device *dev)
{
- unsigned int cpu = sys_dev->id;
+ unsigned int cpu = dev->id;
unsigned long i, j;
struct _index_kobject *this_object;
struct _cpuid4_info *this_leaf;
@@ -1087,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
&ktype_percpu_entry,
- &sys_dev->kobj, "%s", "cache");
+ &dev->kobj, "%s", "cache");
if (retval < 0) {
cpuid4_cache_sysfs_exit(cpu);
return retval;
@@ -1124,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
return 0;
}
-static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
+static void __cpuinit cache_remove_dev(struct device *dev)
{
- unsigned int cpu = sys_dev->id;
+ unsigned int cpu = dev->id;
unsigned long i;
if (per_cpu(ici_cpuid4_info, cpu) == NULL)
@@ -1145,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
+ struct device *dev;
- sys_dev = get_cpu_sysdev(cpu);
+ dev = get_cpu_device(cpu);
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- cache_add_dev(sys_dev);
+ cache_add_dev(dev);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- cache_remove_dev(sys_dev);
+ cache_remove_dev(dev);
break;
}
return NOTIFY_OK;
@@ -1174,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void)
for_each_online_cpu(i) {
int err;
- struct sys_device *sys_dev = get_cpu_sysdev(i);
+ struct device *dev = get_cpu_device(i);
- err = cache_add_dev(sys_dev);
+ err = cache_add_dev(dev);
if (err)
return err;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 319882ef848d..fc4beb393577 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -17,6 +17,7 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/fs.h>
+#include <linux/preempt.h>
#include <linux/smp.h>
#include <linux/notifier.h>
#include <linux/kdebug.h>
@@ -92,6 +93,18 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
return NMI_HANDLED;
}
+static void mce_irq_ipi(void *info)
+{
+ int cpu = smp_processor_id();
+ struct mce *m = &__get_cpu_var(injectm);
+
+ if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
+ m->inject_flags & MCJ_EXCEPTION) {
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ raise_exception(m, NULL);
+ }
+}
+
/* Inject mce on current CPU */
static int raise_local(void)
{
@@ -139,9 +152,10 @@ static void raise_mce(struct mce *m)
return;
#ifdef CONFIG_X86_LOCAL_APIC
- if (m->inject_flags & MCJ_NMI_BROADCAST) {
+ if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) {
unsigned long start;
int cpu;
+
get_online_cpus();
cpumask_copy(mce_inject_cpumask, cpu_online_mask);
cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
@@ -151,13 +165,25 @@ static void raise_mce(struct mce *m)
MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
cpumask_clear_cpu(cpu, mce_inject_cpumask);
}
- if (!cpumask_empty(mce_inject_cpumask))
- apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
+ if (!cpumask_empty(mce_inject_cpumask)) {
+ if (m->inject_flags & MCJ_IRQ_BRAODCAST) {
+ /*
+ * don't wait because mce_irq_ipi is necessary
+ * to be sync with following raise_local
+ */
+ preempt_disable();
+ smp_call_function_many(mce_inject_cpumask,
+ mce_irq_ipi, NULL, 0);
+ preempt_enable();
+ } else if (m->inject_flags & MCJ_NMI_BROADCAST)
+ apic->send_IPI_mask(mce_inject_cpumask,
+ NMI_VECTOR);
+ }
start = jiffies;
while (!cpumask_empty(mce_inject_cpumask)) {
if (!time_before(jiffies, start + 2*HZ)) {
printk(KERN_ERR
- "Timeout waiting for mce inject NMI %lx\n",
+ "Timeout waiting for mce inject %lx\n",
*cpumask_bits(mce_inject_cpumask));
break;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fefcc69ee8b5..ed44c8a65858 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,4 +1,4 @@
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <asm/mce.h>
enum severity_level {
@@ -17,7 +17,7 @@ enum severity_level {
struct mce_bank {
u64 ctl; /* subevents to enable */
unsigned char init; /* initialise bank? */
- struct sysdev_attribute attr; /* sysdev attribute */
+ struct device_attribute attr; /* device attribute */
char attrname[ATTR_LEN]; /* attribute name */
};
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 2af127d4c3d1..f22a9f7f6390 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -19,7 +19,7 @@
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/string.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/ctype.h>
@@ -95,13 +95,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
-/*
- * CPU/chipset specific EDAC code can register a notifier call here to print
- * MCE errors in a human-readable form.
- */
-ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
-EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
-
/* MCA banks polled by the period polling timer for corrected events */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -109,6 +102,12 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
static DEFINE_PER_CPU(struct work_struct, mce_work);
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
{
@@ -119,9 +118,7 @@ void mce_setup(struct mce *m)
m->time = get_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
m->cpuid = cpuid_eax(1);
-#ifdef CONFIG_SMP
m->socketid = cpu_data(m->extcpu).phys_proc_id;
-#endif
m->apicid = cpu_data(m->extcpu).initial_apicid;
rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
}
@@ -190,6 +187,57 @@ void mce_log(struct mce *mce)
set_bit(0, &mce_need_notify);
}
+static void drain_mcelog_buffer(void)
+{
+ unsigned int next, i, prev = 0;
+
+ next = rcu_dereference_check_mce(mcelog.next);
+
+ do {
+ struct mce *m;
+
+ /* drain what was logged during boot */
+ for (i = prev; i < next; i++) {
+ unsigned long start = jiffies;
+ unsigned retries = 1;
+
+ m = &mcelog.entry[i];
+
+ while (!m->finished) {
+ if (time_after_eq(jiffies, start + 2*retries))
+ retries++;
+
+ cpu_relax();
+
+ if (!m->finished && retries >= 4) {
+ pr_err("MCE: skipping error being logged currently!\n");
+ break;
+ }
+ }
+ smp_rmb();
+ atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+ }
+
+ memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
+ prev = next;
+ next = cmpxchg(&mcelog.next, prev, 0);
+ } while (next != prev);
+}
+
+
+void mce_register_decode_chain(struct notifier_block *nb)
+{
+ atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
+ drain_mcelog_buffer();
+}
+EXPORT_SYMBOL_GPL(mce_register_decode_chain);
+
+void mce_unregister_decode_chain(struct notifier_block *nb)
+{
+ atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
+
static void print_mce(struct mce *m)
{
int ret = 0;
@@ -1770,7 +1818,7 @@ static struct syscore_ops mce_syscore_ops = {
};
/*
- * mce_sysdev: Sysfs support
+ * mce_device: Sysfs support
*/
static void mce_cpu_restart(void *data)
@@ -1806,27 +1854,28 @@ static void mce_enable_ce(void *all)
__mcheck_cpu_init_timer();
}
-static struct sysdev_class mce_sysdev_class = {
+static struct bus_type mce_subsys = {
.name = "machinecheck",
+ .dev_name = "machinecheck",
};
-DEFINE_PER_CPU(struct sys_device, mce_sysdev);
+DEFINE_PER_CPU(struct device, mce_device);
__cpuinitdata
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
-static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
+static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
{
return container_of(attr, struct mce_bank, attr);
}
-static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t show_bank(struct device *s, struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
}
-static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_bank(struct device *s, struct device_attribute *attr,
const char *buf, size_t size)
{
u64 new;
@@ -1841,14 +1890,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
}
static ssize_t
-show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
{
strcpy(buf, mce_helper);
strcat(buf, "\n");
return strlen(mce_helper) + 1;
}
-static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
const char *buf, size_t siz)
{
char *p;
@@ -1863,8 +1912,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
return strlen(mce_helper) + !!p;
}
-static ssize_t set_ignore_ce(struct sys_device *s,
- struct sysdev_attribute *attr,
+static ssize_t set_ignore_ce(struct device *s,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
u64 new;
@@ -1887,8 +1936,8 @@ static ssize_t set_ignore_ce(struct sys_device *s,
return size;
}
-static ssize_t set_cmci_disabled(struct sys_device *s,
- struct sysdev_attribute *attr,
+static ssize_t set_cmci_disabled(struct device *s,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
u64 new;
@@ -1910,108 +1959,107 @@ static ssize_t set_cmci_disabled(struct sys_device *s,
return size;
}
-static ssize_t store_int_with_restart(struct sys_device *s,
- struct sysdev_attribute *attr,
+static ssize_t store_int_with_restart(struct device *s,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
- ssize_t ret = sysdev_store_int(s, attr, buf, size);
+ ssize_t ret = device_store_int(s, attr, buf, size);
mce_restart();
return ret;
}
-static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
-static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
-static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
-static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
+static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
+static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
+static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
-static struct sysdev_ext_attribute attr_check_interval = {
- _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
- store_int_with_restart),
+static struct dev_ext_attribute dev_attr_check_interval = {
+ __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
&check_interval
};
-static struct sysdev_ext_attribute attr_ignore_ce = {
- _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
+static struct dev_ext_attribute dev_attr_ignore_ce = {
+ __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
&mce_ignore_ce
};
-static struct sysdev_ext_attribute attr_cmci_disabled = {
- _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
+static struct dev_ext_attribute dev_attr_cmci_disabled = {
+ __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
&mce_cmci_disabled
};
-static struct sysdev_attribute *mce_sysdev_attrs[] = {
- &attr_tolerant.attr,
- &attr_check_interval.attr,
- &attr_trigger,
- &attr_monarch_timeout.attr,
- &attr_dont_log_ce.attr,
- &attr_ignore_ce.attr,
- &attr_cmci_disabled.attr,
+static struct device_attribute *mce_device_attrs[] = {
+ &dev_attr_tolerant.attr,
+ &dev_attr_check_interval.attr,
+ &dev_attr_trigger,
+ &dev_attr_monarch_timeout.attr,
+ &dev_attr_dont_log_ce.attr,
+ &dev_attr_ignore_ce.attr,
+ &dev_attr_cmci_disabled.attr,
NULL
};
-static cpumask_var_t mce_sysdev_initialized;
+static cpumask_var_t mce_device_initialized;
-/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_sysdev_create(unsigned int cpu)
+/* Per cpu device init. All of the cpus still share the same ctrl bank: */
+static __cpuinit int mce_device_create(unsigned int cpu)
{
- struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+ struct device *dev = &per_cpu(mce_device, cpu);
int err;
int i, j;
if (!mce_available(&boot_cpu_data))
return -EIO;
- memset(&sysdev->kobj, 0, sizeof(struct kobject));
- sysdev->id = cpu;
- sysdev->cls = &mce_sysdev_class;
+ memset(&dev->kobj, 0, sizeof(struct kobject));
+ dev->id = cpu;
+ dev->bus = &mce_subsys;
- err = sysdev_register(sysdev);
+ err = device_register(dev);
if (err)
return err;
- for (i = 0; mce_sysdev_attrs[i]; i++) {
- err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
+ for (i = 0; mce_device_attrs[i]; i++) {
+ err = device_create_file(dev, mce_device_attrs[i]);
if (err)
goto error;
}
for (j = 0; j < banks; j++) {
- err = sysdev_create_file(sysdev, &mce_banks[j].attr);
+ err = device_create_file(dev, &mce_banks[j].attr);
if (err)
goto error2;
}
- cpumask_set_cpu(cpu, mce_sysdev_initialized);
+ cpumask_set_cpu(cpu, mce_device_initialized);
return 0;
error2:
while (--j >= 0)
- sysdev_remove_file(sysdev, &mce_banks[j].attr);
+ device_remove_file(dev, &mce_banks[j].attr);
error:
while (--i >= 0)
- sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+ device_remove_file(dev, mce_device_attrs[i]);
- sysdev_unregister(sysdev);
+ device_unregister(dev);
return err;
}
-static __cpuinit void mce_sysdev_remove(unsigned int cpu)
+static __cpuinit void mce_device_remove(unsigned int cpu)
{
- struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+ struct device *dev = &per_cpu(mce_device, cpu);
int i;
- if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
+ if (!cpumask_test_cpu(cpu, mce_device_initialized))
return;
- for (i = 0; mce_sysdev_attrs[i]; i++)
- sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+ for (i = 0; mce_device_attrs[i]; i++)
+ device_remove_file(dev, mce_device_attrs[i]);
for (i = 0; i < banks; i++)
- sysdev_remove_file(sysdev, &mce_banks[i].attr);
+ device_remove_file(dev, &mce_banks[i].attr);
- sysdev_unregister(sysdev);
- cpumask_clear_cpu(cpu, mce_sysdev_initialized);
+ device_unregister(dev);
+ cpumask_clear_cpu(cpu, mce_device_initialized);
}
/* Make sure there are no machine checks on offlined CPUs. */
@@ -2061,7 +2109,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- mce_sysdev_create(cpu);
+ mce_device_create(cpu);
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
break;
@@ -2069,7 +2117,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DEAD_FROZEN:
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
- mce_sysdev_remove(cpu);
+ mce_device_remove(cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
@@ -2103,7 +2151,7 @@ static __init void mce_init_banks(void)
for (i = 0; i < banks; i++) {
struct mce_bank *b = &mce_banks[i];
- struct sysdev_attribute *a = &b->attr;
+ struct device_attribute *a = &b->attr;
sysfs_attr_init(&a->attr);
a->attr.name = b->attrname;
@@ -2123,16 +2171,16 @@ static __init int mcheck_init_device(void)
if (!mce_available(&boot_cpu_data))
return -EIO;
- zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
+ zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
mce_init_banks();
- err = sysdev_class_register(&mce_sysdev_class);
+ err = subsys_system_register(&mce_subsys, NULL);
if (err)
return err;
for_each_online_cpu(i) {
- err = mce_sysdev_create(i);
+ err = mce_device_create(i);
if (err)
return err;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f5474218cffe..ba0b94a7e204 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -17,7 +17,6 @@
#include <linux/notifier.h>
#include <linux/kobject.h>
#include <linux/percpu.h>
-#include <linux/sysdev.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/sysfs.h>
@@ -64,11 +63,9 @@ struct threshold_bank {
};
static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
-#ifdef CONFIG_SMP
static unsigned char shared_bank[NR_BANKS] = {
0, 0, 0, 0, 1
};
-#endif
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
@@ -202,10 +199,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (!block)
per_cpu(bank_map, cpu) |= (1 << bank);
-#ifdef CONFIG_SMP
if (shared_bank[bank] && c->cpu_core_id)
break;
-#endif
+
offset = setup_APIC_mce(offset,
(high & MASK_LVTOFF_HI) >> 20);
@@ -531,7 +527,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
sprintf(name, "threshold_bank%i", bank);
-#ifdef CONFIG_SMP
if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
i = cpumask_first(cpu_llc_shared_mask(cpu));
@@ -548,7 +543,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (!b)
goto out;
- err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
+ err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj,
b->kobj, name);
if (err)
goto out;
@@ -558,7 +553,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
}
-#endif
b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
if (!b) {
@@ -571,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
}
- b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
+ b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj);
if (!b->kobj)
goto out_free;
@@ -591,7 +585,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (i == cpu)
continue;
- err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
+ err = sysfs_create_link(&per_cpu(mce_device, i).kobj,
b->kobj, name);
if (err)
goto out;
@@ -669,7 +663,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
#ifdef CONFIG_SMP
/* sibling symlink */
if (shared_bank[bank] && b->blocks->cpu != cpu) {
- sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
+ sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
@@ -681,7 +675,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
if (i == cpu)
continue;
- sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
+ sysfs_remove_link(&per_cpu(mce_device, i).kobj, name);
per_cpu(threshold_banks, i)[bank] = NULL;
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 787e06c84ea6..67bb17a37a0a 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -19,7 +19,6 @@
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/export.h>
-#include <linux/sysdev.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/smp.h>
@@ -69,16 +68,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0);
static u32 lvtthmr_init __read_mostly;
#ifdef CONFIG_SYSFS
-#define define_therm_throt_sysdev_one_ro(_name) \
- static SYSDEV_ATTR(_name, 0444, \
- therm_throt_sysdev_show_##_name, \
+#define define_therm_throt_device_one_ro(_name) \
+ static DEVICE_ATTR(_name, 0444, \
+ therm_throt_device_show_##_name, \
NULL) \
-#define define_therm_throt_sysdev_show_func(event, name) \
+#define define_therm_throt_device_show_func(event, name) \
\
-static ssize_t therm_throt_sysdev_show_##event##_##name( \
- struct sys_device *dev, \
- struct sysdev_attribute *attr, \
+static ssize_t therm_throt_device_show_##event##_##name( \
+ struct device *dev, \
+ struct device_attribute *attr, \
char *buf) \
{ \
unsigned int cpu = dev->id; \
@@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name( \
return ret; \
}
-define_therm_throt_sysdev_show_func(core_throttle, count);
-define_therm_throt_sysdev_one_ro(core_throttle_count);
+define_therm_throt_device_show_func(core_throttle, count);
+define_therm_throt_device_one_ro(core_throttle_count);
-define_therm_throt_sysdev_show_func(core_power_limit, count);
-define_therm_throt_sysdev_one_ro(core_power_limit_count);
+define_therm_throt_device_show_func(core_power_limit, count);
+define_therm_throt_device_one_ro(core_power_limit_count);
-define_therm_throt_sysdev_show_func(package_throttle, count);
-define_therm_throt_sysdev_one_ro(package_throttle_count);
+define_therm_throt_device_show_func(package_throttle, count);
+define_therm_throt_device_one_ro(package_throttle_count);
-define_therm_throt_sysdev_show_func(package_power_limit, count);
-define_therm_throt_sysdev_one_ro(package_power_limit_count);
+define_therm_throt_device_show_func(package_power_limit, count);
+define_therm_throt_device_one_ro(package_power_limit_count);
static struct attribute *thermal_throttle_attrs[] = {
- &attr_core_throttle_count.attr,
+ &dev_attr_core_throttle_count.attr,
NULL
};
@@ -223,36 +222,36 @@ static int thresh_event_valid(int event)
#ifdef CONFIG_SYSFS
/* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
+static __cpuinit int thermal_throttle_add_dev(struct device *dev,
unsigned int cpu)
{
int err;
struct cpuinfo_x86 *c = &cpu_data(cpu);
- err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
+ err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
if (err)
return err;
if (cpu_has(c, X86_FEATURE_PLN))
- err = sysfs_add_file_to_group(&sys_dev->kobj,
- &attr_core_power_limit_count.attr,
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_core_power_limit_count.attr,
thermal_attr_group.name);
if (cpu_has(c, X86_FEATURE_PTS)) {
- err = sysfs_add_file_to_group(&sys_dev->kobj,
- &attr_package_throttle_count.attr,
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_throttle_count.attr,
thermal_attr_group.name);
if (cpu_has(c, X86_FEATURE_PLN))
- err = sysfs_add_file_to_group(&sys_dev->kobj,
- &attr_package_power_limit_count.attr,
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_power_limit_count.attr,
thermal_attr_group.name);
}
return err;
}
-static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
+static __cpuinit void thermal_throttle_remove_dev(struct device *dev)
{
- sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
+ sysfs_remove_group(&dev->kobj, &thermal_attr_group);
}
/* Mutex protecting device creation against CPU hotplug: */
@@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
+ struct device *dev;
int err = 0;
- sys_dev = get_cpu_sysdev(cpu);
+ dev = get_cpu_device(cpu);
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
mutex_lock(&therm_cpu_lock);
- err = thermal_throttle_add_dev(sys_dev, cpu);
+ err = thermal_throttle_add_dev(dev, cpu);
mutex_unlock(&therm_cpu_lock);
WARN_ON(err);
break;
@@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
case CPU_DEAD:
case CPU_DEAD_FROZEN:
mutex_lock(&therm_cpu_lock);
- thermal_throttle_remove_dev(sys_dev);
+ thermal_throttle_remove_dev(dev);
mutex_unlock(&therm_cpu_lock);
break;
}
@@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void)
#endif
/* connect live CPUs to sysfs */
for_each_online_cpu(cpu) {
- err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
+ err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
WARN_ON(err);
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -323,17 +322,6 @@ device_initcall(thermal_throttle_init_device);
#endif /* CONFIG_SYSFS */
-/*
- * Set up the most two significant bit to notify mce log that this thermal
- * event type.
- * This is a temp solution. May be changed in the future with mce log
- * infrasture.
- */
-#define CORE_THROTTLED (0)
-#define CORE_POWER_LIMIT ((__u64)1 << 62)
-#define PACKAGE_THROTTLED ((__u64)2 << 62)
-#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
-
static void notify_thresholds(__u64 msr_val)
{
/* check whether the interrupt handler is defined;
@@ -363,27 +351,23 @@ static void intel_thermal_interrupt(void)
if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
CORE_LEVEL) != 0)
- mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
+ mce_log_therm_throt_event(msr_val);
if (this_cpu_has(X86_FEATURE_PLN))
- if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+ therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
- CORE_LEVEL) != 0)
- mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
+ CORE_LEVEL);
if (this_cpu_has(X86_FEATURE_PTS)) {
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
- if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+ therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
- PACKAGE_LEVEL) != 0)
- mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
+ PACKAGE_LEVEL);
if (this_cpu_has(X86_FEATURE_PLN))
- if (therm_throt_process(msr_val &
+ therm_throt_process(msr_val &
PACKAGE_THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
- PACKAGE_LEVEL) != 0)
- mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
- | msr_val);
+ PACKAGE_LEVEL);
}
}
@@ -397,8 +381,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
{
- exit_idle();
irq_enter();
+ exit_idle();
inc_irq_stat(irq_thermal_count);
smp_thermal_vector();
irq_exit();
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index d746df2909c9..aa578cadb940 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt;
asmlinkage void smp_threshold_interrupt(void)
{
- exit_idle();
irq_enter();
+ exit_idle();
inc_irq_stat(irq_threshold_count);
mce_threshold_vector();
irq_exit();
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2bda212a0010..5adce1040b11 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -484,18 +484,195 @@ static inline int is_x86_event(struct perf_event *event)
return event->pmu == &pmu;
}
+/*
+ * Event scheduler state:
+ *
+ * Assign events iterating over all events and counters, beginning
+ * with events with least weights first. Keep the current iterator
+ * state in struct sched_state.
+ */
+struct sched_state {
+ int weight;
+ int event; /* event index */
+ int counter; /* counter index */
+ int unassigned; /* number of events to be assigned left */
+ unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+};
+
+/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
+#define SCHED_STATES_MAX 2
+
+struct perf_sched {
+ int max_weight;
+ int max_events;
+ struct event_constraint **constraints;
+ struct sched_state state;
+ int saved_states;
+ struct sched_state saved[SCHED_STATES_MAX];
+};
+
+/*
+ * Initialize interator that runs through all events and counters.
+ */
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c,
+ int num, int wmin, int wmax)
+{
+ int idx;
+
+ memset(sched, 0, sizeof(*sched));
+ sched->max_events = num;
+ sched->max_weight = wmax;
+ sched->constraints = c;
+
+ for (idx = 0; idx < num; idx++) {
+ if (c[idx]->weight == wmin)
+ break;
+ }
+
+ sched->state.event = idx; /* start with min weight */
+ sched->state.weight = wmin;
+ sched->state.unassigned = num;
+}
+
+static void perf_sched_save_state(struct perf_sched *sched)
+{
+ if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
+ return;
+
+ sched->saved[sched->saved_states] = sched->state;
+ sched->saved_states++;
+}
+
+static bool perf_sched_restore_state(struct perf_sched *sched)
+{
+ if (!sched->saved_states)
+ return false;
+
+ sched->saved_states--;
+ sched->state = sched->saved[sched->saved_states];
+
+ /* continue with next counter: */
+ clear_bit(sched->state.counter++, sched->state.used);
+
+ return true;
+}
+
+/*
+ * Select a counter for the current event to schedule. Return true on
+ * success.
+ */
+static bool __perf_sched_find_counter(struct perf_sched *sched)
+{
+ struct event_constraint *c;
+ int idx;
+
+ if (!sched->state.unassigned)
+ return false;
+
+ if (sched->state.event >= sched->max_events)
+ return false;
+
+ c = sched->constraints[sched->state.event];
+
+ /* Prefer fixed purpose counters */
+ if (x86_pmu.num_counters_fixed) {
+ idx = X86_PMC_IDX_FIXED;
+ for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+ if (!__test_and_set_bit(idx, sched->state.used))
+ goto done;
+ }
+ }
+ /* Grab the first unused counter starting with idx */
+ idx = sched->state.counter;
+ for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
+ if (!__test_and_set_bit(idx, sched->state.used))
+ goto done;
+ }
+
+ return false;
+
+done:
+ sched->state.counter = idx;
+
+ if (c->overlap)
+ perf_sched_save_state(sched);
+
+ return true;
+}
+
+static bool perf_sched_find_counter(struct perf_sched *sched)
+{
+ while (!__perf_sched_find_counter(sched)) {
+ if (!perf_sched_restore_state(sched))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Go through all unassigned events and find the next one to schedule.
+ * Take events with the least weight first. Return true on success.
+ */
+static bool perf_sched_next_event(struct perf_sched *sched)
+{
+ struct event_constraint *c;
+
+ if (!sched->state.unassigned || !--sched->state.unassigned)
+ return false;
+
+ do {
+ /* next event */
+ sched->state.event++;
+ if (sched->state.event >= sched->max_events) {
+ /* next weight */
+ sched->state.event = 0;
+ sched->state.weight++;
+ if (sched->state.weight > sched->max_weight)
+ return false;
+ }
+ c = sched->constraints[sched->state.event];
+ } while (c->weight != sched->state.weight);
+
+ sched->state.counter = 0; /* start with first counter */
+
+ return true;
+}
+
+/*
+ * Assign a counter for each event.
+ */
+static int perf_assign_events(struct event_constraint **constraints, int n,
+ int wmin, int wmax, int *assign)
+{
+ struct perf_sched sched;
+
+ perf_sched_init(&sched, constraints, n, wmin, wmax);
+
+ do {
+ if (!perf_sched_find_counter(&sched))
+ break; /* failed */
+ if (assign)
+ assign[sched.state.event] = sched.state.counter;
+ } while (perf_sched_next_event(&sched));
+
+ return sched.state.unassigned;
+}
+
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- int i, j, w, wmax, num = 0;
+ int i, wmin, wmax, num = 0;
struct hw_perf_event *hwc;
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
- for (i = 0; i < n; i++) {
+ for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
constraints[i] = c;
+ wmin = min(wmin, c->weight);
+ wmax = max(wmax, c->weight);
}
/*
@@ -521,60 +698,12 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
if (assign)
assign[i] = hwc->idx;
}
- if (i == n)
- goto done;
-
- /*
- * begin slow path
- */
-
- bitmap_zero(used_mask, X86_PMC_IDX_MAX);
- /*
- * weight = number of possible counters
- *
- * 1 = most constrained, only works on one counter
- * wmax = least constrained, works on any counter
- *
- * assign events to counters starting with most
- * constrained events.
- */
- wmax = x86_pmu.num_counters;
+ /* slow path */
+ if (i != n)
+ num = perf_assign_events(constraints, n, wmin, wmax, assign);
/*
- * when fixed event counters are present,
- * wmax is incremented by 1 to account
- * for one more choice
- */
- if (x86_pmu.num_counters_fixed)
- wmax++;
-
- for (w = 1, num = n; num && w <= wmax; w++) {
- /* for each event */
- for (i = 0; num && i < n; i++) {
- c = constraints[i];
- hwc = &cpuc->event_list[i]->hw;
-
- if (c->weight != w)
- continue;
-
- for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
- if (!test_bit(j, used_mask))
- break;
- }
-
- if (j == X86_PMC_IDX_MAX)
- break;
-
- __set_bit(j, used_mask);
-
- if (assign)
- assign[i] = j;
- num--;
- }
- }
-done:
- /*
* scheduling failed or is just a simulation,
* free resources if necessary
*/
@@ -1119,6 +1248,7 @@ static void __init pmu_check_apic(void)
static int __init init_hw_perf_events(void)
{
+ struct x86_pmu_quirk *quirk;
struct event_constraint *c;
int err;
@@ -1147,8 +1277,8 @@ static int __init init_hw_perf_events(void)
pr_cont("%s PMU driver.\n", x86_pmu.name);
- if (x86_pmu.quirks)
- x86_pmu.quirks();
+ for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
+ quirk->func();
if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
@@ -1171,12 +1301,18 @@ static int __init init_hw_perf_events(void)
unconstrained = (struct event_constraint)
__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
- 0, x86_pmu.num_counters);
+ 0, x86_pmu.num_counters, 0);
if (x86_pmu.event_constraints) {
+ /*
+ * event on fixed counter2 (REF_CYCLES) only works on this
+ * counter, so do not extend mask to generic counters
+ */
for_each_event_constraint(c, x86_pmu.event_constraints) {
- if (c->cmask != X86_RAW_EVENT_MASK)
+ if (c->cmask != X86_RAW_EVENT_MASK
+ || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {
continue;
+ }
c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
c->weight += x86_pmu.num_counters;
@@ -1566,3 +1702,15 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
return misc;
}
+
+void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+ cap->version = x86_pmu.version;
+ cap->num_counters_gp = x86_pmu.num_counters;
+ cap->num_counters_fixed = x86_pmu.num_counters_fixed;
+ cap->bit_width_gp = x86_pmu.cntval_bits;
+ cap->bit_width_fixed = x86_pmu.cntval_bits;
+ cap->events_mask = (unsigned int)x86_pmu.events_maskl;
+ cap->events_mask_len = x86_pmu.events_mask_len;
+}
+EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index b9698d40ac4b..8944062f46e2 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -45,6 +45,7 @@ struct event_constraint {
u64 code;
u64 cmask;
int weight;
+ int overlap;
};
struct amd_nb {
@@ -151,15 +152,40 @@ struct cpu_hw_events {
void *kfree_on_online;
};
-#define __EVENT_CONSTRAINT(c, n, m, w) {\
+#define __EVENT_CONSTRAINT(c, n, m, w, o) {\
{ .idxmsk64 = (n) }, \
.code = (c), \
.cmask = (m), \
.weight = (w), \
+ .overlap = (o), \
}
#define EVENT_CONSTRAINT(c, n, m) \
- __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
+ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0)
+
+/*
+ * The overlap flag marks event constraints with overlapping counter
+ * masks. This is the case if the counter mask of such an event is not
+ * a subset of any other counter mask of a constraint with an equal or
+ * higher weight, e.g.:
+ *
+ * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+ * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
+ * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
+ *
+ * The event scheduler may not select the correct counter in the first
+ * cycle because it needs to know which subsequent events will be
+ * scheduled. It may fail to schedule the events then. So we set the
+ * overlap flag for such constraints to give the scheduler a hint which
+ * events to select for counter rescheduling.
+ *
+ * Care must be taken as the rescheduling algorithm is O(n!) which
+ * will increase scheduling cycles for an over-commited system
+ * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros
+ * and its counter masks must be kept at a minimum.
+ */
+#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \
+ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1)
/*
* Constraint on the Event code.
@@ -235,6 +261,11 @@ union perf_capabilities {
u64 capabilities;
};
+struct x86_pmu_quirk {
+ struct x86_pmu_quirk *next;
+ void (*func)(void);
+};
+
/*
* struct x86_pmu - generic x86 pmu
*/
@@ -259,6 +290,11 @@ struct x86_pmu {
int num_counters_fixed;
int cntval_bits;
u64 cntval_mask;
+ union {
+ unsigned long events_maskl;
+ unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
+ };
+ int events_mask_len;
int apic;
u64 max_period;
struct event_constraint *
@@ -268,7 +304,7 @@ struct x86_pmu {
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
struct event_constraint *event_constraints;
- void (*quirks)(void);
+ struct x86_pmu_quirk *quirks;
int perfctr_second_write;
int (*cpu_prepare)(int cpu);
@@ -309,6 +345,15 @@ struct x86_pmu {
struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
};
+#define x86_add_quirk(func_) \
+do { \
+ static struct x86_pmu_quirk __quirk __initdata = { \
+ .func = func_, \
+ }; \
+ __quirk.next = x86_pmu.quirks; \
+ x86_pmu.quirks = &__quirk; \
+} while (0)
+
#define ERF_NO_HT_SHARING 1
#define ERF_HAS_RSP_1 2
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index aeefd45697a2..0397b23be8e9 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -492,7 +492,7 @@ static __initconst const struct x86_pmu amd_pmu = {
static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
-static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0);
+static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 121f1be4da19..3bd37bdf1b8e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -28,6 +28,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
[PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+ [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
};
static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -45,12 +46,7 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /*
- * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
- * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
- * ratio between these counters.
- */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -68,7 +64,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
@@ -90,7 +86,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
@@ -102,7 +98,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
@@ -125,7 +121,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
EVENT_CONSTRAINT_END
};
@@ -1519,7 +1515,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.guest_get_msrs = intel_guest_get_msrs,
};
-static void intel_clovertown_quirks(void)
+static __init void intel_clovertown_quirk(void)
{
/*
* PEBS is unreliable due to:
@@ -1545,19 +1541,60 @@ static void intel_clovertown_quirks(void)
x86_pmu.pebs_constraints = NULL;
}
-static void intel_sandybridge_quirks(void)
+static __init void intel_sandybridge_quirk(void)
{
printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
x86_pmu.pebs = 0;
x86_pmu.pebs_constraints = NULL;
}
+static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
+ { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+ { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+ { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+ { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+ { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+ { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+ { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void intel_arch_events_quirk(void)
+{
+ int bit;
+
+ /* disable event that reported as not presend by cpuid */
+ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
+ intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
+ printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n",
+ intel_arch_events_map[bit].name);
+ }
+}
+
+static __init void intel_nehalem_quirk(void)
+{
+ union cpuid10_ebx ebx;
+
+ ebx.full = x86_pmu.events_maskl;
+ if (ebx.split.no_branch_misses_retired) {
+ /*
+ * Erratum AAJ80 detected, we work it around by using
+ * the BR_MISP_EXEC.ANY event. This will over-count
+ * branch-misses, but it's still much better than the
+ * architectural event which is often completely bogus:
+ */
+ intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+ ebx.split.no_branch_misses_retired = 0;
+ x86_pmu.events_maskl = ebx.full;
+ printk(KERN_INFO "CPU erratum AAJ80 worked around\n");
+ }
+}
+
__init int intel_pmu_init(void)
{
union cpuid10_edx edx;
union cpuid10_eax eax;
+ union cpuid10_ebx ebx;
unsigned int unused;
- unsigned int ebx;
int version;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
@@ -1574,8 +1611,8 @@ __init int intel_pmu_init(void)
* Check whether the Architectural PerfMon supports
* Branch Misses Retired hw_event or not.
*/
- cpuid(10, &eax.full, &ebx, &unused, &edx.full);
- if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+ cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+ if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
return -ENODEV;
version = eax.split.version_id;
@@ -1589,6 +1626,9 @@ __init int intel_pmu_init(void)
x86_pmu.cntval_bits = eax.split.bit_width;
x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
+ x86_pmu.events_maskl = ebx.full;
+ x86_pmu.events_mask_len = eax.split.mask_length;
+
/*
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events:
@@ -1608,6 +1648,8 @@ __init int intel_pmu_init(void)
intel_ds_init();
+ x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
+
/*
* Install the hw-cache-events table:
*/
@@ -1617,7 +1659,7 @@ __init int intel_pmu_init(void)
break;
case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
- x86_pmu.quirks = intel_clovertown_quirks;
+ x86_add_quirk(intel_clovertown_quirk);
case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
case 29: /* six-core 45 nm xeon "Dunnington" */
@@ -1651,17 +1693,8 @@ __init int intel_pmu_init(void)
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
- if (ebx & 0x40) {
- /*
- * Erratum AAJ80 detected, we work it around by using
- * the BR_MISP_EXEC.ANY event. This will over-count
- * branch-misses, but it's still much better than the
- * architectural event which is often completely bogus:
- */
- intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+ x86_add_quirk(intel_nehalem_quirk);
- pr_cont("erratum AAJ80 worked around, ");
- }
pr_cont("Nehalem events, ");
break;
@@ -1701,7 +1734,7 @@ __init int intel_pmu_init(void)
break;
case 42: /* SandyBridge */
- x86_pmu.quirks = intel_sandybridge_quirks;
+ x86_add_quirk(intel_sandybridge_quirk);
case 45: /* SandyBridge, "Romely-EP" */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -1738,5 +1771,6 @@ __init int intel_pmu_init(void)
break;
}
}
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
index 5abbea297e0c..7b3fe56b1c21 100644
--- a/arch/x86/kernel/cpu/powerflags.c
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = {
"100mhzsteps",
"hwpstate",
"", /* tsc invariant mapped to constant_tsc */
- /* nothing */
+ "cpb", /* core performance boost */
+ "eff_freq_ro", /* Readonly aperf/mperf */
};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 14b23140e81f..8022c6681485 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
static int show_cpuinfo(struct seq_file *m, void *v)
{
struct cpuinfo_x86 *c = v;
- unsigned int cpu = 0;
+ unsigned int cpu;
int i;
-#ifdef CONFIG_SMP
cpu = c->cpu_index;
-#endif
seq_printf(m, "processor\t: %u\n"
"vendor_id\t: %s\n"
"cpu family\t: %d\n"
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 212a6a42527c..a524353d93f2 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
.notifier_call = cpuid_class_cpu_callback,
};
-static char *cpuid_devnode(struct device *dev, mode_t *mode)
+static char *cpuid_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 303a0e48f076..8071e2f3d6eb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -738,35 +738,17 @@ core_initcall(e820_mark_nvs_memory);
/*
* pre allocated 4k and reserved it in memblock and e820_saved
*/
-u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
+u64 __init early_reserve_e820(u64 size, u64 align)
{
- u64 size = 0;
u64 addr;
- u64 start;
- for (start = startt; ; start += size) {
- start = memblock_x86_find_in_range_size(start, &size, align);
- if (start == MEMBLOCK_ERROR)
- return 0;
- if (size >= sizet)
- break;
+ addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+ if (addr) {
+ e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
+ printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
+ update_e820_saved();
}
-#ifdef CONFIG_X86_32
- if (start >= MAXMEM)
- return 0;
- if (start + size > MAXMEM)
- size = MAXMEM - start;
-#endif
-
- addr = round_down(start + size - sizet, align);
- if (addr < start)
- return 0;
- memblock_x86_reserve_range(addr, addr + sizet, "new next");
- e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
- printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
- update_e820_saved();
-
return addr;
}
@@ -1090,7 +1072,7 @@ void __init memblock_x86_fill(void)
* We are safe to enable resizing, beause memblock_x86_fill()
* is rather later for x86
*/
- memblock_can_resize = 1;
+ memblock_allow_resize();
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
@@ -1105,22 +1087,36 @@ void __init memblock_x86_fill(void)
memblock_add(ei->addr, ei->size);
}
- memblock_analyze();
memblock_dump_all();
}
void __init memblock_find_dma_reserve(void)
{
#ifdef CONFIG_X86_64
- u64 free_size_pfn;
- u64 mem_size_pfn;
+ u64 nr_pages = 0, nr_free_pages = 0;
+ unsigned long start_pfn, end_pfn;
+ phys_addr_t start, end;
+ int i;
+ u64 u;
+
/*
* need to find out used area below MAX_DMA_PFN
* need to use memblock to get free size in [0, MAX_DMA_PFN]
* at first, and assume boot_mem will not take below MAX_DMA_PFN
*/
- mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
- free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
- set_dma_reserve(mem_size_pfn - free_size_pfn);
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN);
+ end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN);
+ nr_pages += end_pfn - start_pfn;
+ }
+
+ for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
+ start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
+ end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
+ if (start_pfn < end_pfn)
+ nr_free_pages += end_pfn - start_pfn;
+ }
+
+ set_dma_reserve(nr_pages - nr_free_pages);
#endif
}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cd28a350f7f9..9d42a52d2331 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -247,7 +247,7 @@ static int __init setup_early_printk(char *buf)
}
if (!strncmp(buf, "hsu", 3)) {
- hsu_early_console_init();
+ hsu_early_console_init(buf + 3);
early_console_register(&early_hsu_console, keep);
}
#endif
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f3f6f5344001..22d0e21b4dd7 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -625,6 +625,8 @@ work_notifysig: # deal with pending signals and
movl %esp, %eax
jne work_notifysig_v86 # returning to kernel-space or
# vm86-space
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
xorl %edx, %edx
call do_notify_resume
jmp resume_userspace_sig
@@ -638,6 +640,8 @@ work_notifysig_v86:
#else
movl %esp, %eax
#endif
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
xorl %edx, %edx
call do_notify_resume
jmp resume_userspace_sig
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index faf8d5e74b0b..a20e1cb9dc87 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -221,7 +221,7 @@ ENDPROC(native_usergs_sysret64)
/*CFI_REL_OFFSET ss,0*/
pushq_cfi %rax /* rsp */
CFI_REL_OFFSET rsp,0
- pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
+ pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
/*CFI_REL_OFFSET rflags,0*/
pushq_cfi $__KERNEL_CS /* cs */
/*CFI_REL_OFFSET cs,0*/
@@ -411,7 +411,7 @@ ENTRY(ret_from_fork)
RESTORE_REST
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
- je int_ret_from_sys_call
+ jz retint_restore_args
testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
jnz int_ret_from_sys_call
@@ -465,7 +465,7 @@ ENTRY(system_call)
* after the swapgs, so that it can do the swapgs
* for the guest and jump here on syscall.
*/
-ENTRY(system_call_after_swapgs)
+GLOBAL(system_call_after_swapgs)
movq %rsp,PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack),%rsp
@@ -478,8 +478,7 @@ ENTRY(system_call_after_swapgs)
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
- GET_THREAD_INFO(%rcx)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz tracesys
system_call_fastpath:
cmpq $__NR_syscall_max,%rax
@@ -496,10 +495,9 @@ ret_from_sys_call:
/* edi: flagmask */
sysret_check:
LOCKDEP_SYS_EXIT
- GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- movl TI_flags(%rcx),%edx
+ movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
andl %edi,%edx
jnz sysret_careful
CFI_REMEMBER_STATE
@@ -583,7 +581,7 @@ sysret_audit:
/* Do syscall tracing */
tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz auditsys
#endif
SAVE_REST
@@ -612,8 +610,6 @@ tracesys:
GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $3,CS-ARGOFFSET(%rsp)
- je retint_restore_args
movl $_TIF_ALLWORK_MASK,%edi
/* edi: mask to check */
GLOBAL(int_with_check)
@@ -953,6 +949,7 @@ END(common_interrupt)
ENTRY(\sym)
INTR_FRAME
pushq_cfi $~(\num)
+.Lcommon_\sym:
interrupt \do_sym
jmp ret_from_intr
CFI_ENDPROC
@@ -976,13 +973,21 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
x86_platform_ipi smp_x86_platform_ipi
#ifdef CONFIG_SMP
-.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
+ ALIGN
+ INTR_FRAME
+.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
.if NUM_INVALIDATE_TLB_VECTORS > \idx
-apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
- invalidate_interrupt\idx smp_invalidate_interrupt
+ENTRY(invalidate_interrupt\idx)
+ pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
+ jmp .Lcommon_invalidate_interrupt0
+ CFI_ADJUST_CFA_OFFSET -8
+END(invalidate_interrupt\idx)
.endif
.endr
+ CFI_ENDPROC
+apicinterrupt INVALIDATE_TLB_VECTOR_START, \
+ invalidate_interrupt0, smp_invalidate_interrupt
#endif
apicinterrupt THRESHOLD_APIC_VECTOR \
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index af0699ba48cf..48d9d4ea1020 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -52,5 +52,5 @@ void __init reserve_ebda_region(void)
lowmem = 0x9f000;
/* reserve all memory between lowmem and the 1MB mark */
- memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
+ memblock_reserve(lowmem, 0x100000 - lowmem);
}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3bb08509a7a1..51ff18616d50 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,9 +31,8 @@ static void __init i386_default_early_setup(void)
void __init i386_start_kernel(void)
{
- memblock_init();
-
- memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+ memblock_reserve(__pa_symbol(&_text),
+ __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
@@ -42,7 +41,7 @@ void __init i386_start_kernel(void)
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
+ memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
}
#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5655c2272adb..3a3b779f41d3 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -98,9 +98,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
{
copy_bootdata(__va(real_mode_data));
- memblock_init();
-
- memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+ memblock_reserve(__pa_symbol(&_text),
+ __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
@@ -109,7 +108,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
+ memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
}
#endif
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 1bb0bf4d92cd..ad0de0c2714e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -2,7 +2,6 @@
#include <linux/clockchips.h>
#include <linux/interrupt.h>
#include <linux/export.h>
-#include <linux/sysdev.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/i8253.h>
@@ -32,8 +31,6 @@
#define HPET_MIN_CYCLES 128
#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
-#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
-
/*
* HPET address is set in acpi/boot.c, when an ACPI entry exists
*/
@@ -55,6 +52,11 @@ struct hpet_dev {
char name[10];
};
+inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev)
+{
+ return container_of(evtdev, struct hpet_dev, evt);
+}
+
inline unsigned int hpet_readl(unsigned int a)
{
return readl(hpet_virt_address + a);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 429e0c92924e..7943e0c21bde 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -74,6 +74,10 @@ int arch_show_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
seq_printf(p, " IRQ work interrupts\n");
+ seq_printf(p, "%*s: ", prec, "RTR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
+ seq_printf(p, " APIC ICR read retries\n");
#endif
if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
@@ -136,6 +140,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->irq_spurious_count;
sum += irq_stats(cpu)->apic_perf_irqs;
sum += irq_stats(cpu)->apic_irq_work_irqs;
+ sum += irq_stats(cpu)->icr_read_retry_count;
#endif
if (x86_platform_ipi_callback)
sum += irq_stats(cpu)->x86_platform_ipis;
@@ -181,8 +186,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
unsigned vector = ~regs->orig_ax;
unsigned irq;
- exit_idle();
irq_enter();
+ exit_idle();
irq = __this_cpu_read(vector_irq[vector]);
@@ -209,10 +214,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
ack_APIC_irq();
- exit_idle();
-
irq_enter();
+ exit_idle();
+
inc_irq_stat(x86_platform_ipis);
if (x86_platform_ipi_callback)
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index b3300e6bacef..313fb5cddbce 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -9,7 +9,7 @@
#include <linux/kprobes.h>
#include <linux/init.h>
#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <linux/bitops.h>
#include <linux/acpi.h>
#include <linux/io.h>
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index ea9d5f2f13ef..2889b3d43882 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -50,7 +50,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
put_online_cpus();
}
-void arch_jump_label_transform_static(struct jump_entry *entry,
+__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
enum jump_label_type type)
{
__jump_label_transform(entry, type, text_poke_early);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a9c2116001d6..f0c6fd6f176b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,8 +39,6 @@
#include <asm/desc.h>
#include <asm/tlbflush.h>
-#define MMU_QUEUE_SIZE 1024
-
static int kvmapf = 1;
static int parse_no_kvmapf(char *arg)
@@ -60,21 +58,10 @@ static int parse_no_stealacc(char *arg)
early_param("no-steal-acc", parse_no_stealacc);
-struct kvm_para_state {
- u8 mmu_queue[MMU_QUEUE_SIZE];
- int mmu_queue_len;
-};
-
-static DEFINE_PER_CPU(struct kvm_para_state, para_state);
static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
static int has_steal_clock = 0;
-static struct kvm_para_state *kvm_para_state(void)
-{
- return &per_cpu(para_state, raw_smp_processor_id());
-}
-
/*
* No need for any "IO delay" on KVM
*/
@@ -271,151 +258,6 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
}
}
-static void kvm_mmu_op(void *buffer, unsigned len)
-{
- int r;
- unsigned long a1, a2;
-
- do {
- a1 = __pa(buffer);
- a2 = 0; /* on i386 __pa() always returns <4G */
- r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
- buffer += r;
- len -= r;
- } while (len);
-}
-
-static void mmu_queue_flush(struct kvm_para_state *state)
-{
- if (state->mmu_queue_len) {
- kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
- state->mmu_queue_len = 0;
- }
-}
-
-static void kvm_deferred_mmu_op(void *buffer, int len)
-{
- struct kvm_para_state *state = kvm_para_state();
-
- if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
- kvm_mmu_op(buffer, len);
- return;
- }
- if (state->mmu_queue_len + len > sizeof state->mmu_queue)
- mmu_queue_flush(state);
- memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
- state->mmu_queue_len += len;
-}
-
-static void kvm_mmu_write(void *dest, u64 val)
-{
- __u64 pte_phys;
- struct kvm_mmu_op_write_pte wpte;
-
-#ifdef CONFIG_HIGHPTE
- struct page *page;
- unsigned long dst = (unsigned long) dest;
-
- page = kmap_atomic_to_page(dest);
- pte_phys = page_to_pfn(page);
- pte_phys <<= PAGE_SHIFT;
- pte_phys += (dst & ~(PAGE_MASK));
-#else
- pte_phys = (unsigned long)__pa(dest);
-#endif
- wpte.header.op = KVM_MMU_OP_WRITE_PTE;
- wpte.pte_val = val;
- wpte.pte_phys = pte_phys;
-
- kvm_deferred_mmu_op(&wpte, sizeof wpte);
-}
-
-/*
- * We only need to hook operations that are MMU writes. We hook these so that
- * we can use lazy MMU mode to batch these operations. We could probably
- * improve the performance of the host code if we used some of the information
- * here to simplify processing of batched writes.
- */
-static void kvm_set_pte(pte_t *ptep, pte_t pte)
-{
- kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
-{
- kvm_mmu_write(pmdp, pmd_val(pmd));
-}
-
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
-static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
- kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- kvm_mmu_write(ptep, 0);
-}
-
-static void kvm_pmd_clear(pmd_t *pmdp)
-{
- kvm_mmu_write(pmdp, 0);
-}
-#endif
-
-static void kvm_set_pud(pud_t *pudp, pud_t pud)
-{
- kvm_mmu_write(pudp, pud_val(pud));
-}
-
-#if PAGETABLE_LEVELS == 4
-static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
-{
- kvm_mmu_write(pgdp, pgd_val(pgd));
-}
-#endif
-#endif /* PAGETABLE_LEVELS >= 3 */
-
-static void kvm_flush_tlb(void)
-{
- struct kvm_mmu_op_flush_tlb ftlb = {
- .header.op = KVM_MMU_OP_FLUSH_TLB,
- };
-
- kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
-}
-
-static void kvm_release_pt(unsigned long pfn)
-{
- struct kvm_mmu_op_release_pt rpt = {
- .header.op = KVM_MMU_OP_RELEASE_PT,
- .pt_phys = (u64)pfn << PAGE_SHIFT,
- };
-
- kvm_mmu_op(&rpt, sizeof rpt);
-}
-
-static void kvm_enter_lazy_mmu(void)
-{
- paravirt_enter_lazy_mmu();
-}
-
-static void kvm_leave_lazy_mmu(void)
-{
- struct kvm_para_state *state = kvm_para_state();
-
- mmu_queue_flush(state);
- paravirt_leave_lazy_mmu();
-}
-
static void __init paravirt_ops_setup(void)
{
pv_info.name = "KVM";
@@ -424,29 +266,6 @@ static void __init paravirt_ops_setup(void)
if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
pv_cpu_ops.io_delay = kvm_io_delay;
- if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
- pv_mmu_ops.set_pte = kvm_set_pte;
- pv_mmu_ops.set_pte_at = kvm_set_pte_at;
- pv_mmu_ops.set_pmd = kvm_set_pmd;
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
- pv_mmu_ops.pte_clear = kvm_pte_clear;
- pv_mmu_ops.pmd_clear = kvm_pmd_clear;
-#endif
- pv_mmu_ops.set_pud = kvm_set_pud;
-#if PAGETABLE_LEVELS == 4
- pv_mmu_ops.set_pgd = kvm_set_pgd;
-#endif
-#endif
- pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
- pv_mmu_ops.release_pte = kvm_release_pt;
- pv_mmu_ops.release_pmd = kvm_release_pt;
- pv_mmu_ops.release_pud = kvm_release_pt;
-
- pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
- pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
- }
#ifdef CONFIG_X86_IO_APIC
no_timer_check = 1;
#endif
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index d494799aafcd..fe86493f3ed1 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -1,14 +1,18 @@
/*
* AMD CPU Microcode Update Driver for Linux
- * Copyright (C) 2008 Advanced Micro Devices Inc.
+ * Copyright (C) 2008-2011 Advanced Micro Devices Inc.
*
* Author: Peter Oruba <peter.oruba@amd.com>
*
* Based on work by:
* Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
*
- * This driver allows to upgrade microcode on AMD
- * family 0x10 and 0x11 processors.
+ * Maintainers:
+ * Andreas Herrmann <andreas.herrmann3@amd.com>
+ * Borislav Petkov <borislav.petkov@amd.com>
+ *
+ * This driver allows to upgrade microcode on F10h AMD
+ * CPUs and later.
*
* Licensed under the terms of the GNU General Public
* License version 2. See file COPYING for details.
@@ -71,6 +75,9 @@ struct microcode_amd {
static struct equiv_cpu_entry *equiv_cpu_table;
+/* page-sized ucode patch buffer */
+void *patch;
+
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -86,27 +93,76 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
return 0;
}
-static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
- int rev)
+static unsigned int verify_ucode_size(int cpu, u32 patch_size,
+ unsigned int size)
{
- unsigned int current_cpu_id;
- u16 equiv_cpu_id = 0;
- unsigned int i = 0;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ u32 max_size;
+
+#define F1XH_MPB_MAX_SIZE 2048
+#define F14H_MPB_MAX_SIZE 1824
+#define F15H_MPB_MAX_SIZE 4096
+
+ switch (c->x86) {
+ case 0x14:
+ max_size = F14H_MPB_MAX_SIZE;
+ break;
+ case 0x15:
+ max_size = F15H_MPB_MAX_SIZE;
+ break;
+ default:
+ max_size = F1XH_MPB_MAX_SIZE;
+ break;
+ }
+
+ if (patch_size > min_t(u32, size, max_size)) {
+ pr_err("patch size mismatch\n");
+ return 0;
+ }
+
+ return patch_size;
+}
+
+static u16 find_equiv_id(void)
+{
+ unsigned int current_cpu_id, i = 0;
BUG_ON(equiv_cpu_table == NULL);
+
current_cpu_id = cpuid_eax(0x00000001);
while (equiv_cpu_table[i].installed_cpu != 0) {
- if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
- equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
- break;
- }
+ if (current_cpu_id == equiv_cpu_table[i].installed_cpu)
+ return equiv_cpu_table[i].equiv_cpu;
+
i++;
}
+ return 0;
+}
+/*
+ * we signal a good patch is found by returning its size > 0
+ */
+static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
+ unsigned int leftover_size, int rev,
+ unsigned int *current_size)
+{
+ struct microcode_header_amd *mc_hdr;
+ unsigned int actual_size;
+ u16 equiv_cpu_id;
+
+ /* size of the current patch we're staring at */
+ *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE;
+
+ equiv_cpu_id = find_equiv_id();
if (!equiv_cpu_id)
return 0;
+ /*
+ * let's look at the patch header itself now
+ */
+ mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE);
+
if (mc_hdr->processor_rev_id != equiv_cpu_id)
return 0;
@@ -120,7 +176,20 @@ static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
if (mc_hdr->patch_id <= rev)
return 0;
- return 1;
+ /*
+ * now that the header looks sane, verify its size
+ */
+ actual_size = verify_ucode_size(cpu, *current_size, leftover_size);
+ if (!actual_size)
+ return 0;
+
+ /* clear the patch buffer */
+ memset(patch, 0, PAGE_SIZE);
+
+ /* all looks ok, get the binary patch */
+ get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size);
+
+ return actual_size;
}
static int apply_microcode_amd(int cpu)
@@ -155,63 +224,6 @@ static int apply_microcode_amd(int cpu)
return 0;
}
-static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
-{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- u32 max_size, actual_size;
-
-#define F1XH_MPB_MAX_SIZE 2048
-#define F14H_MPB_MAX_SIZE 1824
-#define F15H_MPB_MAX_SIZE 4096
-
- switch (c->x86) {
- case 0x14:
- max_size = F14H_MPB_MAX_SIZE;
- break;
- case 0x15:
- max_size = F15H_MPB_MAX_SIZE;
- break;
- default:
- max_size = F1XH_MPB_MAX_SIZE;
- break;
- }
-
- actual_size = *(u32 *)(buf + 4);
-
- if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
- pr_err("section size mismatch\n");
- return 0;
- }
-
- return actual_size;
-}
-
-static struct microcode_header_amd *
-get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
-{
- struct microcode_header_amd *mc = NULL;
- unsigned int actual_size = 0;
-
- if (*(u32 *)buf != UCODE_UCODE_TYPE) {
- pr_err("invalid type field in container file section header\n");
- goto out;
- }
-
- actual_size = verify_ucode_size(cpu, buf, size);
- if (!actual_size)
- goto out;
-
- mc = vzalloc(actual_size);
- if (!mc)
- goto out;
-
- get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
- *mc_size = actual_size + SECTION_HDR_SIZE;
-
-out:
- return mc;
-}
-
static int install_equiv_cpu_table(const u8 *buf)
{
unsigned int *ibuf = (unsigned int *)buf;
@@ -247,36 +259,38 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct microcode_header_amd *mc_hdr = NULL;
- unsigned int mc_size, leftover;
+ unsigned int mc_size, leftover, current_size = 0;
int offset;
const u8 *ucode_ptr = data;
void *new_mc = NULL;
unsigned int new_rev = uci->cpu_sig.rev;
- enum ucode_state state = UCODE_OK;
+ enum ucode_state state = UCODE_ERROR;
offset = install_equiv_cpu_table(ucode_ptr);
if (offset < 0) {
pr_err("failed to create equivalent cpu table\n");
- return UCODE_ERROR;
+ goto out;
}
-
ucode_ptr += offset;
leftover = size - offset;
- while (leftover) {
- mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size);
- if (!mc_hdr)
- break;
+ if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) {
+ pr_err("invalid type field in container file section header\n");
+ goto free_table;
+ }
- if (get_matching_microcode(cpu, mc_hdr, new_rev)) {
- vfree(new_mc);
+ while (leftover) {
+ mc_size = get_matching_microcode(cpu, ucode_ptr, leftover,
+ new_rev, &current_size);
+ if (mc_size) {
+ mc_hdr = patch;
+ new_mc = patch;
new_rev = mc_hdr->patch_id;
- new_mc = mc_hdr;
- } else
- vfree(mc_hdr);
+ goto out_ok;
+ }
- ucode_ptr += mc_size;
- leftover -= mc_size;
+ ucode_ptr += current_size;
+ leftover -= current_size;
}
if (!new_mc) {
@@ -284,19 +298,16 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
goto free_table;
}
- if (!leftover) {
- vfree(uci->mc);
- uci->mc = new_mc;
- pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
- cpu, uci->cpu_sig.rev, new_rev);
- } else {
- vfree(new_mc);
- state = UCODE_ERROR;
- }
+out_ok:
+ uci->mc = new_mc;
+ state = UCODE_OK;
+ pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
+ cpu, uci->cpu_sig.rev, new_rev);
free_table:
free_equiv_cpu_table();
+out:
return state;
}
@@ -337,7 +348,6 @@ static void microcode_fini_cpu_amd(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- vfree(uci->mc);
uci->mc = NULL;
}
@@ -351,5 +361,14 @@ static struct microcode_ops microcode_amd_ops = {
struct microcode_ops * __init init_amd_microcode(void)
{
+ patch = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!patch)
+ return NULL;
+
return &microcode_amd_ops;
}
+
+void __exit exit_amd_microcode(void)
+{
+ free_page((unsigned long)patch);
+}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9d46f5e43b51..fda91c307104 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu)
return err;
}
-static ssize_t reload_store(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t reload_store(struct device *dev,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
unsigned long val;
@@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev,
return ret;
}
-static ssize_t version_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
+static ssize_t version_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
}
-static ssize_t pf_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
+static ssize_t pf_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
}
-static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
-static SYSDEV_ATTR(version, 0400, version_show, NULL);
-static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+static DEVICE_ATTR(reload, 0200, NULL, reload_store);
+static DEVICE_ATTR(version, 0400, version_show, NULL);
+static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
static struct attribute *mc_default_attrs[] = {
- &attr_reload.attr,
- &attr_version.attr,
- &attr_processor_flags.attr,
+ &dev_attr_reload.attr,
+ &dev_attr_version.attr,
+ &dev_attr_processor_flags.attr,
NULL
};
@@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu)
return ustate;
}
-static int mc_sysdev_add(struct sys_device *sys_dev)
+static int mc_device_add(struct device *dev, struct subsys_interface *sif)
{
- int err, cpu = sys_dev->id;
+ int err, cpu = dev->id;
if (!cpu_online(cpu))
return 0;
pr_debug("CPU%d added\n", cpu);
- err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+ err = sysfs_create_group(&dev->kobj, &mc_attr_group);
if (err)
return err;
if (microcode_init_cpu(cpu) == UCODE_ERROR) {
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
return -EINVAL;
}
return err;
}
-static int mc_sysdev_remove(struct sys_device *sys_dev)
+static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
{
- int cpu = sys_dev->id;
+ int cpu = dev->id;
if (!cpu_online(cpu))
return 0;
pr_debug("CPU%d removed\n", cpu);
microcode_fini_cpu(cpu);
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
return 0;
}
-static struct sysdev_driver mc_sysdev_driver = {
- .add = mc_sysdev_add,
- .remove = mc_sysdev_remove,
+static struct subsys_interface mc_cpu_interface = {
+ .name = "microcode",
+ .subsys = &cpu_subsys,
+ .add_dev = mc_device_add,
+ .remove_dev = mc_device_remove,
};
/**
@@ -464,9 +466,9 @@ static __cpuinit int
mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
+ struct device *dev;
- sys_dev = get_cpu_sysdev(cpu);
+ dev = get_cpu_device(cpu);
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
@@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
pr_debug("CPU%d added\n", cpu);
- if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
+ if (sysfs_create_group(&dev->kobj, &mc_attr_group))
pr_err("Failed to create group for CPU%d\n", cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
/* Suspend is in progress, only remove the interface */
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
pr_debug("CPU%d removed\n", cpu);
break;
@@ -525,7 +527,7 @@ static int __init microcode_init(void)
get_online_cpus();
mutex_lock(&microcode_mutex);
- error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+ error = subsys_interface_register(&mc_cpu_interface);
mutex_unlock(&microcode_mutex);
put_online_cpus();
@@ -535,7 +537,7 @@ static int __init microcode_init(void)
error = microcode_dev_init();
if (error)
- goto out_sysdev_driver;
+ goto out_driver;
register_syscore_ops(&mc_syscore_ops);
register_hotcpu_notifier(&mc_cpu_notifier);
@@ -545,11 +547,11 @@ static int __init microcode_init(void)
return 0;
-out_sysdev_driver:
+out_driver:
get_online_cpus();
mutex_lock(&microcode_mutex);
- sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+ subsys_interface_unregister(&mc_cpu_interface);
mutex_unlock(&microcode_mutex);
put_online_cpus();
@@ -563,6 +565,8 @@ module_init(microcode_init);
static void __exit microcode_exit(void)
{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
microcode_dev_exit();
unregister_hotcpu_notifier(&mc_cpu_notifier);
@@ -571,7 +575,7 @@ static void __exit microcode_exit(void)
get_online_cpus();
mutex_lock(&microcode_mutex);
- sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+ subsys_interface_unregister(&mc_cpu_interface);
mutex_unlock(&microcode_mutex);
put_online_cpus();
@@ -580,6 +584,9 @@ static void __exit microcode_exit(void)
microcode_ops = NULL;
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ exit_amd_microcode();
+
pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
}
module_exit(microcode_exit);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 0741b062a304..ca470e4c92dc 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -564,9 +564,7 @@ void __init default_get_smp_config(unsigned int early)
static void __init smp_reserve_memory(struct mpf_intel *mpf)
{
- unsigned long size = get_mpc_size(mpf->physptr);
-
- memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
+ memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));
}
static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -595,7 +593,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
mpf, (u64)virt_to_phys(mpf));
mem = virt_to_phys(mpf);
- memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf");
+ memblock_reserve(mem, sizeof(*mpf));
if (mpf->physptr)
smp_reserve_memory(mpf);
@@ -836,10 +834,8 @@ early_param("alloc_mptable", parse_alloc_mptable_opt);
void __init early_reserve_e820_mpc_new(void)
{
- if (enable_update_mptable && alloc_mptable) {
- u64 startt = 0;
- mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
- }
+ if (enable_update_mptable && alloc_mptable)
+ mpc_new_phys = early_reserve_e820(mpc_new_length, 4);
}
static int __init update_mp_table(void)
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 12fcbe2c143e..96356762a51d 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
.notifier_call = msr_class_cpu_callback,
};
-static char *msr_devnode(struct device *dev, mode_t *mode)
+static char *msr_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 80dc793b3f63..1c4d769e21ea 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -45,6 +45,15 @@ int iommu_detected __read_mostly = 0;
*/
int iommu_pass_through __read_mostly;
+/*
+ * Group multi-function PCI devices into a single device-group for the
+ * iommu_device_group interface. This tells the iommu driver to pretend
+ * it cannot distinguish between functions of a device, exposing only one
+ * group for the device. Useful for disallowing use of individual PCI
+ * functions from userspace drivers.
+ */
+int iommu_group_mf __read_mostly;
+
extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
/* Dummy device used for NULL arguments (normally ISA). */
@@ -169,6 +178,8 @@ static __init int iommu_setup(char *p)
#endif
if (!strncmp(p, "pt", 2))
iommu_pass_through = 1;
+ if (!strncmp(p, "group_mf", 8))
+ iommu_group_mf = 1;
gart_parse_options(p);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ee5d4fbd53b4..15763af7bfe3 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -293,7 +293,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
regs.orig_ax = -1;
regs.ip = (unsigned long) kernel_thread_helper;
regs.cs = __KERNEL_CS | get_kernel_rpl();
- regs.flags = X86_EFLAGS_IF | 0x2;
+ regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
/* Ok, create the new process.. */
return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 795b79f984c2..485204f58cda 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -99,7 +99,8 @@ void cpu_idle(void)
/* endless idle loop with no priority at all */
while (1) {
- tick_nohz_stop_sched_tick(1);
+ tick_nohz_idle_enter();
+ rcu_idle_enter();
while (!need_resched()) {
check_pgt_cache();
@@ -116,7 +117,8 @@ void cpu_idle(void)
pm_idle();
start_critical_timings();
}
- tick_nohz_restart_sched_tick();
+ rcu_idle_exit();
+ tick_nohz_idle_exit();
preempt_enable_no_resched();
schedule();
preempt_disable();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3bd7e6eebf31..9b9fe4a85c87 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -122,7 +122,7 @@ void cpu_idle(void)
/* endless idle loop with no priority at all */
while (1) {
- tick_nohz_stop_sched_tick(1);
+ tick_nohz_idle_enter();
while (!need_resched()) {
rmb();
@@ -139,8 +139,14 @@ void cpu_idle(void)
enter_idle();
/* Don't trace irqs off for idle */
stop_critical_timings();
+
+ /* enter_idle() needs rcu for notifiers */
+ rcu_idle_enter();
+
if (cpuidle_idle_call())
pm_idle();
+
+ rcu_idle_exit();
start_critical_timings();
/* In many cases the interrupt that ended idle
@@ -149,7 +155,7 @@ void cpu_idle(void)
__exit_idle();
}
- tick_nohz_restart_sched_tick();
+ tick_nohz_idle_exit();
preempt_enable_no_resched();
schedule();
preempt_disable();
@@ -293,13 +299,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
- p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+ p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
p->thread.io_bitmap_max = 0;
return -ENOMEM;
}
- memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
- IO_BITMAP_BYTES);
set_tsk_thread_flag(p, TIF_IO_BITMAP);
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 82528799c5de..89a04c7b5bb6 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -749,7 +749,8 @@ put:
/*
* Handle PTRACE_POKEUSR calls for the debug register area.
*/
-int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+static int ptrace_set_debugreg(struct task_struct *tsk, int n,
+ unsigned long val)
{
struct thread_struct *thread = &(tsk->thread);
int rc = 0;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cf0ef986cb6d..d05444ac2aea 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -306,7 +306,8 @@ static void __init cleanup_highmap(void)
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
- memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
+ memblock_reserve(__pa(_brk_start),
+ __pa(_brk_end) - __pa(_brk_start));
/* Mark brk area as locked down and no longer taking any
new allocations */
@@ -331,13 +332,13 @@ static void __init relocate_initrd(void)
ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
PAGE_SIZE);
- if (ramdisk_here == MEMBLOCK_ERROR)
+ if (!ramdisk_here)
panic("Cannot find place for new RAMDISK of size %lld\n",
ramdisk_size);
/* Note: this includes all the lowmem currently occupied by
the initrd, we rely on that fact to keep the data intact. */
- memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
+ memblock_reserve(ramdisk_here, area_size);
initrd_start = ramdisk_here + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@ -393,7 +394,7 @@ static void __init reserve_initrd(void)
initrd_start = 0;
if (ramdisk_size >= (end_of_lowmem>>1)) {
- memblock_x86_free_range(ramdisk_image, ramdisk_end);
+ memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
printk(KERN_ERR "initrd too large to handle, "
"disabling initrd\n");
return;
@@ -416,7 +417,7 @@ static void __init reserve_initrd(void)
relocate_initrd();
- memblock_x86_free_range(ramdisk_image, ramdisk_end);
+ memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
}
#else
static void __init reserve_initrd(void)
@@ -490,15 +491,13 @@ static void __init memblock_x86_reserve_range_setup_data(void)
{
struct setup_data *data;
u64 pa_data;
- char buf[32];
if (boot_params.hdr.version < 0x0209)
return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
- sprintf(buf, "setup data %x", data->type);
- memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
+ memblock_reserve(pa_data, sizeof(*data) + data->len);
pa_data = data->next;
early_iounmap(data, sizeof(*data));
}
@@ -554,7 +553,7 @@ static void __init reserve_crashkernel(void)
crash_base = memblock_find_in_range(alignment,
CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
- if (crash_base == MEMBLOCK_ERROR) {
+ if (!crash_base) {
pr_info("crashkernel reservation failed - No suitable area found.\n");
return;
}
@@ -568,7 +567,7 @@ static void __init reserve_crashkernel(void)
return;
}
}
- memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
+ memblock_reserve(crash_base, crash_size);
printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
"for crashkernel (System RAM: %ldMB)\n",
@@ -626,7 +625,7 @@ static __init void reserve_ibft_region(void)
addr = find_ibft_region(&size);
if (size)
- memblock_x86_reserve_range(addr, addr + size, "* ibft");
+ memblock_reserve(addr, size);
}
static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9f548cb4a958..e38e21754eea 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -840,7 +840,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
- !physid_isset(apicid, phys_cpu_present_map)) {
+ !physid_isset(apicid, phys_cpu_present_map) ||
+ (!x2apic_mode && apicid >= 255)) {
printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
return -EINVAL;
}
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index a91ae7709b49..a73b61055ad6 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -14,11 +14,11 @@ void __init setup_trampolines(void)
/* Has to be in very low memory so we can execute real-mode AP code. */
mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
- if (mem == MEMBLOCK_ERROR)
+ if (!mem)
panic("Cannot allocate trampoline\n");
x86_trampoline_base = __va(mem);
- memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE");
+ memblock_reserve(mem, size);
printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
x86_trampoline_base, (unsigned long long)mem, size);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a8e3eb83466c..fa1191fb679d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -306,15 +306,10 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
== NOTIFY_STOP)
return;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
-#ifdef CONFIG_KPROBES
+
if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
== NOTIFY_STOP)
return;
-#else
- if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
- return;
-#endif
preempt_conditional_sti(regs);
do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index db483369f10b..2c9cf0fd78f5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -35,7 +35,7 @@ static int __read_mostly tsc_unstable;
erroneous rdtsc usage on !cpu_has_tsc processors */
static int __read_mostly tsc_disabled = -1;
-static int tsc_clocksource_reliable;
+int tsc_clocksource_reliable;
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -178,11 +178,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
}
#define CAL_MS 10
-#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
+#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS))
#define CAL_PIT_LOOPS 1000
#define CAL2_MS 50
-#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
+#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS))
#define CAL2_PIT_LOOPS 5000
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 0aa5fed8b9e6..9eba29b46cb7 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -113,7 +113,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
if (unsynchronized_tsc())
return;
- if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
+ if (tsc_clocksource_reliable) {
if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
pr_info(
"Skipped synchronization checks as TSC is reliable.\n");
@@ -172,7 +172,7 @@ void __cpuinit check_tsc_sync_target(void)
{
int cpus = 2;
- if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
+ if (unsynchronized_tsc() || tsc_clocksource_reliable)
return;
/*
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index e4d4a22e8b94..b07ba9393564 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
};
-static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE;
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
static int __init vsyscall_setup(char *str)
{
@@ -140,11 +140,40 @@ static int addr_to_vsyscall_nr(unsigned long addr)
return nr;
}
+static bool write_ok_or_segv(unsigned long ptr, size_t size)
+{
+ /*
+ * XXX: if access_ok, get_user, and put_user handled
+ * sig_on_uaccess_error, this could go away.
+ */
+
+ if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
+ siginfo_t info;
+ struct thread_struct *thread = &current->thread;
+
+ thread->error_code = 6; /* user fault, no page, write */
+ thread->cr2 = ptr;
+ thread->trap_no = 14;
+
+ memset(&info, 0, sizeof(info));
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_MAPERR;
+ info.si_addr = (void __user *)ptr;
+
+ force_sig_info(SIGSEGV, &info, current);
+ return false;
+ } else {
+ return true;
+ }
+}
+
bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
{
struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr;
+ int prev_sig_on_uaccess_error;
long ret;
/*
@@ -180,35 +209,65 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
if (seccomp_mode(&tsk->seccomp))
do_exit(SIGKILL);
+ /*
+ * With a real vsyscall, page faults cause SIGSEGV. We want to
+ * preserve that behavior to make writing exploits harder.
+ */
+ prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+ current_thread_info()->sig_on_uaccess_error = 1;
+
+ /*
+ * 0 is a valid user pointer (in the access_ok sense) on 32-bit and
+ * 64-bit, so we don't need to special-case it here. For all the
+ * vsyscalls, 0 means "don't write anything" not "write it at
+ * address 0".
+ */
+ ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
+ if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+ !write_ok_or_segv(regs->si, sizeof(struct timezone)))
+ break;
+
ret = sys_gettimeofday(
(struct timeval __user *)regs->di,
(struct timezone __user *)regs->si);
break;
case 1:
+ if (!write_ok_or_segv(regs->di, sizeof(time_t)))
+ break;
+
ret = sys_time((time_t __user *)regs->di);
break;
case 2:
+ if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+ !write_ok_or_segv(regs->si, sizeof(unsigned)))
+ break;
+
ret = sys_getcpu((unsigned __user *)regs->di,
(unsigned __user *)regs->si,
0);
break;
}
+ current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
+
if (ret == -EFAULT) {
- /*
- * Bad news -- userspace fed a bad pointer to a vsyscall.
- *
- * With a real vsyscall, that would have caused SIGSEGV.
- * To make writing reliable exploits using the emulated
- * vsyscalls harder, generate SIGSEGV here as well.
- */
+ /* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall fault (exploit attempt?)");
- goto sigsegv;
+
+ /*
+ * If we failed to generate a signal for any reason,
+ * generate one here. (This should be impossible.)
+ */
+ if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
+ !sigismember(&tsk->pending.signal, SIGSEGV)))
+ goto sigsegv;
+
+ return true; /* Don't emulate the ret. */
}
regs->ax = ret;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c1d6cd549397..91f83e21b989 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = {
struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
.setup_percpu_clockev = setup_secondary_APIC_clock,
+ .fixup_cpu_id = x86_default_fixup_cpu_id,
};
static void default_nmi_init(void) { };