aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/amd_nb.c2
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c26
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c91
-rw-r--r--arch/x86/kernel/apic/io_apic.c8
-rw-r--r--arch/x86/kernel/apm_32.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c23
-rw-r--r--arch/x86/kernel/cpu/common.c12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c23
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c72
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c54
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c8
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_early.c53
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c23
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c42
-rw-r--r--arch/x86/kernel/cpu/perf_event.h4
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c15
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c98
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c25
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c57
-rw-r--r--arch/x86/kernel/cpu/proc.c10
-rw-r--r--arch/x86/kernel/cpu/scattered.c5
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/entry_64.S81
-rw-r--r--arch/x86/kernel/espfix_64.c3
-rw-r--r--arch/x86/kernel/ftrace.c288
-rw-r--r--arch/x86/kernel/irq.c30
-rw-r--r--arch/x86/kernel/kprobes/core.c8
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c9
-rw-r--r--arch/x86/kernel/kprobes/opt.c4
-rw-r--r--arch/x86/kernel/mcount_64.S224
-rw-r--r--arch/x86/kernel/msr.c11
-rw-r--r--arch/x86/kernel/ptrace.c2
-rw-r--r--arch/x86/kernel/setup.c4
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/smpboot.c17
-rw-r--r--arch/x86/kernel/sysfb.c2
-rw-r--r--arch/x86/kernel/sysfb_simplefb.c5
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/traps.c159
-rw-r--r--arch/x86/kernel/uprobes.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86/kernel/vsyscall_64.c147
-rw-r--r--arch/x86/kernel/x86_init.c10
48 files changed, 1224 insertions, 455 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8f1e77440b2b..5d4502c8b983 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -28,8 +28,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += mcount_64.o
obj-y += syscall_$(BITS).o vsyscall_gtod.o
-obj-$(CONFIG_X86_64) += vsyscall_64.o
-obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
+obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index f04dbb3069b8..5caed1dd7ccf 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -21,6 +21,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
{}
@@ -30,6 +31,7 @@ EXPORT_SYMBOL(amd_nb_misc_ids);
static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
{}
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 4128b5fcb559..c2fd21fed002 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -40,7 +40,7 @@ static unsigned int get_apic_id(unsigned long x)
unsigned int id;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U);
+ id = ((x >> 24) & 0xffU) | ((value << 2) & 0xff00U);
return id;
}
@@ -145,7 +145,7 @@ static void numachip_send_IPI_all(int vector)
static void numachip_send_IPI_self(int vector)
{
- __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+ apic_write(APIC_SELF_IPI, vector);
}
static int __init numachip_probe(void)
@@ -153,20 +153,8 @@ static int __init numachip_probe(void)
return apic == &apic_numachip;
}
-static void __init map_csrs(void)
-{
- printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n",
- NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1);
- init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
-
- printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n",
- NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1);
- init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
-}
-
static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
{
-
if (c->phys_proc_id != node) {
c->phys_proc_id = node;
per_cpu(cpu_llc_id, smp_processor_id()) = node;
@@ -175,19 +163,15 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
static int __init numachip_system_init(void)
{
- unsigned int val;
-
if (!numachip_system)
return 0;
+ init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
+ init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
+
x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
x86_init.pci.arch_init = pci_numachip_init;
- map_csrs();
-
- val = read_lcsr(CSR_G0_NODE_IDS);
- printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val);
-
return 0;
}
early_initcall(numachip_system_init);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 6a1e71bde323..6873ab925d00 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,6 +18,7 @@
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/delay.h>
+#include <linux/seq_buf.h>
#ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(int watchdog_thresh)
@@ -29,14 +30,35 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
#ifdef arch_trigger_all_cpu_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+static cpumask_t printtrace_mask;
+
+#define NMI_BUF_SIZE 4096
+
+struct nmi_seq_buf {
+ unsigned char buffer[NMI_BUF_SIZE];
+ struct seq_buf seq;
+};
+
+/* Safe printing in NMI context */
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
+static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+ const char *buf = s->buffer + start;
+
+ printk("%.*s", (end - start) + 1, buf);
+}
+
void arch_trigger_all_cpu_backtrace(bool include_self)
{
+ struct nmi_seq_buf *s;
+ int len;
+ int cpu;
int i;
- int cpu = get_cpu();
+ int this_cpu = get_cpu();
if (test_and_set_bit(0, &backtrace_flag)) {
/*
@@ -49,7 +71,17 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
if (!include_self)
- cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+ cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
+
+ cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
+ /*
+ * Set up per_cpu seq_buf buffers that the NMIs running on the other
+ * CPUs will write to.
+ */
+ for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
+ s = &per_cpu(nmi_print_seq, cpu);
+ seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
+ }
if (!cpumask_empty(to_cpumask(backtrace_mask))) {
pr_info("sending NMI to %s CPUs:\n",
@@ -65,11 +97,58 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
touch_softlockup_watchdog();
}
+ /*
+ * Now that all the NMIs have triggered, we can dump out their
+ * back traces safely to the console.
+ */
+ for_each_cpu(cpu, &printtrace_mask) {
+ int last_i = 0;
+
+ s = &per_cpu(nmi_print_seq, cpu);
+ len = seq_buf_used(&s->seq);
+ if (!len)
+ continue;
+
+ /* Print line by line. */
+ for (i = 0; i < len; i++) {
+ if (s->buffer[i] == '\n') {
+ print_seq_line(s, last_i, i);
+ last_i = i + 1;
+ }
+ }
+ /* Check if there was a partial line. */
+ if (last_i < len) {
+ print_seq_line(s, last_i, len - 1);
+ pr_cont("\n");
+ }
+ }
+
clear_bit(0, &backtrace_flag);
smp_mb__after_atomic();
put_cpu();
}
+/*
+ * It is not safe to call printk() directly from NMI handlers.
+ * It may be fine if the NMI detected a lock up and we have no choice
+ * but to do so, but doing a NMI on all other CPUs to get a back trace
+ * can be done with a sysrq-l. We don't want that to lock up, which
+ * can happen if the NMI interrupts a printk in progress.
+ *
+ * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
+ * the content into a per cpu seq_buf buffer. Then when the NMIs are
+ * all done, we can safely dump the contents of the seq_buf to a printk()
+ * from a non NMI context.
+ */
+static int nmi_vprintk(const char *fmt, va_list args)
+{
+ struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+ unsigned int len = seq_buf_used(&s->seq);
+
+ seq_buf_vprintf(&s->seq, fmt, args);
+ return seq_buf_used(&s->seq) - len;
+}
+
static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
@@ -78,12 +157,14 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+ printk_func_t printk_func_save = this_cpu_read(printk_func);
- arch_spin_lock(&lock);
+ /* Replace printk to write into the NMI seq */
+ this_cpu_write(printk_func, nmi_vprintk);
printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
show_regs(regs);
- arch_spin_unlock(&lock);
+ this_cpu_write(printk_func, printk_func_save);
+
cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return NMI_HANDLED;
}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1183d545da1e..7ffe0a2b870f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3158,7 +3158,7 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- __write_msi_msg(data->msi_desc, &msg);
+ __pci_write_msi_msg(data->msi_desc, &msg);
return IRQ_SET_MASK_OK_NOCOPY;
}
@@ -3169,8 +3169,8 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
*/
static struct irq_chip msi_chip = {
.name = "PCI-MSI",
- .irq_unmask = unmask_msi_irq,
- .irq_mask = mask_msi_irq,
+ .irq_unmask = pci_msi_unmask_irq,
+ .irq_mask = pci_msi_mask_irq,
.irq_ack = ack_apic_edge,
.irq_set_affinity = msi_set_affinity,
.irq_retrigger = ioapic_retrigger_irq,
@@ -3196,7 +3196,7 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
* MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
*/
if (!irq_offset)
- write_msi_msg(irq, &msg);
+ pci_write_msi_msg(irq, &msg);
setup_remapped_irq(irq, irq_cfg(irq), chip);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 584874451414..927ec9235947 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -378,7 +378,6 @@ static struct cpuidle_driver apm_idle_driver = {
{ /* entry 1 is for APM idle */
.name = "APM",
.desc = "APM idle",
- .flags = CPUIDLE_FLAG_TIME_VALID,
.exit_latency = 250, /* WAG */
.target_residency = 500, /* WAG */
.enter = &apm_cpu_idle
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e7c798b354fa..4f9359f36bb7 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -48,7 +48,6 @@ int main(void)
#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
ENTRY(bx);
- ENTRY(bx);
ENTRY(cx);
ENTRY(dx);
ENTRY(sp);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 813d29d00a17..15c5df92f74e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -566,6 +566,17 @@ static void init_amd_k8(struct cpuinfo_x86 *c)
if (!c->x86_model_id[0])
strcpy(c->x86_model_id, "Hammer");
+
+#ifdef CONFIG_SMP
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
+ * bit 6 of msr C001_0015
+ *
+ * Errata 63 for SH-B3 steppings
+ * Errata 122 for all steppings (F+ have it disabled by default)
+ */
+ msr_set_bit(MSR_K7_HWCR, 6);
+#endif
}
static void init_amd_gh(struct cpuinfo_x86 *c)
@@ -636,18 +647,6 @@ static void init_amd(struct cpuinfo_x86 *c)
{
u32 dummy;
-#ifdef CONFIG_SMP
- /*
- * Disable TLB flush filter by setting HWCR.FFDIS on K8
- * bit 6 of msr C001_0015
- *
- * Errata 63 for SH-B3 steppings
- * Errata 122 for all steppings (F+ have it disabled by default)
- */
- if (c->x86 == 0xf)
- msr_set_bit(MSR_K7_HWCR, 6);
-#endif
-
early_init_amd(c);
/*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4b4f78c9ba19..c6049650c093 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -146,6 +146,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
static int __init x86_xsave_setup(char *s)
{
+ if (strlen(s))
+ return 0;
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
@@ -956,14 +958,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
}
#ifdef CONFIG_X86_64
-static void vgetcpu_set_mode(void)
-{
- if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
- vgetcpu_mode = VGETCPU_RDTSCP;
- else
- vgetcpu_mode = VGETCPU_LSL;
-}
-
#ifdef CONFIG_IA32_EMULATION
/* May not be __init: called during resume */
static void syscall32_cpu_init(void)
@@ -1006,8 +1000,6 @@ void __init identify_boot_cpu(void)
#ifdef CONFIG_X86_32
sysenter_setup();
enable_sep_cpu();
-#else
- vgetcpu_set_mode();
#endif
cpu_detect_tlb(&boot_cpu_data);
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 09edd0b65fef..10b46906767f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -3,6 +3,8 @@
enum severity_level {
MCE_NO_SEVERITY,
+ MCE_DEFERRED_SEVERITY,
+ MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
MCE_KEEP_SEVERITY,
MCE_SOME_SEVERITY,
MCE_AO_SEVERITY,
@@ -21,7 +23,7 @@ struct mce_bank {
char attrname[ATTR_LEN]; /* attribute name */
};
-int mce_severity(struct mce *a, int tolerant, char **msg);
+int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index c370e1c4468b..8bb433043a7f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -31,6 +31,7 @@
enum context { IN_KERNEL = 1, IN_USER = 2 };
enum ser { SER_REQUIRED = 1, NO_SER = 2 };
+enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
static struct severity {
u64 mask;
@@ -40,6 +41,7 @@ static struct severity {
unsigned char mcgres;
unsigned char ser;
unsigned char context;
+ unsigned char excp;
unsigned char covered;
char *msg;
} severities[] = {
@@ -48,6 +50,8 @@ static struct severity {
#define USER .context = IN_USER
#define SER .ser = SER_REQUIRED
#define NOSER .ser = NO_SER
+#define EXCP .excp = EXCP_CONTEXT
+#define NOEXCP .excp = NO_EXCP
#define BITCLR(x) .mask = x, .result = 0
#define BITSET(x) .mask = x, .result = x
#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
@@ -62,7 +66,7 @@ static struct severity {
),
MCESEV(
NO, "Not enabled",
- BITCLR(MCI_STATUS_EN)
+ EXCP, BITCLR(MCI_STATUS_EN)
),
MCESEV(
PANIC, "Processor context corrupt",
@@ -71,16 +75,20 @@ static struct severity {
/* When MCIP is not set something is very confused */
MCESEV(
PANIC, "MCIP not set in MCA handler",
- MCGMASK(MCG_STATUS_MCIP, 0)
+ EXCP, MCGMASK(MCG_STATUS_MCIP, 0)
),
/* Neither return not error IP -- no chance to recover -> PANIC */
MCESEV(
PANIC, "Neither restart nor error IP",
- MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
+ EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
),
MCESEV(
PANIC, "In kernel and no restart IP",
- KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+ EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ DEFERRED, "Deferred error",
+ NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
),
MCESEV(
KEEP, "Corrected error",
@@ -89,7 +97,7 @@ static struct severity {
/* ignore OVER for UCNA */
MCESEV(
- KEEP, "Uncorrected no action required",
+ UCNA, "Uncorrected no action required",
SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
),
MCESEV(
@@ -178,8 +186,9 @@ static int error_context(struct mce *m)
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}
-int mce_severity(struct mce *m, int tolerant, char **msg)
+int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
{
+ enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
enum context ctx = error_context(m);
struct severity *s;
@@ -194,6 +203,8 @@ int mce_severity(struct mce *m, int tolerant, char **msg)
continue;
if (s->context && ctx != s->context)
continue;
+ if (s->excp && excp != s->excp)
+ continue;
if (msg)
*msg = s->msg;
s->covered = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 61a9668cebfd..d2c611699cd9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -292,10 +292,10 @@ static void print_mce(struct mce *m)
#define PANIC_TIMEOUT 5 /* 5 seconds */
-static atomic_t mce_paniced;
+static atomic_t mce_panicked;
static int fake_panic;
-static atomic_t mce_fake_paniced;
+static atomic_t mce_fake_panicked;
/* Panic in progress. Enable interrupts and wait for final IPI */
static void wait_for_panic(void)
@@ -319,7 +319,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
/*
* Make sure only one CPU runs in machine check panic
*/
- if (atomic_inc_return(&mce_paniced) > 1)
+ if (atomic_inc_return(&mce_panicked) > 1)
wait_for_panic();
barrier();
@@ -327,7 +327,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
console_verbose();
} else {
/* Don't log too much for fake panic */
- if (atomic_inc_return(&mce_fake_paniced) > 1)
+ if (atomic_inc_return(&mce_fake_panicked) > 1)
return;
}
/* First print corrected ones that are still unlogged */
@@ -575,6 +575,37 @@ static void mce_read_aux(struct mce *m, int i)
}
}
+static bool memory_error(struct mce *m)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ /*
+ * coming soon
+ */
+ return false;
+ } else if (c->x86_vendor == X86_VENDOR_INTEL) {
+ /*
+ * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+ *
+ * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+ * indicating a memory error. Bit 8 is used for indicating a
+ * cache hierarchy error. The combination of bit 2 and bit 3
+ * is used for indicating a `generic' cache hierarchy error
+ * But we can't just blindly check the above bits, because if
+ * bit 11 is set, then it is a bus/interconnect error - and
+ * either way the above bits just gives more detail on what
+ * bus/interconnect error happened. Note that bit 12 can be
+ * ignored, as it's the "filter" bit.
+ */
+ return (m->status & 0xef80) == BIT(7) ||
+ (m->status & 0xef00) == BIT(8) ||
+ (m->status & 0xeffc) == 0xc;
+ }
+
+ return false;
+}
+
DEFINE_PER_CPU(unsigned, mce_poll_count);
/*
@@ -595,6 +626,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
struct mce m;
+ int severity;
int i;
this_cpu_inc(mce_poll_count);
@@ -630,6 +662,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(flags & MCP_TIMESTAMP))
m.tsc = 0;
+
+ severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
+
+ /*
+ * In the cases where we don't have a valid address after all,
+ * do not add it into the ring buffer.
+ */
+ if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
+ if (m.status & MCI_STATUS_ADDRV) {
+ mce_ring_add(m.addr >> PAGE_SHIFT);
+ mce_schedule_work();
+ }
+ }
+
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
@@ -668,7 +714,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
if (quirk_no_way_out)
quirk_no_way_out(i, m, regs);
}
- if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY)
+ if (mce_severity(m, mca_cfg.tolerant, msg, true) >=
+ MCE_PANIC_SEVERITY)
ret = 1;
}
return ret;
@@ -697,7 +744,7 @@ static int mce_timed_out(u64 *t)
* might have been modified by someone else.
*/
rmb();
- if (atomic_read(&mce_paniced))
+ if (atomic_read(&mce_panicked))
wait_for_panic();
if (!mca_cfg.monarch_timeout)
goto out;
@@ -754,7 +801,7 @@ static void mce_reign(void)
for_each_possible_cpu(cpu) {
int severity = mce_severity(&per_cpu(mces_seen, cpu),
mca_cfg.tolerant,
- &nmsg);
+ &nmsg, true);
if (severity > global_worst) {
msg = nmsg;
global_worst = severity;
@@ -1095,13 +1142,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)
*/
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
- severity = mce_severity(&m, cfg->tolerant, NULL);
+ severity = mce_severity(&m, cfg->tolerant, NULL, true);
/*
- * When machine check was for corrected handler don't touch,
- * unless we're panicing.
+ * When machine check was for corrected/deferred handler don't
+ * touch, unless we're panicing.
*/
- if (severity == MCE_KEEP_SEVERITY && !no_way_out)
+ if ((severity == MCE_KEEP_SEVERITY ||
+ severity == MCE_UCNA_SEVERITY) && !no_way_out)
continue;
__set_bit(i, toclear);
if (severity == MCE_NO_SEVERITY) {
@@ -2520,7 +2568,7 @@ struct dentry *mce_get_debugfs_dir(void)
static void mce_reset(void)
{
cpu_missing = 0;
- atomic_set(&mce_fake_paniced, 0);
+ atomic_set(&mce_fake_panicked, 0);
atomic_set(&mce_executing, 0);
atomic_set(&mce_callin, 0);
atomic_set(&global_nwo, 0);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 5d4999f95aec..f1c3769bbd64 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -212,12 +212,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
unsigned int cpu = smp_processor_id();
u32 low = 0, high = 0, address = 0;
unsigned int bank, block;
- int offset = -1;
+ int offset = -1, new;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0)
- address = MSR_IA32_MC0_MISC + bank * 4;
+ address = MSR_IA32_MCx_MISC(bank);
else if (block == 1) {
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
@@ -247,13 +247,18 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
b.address = address;
b.interrupt_capable = lvt_interrupt_supported(bank, high);
- if (b.interrupt_capable) {
- int new = (high & MASK_LVTOFF_HI) >> 20;
- offset = setup_APIC_mce(offset, new);
- }
+ if (!b.interrupt_capable)
+ goto init;
+
+ new = (high & MASK_LVTOFF_HI) >> 20;
+ offset = setup_APIC_mce(offset, new);
+
+ if ((offset == new) &&
+ (mce_threshold_vector != amd_threshold_interrupt))
+ mce_threshold_vector = amd_threshold_interrupt;
+init:
mce_threshold_block_init(&b, offset);
- mce_threshold_vector = amd_threshold_interrupt;
}
}
}
@@ -270,18 +275,17 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
+ int cpu = smp_processor_id();
unsigned int bank, block;
struct mce m;
- mce_setup(&m);
-
/* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
- if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
+ if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0) {
- address = MSR_IA32_MC0_MISC + bank * 4;
+ address = MSR_IA32_MCx_MISC(bank);
} else if (block == 1) {
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
@@ -309,21 +313,20 @@ static void amd_threshold_interrupt(void)
* Log the machine check that caused the threshold
* event.
*/
- machine_check_poll(MCP_TIMESTAMP,
- this_cpu_ptr(&mce_poll_banks));
-
- if (high & MASK_OVERFLOW_HI) {
- rdmsrl(address, m.misc);
- rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
- m.status);
- m.bank = K8_MCE_THRESHOLD_BASE
- + bank * NR_BLOCKS
- + block;
- mce_log(&m);
- return;
- }
+ if (high & MASK_OVERFLOW_HI)
+ goto log;
}
}
+ return;
+
+log:
+ mce_setup(&m);
+ rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
+ m.misc = ((u64)high << 32) | low;
+ m.bank = bank;
+ mce_log(&m);
+
+ wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
}
/*
@@ -617,8 +620,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
}
}
- err = allocate_threshold_blocks(cpu, bank, 0,
- MSR_IA32_MC0_MISC + bank * 4);
+ err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank));
if (!err)
goto out;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 8fffd845e22b..bfbbe6195e2d 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -376,7 +376,7 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
return UCODE_OK;
}
-enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
+enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
{
enum ucode_state ret;
@@ -390,8 +390,8 @@ enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32)
/* save BSP's matching patch for early load */
- if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) {
- struct ucode_patch *p = find_patch(smp_processor_id());
+ if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
+ struct ucode_patch *p = find_patch(cpu);
if (p) {
memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
@@ -444,7 +444,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
goto fw_release;
}
- ret = load_microcode_amd(c->x86, fw->data, fw->size);
+ ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size);
fw_release:
release_firmware(fw);
diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
index 7aa1acc79789..737737edbd1e 100644
--- a/arch/x86/kernel/cpu/microcode/amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -108,12 +108,13 @@ static size_t compute_container_size(u8 *data, u32 total_size)
* load_microcode_amd() to save equivalent cpu table and microcode patches in
* kernel heap memory.
*/
-static void apply_ucode_in_initrd(void *ucode, size_t size)
+static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
{
struct equiv_cpu_entry *eq;
size_t *cont_sz;
u32 *header;
u8 *data, **cont;
+ u8 (*patch)[PATCH_MAX_SIZE];
u16 eq_id = 0;
int offset, left;
u32 rev, eax, ebx, ecx, edx;
@@ -123,10 +124,12 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
cont_sz = (size_t *)__pa_nodebug(&container_size);
cont = (u8 **)__pa_nodebug(&container);
+ patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
#else
new_rev = &ucode_new_rev;
cont_sz = &container_size;
cont = &container;
+ patch = &amd_ucode_patch;
#endif
data = ucode;
@@ -213,9 +216,9 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
rev = mc->hdr.patch_id;
*new_rev = rev;
- /* save ucode patch */
- memcpy(amd_ucode_patch, mc,
- min_t(u32, header[1], PATCH_MAX_SIZE));
+ if (save_patch)
+ memcpy(patch, mc,
+ min_t(u32, header[1], PATCH_MAX_SIZE));
}
}
@@ -246,7 +249,7 @@ void __init load_ucode_amd_bsp(void)
*data = cp.data;
*size = cp.size;
- apply_ucode_in_initrd(cp.data, cp.size);
+ apply_ucode_in_initrd(cp.data, cp.size, true);
}
#ifdef CONFIG_X86_32
@@ -263,7 +266,7 @@ void load_ucode_amd_ap(void)
size_t *usize;
void **ucode;
- mc = (struct microcode_amd *)__pa(amd_ucode_patch);
+ mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
__apply_microcode_amd(mc);
return;
@@ -275,7 +278,7 @@ void load_ucode_amd_ap(void)
if (!*ucode || !*usize)
return;
- apply_ucode_in_initrd(*ucode, *usize);
+ apply_ucode_in_initrd(*ucode, *usize, false);
}
static void __init collect_cpu_sig_on_bsp(void *arg)
@@ -339,7 +342,7 @@ void load_ucode_amd_ap(void)
* AP has a different equivalence ID than BSP, looks like
* mixed-steppings silicon so go through the ucode blob anew.
*/
- apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size);
+ apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
}
}
#endif
@@ -347,7 +350,9 @@ void load_ucode_amd_ap(void)
int __init save_microcode_in_initrd_amd(void)
{
unsigned long cont;
+ int retval = 0;
enum ucode_state ret;
+ u8 *cont_va;
u32 eax;
if (!container)
@@ -355,13 +360,15 @@ int __init save_microcode_in_initrd_amd(void)
#ifdef CONFIG_X86_32
get_bsp_sig();
- cont = (unsigned long)container;
+ cont = (unsigned long)container;
+ cont_va = __va(container);
#else
/*
* We need the physical address of the container for both bitness since
* boot_params.hdr.ramdisk_image is a physical address.
*/
- cont = __pa(container);
+ cont = __pa(container);
+ cont_va = container;
#endif
/*
@@ -372,6 +379,8 @@ int __init save_microcode_in_initrd_amd(void)
if (relocated_ramdisk)
container = (u8 *)(__va(relocated_ramdisk) +
(cont - boot_params.hdr.ramdisk_image));
+ else
+ container = cont_va;
if (ucode_new_rev)
pr_info("microcode: updated early to new patch_level=0x%08x\n",
@@ -380,9 +389,9 @@ int __init save_microcode_in_initrd_amd(void)
eax = cpuid_eax(0x00000001);
eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
- ret = load_microcode_amd(eax, container, container_size);
+ ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
if (ret != UCODE_OK)
- return -EINVAL;
+ retval = -EINVAL;
/*
* This will be freed any msec now, stash patches for the current
@@ -391,5 +400,23 @@ int __init save_microcode_in_initrd_amd(void)
container = NULL;
container_size = 0;
- return 0;
+ return retval;
+}
+
+void reload_ucode_amd(void)
+{
+ struct microcode_amd *mc;
+ u32 rev, eax;
+
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
+
+ mc = (struct microcode_amd *)amd_ucode_patch;
+
+ if (mc && rev < mc->hdr.patch_id) {
+ if (!__apply_microcode_amd(mc)) {
+ ucode_new_rev = mc->hdr.patch_id;
+ pr_info("microcode: reload patch_level=0x%08x\n",
+ ucode_new_rev);
+ }
+ }
}
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index dd9d6190b08d..15c29096136b 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -465,6 +465,8 @@ static void mc_bp_resume(void)
if (uci->valid && uci->mc)
microcode_ops->apply_microcode(cpu);
+ else if (!uci->mc)
+ reload_early_microcode();
}
static struct syscore_ops mc_syscore_ops = {
@@ -549,7 +551,7 @@ static int __init microcode_init(void)
struct cpuinfo_x86 *c = &cpu_data(0);
int error;
- if (dis_ucode_ldr)
+ if (paravirt_enabled() || dis_ucode_ldr)
return 0;
if (c->x86_vendor == X86_VENDOR_INTEL)
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index 5f28a64e71ea..d45df4bd16ab 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -124,7 +124,7 @@ void __init load_ucode_bsp(void)
static bool check_loader_disabled_ap(void)
{
#ifdef CONFIG_X86_32
- return __pa_nodebug(dis_ucode_ldr);
+ return *((bool *)__pa_nodebug(&dis_ucode_ldr));
#else
return dis_ucode_ldr;
#endif
@@ -176,3 +176,24 @@ int __init save_microcode_in_initrd(void)
return 0;
}
+
+void reload_early_microcode(void)
+{
+ int vendor, x86;
+
+ vendor = x86_vendor();
+ x86 = x86_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (x86 >= 6)
+ reload_ucode_intel();
+ break;
+ case X86_VENDOR_AMD:
+ if (x86 >= 0x10)
+ reload_ucode_amd();
+ break;
+ default:
+ break;
+ }
+}
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index b88343f7a3b3..ec9df6f9cd47 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -650,8 +650,7 @@ static inline void print_ucode(struct ucode_cpu_info *uci)
}
#endif
-static int apply_microcode_early(struct mc_saved_data *mc_saved_data,
- struct ucode_cpu_info *uci)
+static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
{
struct microcode_intel *mc_intel;
unsigned int val[2];
@@ -680,7 +679,10 @@ static int apply_microcode_early(struct mc_saved_data *mc_saved_data,
#endif
uci->cpu_sig.rev = val[1];
- print_ucode(uci);
+ if (early)
+ print_ucode(uci);
+ else
+ print_ucode_info(uci, mc_intel->hdr.date);
return 0;
}
@@ -715,12 +717,17 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
unsigned long initrd_end_early,
struct ucode_cpu_info *uci)
{
+ enum ucode_state ret;
+
collect_cpu_info_early(uci);
scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
mc_saved_in_initrd, uci);
- load_microcode(mc_saved_data, mc_saved_in_initrd,
- initrd_start_early, uci);
- apply_microcode_early(mc_saved_data, uci);
+
+ ret = load_microcode(mc_saved_data, mc_saved_in_initrd,
+ initrd_start_early, uci);
+
+ if (ret == UCODE_OK)
+ apply_microcode_early(uci, true);
}
void __init
@@ -749,7 +756,8 @@ load_ucode_intel_bsp(void)
initrd_end_early = initrd_start_early + ramdisk_size;
_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
- initrd_start_early, initrd_end_early, &uci);
+ initrd_start_early, initrd_end_early,
+ &uci);
#endif
}
@@ -783,5 +791,23 @@ void load_ucode_intel_ap(void)
collect_cpu_info_early(&uci);
load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
initrd_start_addr, &uci);
- apply_microcode_early(mc_saved_data_p, &uci);
+ apply_microcode_early(&uci, true);
+}
+
+void reload_ucode_intel(void)
+{
+ struct ucode_cpu_info uci;
+ enum ucode_state ret;
+
+ if (!mc_saved_data.mc_saved_count)
+ return;
+
+ collect_cpu_info_early(&uci);
+
+ ret = generic_load_microcode_early(mc_saved_data.mc_saved,
+ mc_saved_data.mc_saved_count, &uci);
+ if (ret != UCODE_OK)
+ return;
+
+ apply_microcode_early(&uci, false);
}
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index fc5eb390b368..4e6cdb0ddc70 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -253,6 +253,10 @@ struct cpu_hw_events {
#define INTEL_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+/* Like UEVENT_CONSTRAINT, but match flags too */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
#define INTEL_PLD_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index cbb1be3ed9e4..a61f5c6911da 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -565,6 +565,21 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
perf_ibs->offset_max,
offset + 1);
} while (offset < offset_max);
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ /*
+ * Read IbsBrTarget and IbsOpData4 separately
+ * depending on their availability.
+ * Can't add to offset_max as they are staggered
+ */
+ if (ibs_caps & IBS_CAPS_BRNTRGT) {
+ rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
+ size++;
+ }
+ if (ibs_caps & IBS_CAPS_OPDATA4) {
+ rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
+ size++;
+ }
+ }
ibs_data.size = sizeof(u64) * size;
regs = *iregs;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 46211bcc813e..3c895d480cd7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -552,18 +552,18 @@ int intel_pmu_drain_bts_buffer(void)
* PEBS
*/
struct event_constraint intel_core2_pebs_event_constraints[] = {
- INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
- INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
- INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
- INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
- INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
EVENT_CONSTRAINT_END
};
struct event_constraint intel_atom_pebs_event_constraints[] = {
- INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
- INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
- INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
EVENT_CONSTRAINT_END
};
@@ -577,36 +577,36 @@ struct event_constraint intel_slm_pebs_event_constraints[] = {
struct event_constraint intel_nehalem_pebs_event_constraints[] = {
INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
- INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */
INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
- INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
- INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
EVENT_CONSTRAINT_END
};
struct event_constraint intel_westmere_pebs_event_constraints[] = {
INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
- INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
- INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
EVENT_CONSTRAINT_END
};
struct event_constraint intel_snb_pebs_event_constraints[] = {
- INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
@@ -617,7 +617,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
};
struct event_constraint intel_ivb_pebs_event_constraints[] = {
- INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
@@ -628,7 +628,7 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
};
struct event_constraint intel_hsw_pebs_event_constraints[] = {
- INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
@@ -724,6 +724,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
unsigned long ip = regs->ip;
int is_64bit = 0;
void *kaddr;
+ int size;
/*
* We don't need to fixup if the PEBS assist is fault like
@@ -758,11 +759,12 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
return 1;
}
+ size = ip - to;
if (!kernel_ip(ip)) {
- int size, bytes;
+ int bytes;
u8 *buf = this_cpu_read(insn_buffer);
- size = ip - to; /* Must fit our buffer, see above */
+ /* 'size' must fit our buffer, see above */
bytes = copy_from_user_nmi(buf, (void __user *)to, size);
if (bytes != 0)
return 0;
@@ -780,11 +782,20 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
#ifdef CONFIG_X86_64
is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
#endif
- insn_init(&insn, kaddr, is_64bit);
+ insn_init(&insn, kaddr, size, is_64bit);
insn_get_length(&insn);
+ /*
+ * Make sure there was not a problem decoding the
+ * instruction and getting the length. This is
+ * doubly important because we have an infinite
+ * loop if insn.length=0.
+ */
+ if (!insn.length)
+ break;
to += insn.length;
kaddr += insn.length;
+ size -= insn.length;
} while (to < ip);
if (to == ip) {
@@ -886,6 +897,29 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
regs.bp = pebs->bp;
regs.sp = pebs->sp;
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ regs.ax = pebs->ax;
+ regs.bx = pebs->bx;
+ regs.cx = pebs->cx;
+ regs.dx = pebs->dx;
+ regs.si = pebs->si;
+ regs.di = pebs->di;
+ regs.bp = pebs->bp;
+ regs.sp = pebs->sp;
+
+ regs.flags = pebs->flags;
+#ifndef CONFIG_X86_32
+ regs.r8 = pebs->r8;
+ regs.r9 = pebs->r9;
+ regs.r10 = pebs->r10;
+ regs.r11 = pebs->r11;
+ regs.r12 = pebs->r12;
+ regs.r13 = pebs->r13;
+ regs.r14 = pebs->r14;
+ regs.r15 = pebs->r15;
+#endif
+ }
+
if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
regs.ip = pebs->real_ip;
regs.flags |= PERF_EFLAGS_EXACT;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 45fa730a5283..58f1a94beaf0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -465,7 +465,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
{
struct insn insn;
void *addr;
- int bytes, size = MAX_INSN_SIZE;
+ int bytes_read, bytes_left;
int ret = X86_BR_NONE;
int ext, to_plm, from_plm;
u8 buf[MAX_INSN_SIZE];
@@ -493,8 +493,10 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
return X86_BR_NONE;
/* may fail if text not present */
- bytes = copy_from_user_nmi(buf, (void __user *)from, size);
- if (bytes != 0)
+ bytes_left = copy_from_user_nmi(buf, (void __user *)from,
+ MAX_INSN_SIZE);
+ bytes_read = MAX_INSN_SIZE - bytes_left;
+ if (!bytes_read)
return X86_BR_NONE;
addr = buf;
@@ -505,10 +507,19 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
* Ensure we don't blindy read any address by validating it is
* a known text address.
*/
- if (kernel_text_address(from))
+ if (kernel_text_address(from)) {
addr = (void *)from;
- else
+ /*
+ * Assume we can get the maximum possible size
+ * when grabbing kernel data. This is not
+ * _strictly_ true since we could possibly be
+ * executing up next to a memory hole, but
+ * it is very unlikely to be a problem.
+ */
+ bytes_read = MAX_INSN_SIZE;
+ } else {
return X86_BR_NONE;
+ }
}
/*
@@ -518,8 +529,10 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
#ifdef CONFIG_X86_64
is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
#endif
- insn_init(&insn, addr, is64);
+ insn_init(&insn, addr, bytes_read, is64);
insn_get_opcode(&insn);
+ if (!insn.opcode.got)
+ return X86_BR_ABORT;
switch (insn.opcode.bytes[0]) {
case 0xf:
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index adf138eac85c..745b158e9a65 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -449,7 +449,11 @@ static struct attribute *snbep_uncore_qpi_formats_attr[] = {
static struct uncore_event_desc snbep_uncore_imc_events[] = {
INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
{ /* end: all zeroes */ },
};
@@ -486,14 +490,17 @@ static struct attribute_group snbep_uncore_qpi_format_group = {
.attrs = snbep_uncore_qpi_formats_attr,
};
-#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
- .init_box = snbep_uncore_msr_init_box, \
+#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
.disable_box = snbep_uncore_msr_disable_box, \
.enable_box = snbep_uncore_msr_enable_box, \
.disable_event = snbep_uncore_msr_disable_event, \
.enable_event = snbep_uncore_msr_enable_event, \
.read_counter = uncore_msr_read_counter
+#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
+ __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), \
+ .init_box = snbep_uncore_msr_init_box \
+
static struct intel_uncore_ops snbep_uncore_msr_ops = {
SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
};
@@ -1919,6 +1926,30 @@ static struct intel_uncore_type hswep_uncore_cbox = {
.format_group = &hswep_uncore_cbox_format_group,
};
+/*
+ * Write SBOX Initialization register bit by bit to avoid spurious #GPs
+ */
+static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+
+ if (msr) {
+ u64 init = SNBEP_PMON_BOX_CTL_INT;
+ u64 flags = 0;
+ int i;
+
+ for_each_set_bit(i, (unsigned long *)&init, 64) {
+ flags |= (1ULL << i);
+ wrmsrl(msr, flags);
+ }
+ }
+}
+
+static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
+ __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .init_box = hswep_uncore_sbox_msr_init_box
+};
+
static struct attribute *hswep_uncore_sbox_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
@@ -1944,7 +1975,7 @@ static struct intel_uncore_type hswep_uncore_sbox = {
.event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
.box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL,
.msr_offset = HSWEP_SBOX_MSR_OFFSET,
- .ops = &snbep_uncore_msr_ops,
+ .ops = &hswep_uncore_sbox_msr_ops,
.format_group = &hswep_uncore_sbox_format_group,
};
@@ -2009,7 +2040,11 @@ static struct intel_uncore_type hswep_uncore_ha = {
static struct uncore_event_desc hswep_uncore_imc_events[] = {
INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x00,umask=0x00"),
INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
{ /* end: all zeroes */ },
};
@@ -2025,13 +2060,27 @@ static struct intel_uncore_type hswep_uncore_imc = {
SNBEP_UNCORE_PCI_COMMON_INIT(),
};
+static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
+
+static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+ pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
static struct intel_uncore_ops hswep_uncore_irp_ops = {
.init_box = snbep_uncore_pci_init_box,
.disable_box = snbep_uncore_pci_disable_box,
.enable_box = snbep_uncore_pci_enable_box,
.disable_event = ivbep_uncore_irp_disable_event,
.enable_event = ivbep_uncore_irp_enable_event,
- .read_counter = ivbep_uncore_irp_read_counter,
+ .read_counter = hswep_uncore_irp_read_counter,
};
static struct intel_uncore_type hswep_uncore_irp = {
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 5433658e598d..e7d8c7608471 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -72,7 +72,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (c->x86_mask || c->cpuid_level >= 0)
seq_printf(m, "stepping\t: %d\n", c->x86_mask);
else
- seq_printf(m, "stepping\t: unknown\n");
+ seq_puts(m, "stepping\t: unknown\n");
if (c->microcode)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
@@ -92,12 +92,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
show_cpuinfo_core(m, c, cpu);
show_cpuinfo_misc(m, c);
- seq_printf(m, "flags\t\t:");
+ seq_puts(m, "flags\t\t:");
for (i = 0; i < 32*NCAPINTS; i++)
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
- seq_printf(m, "\nbugs\t\t:");
+ seq_puts(m, "\nbugs\t\t:");
for (i = 0; i < 32*NBUGINTS; i++) {
unsigned int bug_bit = 32*NCAPINTS + i;
@@ -118,7 +118,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
c->x86_phys_bits, c->x86_virt_bits);
- seq_printf(m, "power management:");
+ seq_puts(m, "power management:");
for (i = 0; i < 32; i++) {
if (c->x86_power & (1 << i)) {
if (i < ARRAY_SIZE(x86_power_flags) &&
@@ -131,7 +131,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
}
}
- seq_printf(m, "\n\n");
+ seq_puts(m, "\n\n");
return 0;
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 4a8013d55947..60639093d536 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -36,6 +36,11 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
{ X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
{ X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
+ { X86_FEATURE_HWP, CR_EAX, 7, 0x00000006, 0 },
+ { X86_FEATURE_HWP_NOITFY, CR_EAX, 8, 0x00000006, 0 },
+ { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 },
+ { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 },
+ { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 },
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 3225ae6c5180..83741a71558f 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -143,7 +143,7 @@ static int cpuid_device_create(int cpu)
dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
"cpu%d", cpu);
- return IS_ERR(dev) ? PTR_ERR(dev) : 0;
+ return PTR_ERR_OR_ZERO(dev);
}
static void cpuid_device_destroy(int cpu)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 1abcb50b48ae..ff86f19b5758 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -24,7 +24,6 @@ static char x86_stack_ids[][8] = {
[ DEBUG_STACK-1 ] = "#DB",
[ NMI_STACK-1 ] = "NMI",
[ DOUBLEFAULT_STACK-1 ] = "#DF",
- [ STACKFAULT_STACK-1 ] = "#SS",
[ MCE_STACK-1 ] = "#MC",
#if DEBUG_STKSZ > EXCEPTION_STKSZ
[ N_EXCEPTION_STACKS ...
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index df088bb03fb3..c0226ab54106 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -828,9 +828,15 @@ ENTRY(native_iret)
jnz native_irq_return_ldt
#endif
+.global native_irq_return_iret
native_irq_return_iret:
+ /*
+ * This may fault. Non-paranoid faults on return to userspace are
+ * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
+ * Double-faults due to espfix64 are handled in do_double_fault.
+ * Other faults here are fatal.
+ */
iretq
- _ASM_EXTABLE(native_irq_return_iret, bad_iret)
#ifdef CONFIG_X86_ESPFIX64
native_irq_return_ldt:
@@ -858,25 +864,6 @@ native_irq_return_ldt:
jmp native_irq_return_iret
#endif
- .section .fixup,"ax"
-bad_iret:
- /*
- * The iret traps when the %cs or %ss being restored is bogus.
- * We've lost the original trap vector and error code.
- * #GPF is the most likely one to get for an invalid selector.
- * So pretend we completed the iret and took the #GPF in user mode.
- *
- * We are now running with the kernel GS after exception recovery.
- * But error_entry expects us to have user GS to match the user %cs,
- * so swap back.
- */
- pushq $0
-
- SWAPGS
- jmp general_protection
-
- .previous
-
/* edi: workmask, edx: work */
retint_careful:
CFI_RESTORE_STATE
@@ -922,37 +909,6 @@ ENTRY(retint_kernel)
CFI_ENDPROC
END(common_interrupt)
- /*
- * If IRET takes a fault on the espfix stack, then we
- * end up promoting it to a doublefault. In that case,
- * modify the stack to make it look like we just entered
- * the #GP handler from user space, similar to bad_iret.
- */
-#ifdef CONFIG_X86_ESPFIX64
- ALIGN
-__do_double_fault:
- XCPT_FRAME 1 RDI+8
- movq RSP(%rdi),%rax /* Trap on the espfix stack? */
- sarq $PGDIR_SHIFT,%rax
- cmpl $ESPFIX_PGD_ENTRY,%eax
- jne do_double_fault /* No, just deliver the fault */
- cmpl $__KERNEL_CS,CS(%rdi)
- jne do_double_fault
- movq RIP(%rdi),%rax
- cmpq $native_irq_return_iret,%rax
- jne do_double_fault /* This shouldn't happen... */
- movq PER_CPU_VAR(kernel_stack),%rax
- subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
- movq %rax,RSP(%rdi)
- movq $0,(%rax) /* Missing (lost) #GP error code */
- movq $general_protection,RIP(%rdi)
- retq
- CFI_ENDPROC
-END(__do_double_fault)
-#else
-# define __do_double_fault do_double_fault
-#endif
-
/*
* APIC interrupts.
*/
@@ -1124,7 +1080,7 @@ idtentry overflow do_overflow has_error_code=0
idtentry bounds do_bounds has_error_code=0
idtentry invalid_op do_invalid_op has_error_code=0
idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault __do_double_fault has_error_code=1 paranoid=1
+idtentry double_fault do_double_fault has_error_code=1 paranoid=1
idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
idtentry invalid_TSS do_invalid_TSS has_error_code=1
idtentry segment_not_present do_segment_not_present has_error_code=1
@@ -1289,7 +1245,7 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
-idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1
+idtentry stack_segment do_stack_segment has_error_code=1
#ifdef CONFIG_XEN
idtentry xen_debug do_debug has_error_code=0
idtentry xen_int3 do_int3 has_error_code=0
@@ -1399,17 +1355,16 @@ error_sti:
/*
* There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. The exception handlers after iret run with
- * kernel gs again, so don't set the user space flag. B stepping K8s
- * sometimes report an truncated RIP for IRET exceptions returning to
- * compat mode. Check for these here too.
+ * usergs. Handle them here. B stepping K8s sometimes report a
+ * truncated RIP for IRET exceptions returning to compat mode. Check
+ * for these here too.
*/
error_kernelspace:
CFI_REL_OFFSET rcx, RCX+8
incl %ebx
leaq native_irq_return_iret(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
+ je error_bad_iret
movl %ecx,%eax /* zero extend */
cmpq %rax,RIP+8(%rsp)
je bstep_iret
@@ -1420,7 +1375,15 @@ error_kernelspace:
bstep_iret:
/* Fix truncated RIP */
movq %rcx,RIP+8(%rsp)
- jmp error_swapgs
+ /* fall through */
+
+error_bad_iret:
+ SWAPGS
+ mov %rsp,%rdi
+ call fixup_bad_iret
+ mov %rax,%rsp
+ decl %ebx /* Return to usergs */
+ jmp error_sti
CFI_ENDPROC
END(error_entry)
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 94d857fb1033..f5d0730e7b08 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -122,9 +122,6 @@ static void init_espfix_random(void)
void __init init_espfix_bsp(void)
{
pgd_t *pgd_p;
- pteval_t ptemask;
-
- ptemask = __supported_pte_mask;
/* Install the espfix pud into the kernel page directory */
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3386dc9aa333..2142376dc8c6 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -17,6 +17,7 @@
#include <linux/ftrace.h>
#include <linux/percpu.h>
#include <linux/sched.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/module.h>
@@ -47,7 +48,7 @@ int ftrace_arch_code_modify_post_process(void)
union ftrace_code_union {
char code[MCOUNT_INSN_SIZE];
struct {
- char e8;
+ unsigned char e8;
int offset;
} __attribute__((packed));
};
@@ -582,7 +583,7 @@ void ftrace_replace_code(int enable)
remove_breakpoints:
pr_warn("Failed on %s (%d):\n", report, count);
- ftrace_bug(ret, rec ? rec->ip : 0);
+ ftrace_bug(ret, rec);
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
/*
@@ -644,13 +645,8 @@ int __init ftrace_dyn_arch_init(void)
{
return 0;
}
-#endif
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-extern void ftrace_graph_call(void);
+#if defined(CONFIG_X86_64) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
{
static union ftrace_code_union calc;
@@ -664,6 +660,280 @@ static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
*/
return calc.code;
}
+#endif
+
+/* Currently only x86_64 supports dynamic trampolines */
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_MODULES
+#include <linux/moduleloader.h>
+/* Module allocation simplifies allocating memory for code */
+static inline void *alloc_tramp(unsigned long size)
+{
+ return module_alloc(size);
+}
+static inline void tramp_free(void *tramp)
+{
+ module_free(NULL, tramp);
+}
+#else
+/* Trampolines can only be created if modules are supported */
+static inline void *alloc_tramp(unsigned long size)
+{
+ return NULL;
+}
+static inline void tramp_free(void *tramp) { }
+#endif
+
+/* Defined as markers to the end of the ftrace default trampolines */
+extern void ftrace_caller_end(void);
+extern void ftrace_regs_caller_end(void);
+extern void ftrace_return(void);
+extern void ftrace_caller_op_ptr(void);
+extern void ftrace_regs_caller_op_ptr(void);
+
+/* movq function_trace_op(%rip), %rdx */
+/* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */
+#define OP_REF_SIZE 7
+
+/*
+ * The ftrace_ops is passed to the function callback. Since the
+ * trampoline only services a single ftrace_ops, we can pass in
+ * that ops directly.
+ *
+ * The ftrace_op_code_union is used to create a pointer to the
+ * ftrace_ops that will be passed to the callback function.
+ */
+union ftrace_op_code_union {
+ char code[OP_REF_SIZE];
+ struct {
+ char op[3];
+ int offset;
+ } __attribute__((packed));
+};
+
+static unsigned long
+create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
+{
+ unsigned const char *jmp;
+ unsigned long start_offset;
+ unsigned long end_offset;
+ unsigned long op_offset;
+ unsigned long offset;
+ unsigned long size;
+ unsigned long ip;
+ unsigned long *ptr;
+ void *trampoline;
+ /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */
+ unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
+ union ftrace_op_code_union op_ptr;
+ int ret;
+
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ start_offset = (unsigned long)ftrace_regs_caller;
+ end_offset = (unsigned long)ftrace_regs_caller_end;
+ op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
+ } else {
+ start_offset = (unsigned long)ftrace_caller;
+ end_offset = (unsigned long)ftrace_caller_end;
+ op_offset = (unsigned long)ftrace_caller_op_ptr;
+ }
+
+ size = end_offset - start_offset;
+
+ /*
+ * Allocate enough size to store the ftrace_caller code,
+ * the jmp to ftrace_return, as well as the address of
+ * the ftrace_ops this trampoline is used for.
+ */
+ trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *));
+ if (!trampoline)
+ return 0;
+
+ *tramp_size = size + MCOUNT_INSN_SIZE + sizeof(void *);
+
+ /* Copy ftrace_caller onto the trampoline memory */
+ ret = probe_kernel_read(trampoline, (void *)start_offset, size);
+ if (WARN_ON(ret < 0)) {
+ tramp_free(trampoline);
+ return 0;
+ }
+
+ ip = (unsigned long)trampoline + size;
+
+ /* The trampoline ends with a jmp to ftrace_return */
+ jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_return);
+ memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE);
+
+ /*
+ * The address of the ftrace_ops that is used for this trampoline
+ * is stored at the end of the trampoline. This will be used to
+ * load the third parameter for the callback. Basically, that
+ * location at the end of the trampoline takes the place of
+ * the global function_trace_op variable.
+ */
+
+ ptr = (unsigned long *)(trampoline + size + MCOUNT_INSN_SIZE);
+ *ptr = (unsigned long)ops;
+
+ op_offset -= start_offset;
+ memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE);
+
+ /* Are we pointing to the reference? */
+ if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) {
+ tramp_free(trampoline);
+ return 0;
+ }
+
+ /* Load the contents of ptr into the callback parameter */
+ offset = (unsigned long)ptr;
+ offset -= (unsigned long)trampoline + op_offset + OP_REF_SIZE;
+
+ op_ptr.offset = offset;
+
+ /* put in the new offset to the ftrace_ops */
+ memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+
+ /* ALLOC_TRAMP flags lets us know we created it */
+ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+
+ return (unsigned long)trampoline;
+}
+
+static unsigned long calc_trampoline_call_offset(bool save_regs)
+{
+ unsigned long start_offset;
+ unsigned long call_offset;
+
+ if (save_regs) {
+ start_offset = (unsigned long)ftrace_regs_caller;
+ call_offset = (unsigned long)ftrace_regs_call;
+ } else {
+ start_offset = (unsigned long)ftrace_caller;
+ call_offset = (unsigned long)ftrace_call;
+ }
+
+ return call_offset - start_offset;
+}
+
+void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+ ftrace_func_t func;
+ unsigned char *new;
+ unsigned long offset;
+ unsigned long ip;
+ unsigned int size;
+ int ret;
+
+ if (ops->trampoline) {
+ /*
+ * The ftrace_ops caller may set up its own trampoline.
+ * In such a case, this code must not modify it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
+ } else {
+ ops->trampoline = create_trampoline(ops, &size);
+ if (!ops->trampoline)
+ return;
+ ops->trampoline_size = size;
+ }
+
+ offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
+ ip = ops->trampoline + offset;
+
+ func = ftrace_ops_get_func(ops);
+
+ /* Do a safe modify in case the trampoline is executing */
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = update_ftrace_func(ip, new);
+
+ /* The update should never fail */
+ WARN_ON(ret);
+}
+
+/* Return the address of the function the trampoline calls */
+static void *addr_from_call(void *ptr)
+{
+ union ftrace_code_union calc;
+ int ret;
+
+ ret = probe_kernel_read(&calc, ptr, MCOUNT_INSN_SIZE);
+ if (WARN_ON_ONCE(ret < 0))
+ return NULL;
+
+ /* Make sure this is a call */
+ if (WARN_ON_ONCE(calc.e8 != 0xe8)) {
+ pr_warn("Expected e8, got %x\n", calc.e8);
+ return NULL;
+ }
+
+ return ptr + MCOUNT_INSN_SIZE + calc.offset;
+}
+
+void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+ unsigned long frame_pointer);
+
+/*
+ * If the ops->trampoline was not allocated, then it probably
+ * has a static trampoline func, or is the ftrace caller itself.
+ */
+static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ unsigned long offset;
+ bool save_regs = rec->flags & FTRACE_FL_REGS_EN;
+ void *ptr;
+
+ if (ops && ops->trampoline) {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ /*
+ * We only know about function graph tracer setting as static
+ * trampoline.
+ */
+ if (ops->trampoline == FTRACE_GRAPH_ADDR)
+ return (void *)prepare_ftrace_return;
+#endif
+ return NULL;
+ }
+
+ offset = calc_trampoline_call_offset(save_regs);
+
+ if (save_regs)
+ ptr = (void *)FTRACE_REGS_ADDR + offset;
+ else
+ ptr = (void *)FTRACE_ADDR + offset;
+
+ return addr_from_call(ptr);
+}
+
+void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ unsigned long offset;
+
+ /* If we didn't allocate this trampoline, consider it static */
+ if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return static_tramp_func(ops, rec);
+
+ offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
+ return addr_from_call((void *)ops->trampoline + offset);
+}
+
+void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+ if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
+
+ tramp_free((void *)ops->trampoline);
+ ops->trampoline = 0;
+}
+
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
static int ftrace_mod_jmp(unsigned long ip, void *func)
{
@@ -694,7 +964,7 @@ int ftrace_disable_ftrace_graph_caller(void)
* Hook the return address and push it in the stack of return addrs
* in current thread info.
*/
-void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
+void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
unsigned long frame_pointer)
{
unsigned long old;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 922d28581024..6307a0f0cf17 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -59,78 +59,78 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%*s: ", prec, "NMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
- seq_printf(p, " Non-maskable interrupts\n");
+ seq_puts(p, " Non-maskable interrupts\n");
#ifdef CONFIG_X86_LOCAL_APIC
seq_printf(p, "%*s: ", prec, "LOC");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
- seq_printf(p, " Local timer interrupts\n");
+ seq_puts(p, " Local timer interrupts\n");
seq_printf(p, "%*s: ", prec, "SPU");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
- seq_printf(p, " Spurious interrupts\n");
+ seq_puts(p, " Spurious interrupts\n");
seq_printf(p, "%*s: ", prec, "PMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
- seq_printf(p, " Performance monitoring interrupts\n");
+ seq_puts(p, " Performance monitoring interrupts\n");
seq_printf(p, "%*s: ", prec, "IWI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
- seq_printf(p, " IRQ work interrupts\n");
+ seq_puts(p, " IRQ work interrupts\n");
seq_printf(p, "%*s: ", prec, "RTR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
- seq_printf(p, " APIC ICR read retries\n");
+ seq_puts(p, " APIC ICR read retries\n");
#endif
if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
- seq_printf(p, " Platform interrupts\n");
+ seq_puts(p, " Platform interrupts\n");
}
#ifdef CONFIG_SMP
seq_printf(p, "%*s: ", prec, "RES");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
- seq_printf(p, " Rescheduling interrupts\n");
+ seq_puts(p, " Rescheduling interrupts\n");
seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
irq_stats(j)->irq_tlb_count);
- seq_printf(p, " Function call interrupts\n");
+ seq_puts(p, " Function call interrupts\n");
seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
- seq_printf(p, " TLB shootdowns\n");
+ seq_puts(p, " TLB shootdowns\n");
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
seq_printf(p, "%*s: ", prec, "TRM");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
- seq_printf(p, " Thermal event interrupts\n");
+ seq_puts(p, " Thermal event interrupts\n");
#endif
#ifdef CONFIG_X86_MCE_THRESHOLD
seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
- seq_printf(p, " Threshold APIC interrupts\n");
+ seq_puts(p, " Threshold APIC interrupts\n");
#endif
#ifdef CONFIG_X86_MCE
seq_printf(p, "%*s: ", prec, "MCE");
for_each_online_cpu(j)
seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
- seq_printf(p, " Machine check exceptions\n");
+ seq_puts(p, " Machine check exceptions\n");
seq_printf(p, "%*s: ", prec, "MCP");
for_each_online_cpu(j)
seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
- seq_printf(p, " Machine check polls\n");
+ seq_puts(p, " Machine check polls\n");
#endif
#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count);
- seq_printf(p, " Hypervisor callback interrupts\n");
+ seq_puts(p, " Hypervisor callback interrupts\n");
#endif
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 67e6d19ef1be..f7e3cd50ece0 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -285,7 +285,7 @@ static int can_probe(unsigned long paddr)
* normally used, we just go through if there is no kprobe.
*/
__addr = recover_probed_instruction(buf, addr);
- kernel_insn_init(&insn, (void *)__addr);
+ kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE);
insn_get_length(&insn);
/*
@@ -330,8 +330,10 @@ int __copy_instruction(u8 *dest, u8 *src)
{
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
+ unsigned long recovered_insn =
+ recover_probed_instruction(buf, (unsigned long)src);
- kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src));
+ kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
insn_get_length(&insn);
/* Another subsystem puts a breakpoint, failed to recover */
if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
@@ -342,7 +344,7 @@ int __copy_instruction(u8 *dest, u8 *src)
if (insn_rip_relative(&insn)) {
s64 newdisp;
u8 *disp;
- kernel_insn_init(&insn, dest);
+ kernel_insn_init(&insn, dest, insn.length);
insn_get_displacement(&insn);
/*
* The copied instruction uses the %rip-relative addressing
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 717b02a22e67..5f8f0b3cc674 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -27,7 +27,7 @@
static nokprobe_inline
int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+ struct kprobe_ctlblk *kcb, unsigned long orig_ip)
{
/*
* Emulate singlestep (and also recover regs->ip)
@@ -39,6 +39,8 @@ int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
p->post_handler(p, regs, 0);
}
__this_cpu_write(current_kprobe, NULL);
+ if (orig_ip)
+ regs->ip = orig_ip;
return 1;
}
@@ -46,7 +48,7 @@ int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
{
if (kprobe_ftrace(p))
- return __skip_singlestep(p, regs, kcb);
+ return __skip_singlestep(p, regs, kcb, 0);
else
return 0;
}
@@ -71,13 +73,14 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
if (kprobe_running()) {
kprobes_inc_nmissed_count(p);
} else {
+ unsigned long orig_ip = regs->ip;
/* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
regs->ip = ip + sizeof(kprobe_opcode_t);
__this_cpu_write(current_kprobe, p);
kcb->kprobe_status = KPROBE_HIT_ACTIVE;
if (!p->pre_handler || !p->pre_handler(p, regs))
- __skip_singlestep(p, regs, kcb);
+ __skip_singlestep(p, regs, kcb, orig_ip);
/*
* If pre_handler returns !0, it sets regs->ip and
* resets current kprobe.
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index f1314d0bcf0a..7c523bbf3dc8 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -251,13 +251,15 @@ static int can_optimize(unsigned long paddr)
/* Decode instructions */
addr = paddr - offset;
while (addr < paddr - offset + size) { /* Decode until function end */
+ unsigned long recovered_insn;
if (search_exception_tables(addr))
/*
* Since some fixup code will jumps into this function,
* we can't optimize kprobe in this function.
*/
return 0;
- kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr));
+ recovered_insn = recover_probed_instruction(buf, addr);
+ kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
insn_get_length(&insn);
/* Another subsystem puts a breakpoint */
if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
index c73aecf10d34..94ea120fa21f 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -21,40 +21,159 @@
# define function_hook mcount
#endif
+/* All cases save the original rbp (8 bytes) */
+#ifdef CONFIG_FRAME_POINTER
+# ifdef CC_USING_FENTRY
+/* Save parent and function stack frames (rip and rbp) */
+# define MCOUNT_FRAME_SIZE (8+16*2)
+# else
+/* Save just function stack frame (rip and rbp) */
+# define MCOUNT_FRAME_SIZE (8+16)
+# endif
+#else
+/* No need to save a stack frame */
+# define MCOUNT_FRAME_SIZE 8
+#endif /* CONFIG_FRAME_POINTER */
+
+/* Size of stack used to save mcount regs in save_mcount_regs */
+#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE)
+
+/*
+ * gcc -pg option adds a call to 'mcount' in most functions.
+ * When -mfentry is used, the call is to 'fentry' and not 'mcount'
+ * and is done before the function's stack frame is set up.
+ * They both require a set of regs to be saved before calling
+ * any C code and restored before returning back to the function.
+ *
+ * On boot up, all these calls are converted into nops. When tracing
+ * is enabled, the call can jump to either ftrace_caller or
+ * ftrace_regs_caller. Callbacks (tracing functions) that require
+ * ftrace_regs_caller (like kprobes) need to have pt_regs passed to
+ * it. For this reason, the size of the pt_regs structure will be
+ * allocated on the stack and the required mcount registers will
+ * be saved in the locations that pt_regs has them in.
+ */
+
+/*
+ * @added: the amount of stack added before calling this
+ *
+ * After this is called, the following registers contain:
+ *
+ * %rdi - holds the address that called the trampoline
+ * %rsi - holds the parent function (traced function's return address)
+ * %rdx - holds the original %rbp
+ */
+.macro save_mcount_regs added=0
+
+ /* Always save the original rbp */
+ pushq %rbp
+
+#ifdef CONFIG_FRAME_POINTER
+ /*
+ * Stack traces will stop at the ftrace trampoline if the frame pointer
+ * is not set up properly. If fentry is used, we need to save a frame
+ * pointer for the parent as well as the function traced, because the
+ * fentry is called before the stack frame is set up, where as mcount
+ * is called afterward.
+ */
+#ifdef CC_USING_FENTRY
+ /* Save the parent pointer (skip orig rbp and our return address) */
+ pushq \added+8*2(%rsp)
+ pushq %rbp
+ movq %rsp, %rbp
+ /* Save the return address (now skip orig rbp, rbp and parent) */
+ pushq \added+8*3(%rsp)
+#else
+ /* Can't assume that rip is before this (unless added was zero) */
+ pushq \added+8(%rsp)
+#endif
+ pushq %rbp
+ movq %rsp, %rbp
+#endif /* CONFIG_FRAME_POINTER */
+
+ /*
+ * We add enough stack to save all regs.
+ */
+ subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp
+ movq %rax, RAX(%rsp)
+ movq %rcx, RCX(%rsp)
+ movq %rdx, RDX(%rsp)
+ movq %rsi, RSI(%rsp)
+ movq %rdi, RDI(%rsp)
+ movq %r8, R8(%rsp)
+ movq %r9, R9(%rsp)
+ /*
+ * Save the original RBP. Even though the mcount ABI does not
+ * require this, it helps out callers.
+ */
+ movq MCOUNT_REG_SIZE-8(%rsp), %rdx
+ movq %rdx, RBP(%rsp)
+
+ /* Copy the parent address into %rsi (second parameter) */
+#ifdef CC_USING_FENTRY
+ movq MCOUNT_REG_SIZE+8+\added(%rsp), %rsi
+#else
+ /* %rdx contains original %rbp */
+ movq 8(%rdx), %rsi
+#endif
+
+ /* Move RIP to its proper location */
+ movq MCOUNT_REG_SIZE+\added(%rsp), %rdi
+ movq %rdi, RIP(%rsp)
+
+ /*
+ * Now %rdi (the first parameter) has the return address of
+ * where ftrace_call returns. But the callbacks expect the
+ * address of the call itself.
+ */
+ subq $MCOUNT_INSN_SIZE, %rdi
+ .endm
+
+.macro restore_mcount_regs
+ movq R9(%rsp), %r9
+ movq R8(%rsp), %r8
+ movq RDI(%rsp), %rdi
+ movq RSI(%rsp), %rsi
+ movq RDX(%rsp), %rdx
+ movq RCX(%rsp), %rcx
+ movq RAX(%rsp), %rax
+
+ /* ftrace_regs_caller can modify %rbp */
+ movq RBP(%rsp), %rbp
+
+ addq $MCOUNT_REG_SIZE, %rsp
+
+ .endm
+
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(function_hook)
retq
END(function_hook)
-/* skip is set if stack has been adjusted */
-.macro ftrace_caller_setup skip=0
- MCOUNT_SAVE_FRAME \skip
+ENTRY(ftrace_caller)
+ /* save_mcount_regs fills in first two parameters */
+ save_mcount_regs
+GLOBAL(ftrace_caller_op_ptr)
/* Load the ftrace_ops into the 3rd parameter */
movq function_trace_op(%rip), %rdx
- /* Load ip into the first parameter */
- movq RIP(%rsp), %rdi
- subq $MCOUNT_INSN_SIZE, %rdi
- /* Load the parent_ip into the second parameter */
-#ifdef CC_USING_FENTRY
- movq SS+16(%rsp), %rsi
-#else
- movq 8(%rbp), %rsi
-#endif
-.endm
-
-ENTRY(ftrace_caller)
- ftrace_caller_setup
/* regs go into 4th parameter (but make it NULL) */
movq $0, %rcx
GLOBAL(ftrace_call)
call ftrace_stub
- MCOUNT_RESTORE_FRAME
-ftrace_return:
+ restore_mcount_regs
+
+ /*
+ * The copied trampoline must call ftrace_return as it
+ * still may need to call the function graph tracer.
+ */
+GLOBAL(ftrace_caller_end)
+
+GLOBAL(ftrace_return)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
GLOBAL(ftrace_graph_call)
@@ -66,11 +185,16 @@ GLOBAL(ftrace_stub)
END(ftrace_caller)
ENTRY(ftrace_regs_caller)
- /* Save the current flags before compare (in SS location)*/
+ /* Save the current flags before any operations that can change them */
pushfq
- /* skip=8 to skip flags saved in SS */
- ftrace_caller_setup 8
+ /* added 8 bytes to save flags */
+ save_mcount_regs 8
+ /* save_mcount_regs fills in first two parameters */
+
+GLOBAL(ftrace_regs_caller_op_ptr)
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
/* Save the rest of pt_regs */
movq %r15, R15(%rsp)
@@ -79,18 +203,17 @@ ENTRY(ftrace_regs_caller)
movq %r12, R12(%rsp)
movq %r11, R11(%rsp)
movq %r10, R10(%rsp)
- movq %rbp, RBP(%rsp)
movq %rbx, RBX(%rsp)
/* Copy saved flags */
- movq SS(%rsp), %rcx
+ movq MCOUNT_REG_SIZE(%rsp), %rcx
movq %rcx, EFLAGS(%rsp)
/* Kernel segments */
movq $__KERNEL_DS, %rcx
movq %rcx, SS(%rsp)
movq $__KERNEL_CS, %rcx
movq %rcx, CS(%rsp)
- /* Stack - skipping return address */
- leaq SS+16(%rsp), %rcx
+ /* Stack - skipping return address and flags */
+ leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx
movq %rcx, RSP(%rsp)
/* regs go into 4th parameter */
@@ -101,11 +224,11 @@ GLOBAL(ftrace_regs_call)
/* Copy flags back to SS, to restore them */
movq EFLAGS(%rsp), %rax
- movq %rax, SS(%rsp)
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
/* Handlers can change the RIP */
movq RIP(%rsp), %rax
- movq %rax, SS+8(%rsp)
+ movq %rax, MCOUNT_REG_SIZE+8(%rsp)
/* restore the rest of pt_regs */
movq R15(%rsp), %r15
@@ -113,19 +236,22 @@ GLOBAL(ftrace_regs_call)
movq R13(%rsp), %r13
movq R12(%rsp), %r12
movq R10(%rsp), %r10
- movq RBP(%rsp), %rbp
movq RBX(%rsp), %rbx
- /* skip=8 to skip flags saved in SS */
- MCOUNT_RESTORE_FRAME 8
+ restore_mcount_regs
/* Restore flags */
popfq
- jmp ftrace_return
+ /*
+ * As this jmp to ftrace_return can be a short jump
+ * it must not be copied into the trampoline.
+ * The trampoline will add the code to jump
+ * to the return.
+ */
+GLOBAL(ftrace_regs_caller_end)
- popfq
- jmp ftrace_stub
+ jmp ftrace_return
END(ftrace_regs_caller)
@@ -136,6 +262,7 @@ ENTRY(function_hook)
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
+fgraph_trace:
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
cmpq $ftrace_stub, ftrace_graph_return
jnz ftrace_graph_caller
@@ -148,42 +275,35 @@ GLOBAL(ftrace_stub)
retq
trace:
- MCOUNT_SAVE_FRAME
-
- movq RIP(%rsp), %rdi
-#ifdef CC_USING_FENTRY
- movq SS+16(%rsp), %rsi
-#else
- movq 8(%rbp), %rsi
-#endif
- subq $MCOUNT_INSN_SIZE, %rdi
+ /* save_mcount_regs fills in first two parameters */
+ save_mcount_regs
call *ftrace_trace_function
- MCOUNT_RESTORE_FRAME
+ restore_mcount_regs
- jmp ftrace_stub
+ jmp fgraph_trace
END(function_hook)
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ENTRY(ftrace_graph_caller)
- MCOUNT_SAVE_FRAME
+ /* Saves rbp into %rdx and fills first parameter */
+ save_mcount_regs
#ifdef CC_USING_FENTRY
- leaq SS+16(%rsp), %rdi
+ leaq MCOUNT_REG_SIZE+8(%rsp), %rsi
movq $0, %rdx /* No framepointers needed */
#else
- leaq 8(%rbp), %rdi
- movq (%rbp), %rdx
+ /* Save address of the return address of traced function */
+ leaq 8(%rdx), %rsi
+ /* ftrace does sanity checks against frame pointers */
+ movq (%rdx), %rdx
#endif
- movq RIP(%rsp), %rsi
- subq $MCOUNT_INSN_SIZE, %rsi
-
call prepare_ftrace_return
- MCOUNT_RESTORE_FRAME
+ restore_mcount_regs
retq
END(ftrace_graph_caller)
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index c9603ac80de5..113e70784854 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -22,6 +22,8 @@
* an SMP box will direct the access to CPU %d.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
@@ -50,11 +52,11 @@ static loff_t msr_seek(struct file *file, loff_t offset, int orig)
mutex_lock(&inode->i_mutex);
switch (orig) {
- case 0:
+ case SEEK_SET:
file->f_pos = offset;
ret = file->f_pos;
break;
- case 1:
+ case SEEK_CUR:
file->f_pos += offset;
ret = file->f_pos;
break;
@@ -206,7 +208,7 @@ static int msr_device_create(int cpu)
dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
"msr%d", cpu);
- return IS_ERR(dev) ? PTR_ERR(dev) : 0;
+ return PTR_ERR_OR_ZERO(dev);
}
static void msr_device_destroy(int cpu)
@@ -248,8 +250,7 @@ static int __init msr_init(void)
i = 0;
if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
- printk(KERN_ERR "msr: unable to get major %d for msr\n",
- MSR_MAJOR);
+ pr_err("unable to get major %d for msr\n", MSR_MAJOR);
err = -EBUSY;
goto out;
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 749b0e423419..e510618b2e91 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1484,7 +1484,7 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
*/
if (work & _TIF_NOHZ) {
user_exit();
- work &= ~TIF_NOHZ;
+ work &= ~_TIF_NOHZ;
}
#ifdef CONFIG_SECCOMP
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ab08aa2276fb..ab4734e5411d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -960,6 +960,8 @@ void __init setup_arch(char **cmdline_p)
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = _brk_end;
+ mpx_mm_init(&init_mm);
+
code_resource.start = __pa_symbol(_text);
code_resource.end = __pa_symbol(_etext)-1;
data_resource.start = __pa_symbol(_etext);
@@ -1190,9 +1192,7 @@ void __init setup_arch(char **cmdline_p)
tboot_probe();
-#ifdef CONFIG_X86_64
map_vsyscall();
-#endif
generic_apic_probe();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5cdff0357746..e4fcb87ba7a6 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -30,7 +30,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
#define BOOT_PERCPU_OFFSET 0
#endif
-DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
EXPORT_PER_CPU_SYMBOL(this_cpu_off);
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 4d2128ac70bd..7a8f5845e8eb 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -99,7 +99,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map);
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
/* Per CPU bogomips and other parameters */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
+DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
atomic_t init_deasserted;
@@ -1303,10 +1303,14 @@ static void __ref remove_cpu_from_maps(int cpu)
numa_remove_cpu(cpu);
}
+static DEFINE_PER_CPU(struct completion, die_complete);
+
void cpu_disable_common(void)
{
int cpu = smp_processor_id();
+ init_completion(&per_cpu(die_complete, smp_processor_id()));
+
remove_siblinginfo(cpu);
/* It's now safe to remove this processor from the online map */
@@ -1316,8 +1320,6 @@ void cpu_disable_common(void)
fixup_irqs();
}
-static DEFINE_PER_CPU(struct completion, die_complete);
-
int native_cpu_disable(void)
{
int ret;
@@ -1327,16 +1329,21 @@ int native_cpu_disable(void)
return ret;
clear_local_APIC();
- init_completion(&per_cpu(die_complete, smp_processor_id()));
cpu_disable_common();
return 0;
}
+void cpu_die_common(unsigned int cpu)
+{
+ wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
+}
+
void native_cpu_die(unsigned int cpu)
{
/* We don't do anything here: idle task is faking death itself. */
- wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
+
+ cpu_die_common(cpu);
/* They ack this in play_dead() by setting CPU_DEAD */
if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c
index 193ec2ce46c7..160386e9fc17 100644
--- a/arch/x86/kernel/sysfb.c
+++ b/arch/x86/kernel/sysfb.c
@@ -67,7 +67,7 @@ static __init int sysfb_init(void)
pd = platform_device_register_resndata(NULL, name, 0,
NULL, 0, si, sizeof(*si));
- return IS_ERR(pd) ? PTR_ERR(pd) : 0;
+ return PTR_ERR_OR_ZERO(pd);
}
/* must execute after PCI subsystem for EFI quirks */
diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
index 86179d409893..764a29f84de7 100644
--- a/arch/x86/kernel/sysfb_simplefb.c
+++ b/arch/x86/kernel/sysfb_simplefb.c
@@ -88,8 +88,5 @@ __init int create_simplefb(const struct screen_info *si,
pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0,
&res, 1, mode, sizeof(*mode));
- if (IS_ERR(pd))
- return PTR_ERR(pd);
-
- return 0;
+ return PTR_ERR_OR_ZERO(pd);
}
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 0fa29609b2c4..25adc0e16eaa 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -23,7 +23,7 @@
#include <asm/time.h>
#ifdef CONFIG_X86_64
-__visible DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
+__visible volatile unsigned long jiffies __cacheline_aligned = INITIAL_JIFFIES;
#endif
unsigned long profile_pc(struct pt_regs *regs)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 0d0e922fafc1..a9ae20579895 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -60,6 +60,7 @@
#include <asm/fixmap.h>
#include <asm/mach_traps.h>
#include <asm/alternative.h>
+#include <asm/mpx.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -228,37 +229,44 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error)
DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
-DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds)
DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op)
DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun)
DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
-#ifdef CONFIG_X86_32
DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
-#endif
DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
#ifdef CONFIG_X86_64
/* Runs on IST stack */
-dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
-{
- enum ctx_state prev_state;
-
- prev_state = exception_enter();
- if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
- X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) {
- preempt_conditional_sti(regs);
- do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
- preempt_conditional_cli(regs);
- }
- exception_exit(prev_state);
-}
-
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
+#ifdef CONFIG_X86_ESPFIX64
+ extern unsigned char native_irq_return_iret[];
+
+ /*
+ * If IRET takes a non-IST fault on the espfix64 stack, then we
+ * end up promoting it to a doublefault. In that case, modify
+ * the stack to make it look like we just entered the #GP
+ * handler from user space, similar to bad_iret.
+ */
+ if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
+ regs->cs == __KERNEL_CS &&
+ regs->ip == (unsigned long)native_irq_return_iret)
+ {
+ struct pt_regs *normal_regs = task_pt_regs(current);
+
+ /* Fake a #GP(0) from userspace. */
+ memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
+ normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
+ regs->ip = (unsigned long)general_protection;
+ regs->sp = (unsigned long)&normal_regs->orig_ax;
+ return;
+ }
+#endif
+
exception_enter();
/* Return not checked because double check cannot be ignored */
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
@@ -278,6 +286,89 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
}
#endif
+dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
+{
+ struct task_struct *tsk = current;
+ struct xsave_struct *xsave_buf;
+ enum ctx_state prev_state;
+ struct bndcsr *bndcsr;
+ siginfo_t *info;
+
+ prev_state = exception_enter();
+ if (notify_die(DIE_TRAP, "bounds", regs, error_code,
+ X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
+ goto exit;
+ conditional_sti(regs);
+
+ if (!user_mode(regs))
+ die("bounds", regs, error_code);
+
+ if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
+ /* The exception is not from Intel MPX */
+ goto exit_trap;
+ }
+
+ /*
+ * We need to look at BNDSTATUS to resolve this exception.
+ * It is not directly accessible, though, so we need to
+ * do an xsave and then pull it out of the xsave buffer.
+ */
+ fpu_save_init(&tsk->thread.fpu);
+ xsave_buf = &(tsk->thread.fpu.state->xsave);
+ bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
+ if (!bndcsr)
+ goto exit_trap;
+
+ /*
+ * The error code field of the BNDSTATUS register communicates status
+ * information of a bound range exception #BR or operation involving
+ * bound directory.
+ */
+ switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
+ case 2: /* Bound directory has invalid entry. */
+ if (mpx_handle_bd_fault(xsave_buf))
+ goto exit_trap;
+ break; /* Success, it was handled */
+ case 1: /* Bound violation. */
+ info = mpx_generate_siginfo(regs, xsave_buf);
+ if (PTR_ERR(info)) {
+ /*
+ * We failed to decode the MPX instruction. Act as if
+ * the exception was not caused by MPX.
+ */
+ goto exit_trap;
+ }
+ /*
+ * Success, we decoded the instruction and retrieved
+ * an 'info' containing the address being accessed
+ * which caused the exception. This information
+ * allows and application to possibly handle the
+ * #BR exception itself.
+ */
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info);
+ kfree(info);
+ break;
+ case 0: /* No exception caused by Intel MPX operations. */
+ goto exit_trap;
+ default:
+ die("bounds", regs, error_code);
+ }
+
+exit:
+ exception_exit(prev_state);
+ return;
+exit_trap:
+ /*
+ * This path out is for all the cases where we could not
+ * handle the exception in some way (like allocating a
+ * table or telling userspace about it. We will also end
+ * up here if the kernel has MPX turned off at compile
+ * time..
+ */
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
+ exception_exit(prev_state);
+}
+
dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
@@ -379,7 +470,7 @@ NOKPROBE_SYMBOL(do_int3);
* for scheduling or signal handling. The actual stack switch is done in
* entry.S
*/
-asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs)
+asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = eregs;
/* Did already sync */
@@ -399,6 +490,36 @@ asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs)
return regs;
}
NOKPROBE_SYMBOL(sync_regs);
+
+struct bad_iret_stack {
+ void *error_entry_ret;
+ struct pt_regs regs;
+};
+
+asmlinkage __visible notrace
+struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
+{
+ /*
+ * This is called from entry_64.S early in handling a fault
+ * caused by a bad iret to user mode. To handle the fault
+ * correctly, we want move our stack frame to task_pt_regs
+ * and we want to pretend that the exception came from the
+ * iret target.
+ */
+ struct bad_iret_stack *new_stack =
+ container_of(task_pt_regs(current),
+ struct bad_iret_stack, regs);
+
+ /* Copy the IRET target to the new stack. */
+ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
+
+ /* Copy the remainder of the stack from the current stack. */
+ memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
+
+ BUG_ON(!user_mode_vm(&new_stack->regs));
+ return new_stack;
+}
+NOKPROBE_SYMBOL(fixup_bad_iret);
#endif
/*
@@ -778,7 +899,7 @@ void __init trap_init(void)
set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
set_intr_gate(X86_TRAP_TS, invalid_TSS);
set_intr_gate(X86_TRAP_NP, segment_not_present);
- set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
+ set_intr_gate(X86_TRAP_SS, stack_segment);
set_intr_gate(X86_TRAP_GP, general_protection);
set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
set_intr_gate(X86_TRAP_MF, coprocessor_error);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 5d1cbfe4ae58..8b96a947021f 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -219,7 +219,7 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
{
u32 volatile *good_insns;
- insn_init(insn, auprobe->insn, x86_64);
+ insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64);
/* has the side-effect of processing the entire instruction */
insn_get_length(insn);
if (WARN_ON_ONCE(!insn_complete(insn)))
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 49edf2dd3613..00bf300fd846 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -186,6 +186,8 @@ SECTIONS
* start another segment - init.
*/
PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
+ ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START,
+ "per-CPU data too large - increase CONFIG_PHYSICAL_START")
#endif
INIT_TEXT_SECTION(PAGE_SIZE)
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 957779f4eb40..2dcc6ff6fdcc 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -1,59 +1,43 @@
/*
+ * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
+ *
+ * Based on the original implementation which is:
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright 2003 Andi Kleen, SuSE Labs.
*
- * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
+ * Parts of the original code have been moved to arch/x86/vdso/vma.c
+ *
+ * This file implements vsyscall emulation. vsyscalls are a legacy ABI:
+ * Userspace can request certain kernel services by calling fixed
+ * addresses. This concept is problematic:
*
- * Thanks to hpa@transmeta.com for some useful hint.
- * Special thanks to Ingo Molnar for his early experience with
- * a different vsyscall implementation for Linux/IA32 and for the name.
+ * - It interferes with ASLR.
+ * - It's awkward to write code that lives in kernel addresses but is
+ * callable by userspace at fixed addresses.
+ * - The whole concept is impossible for 32-bit compat userspace.
+ * - UML cannot easily virtualize a vsyscall.
*
- * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
- * at virtual address -10Mbyte+1024bytes etc... There are at max 4
- * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
- * jumping out of line if necessary. We cannot add more with this
- * mechanism because older kernels won't return -ENOSYS.
+ * As of mid-2014, I believe that there is no new userspace code that
+ * will use a vsyscall if the vDSO is present. I hope that there will
+ * soon be no new userspace code that will ever use a vsyscall.
*
- * Note: the concept clashes with user mode linux. UML users should
- * use the vDSO.
+ * The code in this file emulates vsyscalls when notified of a page
+ * fault to a vsyscall address.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/time.h>
-#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/timer.h>
-#include <linux/seqlock.h>
-#include <linux/jiffies.h>
-#include <linux/sysctl.h>
-#include <linux/topology.h>
-#include <linux/timekeeper_internal.h>
-#include <linux/getcpu.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/notifier.h>
#include <linux/syscalls.h>
#include <linux/ratelimit.h>
#include <asm/vsyscall.h>
-#include <asm/pgtable.h>
-#include <asm/compat.h>
-#include <asm/page.h>
#include <asm/unistd.h>
#include <asm/fixmap.h>
-#include <asm/errno.h>
-#include <asm/io.h>
-#include <asm/segment.h>
-#include <asm/desc.h>
-#include <asm/topology.h>
#include <asm/traps.h>
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
-DEFINE_VVAR(int, vgetcpu_mode);
-
static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
static int __init vsyscall_setup(char *str)
@@ -222,6 +206,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
"seccomp tried to change syscall nr or ip");
do_exit(SIGSYS);
}
+ regs->orig_ax = -1;
if (tmp)
goto do_ret; /* skip requested */
@@ -284,46 +269,54 @@ sigsegv:
}
/*
- * Assume __initcall executes before all user space. Hopefully kmod
- * doesn't violate that. We'll find out if it does.
+ * A pseudo VMA to allow ptrace access for the vsyscall page. This only
+ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+ * not need special handling anymore:
*/
-static void vsyscall_set_cpu(int cpu)
+static const char *gate_vma_name(struct vm_area_struct *vma)
{
- unsigned long d;
- unsigned long node = 0;
-#ifdef CONFIG_NUMA
- node = cpu_to_node(cpu);
-#endif
- if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
- write_rdtscp_aux((node << 12) | cpu);
-
- /*
- * Store cpu number in limit so that it can be loaded quickly
- * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
- */
- d = 0x0f40000000000ULL;
- d |= cpu;
- d |= (node & 0xf) << 12;
- d |= (node >> 4) << 48;
-
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
+ return "[vsyscall]";
}
-
-static void cpu_vsyscall_init(void *arg)
+static struct vm_operations_struct gate_vma_ops = {
+ .name = gate_vma_name,
+};
+static struct vm_area_struct gate_vma = {
+ .vm_start = VSYSCALL_ADDR,
+ .vm_end = VSYSCALL_ADDR + PAGE_SIZE,
+ .vm_page_prot = PAGE_READONLY_EXEC,
+ .vm_flags = VM_READ | VM_EXEC,
+ .vm_ops = &gate_vma_ops,
+};
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
- /* preemption should be already off */
- vsyscall_set_cpu(raw_smp_processor_id());
+#ifdef CONFIG_IA32_EMULATION
+ if (!mm || mm->context.ia32_compat)
+ return NULL;
+#endif
+ if (vsyscall_mode == NONE)
+ return NULL;
+ return &gate_vma;
}
-static int
-cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
- long cpu = (long)arg;
+ struct vm_area_struct *vma = get_gate_vma(mm);
+
+ if (!vma)
+ return 0;
- if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
- smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
+ return (addr >= vma->vm_start) && (addr < vma->vm_end);
+}
- return NOTIFY_DONE;
+/*
+ * Use this when you have no reliable mm, typically from interrupt
+ * context. It is less reliable than using a task's mm and may give
+ * false positives.
+ */
+int in_gate_area_no_mm(unsigned long addr)
+{
+ return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
}
void __init map_vsyscall(void)
@@ -331,24 +324,12 @@ void __init map_vsyscall(void)
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
- __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
- vsyscall_mode == NATIVE
- ? PAGE_KERNEL_VSYSCALL
- : PAGE_KERNEL_VVAR);
+ if (vsyscall_mode != NONE)
+ __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
+ vsyscall_mode == NATIVE
+ ? PAGE_KERNEL_VSYSCALL
+ : PAGE_KERNEL_VVAR);
+
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
}
-
-static int __init vsyscall_init(void)
-{
- cpu_notifier_register_begin();
-
- on_each_cpu(cpu_vsyscall_init, NULL, 1);
- /* notifier priority > KVM */
- __hotcpu_notifier(cpu_vsyscall_notifier, 30);
-
- cpu_notifier_register_done();
-
- return 0;
-}
-__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index e48b674639cc..234b0722de53 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -116,8 +116,6 @@ struct x86_msi_ops x86_msi = {
.teardown_msi_irqs = default_teardown_msi_irqs,
.restore_msi_irqs = default_restore_msi_irqs,
.setup_hpet_msi = default_setup_hpet_msi,
- .msi_mask_irq = default_msi_mask_irq,
- .msix_mask_irq = default_msix_mask_irq,
};
/* MSI arch specific hooks */
@@ -140,14 +138,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
{
x86_msi.restore_msi_irqs(dev);
}
-u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
-{
- return x86_msi.msi_mask_irq(desc, mask, flag);
-}
-u32 arch_msix_mask_irq(struct msi_desc *desc, u32 flag)
-{
- return x86_msi.msix_mask_irq(desc, flag);
-}
#endif
struct x86_io_apic_ops x86_io_apic_ops = {