aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c105
-rw-r--r--arch/x86/kernel/alternative.c58
-rw-r--r--arch/x86/kernel/amd_iommu.c489
-rw-r--r--arch/x86/kernel/amd_iommu_init.c42
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic/apic.c110
-rw-r--r--arch/x86/kernel/apic/es7000_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c324
-rw-r--r--arch/x86/kernel/apic/ipi.c3
-rw-r--r--arch/x86/kernel/apic/nmi.c20
-rw-r--r--arch/x86/kernel/apic/probe_64.c16
-rw-r--r--arch/x86/kernel/apm_32.c31
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c80
-rw-r--r--arch/x86/kernel/cpu/common.c60
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c32
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c8
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c329
-rw-r--r--arch/x86/kernel/cpu/proc.c2
-rw-r--r--arch/x86/kernel/doublefault_32.c4
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/ftrace.c51
-rw-r--r--arch/x86/kernel/head_32.S1
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kernel/mpparse.c10
-rw-r--r--arch/x86/kernel/msr.c61
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/pci-dma.c17
-rw-r--r--arch/x86/kernel/pci-gart_64.c5
-rw-r--r--arch/x86/kernel/pci-nommu.c29
-rw-r--r--arch/x86/kernel/pci-swiotlb.c25
-rw-r--r--arch/x86/kernel/process.c6
-rw-r--r--arch/x86/kernel/process_32.c30
-rw-r--r--arch/x86/kernel/process_64.c36
-rw-r--r--arch/x86/kernel/ptrace.c13
-rw-r--r--arch/x86/kernel/reboot.c7
-rw-r--r--arch/x86/kernel/setup.c3
-rw-r--r--arch/x86/kernel/setup_percpu.c14
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/smpboot.c5
-rw-r--r--arch/x86/kernel/step.c9
-rw-r--r--arch/x86/kernel/sys_x86_64.c8
-rw-r--r--arch/x86/kernel/tboot.c447
-rw-r--r--arch/x86/kernel/tlb_uv.c4
-rw-r--r--arch/x86/kernel/traps.c52
-rw-r--r--arch/x86/kernel/vmlinux.lds.S126
49 files changed, 1920 insertions, 792 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 430d5b24af7b..832cb838cb48 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
+obj-$(CONFIG_INTEL_TXT) += tboot.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
obj-y += acpi/
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6b8ca3a0285d..67e929b89875 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -833,106 +833,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
extern int es7000_plat;
#endif
-static struct {
- int gsi_base;
- int gsi_end;
-} mp_ioapic_routing[MAX_IO_APICS];
-
-int mp_find_ioapic(int gsi)
-{
- int i = 0;
-
- /* Find the IOAPIC that manages this GSI. */
- for (i = 0; i < nr_ioapics; i++) {
- if ((gsi >= mp_ioapic_routing[i].gsi_base)
- && (gsi <= mp_ioapic_routing[i].gsi_end))
- return i;
- }
-
- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
- return -1;
-}
-
-int mp_find_ioapic_pin(int ioapic, int gsi)
-{
- if (WARN_ON(ioapic == -1))
- return -1;
- if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end))
- return -1;
-
- return gsi - mp_ioapic_routing[ioapic].gsi_base;
-}
-
-static u8 __init uniq_ioapic_id(u8 id)
-{
-#ifdef CONFIG_X86_32
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return io_apic_get_unique_id(nr_ioapics, id);
- else
- return id;
-#else
- int i;
- DECLARE_BITMAP(used, 256);
- bitmap_zero(used, 256);
- for (i = 0; i < nr_ioapics; i++) {
- struct mpc_ioapic *ia = &mp_ioapics[i];
- __set_bit(ia->apicid, used);
- }
- if (!test_bit(id, used))
- return id;
- return find_first_zero_bit(used, 256);
-#endif
-}
-
-static int bad_ioapic(unsigned long address)
-{
- if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
- }
- if (!address) {
- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
- " found in table, skipping!\n");
- return 1;
- }
- return 0;
-}
-
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
-{
- int idx = 0;
-
- if (bad_ioapic(address))
- return;
-
- idx = nr_ioapics;
-
- mp_ioapics[idx].type = MP_IOAPIC;
- mp_ioapics[idx].flags = MPC_APIC_USABLE;
- mp_ioapics[idx].apicaddr = address;
-
- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
- mp_ioapics[idx].apicid = uniq_ioapic_id(id);
- mp_ioapics[idx].apicver = io_apic_get_version(idx);
-
- /*
- * Build basic GSI lookup table to facilitate gsi->io_apic lookups
- * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
- */
- mp_ioapic_routing[idx].gsi_base = gsi_base;
- mp_ioapic_routing[idx].gsi_end = gsi_base +
- io_apic_get_redir_entries(idx);
-
- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
- "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
- mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
- mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
-
- nr_ioapics++;
-}
-
int __init acpi_probe_gsi(void)
{
int idx;
@@ -947,7 +847,7 @@ int __init acpi_probe_gsi(void)
max_gsi = 0;
for (idx = 0; idx < nr_ioapics; idx++) {
- gsi = mp_ioapic_routing[idx].gsi_end;
+ gsi = mp_gsi_routing[idx].gsi_end;
if (gsi > max_gsi)
max_gsi = gsi;
@@ -1179,9 +1079,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
* If MPS is present, it will handle them,
* otherwise the system will stay in PIC mode
*/
- if (acpi_disabled || acpi_noirq) {
+ if (acpi_disabled || acpi_noirq)
return -ENODEV;
- }
if (!cpu_has_apic)
return -ENODEV;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index f57658702571..de7353c0ce9c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2,6 +2,7 @@
#include <linux/sched.h>
#include <linux/mutex.h>
#include <linux/list.h>
+#include <linux/stringify.h>
#include <linux/kprobes.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
@@ -32,7 +33,7 @@ __setup("smp-alt-boot", bootonly);
#define smp_alt_once 1
#endif
-static int debug_alternative;
+static int __initdata_or_module debug_alternative;
static int __init debug_alt(char *str)
{
@@ -51,7 +52,7 @@ static int __init setup_noreplace_smp(char *str)
__setup("noreplace-smp", setup_noreplace_smp);
#ifdef CONFIG_PARAVIRT
-static int noreplace_paravirt = 0;
+static int __initdata_or_module noreplace_paravirt = 0;
static int __init setup_noreplace_paravirt(char *str)
{
@@ -64,16 +65,17 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
#define DPRINTK(fmt, args...) if (debug_alternative) \
printk(KERN_DEBUG fmt, args)
-#ifdef GENERIC_NOP1
+#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
/* Use inline assembly to define this because the nops are defined
as inline assembly strings in the include files and we cannot
get them easily into strings. */
-asm("\t.section .rodata, \"a\"\nintelnops: "
+asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: "
GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
GENERIC_NOP7 GENERIC_NOP8
"\t.previous");
extern const unsigned char intelnops[];
-static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = {
+static const unsigned char *const __initconst_or_module
+intel_nops[ASM_NOP_MAX+1] = {
NULL,
intelnops,
intelnops + 1,
@@ -87,12 +89,13 @@ static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = {
#endif
#ifdef K8_NOP1
-asm("\t.section .rodata, \"a\"\nk8nops: "
+asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: "
K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
K8_NOP7 K8_NOP8
"\t.previous");
extern const unsigned char k8nops[];
-static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = {
+static const unsigned char *const __initconst_or_module
+k8_nops[ASM_NOP_MAX+1] = {
NULL,
k8nops,
k8nops + 1,
@@ -105,13 +108,14 @@ static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = {
};
#endif
-#ifdef K7_NOP1
-asm("\t.section .rodata, \"a\"\nk7nops: "
+#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
+asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: "
K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
K7_NOP7 K7_NOP8
"\t.previous");
extern const unsigned char k7nops[];
-static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = {
+static const unsigned char *const __initconst_or_module
+k7_nops[ASM_NOP_MAX+1] = {
NULL,
k7nops,
k7nops + 1,
@@ -125,12 +129,13 @@ static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = {
#endif
#ifdef P6_NOP1
-asm("\t.section .rodata, \"a\"\np6nops: "
+asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: "
P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
P6_NOP7 P6_NOP8
"\t.previous");
extern const unsigned char p6nops[];
-static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
+static const unsigned char *const __initconst_or_module
+p6_nops[ASM_NOP_MAX+1] = {
NULL,
p6nops,
p6nops + 1,
@@ -146,7 +151,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
#ifdef CONFIG_X86_64
extern char __vsyscall_0;
-const unsigned char *const *find_nop_table(void)
+static const unsigned char *const *__init_or_module find_nop_table(void)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
boot_cpu_has(X86_FEATURE_NOPL))
@@ -157,7 +162,7 @@ const unsigned char *const *find_nop_table(void)
#else /* CONFIG_X86_64 */
-const unsigned char *const *find_nop_table(void)
+static const unsigned char *const *__init_or_module find_nop_table(void)
{
if (boot_cpu_has(X86_FEATURE_K8))
return k8_nops;
@@ -172,7 +177,7 @@ const unsigned char *const *find_nop_table(void)
#endif /* CONFIG_X86_64 */
/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-void add_nops(void *insns, unsigned int len)
+static void __init_or_module add_nops(void *insns, unsigned int len)
{
const unsigned char *const *noptable = find_nop_table();
@@ -185,10 +190,10 @@ void add_nops(void *insns, unsigned int len)
len -= noplen;
}
}
-EXPORT_SYMBOL_GPL(add_nops);
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern u8 *__smp_locks[], *__smp_locks_end[];
+static void *text_poke_early(void *addr, const void *opcode, size_t len);
/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
@@ -196,7 +201,8 @@ extern u8 *__smp_locks[], *__smp_locks_end[];
APs have less capabilities than the boot processor are not handled.
Tough. Make sure you disable such features by hand. */
-void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
+void __init_or_module apply_alternatives(struct alt_instr *start,
+ struct alt_instr *end)
{
struct alt_instr *a;
char insnbuf[MAX_PATCH_LEN];
@@ -279,9 +285,10 @@ static LIST_HEAD(smp_alt_modules);
static DEFINE_MUTEX(smp_alt);
static int smp_mode = 1; /* protected by smp_alt */
-void alternatives_smp_module_add(struct module *mod, char *name,
- void *locks, void *locks_end,
- void *text, void *text_end)
+void __init_or_module alternatives_smp_module_add(struct module *mod,
+ char *name,
+ void *locks, void *locks_end,
+ void *text, void *text_end)
{
struct smp_alt_module *smp;
@@ -317,7 +324,7 @@ void alternatives_smp_module_add(struct module *mod, char *name,
mutex_unlock(&smp_alt);
}
-void alternatives_smp_module_del(struct module *mod)
+void __init_or_module alternatives_smp_module_del(struct module *mod)
{
struct smp_alt_module *item;
@@ -386,8 +393,8 @@ void alternatives_smp_switch(int smp)
#endif
#ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end)
+void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end)
{
struct paravirt_patch_site *p;
char insnbuf[MAX_PATCH_LEN];
@@ -485,13 +492,14 @@ void __init alternative_instructions(void)
* instructions. And on the local CPU you need to be protected again NMI or MCE
* handlers seeing an inconsistent instruction while you patch.
*/
-void *text_poke_early(void *addr, const void *opcode, size_t len)
+static void *__init_or_module text_poke_early(void *addr, const void *opcode,
+ size_t len)
{
unsigned long flags;
local_irq_save(flags);
memcpy(addr, opcode, len);
- local_irq_restore(flags);
sync_core();
+ local_irq_restore(flags);
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
return addr;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 6c99f5037801..98f230f6a28d 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -41,9 +41,13 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
static LIST_HEAD(iommu_pd_list);
static DEFINE_SPINLOCK(iommu_pd_list_lock);
-#ifdef CONFIG_IOMMU_API
+/*
+ * Domain for untranslated devices - only allocated
+ * if iommu=pt passed on kernel cmd line.
+ */
+static struct protection_domain *pt_domain;
+
static struct iommu_ops amd_iommu_ops;
-#endif
/*
* general struct to manage commands send to an IOMMU
@@ -55,16 +59,16 @@ struct iommu_cmd {
static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
struct unity_map_entry *e);
static struct dma_ops_domain *find_protection_domain(u16 devid);
-static u64* alloc_pte(struct protection_domain *dom,
- unsigned long address, u64
- **pte_page, gfp_t gfp);
+static u64 *alloc_pte(struct protection_domain *domain,
+ unsigned long address, int end_lvl,
+ u64 **pte_page, gfp_t gfp);
static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
unsigned long start_page,
unsigned int pages);
-
-#ifndef BUS_NOTIFY_UNBOUND_DRIVER
-#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
-#endif
+static void reset_iommu_command_buffer(struct amd_iommu *iommu);
+static u64 *fetch_pte(struct protection_domain *domain,
+ unsigned long address, int map_size);
+static void update_domain(struct protection_domain *domain);
#ifdef CONFIG_AMD_IOMMU_STATS
@@ -138,7 +142,25 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
*
****************************************************************************/
-static void iommu_print_event(void *__evt)
+static void dump_dte_entry(u16 devid)
+{
+ int i;
+
+ for (i = 0; i < 8; ++i)
+ pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
+ amd_iommu_dev_table[devid].data[i]);
+}
+
+static void dump_command(unsigned long phys_addr)
+{
+ struct iommu_cmd *cmd = phys_to_virt(phys_addr);
+ int i;
+
+ for (i = 0; i < 4; ++i)
+ pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
+}
+
+static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
{
u32 *event = __evt;
int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
@@ -147,7 +169,7 @@ static void iommu_print_event(void *__evt)
int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
u64 address = (u64)(((u64)event[3]) << 32) | event[2];
- printk(KERN_ERR "AMD IOMMU: Event logged [");
+ printk(KERN_ERR "AMD-Vi: Event logged [");
switch (type) {
case EVENT_TYPE_ILL_DEV:
@@ -155,6 +177,7 @@ static void iommu_print_event(void *__evt)
"address=0x%016llx flags=0x%04x]\n",
PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
address, flags);
+ dump_dte_entry(devid);
break;
case EVENT_TYPE_IO_FAULT:
printk("IO_PAGE_FAULT device=%02x:%02x.%x "
@@ -176,6 +199,8 @@ static void iommu_print_event(void *__evt)
break;
case EVENT_TYPE_ILL_CMD:
printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
+ reset_iommu_command_buffer(iommu);
+ dump_command(address);
break;
case EVENT_TYPE_CMD_HARD_ERR:
printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
@@ -209,7 +234,7 @@ static void iommu_poll_events(struct amd_iommu *iommu)
tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
while (head != tail) {
- iommu_print_event(iommu->evt_buf + head);
+ iommu_print_event(iommu, iommu->evt_buf + head);
head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
}
@@ -296,8 +321,11 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
- if (unlikely(i == EXIT_LOOP_COUNT))
- panic("AMD IOMMU: Completion wait loop failed\n");
+ if (unlikely(i == EXIT_LOOP_COUNT)) {
+ spin_unlock(&iommu->lock);
+ reset_iommu_command_buffer(iommu);
+ spin_lock(&iommu->lock);
+ }
}
/*
@@ -445,47 +473,78 @@ static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
}
/*
+ * This function flushes one domain on one IOMMU
+ */
+static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid)
+{
+ struct iommu_cmd cmd;
+ unsigned long flags;
+
+ __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+ domid, 1, 1);
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ __iommu_queue_command(iommu, &cmd);
+ __iommu_completion_wait(iommu);
+ __iommu_wait_for_completion(iommu);
+ spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static void flush_all_domains_on_iommu(struct amd_iommu *iommu)
+{
+ int i;
+
+ for (i = 1; i < MAX_DOMAIN_ID; ++i) {
+ if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
+ continue;
+ flush_domain_on_iommu(iommu, i);
+ }
+
+}
+
+/*
* This function is used to flush the IO/TLB for a given protection domain
* on every IOMMU in the system
*/
static void iommu_flush_domain(u16 domid)
{
- unsigned long flags;
struct amd_iommu *iommu;
- struct iommu_cmd cmd;
INC_STATS_COUNTER(domain_flush_all);
- __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
- domid, 1, 1);
-
- for_each_iommu(iommu) {
- spin_lock_irqsave(&iommu->lock, flags);
- __iommu_queue_command(iommu, &cmd);
- __iommu_completion_wait(iommu);
- __iommu_wait_for_completion(iommu);
- spin_unlock_irqrestore(&iommu->lock, flags);
- }
+ for_each_iommu(iommu)
+ flush_domain_on_iommu(iommu, domid);
}
void amd_iommu_flush_all_domains(void)
{
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu)
+ flush_all_domains_on_iommu(iommu);
+}
+
+static void flush_all_devices_for_iommu(struct amd_iommu *iommu)
+{
int i;
- for (i = 1; i < MAX_DOMAIN_ID; ++i) {
- if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
+ for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+ if (iommu != amd_iommu_rlookup_table[i])
continue;
- iommu_flush_domain(i);
+
+ iommu_queue_inv_dev_entry(iommu, i);
+ iommu_completion_wait(iommu);
}
}
-void amd_iommu_flush_all_devices(void)
+static void flush_devices_by_domain(struct protection_domain *domain)
{
struct amd_iommu *iommu;
int i;
for (i = 0; i <= amd_iommu_last_bdf; ++i) {
- if (amd_iommu_pd_table[i] == NULL)
+ if ((domain == NULL && amd_iommu_pd_table[i] == NULL) ||
+ (amd_iommu_pd_table[i] != domain))
continue;
iommu = amd_iommu_rlookup_table[i];
@@ -497,6 +556,27 @@ void amd_iommu_flush_all_devices(void)
}
}
+static void reset_iommu_command_buffer(struct amd_iommu *iommu)
+{
+ pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
+
+ if (iommu->reset_in_progress)
+ panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
+
+ iommu->reset_in_progress = true;
+
+ amd_iommu_reset_cmd_buffer(iommu);
+ flush_all_devices_for_iommu(iommu);
+ flush_all_domains_on_iommu(iommu);
+
+ iommu->reset_in_progress = false;
+}
+
+void amd_iommu_flush_all_devices(void)
+{
+ flush_devices_by_domain(NULL);
+}
+
/****************************************************************************
*
* The functions below are used the create the page table mappings for
@@ -514,18 +594,21 @@ void amd_iommu_flush_all_devices(void)
static int iommu_map_page(struct protection_domain *dom,
unsigned long bus_addr,
unsigned long phys_addr,
- int prot)
+ int prot,
+ int map_size)
{
u64 __pte, *pte;
bus_addr = PAGE_ALIGN(bus_addr);
phys_addr = PAGE_ALIGN(phys_addr);
- /* only support 512GB address spaces for now */
- if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
+ BUG_ON(!PM_ALIGNED(map_size, bus_addr));
+ BUG_ON(!PM_ALIGNED(map_size, phys_addr));
+
+ if (!(prot & IOMMU_PROT_MASK))
return -EINVAL;
- pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
+ pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
if (IOMMU_PTE_PRESENT(*pte))
return -EBUSY;
@@ -538,29 +621,18 @@ static int iommu_map_page(struct protection_domain *dom,
*pte = __pte;
+ update_domain(dom);
+
return 0;
}
static void iommu_unmap_page(struct protection_domain *dom,
- unsigned long bus_addr)
+ unsigned long bus_addr, int map_size)
{
- u64 *pte;
-
- pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
-
- if (!IOMMU_PTE_PRESENT(*pte))
- return;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+ u64 *pte = fetch_pte(dom, bus_addr, map_size);
- if (!IOMMU_PTE_PRESENT(*pte))
- return;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
-
- *pte = 0;
+ if (pte)
+ *pte = 0;
}
/*
@@ -615,7 +687,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
for (addr = e->address_start; addr < e->address_end;
addr += PAGE_SIZE) {
- ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot);
+ ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
+ PM_MAP_4k);
if (ret)
return ret;
/*
@@ -670,24 +743,29 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
* This function checks if there is a PTE for a given dma address. If
* there is one, it returns the pointer to it.
*/
-static u64* fetch_pte(struct protection_domain *domain,
- unsigned long address)
+static u64 *fetch_pte(struct protection_domain *domain,
+ unsigned long address, int map_size)
{
+ int level;
u64 *pte;
- pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
+ level = domain->mode - 1;
+ pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
+ while (level > map_size) {
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return NULL;
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+ level -= 1;
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[PM_LEVEL_INDEX(level, address)];
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+ if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
+ pte = NULL;
+ break;
+ }
+ }
return pte;
}
@@ -727,7 +805,7 @@ static int alloc_new_range(struct amd_iommu *iommu,
u64 *pte, *pte_page;
for (i = 0; i < num_ptes; ++i) {
- pte = alloc_pte(&dma_dom->domain, address,
+ pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
&pte_page, gfp);
if (!pte)
goto out_free;
@@ -760,16 +838,20 @@ static int alloc_new_range(struct amd_iommu *iommu,
for (i = dma_dom->aperture[index]->offset;
i < dma_dom->aperture_size;
i += PAGE_SIZE) {
- u64 *pte = fetch_pte(&dma_dom->domain, i);
+ u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k);
if (!pte || !IOMMU_PTE_PRESENT(*pte))
continue;
dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
}
+ update_domain(&dma_dom->domain);
+
return 0;
out_free:
+ update_domain(&dma_dom->domain);
+
free_page((unsigned long)dma_dom->aperture[index]->bitmap);
kfree(dma_dom->aperture[index]);
@@ -1009,7 +1091,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
dma_dom->domain.id = domain_id_alloc();
if (dma_dom->domain.id == 0)
goto free_dma_dom;
- dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
+ dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
dma_dom->domain.flags = PD_DMA_OPS_MASK;
dma_dom->domain.priv = dma_dom;
@@ -1063,6 +1145,41 @@ static struct protection_domain *domain_for_device(u16 devid)
return dom;
}
+static void set_dte_entry(u16 devid, struct protection_domain *domain)
+{
+ u64 pte_root = virt_to_phys(domain->pt_root);
+
+ pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
+ << DEV_ENTRY_MODE_SHIFT;
+ pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
+
+ amd_iommu_dev_table[devid].data[2] = domain->id;
+ amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
+ amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
+
+ amd_iommu_pd_table[devid] = domain;
+}
+
+/*
+ * If a device is not yet associated with a domain, this function does
+ * assigns it visible for the hardware
+ */
+static void __attach_device(struct amd_iommu *iommu,
+ struct protection_domain *domain,
+ u16 devid)
+{
+ /* lock domain */
+ spin_lock(&domain->lock);
+
+ /* update DTE entry */
+ set_dte_entry(devid, domain);
+
+ domain->dev_cnt += 1;
+
+ /* ready */
+ spin_unlock(&domain->lock);
+}
+
/*
* If a device is not yet associated with a domain, this function does
* assigns it visible for the hardware
@@ -1072,27 +1189,16 @@ static void attach_device(struct amd_iommu *iommu,
u16 devid)
{
unsigned long flags;
- u64 pte_root = virt_to_phys(domain->pt_root);
-
- domain->dev_cnt += 1;
-
- pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
- << DEV_ENTRY_MODE_SHIFT;
- pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
- amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
- amd_iommu_dev_table[devid].data[2] = domain->id;
-
- amd_iommu_pd_table[devid] = domain;
+ __attach_device(iommu, domain, devid);
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
- /*
- * We might boot into a crash-kernel here. The crashed kernel
- * left the caches in the IOMMU dirty. So we have to flush
- * here to evict all dirty stuff.
- */
+ /*
+ * We might boot into a crash-kernel here. The crashed kernel
+ * left the caches in the IOMMU dirty. So we have to flush
+ * here to evict all dirty stuff.
+ */
iommu_queue_inv_dev_entry(iommu, devid);
iommu_flush_tlb_pde(iommu, domain->id);
}
@@ -1119,6 +1225,15 @@ static void __detach_device(struct protection_domain *domain, u16 devid)
/* ready */
spin_unlock(&domain->lock);
+
+ /*
+ * If we run in passthrough mode the device must be assigned to the
+ * passthrough domain if it is detached from any other domain
+ */
+ if (iommu_pass_through) {
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+ __attach_device(iommu, pt_domain, devid);
+ }
}
/*
@@ -1164,6 +1279,8 @@ static int device_change_notifier(struct notifier_block *nb,
case BUS_NOTIFY_UNBOUND_DRIVER:
if (!domain)
goto out;
+ if (iommu_pass_through)
+ break;
detach_device(domain, devid);
break;
case BUS_NOTIFY_ADD_DEVICE:
@@ -1292,39 +1409,91 @@ static int get_device_resources(struct device *dev,
return 1;
}
+static void update_device_table(struct protection_domain *domain)
+{
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+ if (amd_iommu_pd_table[i] != domain)
+ continue;
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ set_dte_entry(i, domain);
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+ }
+}
+
+static void update_domain(struct protection_domain *domain)
+{
+ if (!domain->updated)
+ return;
+
+ update_device_table(domain);
+ flush_devices_by_domain(domain);
+ iommu_flush_domain(domain->id);
+
+ domain->updated = false;
+}
+
/*
- * If the pte_page is not yet allocated this function is called
+ * This function is used to add another level to an IO page table. Adding
+ * another level increases the size of the address space by 9 bits to a size up
+ * to 64 bits.
*/
-static u64* alloc_pte(struct protection_domain *dom,
- unsigned long address, u64 **pte_page, gfp_t gfp)
+static bool increase_address_space(struct protection_domain *domain,
+ gfp_t gfp)
+{
+ u64 *pte;
+
+ if (domain->mode == PAGE_MODE_6_LEVEL)
+ /* address space already 64 bit large */
+ return false;
+
+ pte = (void *)get_zeroed_page(gfp);
+ if (!pte)
+ return false;
+
+ *pte = PM_LEVEL_PDE(domain->mode,
+ virt_to_phys(domain->pt_root));
+ domain->pt_root = pte;
+ domain->mode += 1;
+ domain->updated = true;
+
+ return true;
+}
+
+static u64 *alloc_pte(struct protection_domain *domain,
+ unsigned long address,
+ int end_lvl,
+ u64 **pte_page,
+ gfp_t gfp)
{
u64 *pte, *page;
+ int level;
- pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
+ while (address > PM_LEVEL_SIZE(domain->mode))
+ increase_address_space(domain, gfp);
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(gfp);
- if (!page)
- return NULL;
- *pte = IOMMU_L2_PDE(virt_to_phys(page));
- }
+ level = domain->mode - 1;
+ pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+ while (level > end_lvl) {
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(gfp);
+ if (!page)
+ return NULL;
+ *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
+ }
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(gfp);
- if (!page)
- return NULL;
- *pte = IOMMU_L1_PDE(virt_to_phys(page));
- }
+ level -= 1;
- pte = IOMMU_PTE_PAGE(*pte);
+ pte = IOMMU_PTE_PAGE(*pte);
- if (pte_page)
- *pte_page = pte;
+ if (pte_page && level == end_lvl)
+ *pte_page = pte;
- pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+ pte = &pte[PM_LEVEL_INDEX(level, address)];
+ }
return pte;
}
@@ -1344,10 +1513,13 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
if (!pte) {
- pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
+ pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
+ GFP_ATOMIC);
aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
} else
- pte += IOMMU_PTE_L0_INDEX(address);
+ pte += PM_LEVEL_INDEX(0, address);
+
+ update_domain(&dom->domain);
return pte;
}
@@ -1409,7 +1581,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
if (!pte)
return;
- pte += IOMMU_PTE_L0_INDEX(address);
+ pte += PM_LEVEL_INDEX(0, address);
WARN_ON(!*pte);
@@ -1988,19 +2160,47 @@ static void cleanup_domain(struct protection_domain *domain)
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
}
-static int amd_iommu_domain_init(struct iommu_domain *dom)
+static void protection_domain_free(struct protection_domain *domain)
+{
+ if (!domain)
+ return;
+
+ if (domain->id)
+ domain_id_free(domain->id);
+
+ kfree(domain);
+}
+
+static struct protection_domain *protection_domain_alloc(void)
{
struct protection_domain *domain;
domain = kzalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
- return -ENOMEM;
+ return NULL;
spin_lock_init(&domain->lock);
- domain->mode = PAGE_MODE_3_LEVEL;
domain->id = domain_id_alloc();
if (!domain->id)
+ goto out_err;
+
+ return domain;
+
+out_err:
+ kfree(domain);
+
+ return NULL;
+}
+
+static int amd_iommu_domain_init(struct iommu_domain *dom)
+{
+ struct protection_domain *domain;
+
+ domain = protection_domain_alloc();
+ if (!domain)
goto out_free;
+
+ domain->mode = PAGE_MODE_3_LEVEL;
domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
if (!domain->pt_root)
goto out_free;
@@ -2010,7 +2210,7 @@ static int amd_iommu_domain_init(struct iommu_domain *dom)
return 0;
out_free:
- kfree(domain);
+ protection_domain_free(domain);
return -ENOMEM;
}
@@ -2115,7 +2315,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
paddr &= PAGE_MASK;
for (i = 0; i < npages; ++i) {
- ret = iommu_map_page(domain, iova, paddr, prot);
+ ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
if (ret)
return ret;
@@ -2136,7 +2336,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
iova &= PAGE_MASK;
for (i = 0; i < npages; ++i) {
- iommu_unmap_page(domain, iova);
+ iommu_unmap_page(domain, iova, PM_MAP_4k);
iova += PAGE_SIZE;
}
@@ -2151,21 +2351,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
phys_addr_t paddr;
u64 *pte;
- pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)];
-
- if (!IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
-
- if (!IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
+ pte = fetch_pte(domain, iova, PM_MAP_4k);
- if (!IOMMU_PTE_PRESENT(*pte))
+ if (!pte || !IOMMU_PTE_PRESENT(*pte))
return 0;
paddr = *pte & IOMMU_PAGE_MASK;
@@ -2191,3 +2379,46 @@ static struct iommu_ops amd_iommu_ops = {
.domain_has_cap = amd_iommu_domain_has_cap,
};
+/*****************************************************************************
+ *
+ * The next functions do a basic initialization of IOMMU for pass through
+ * mode
+ *
+ * In passthrough mode the IOMMU is initialized and enabled but not used for
+ * DMA-API translation.
+ *
+ *****************************************************************************/
+
+int __init amd_iommu_init_passthrough(void)
+{
+ struct pci_dev *dev = NULL;
+ u16 devid, devid2;
+
+ /* allocate passthroug domain */
+ pt_domain = protection_domain_alloc();
+ if (!pt_domain)
+ return -ENOMEM;
+
+ pt_domain->mode |= PAGE_MODE_NONE;
+
+ while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ struct amd_iommu *iommu;
+
+ devid = calc_devid(dev->bus->number, dev->devfn);
+ if (devid > amd_iommu_last_bdf)
+ continue;
+
+ devid2 = amd_iommu_alias_table[devid];
+
+ iommu = amd_iommu_rlookup_table[devid2];
+ if (!iommu)
+ continue;
+
+ __attach_device(iommu, pt_domain, devid);
+ __attach_device(iommu, pt_domain, devid2);
+ }
+
+ pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
+
+ return 0;
+}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c1b17e97252e..b4b61d462dcc 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -252,7 +252,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
/* Function to enable the hardware */
static void iommu_enable(struct amd_iommu *iommu)
{
- printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
+ printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
dev_name(&iommu->dev->dev), iommu->cap_ptr);
iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
@@ -435,6 +435,20 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
}
/*
+ * This function resets the command buffer if the IOMMU stopped fetching
+ * commands from it.
+ */
+void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
+{
+ iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+
+ writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+ iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+}
+
+/*
* This function writes the command buffer address to the hardware and
* enables it.
*/
@@ -450,11 +464,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
&entry, sizeof(entry));
- /* set head and tail to zero manually */
- writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
- writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
- iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+ amd_iommu_reset_cmd_buffer(iommu);
}
static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -858,7 +868,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
switch (*p) {
case ACPI_IVHD_TYPE:
- DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
+ DUMP_printk("device: %02x:%02x.%01x cap: %04x "
"seg: %d flags: %01x info %04x\n",
PCI_BUS(h->devid), PCI_SLOT(h->devid),
PCI_FUNC(h->devid), h->cap_ptr,
@@ -902,7 +912,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
IRQF_SAMPLE_RANDOM,
- "AMD IOMMU",
+ "AMD-Vi",
NULL);
if (r) {
@@ -1150,7 +1160,7 @@ int __init amd_iommu_init(void)
if (no_iommu) {
- printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n");
+ printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
return 0;
}
@@ -1242,22 +1252,28 @@ int __init amd_iommu_init(void)
if (ret)
goto free;
- ret = amd_iommu_init_dma_ops();
+ if (iommu_pass_through)
+ ret = amd_iommu_init_passthrough();
+ else
+ ret = amd_iommu_init_dma_ops();
if (ret)
goto free;
enable_iommus();
- printk(KERN_INFO "AMD IOMMU: device isolation ");
+ if (iommu_pass_through)
+ goto out;
+
+ printk(KERN_INFO "AMD-Vi: device isolation ");
if (amd_iommu_isolate)
printk("enabled\n");
else
printk("disabled\n");
if (amd_iommu_unmap_flush)
- printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
+ printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
else
- printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
+ printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
out:
return ret;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 676debfc1702..128111d8ffe0 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -20,6 +20,7 @@
#include <linux/bitops.h>
#include <linux/ioport.h>
#include <linux/suspend.h>
+#include <linux/kmemleak.h>
#include <asm/e820.h>
#include <asm/io.h>
#include <asm/iommu.h>
@@ -94,6 +95,11 @@ static u32 __init allocate_aperture(void)
* code for safe
*/
p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
+ /*
+ * Kmemleak should not scan this block as it may not be mapped via the
+ * kernel direct mapping.
+ */
+ kmemleak_ignore(p);
if (!p || __pa(p)+aper_size > 0xffffffff) {
printk(KERN_ERR
"Cannot allocate aperture memory hole (%p,%uK)\n",
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0a1c2830ec66..159740decc41 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -49,6 +49,7 @@
#include <asm/mtrr.h>
#include <asm/smp.h>
#include <asm/mce.h>
+#include <asm/kvm_para.h>
unsigned int num_processors;
@@ -1361,52 +1362,80 @@ void enable_x2apic(void)
}
#endif /* CONFIG_X86_X2APIC */
-void __init enable_IR_x2apic(void)
+int __init enable_IR(void)
{
#ifdef CONFIG_INTR_REMAP
- int ret;
- unsigned long flags;
- struct IO_APIC_route_entry **ioapic_entries = NULL;
-
- ret = dmar_table_init();
- if (ret) {
- pr_debug("dmar_table_init() failed with %d:\n", ret);
- goto ir_failed;
- }
-
if (!intr_remapping_supported()) {
pr_debug("intr-remapping not supported\n");
- goto ir_failed;
+ return 0;
}
-
if (!x2apic_preenabled && skip_ioapic_setup) {
pr_info("Skipped enabling intr-remap because of skipping "
"io-apic setup\n");
- return;
+ return 0;
}
+ if (enable_intr_remapping(x2apic_supported()))
+ return 0;
+
+ pr_info("Enabled Interrupt-remapping\n");
+
+ return 1;
+
+#endif
+ return 0;
+}
+
+void __init enable_IR_x2apic(void)
+{
+ unsigned long flags;
+ struct IO_APIC_route_entry **ioapic_entries = NULL;
+ int ret, x2apic_enabled = 0;
+ int dmar_table_init_ret = 0;
+
+#ifdef CONFIG_INTR_REMAP
+ dmar_table_init_ret = dmar_table_init();
+ if (dmar_table_init_ret)
+ pr_debug("dmar_table_init() failed with %d:\n",
+ dmar_table_init_ret);
+#endif
+
ioapic_entries = alloc_ioapic_entries();
if (!ioapic_entries) {
- pr_info("Allocate ioapic_entries failed: %d\n", ret);
- goto end;
+ pr_err("Allocate ioapic_entries failed\n");
+ goto out;
}
ret = save_IO_APIC_setup(ioapic_entries);
if (ret) {
pr_info("Saving IO-APIC state failed: %d\n", ret);
- goto end;
+ goto out;
}
local_irq_save(flags);
- mask_IO_APIC_setup(ioapic_entries);
mask_8259A();
+ mask_IO_APIC_setup(ioapic_entries);
- ret = enable_intr_remapping(x2apic_supported());
- if (ret)
- goto end_restore;
+ if (dmar_table_init_ret)
+ ret = 0;
+ else
+ ret = enable_IR();
- pr_info("Enabled Interrupt-remapping\n");
+ if (!ret) {
+ /* IR is required if there is APIC ID > 255 even when running
+ * under KVM
+ */
+ if (max_physical_apicid > 255 || !kvm_para_available())
+ goto nox2apic;
+ /*
+ * without IR all CPUs can be addressed by IOAPIC/MSI
+ * only in physical mode
+ */
+ x2apic_force_phys();
+ }
+
+ x2apic_enabled = 1;
if (x2apic_supported() && !x2apic_mode) {
x2apic_mode = 1;
@@ -1414,41 +1443,25 @@ void __init enable_IR_x2apic(void)
pr_info("Enabled x2apic\n");
}
-end_restore:
- if (ret)
- /*
- * IR enabling failed
- */
+nox2apic:
+ if (!ret) /* IR enabling failed */
restore_IO_APIC_setup(ioapic_entries);
-
unmask_8259A();
local_irq_restore(flags);
-end:
+out:
if (ioapic_entries)
free_ioapic_entries(ioapic_entries);
- if (!ret)
+ if (x2apic_enabled)
return;
-ir_failed:
if (x2apic_preenabled)
- panic("x2apic enabled by bios. But IR enabling failed");
+ panic("x2apic: enabled by BIOS but kernel init failed.");
else if (cpu_has_x2apic)
- pr_info("Not enabling x2apic,Intr-remapping\n");
-#else
- if (!cpu_has_x2apic)
- return;
-
- if (x2apic_preenabled)
- panic("x2apic enabled prior OS handover,"
- " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
-#endif
-
- return;
+ pr_info("Not enabling x2apic, Intr-remapping init failed.\n");
}
-
#ifdef CONFIG_X86_64
/*
* Detect and enable local APICs on non-SMP boards.
@@ -1549,8 +1562,6 @@ no_apic:
#ifdef CONFIG_X86_64
void __init early_init_lapic_mapping(void)
{
- unsigned long phys_addr;
-
/*
* If no local APIC can be found then go out
* : it means there is no mpatable and MADT
@@ -1558,11 +1569,9 @@ void __init early_init_lapic_mapping(void)
if (!smp_found_config)
return;
- phys_addr = mp_lapic_addr;
-
- set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
+ set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, phys_addr);
+ APIC_BASE, mp_lapic_addr);
/*
* Fetch the APIC ID of the BSP in case we have a
@@ -1651,7 +1660,6 @@ int __init APIC_init_uniprocessor(void)
APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
boot_cpu_physical_apicid);
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
return -1;
}
#endif
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 8952a5890281..89174f847b49 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -167,7 +167,7 @@ static int es7000_apic_is_cluster(void)
{
/* MPENTIUMIII */
if (boot_cpu_data.x86 == 6 &&
- (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11))
+ (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11))
return 1;
return 0;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index d2ed6c5ddc80..3c8f9e75d038 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -66,6 +66,8 @@
#include <asm/apic.h>
#define __apicdebuginit(type) static type __init
+#define for_each_irq_pin(entry, head) \
+ for (entry = head; entry; entry = entry->next)
/*
* Is the SiS APIC rmw bug present ?
@@ -85,6 +87,9 @@ int nr_ioapic_registers[MAX_IO_APICS];
struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
int nr_ioapics;
+/* IO APIC gsi routing info */
+struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS];
+
/* MP IRQ source entries */
struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
@@ -116,15 +121,6 @@ static int __init parse_noapic(char *str)
}
early_param("noapic", parse_noapic);
-struct irq_pin_list;
-
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
-
struct irq_pin_list {
int apic, pin;
struct irq_pin_list *next;
@@ -139,6 +135,11 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
return pin;
}
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * Most irqs are mapped 1:1 with pins.
+ */
struct irq_cfg {
struct irq_pin_list *irq_2_pin;
cpumask_var_t domain;
@@ -414,13 +415,10 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
- entry = cfg->irq_2_pin;
- for (;;) {
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
unsigned int reg;
int pin;
- if (!entry)
- break;
pin = entry->pin;
reg = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
@@ -428,9 +426,6 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
spin_unlock_irqrestore(&ioapic_lock, flags);
return true;
}
- if (!entry->next)
- break;
- entry = entry->next;
}
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -498,72 +493,68 @@ static void ioapic_mask_entry(int apic, int pin)
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static int
+add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
{
- struct irq_pin_list *entry;
+ struct irq_pin_list **last, *entry;
- entry = cfg->irq_2_pin;
- if (!entry) {
- entry = get_one_free_irq_2_pin(node);
- if (!entry) {
- printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
- apic, pin);
- return;
- }
- cfg->irq_2_pin = entry;
- entry->apic = apic;
- entry->pin = pin;
- return;
- }
-
- while (entry->next) {
- /* not again, please */
+ /* don't allow duplicates */
+ last = &cfg->irq_2_pin;
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
if (entry->apic == apic && entry->pin == pin)
- return;
-
- entry = entry->next;
+ return 0;
+ last = &entry->next;
}
- entry->next = get_one_free_irq_2_pin(node);
- entry = entry->next;
+ entry = get_one_free_irq_2_pin(node);
+ if (!entry) {
+ printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
+ node, apic, pin);
+ return -ENOMEM;
+ }
entry->apic = apic;
entry->pin = pin;
+
+ *last = entry;
+ return 0;
+}
+
+static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+{
+ if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin))
+ panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
}
/*
* Reroute an IRQ to a different pin.
*/
static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
- int oldapic, int oldpin,
- int newapic, int newpin)
+ int oldapic, int oldpin,
+ int newapic, int newpin)
{
- struct irq_pin_list *entry = cfg->irq_2_pin;
- int replaced = 0;
+ struct irq_pin_list *entry;
- while (entry) {
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
if (entry->apic == oldapic && entry->pin == oldpin) {
entry->apic = newapic;
entry->pin = newpin;
- replaced = 1;
/* every one is different, right? */
- break;
+ return;
}
- entry = entry->next;
}
- /* why? call replace before add? */
- if (!replaced)
- add_pin_to_irq_node(cfg, node, newapic, newpin);
+ /* old apic/pin didn't exist, so just add new ones */
+ add_pin_to_irq_node(cfg, node, newapic, newpin);
}
-static inline void io_apic_modify_irq(struct irq_cfg *cfg,
- int mask_and, int mask_or,
- void (*final)(struct irq_pin_list *entry))
+static void io_apic_modify_irq(struct irq_cfg *cfg,
+ int mask_and, int mask_or,
+ void (*final)(struct irq_pin_list *entry))
{
int pin;
struct irq_pin_list *entry;
- for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
unsigned int reg;
pin = entry->pin;
reg = io_apic_read(entry->apic, 0x10 + pin * 2);
@@ -580,7 +571,6 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
}
-#ifdef CONFIG_X86_64
static void io_apic_sync(struct irq_pin_list *entry)
{
/*
@@ -596,11 +586,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
{
io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
}
-#else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
-}
static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
{
@@ -613,7 +598,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
}
-#endif /* CONFIG_X86_32 */
static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
{
@@ -1702,12 +1686,8 @@ __apicdebuginit(void) print_IO_APIC(void)
if (!entry)
continue;
printk(KERN_DEBUG "IRQ%d ", irq);
- for (;;) {
+ for_each_irq_pin(entry, cfg->irq_2_pin)
printk("-> %d:%d", entry->apic, entry->pin);
- if (!entry->next)
- break;
- entry = entry->next;
- }
printk("\n");
}
@@ -2211,7 +2191,6 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
return was_pending;
}
-#ifdef CONFIG_X86_64
static int ioapic_retrigger_irq(unsigned int irq)
{
@@ -2224,14 +2203,6 @@ static int ioapic_retrigger_irq(unsigned int irq)
return 1;
}
-#else
-static int ioapic_retrigger_irq(unsigned int irq)
-{
- apic->send_IPI_self(irq_cfg(irq)->vector);
-
- return 1;
-}
-#endif
/*
* Level and edge triggered IO-APIC interrupts need different handling,
@@ -2269,13 +2240,9 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
struct irq_pin_list *entry;
u8 vector = cfg->vector;
- entry = cfg->irq_2_pin;
- for (;;) {
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
unsigned int reg;
- if (!entry)
- break;
-
apic = entry->apic;
pin = entry->pin;
/*
@@ -2288,9 +2255,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
reg &= ~IO_APIC_REDIR_VECTOR_MASK;
reg |= vector;
io_apic_modify(apic, 0x10 + pin*2, reg);
- if (!entry->next)
- break;
- entry = entry->next;
}
}
@@ -2515,11 +2479,8 @@ atomic_t irq_mis_count;
static void ack_apic_level(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
-
-#ifdef CONFIG_X86_32
unsigned long v;
int i;
-#endif
struct irq_cfg *cfg;
int do_unmask_irq = 0;
@@ -2532,31 +2493,28 @@ static void ack_apic_level(unsigned int irq)
}
#endif
-#ifdef CONFIG_X86_32
/*
- * It appears there is an erratum which affects at least version 0x11
- * of I/O APIC (that's the 82093AA and cores integrated into various
- * chipsets). Under certain conditions a level-triggered interrupt is
- * erroneously delivered as edge-triggered one but the respective IRR
- * bit gets set nevertheless. As a result the I/O unit expects an EOI
- * message but it will never arrive and further interrupts are blocked
- * from the source. The exact reason is so far unknown, but the
- * phenomenon was observed when two consecutive interrupt requests
- * from a given source get delivered to the same CPU and the source is
- * temporarily disabled in between.
- *
- * A workaround is to simulate an EOI message manually. We achieve it
- * by setting the trigger mode to edge and then to level when the edge
- * trigger mode gets detected in the TMR of a local APIC for a
- * level-triggered interrupt. We mask the source for the time of the
- * operation to prevent an edge-triggered interrupt escaping meanwhile.
- * The idea is from Manfred Spraul. --macro
- */
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets). Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless. As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source. The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually. We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt. We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul. --macro
+ */
cfg = desc->chip_data;
i = cfg->vector;
-
v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
-#endif
/*
* We must acknowledge the irq before we move it or the acknowledge will
@@ -2598,7 +2556,7 @@ static void ack_apic_level(unsigned int irq)
unmask_IO_APIC_irq_desc(desc);
}
-#ifdef CONFIG_X86_32
+ /* Tail end of version 0x11 I/O APIC bug workaround */
if (!(v & (1 << (i & 0x1f)))) {
atomic_inc(&irq_mis_count);
spin_lock(&ioapic_lock);
@@ -2606,26 +2564,15 @@ static void ack_apic_level(unsigned int irq)
__unmask_and_level_IO_APIC_irq(cfg);
spin_unlock(&ioapic_lock);
}
-#endif
}
#ifdef CONFIG_INTR_REMAP
static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
{
- int apic, pin;
struct irq_pin_list *entry;
- entry = cfg->irq_2_pin;
- for (;;) {
-
- if (!entry)
- break;
-
- apic = entry->apic;
- pin = entry->pin;
- io_apic_eoi(apic, pin);
- entry = entry->next;
- }
+ for_each_irq_pin(entry, cfg->irq_2_pin)
+ io_apic_eoi(entry->apic, entry->pin);
}
static void
@@ -3241,8 +3188,7 @@ void destroy_irq(unsigned int irq)
cfg = desc->chip_data;
dynamic_irq_cleanup(irq);
/* connect back irq_cfg */
- if (desc)
- desc->chip_data = cfg;
+ desc->chip_data = cfg;
free_irte(irq);
spin_lock_irqsave(&vector_lock, flags);
@@ -3912,7 +3858,11 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
*/
if (irq >= NR_IRQS_LEGACY) {
cfg = desc->chip_data;
- add_pin_to_irq_node(cfg, node, ioapic, pin);
+ if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
+ printk(KERN_INFO "can not add pin %d for irq %d\n",
+ pin, irq);
+ return 0;
+ }
}
setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
@@ -3941,11 +3891,28 @@ int io_apic_set_pci_routing(struct device *dev, int irq,
return __io_apic_set_pci_routing(dev, irq, irq_attr);
}
-/* --------------------------------------------------------------------------
- ACPI-based IOAPIC Configuration
- -------------------------------------------------------------------------- */
+u8 __init io_apic_unique_id(u8 id)
+{
+#ifdef CONFIG_X86_32
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return io_apic_get_unique_id(nr_ioapics, id);
+ else
+ return id;
+#else
+ int i;
+ DECLARE_BITMAP(used, 256);
-#ifdef CONFIG_ACPI
+ bitmap_zero(used, 256);
+ for (i = 0; i < nr_ioapics; i++) {
+ struct mpc_ioapic *ia = &mp_ioapics[i];
+ __set_bit(ia->apicid, used);
+ }
+ if (!test_bit(id, used))
+ return id;
+ return find_first_zero_bit(used, 256);
+#endif
+}
#ifdef CONFIG_X86_32
int __init io_apic_get_unique_id(int ioapic, int apic_id)
@@ -4054,8 +4021,6 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
return 0;
}
-#endif /* CONFIG_ACPI */
-
/*
* This function currently is only a helper for the i386 smp boot process where
* we need to reprogram the ioredtbls to cater for the cpus which have come online
@@ -4109,7 +4074,7 @@ void __init setup_ioapic_dest(void)
static struct resource *ioapic_resources;
-static struct resource * __init ioapic_setup_resources(void)
+static struct resource * __init ioapic_setup_resources(int nr_ioapics)
{
unsigned long n;
struct resource *res;
@@ -4125,15 +4090,13 @@ static struct resource * __init ioapic_setup_resources(void)
mem = alloc_bootmem(n);
res = (void *)mem;
- if (mem != NULL) {
- mem += sizeof(struct resource) * nr_ioapics;
+ mem += sizeof(struct resource) * nr_ioapics;
- for (i = 0; i < nr_ioapics; i++) {
- res[i].name = mem;
- res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- sprintf(mem, "IOAPIC %u", i);
- mem += IOAPIC_RESOURCE_NAME_SIZE;
- }
+ for (i = 0; i < nr_ioapics; i++) {
+ res[i].name = mem;
+ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ sprintf(mem, "IOAPIC %u", i);
+ mem += IOAPIC_RESOURCE_NAME_SIZE;
}
ioapic_resources = res;
@@ -4147,7 +4110,7 @@ void __init ioapic_init_mappings(void)
struct resource *ioapic_res;
int i;
- ioapic_res = ioapic_setup_resources();
+ ioapic_res = ioapic_setup_resources(nr_ioapics);
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
ioapic_phys = mp_ioapics[i].apicaddr;
@@ -4176,11 +4139,9 @@ fake_ioapic_page:
__fix_to_virt(idx), ioapic_phys);
idx++;
- if (ioapic_res != NULL) {
- ioapic_res->start = ioapic_phys;
- ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
- ioapic_res++;
- }
+ ioapic_res->start = ioapic_phys;
+ ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+ ioapic_res++;
}
}
@@ -4201,3 +4162,76 @@ void __init ioapic_insert_resources(void)
r++;
}
}
+
+int mp_find_ioapic(int gsi)
+{
+ int i = 0;
+
+ /* Find the IOAPIC that manages this GSI. */
+ for (i = 0; i < nr_ioapics; i++) {
+ if ((gsi >= mp_gsi_routing[i].gsi_base)
+ && (gsi <= mp_gsi_routing[i].gsi_end))
+ return i;
+ }
+
+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ return -1;
+}
+
+int mp_find_ioapic_pin(int ioapic, int gsi)
+{
+ if (WARN_ON(ioapic == -1))
+ return -1;
+ if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end))
+ return -1;
+
+ return gsi - mp_gsi_routing[ioapic].gsi_base;
+}
+
+static int bad_ioapic(unsigned long address)
+{
+ if (nr_ioapics >= MAX_IO_APICS) {
+ printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded "
+ "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
+ return 1;
+ }
+ if (!address) {
+ printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address"
+ " found in table, skipping!\n");
+ return 1;
+ }
+ return 0;
+}
+
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+{
+ int idx = 0;
+
+ if (bad_ioapic(address))
+ return;
+
+ idx = nr_ioapics;
+
+ mp_ioapics[idx].type = MP_IOAPIC;
+ mp_ioapics[idx].flags = MPC_APIC_USABLE;
+ mp_ioapics[idx].apicaddr = address;
+
+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+ mp_ioapics[idx].apicid = io_apic_unique_id(id);
+ mp_ioapics[idx].apicver = io_apic_get_version(idx);
+
+ /*
+ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+ */
+ mp_gsi_routing[idx].gsi_base = gsi_base;
+ mp_gsi_routing[idx].gsi_end = gsi_base +
+ io_apic_get_redir_entries(idx);
+
+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+ "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
+ mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
+ mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end);
+
+ nr_ioapics++;
+}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index e6b4f517fcfe..08385e090a6f 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -106,6 +106,9 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
unsigned long mask = cpumask_bits(cpumask)[0];
unsigned long flags;
+ if (WARN_ONCE(!mask, "empty IPI mask"))
+ return;
+
local_irq_save(flags);
WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
__default_send_IPI_dest_field(mask, vector, apic->dest_logical);
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index b3025b43b63a..db7220220d09 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,7 @@
int unknown_nmi_panic;
int nmi_watchdog_enabled;
-static cpumask_var_t backtrace_mask;
+static cpumask_t backtrace_mask __read_mostly;
/* nmi_active:
* >0: the lapic NMI watchdog is active, but can be disabled
@@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void)
if (!prev_nmi_count)
goto error;
- alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO);
printk(KERN_INFO "Testing NMI watchdog ... ");
#ifdef CONFIG_SMP
@@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
}
/* We can be called before check_nmi_watchdog, hence NULL check. */
- if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) {
+ if (cpumask_test_cpu(cpu, &backtrace_mask)) {
static DEFINE_SPINLOCK(lock); /* Serialise the printks */
spin_lock(&lock);
printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+ show_regs(regs);
dump_stack();
spin_unlock(&lock);
- cpumask_clear_cpu(cpu, backtrace_mask);
+ cpumask_clear_cpu(cpu, &backtrace_mask);
+
+ rc = 1;
}
/* Could check oops_in_progress here too, but it's safer not to */
@@ -552,14 +554,18 @@ int do_nmi_callback(struct pt_regs *regs, int cpu)
return 0;
}
-void __trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(void)
{
int i;
- cpumask_copy(backtrace_mask, cpu_online_mask);
+ cpumask_copy(&backtrace_mask, cpu_online_mask);
+
+ printk(KERN_INFO "sending NMI to all CPUs:\n");
+ apic->send_IPI_all(NMI_VECTOR);
+
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
- if (cpumask_empty(backtrace_mask))
+ if (cpumask_empty(&backtrace_mask))
break;
mdelay(1);
}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index bc3e880f9b82..65edc180fc82 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -44,17 +44,22 @@ static struct apic *apic_probe[] __initdata = {
NULL,
};
+static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return hard_smp_processor_id() >> index_msb;
+}
+
/*
* Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
*/
void __init default_setup_apic_routing(void)
{
#ifdef CONFIG_X86_X2APIC
- if (x2apic_mode && (apic != &apic_x2apic_phys &&
+ if (x2apic_mode
#ifdef CONFIG_X86_UV
- apic != &apic_x2apic_uv_x &&
+ && apic != &apic_x2apic_uv_x
#endif
- apic != &apic_x2apic_cluster)) {
+ ) {
if (x2apic_phys)
apic = &apic_x2apic_phys;
else
@@ -69,6 +74,11 @@ void __init default_setup_apic_routing(void)
printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
}
+ if (is_vsmp_box()) {
+ /* need to update phys_pkg_id */
+ apic->phys_pkg_id = apicid_phys_pkg_id;
+ }
+
/*
* Now that apic routing model is selected, configure the
* fault handling for intr remapping.
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 442b5508893f..151ace69a5aa 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -403,7 +403,15 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
static struct apm_user *user_list;
static DEFINE_SPINLOCK(user_list_lock);
-static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } };
+
+/*
+ * Set up a segment that references the real mode segment 0x40
+ * that extends up to the end of page zero (that we have reserved).
+ * This is for buggy BIOS's that refer to (real mode) segment 0x40
+ * even though they are called in protected mode.
+ */
+static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092,
+ (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1);
static const char driver_version[] = "1.16ac"; /* no spaces */
@@ -2332,15 +2340,6 @@ static int __init apm_init(void)
pm_flags |= PM_APM;
/*
- * Set up a segment that references the real mode segment 0x40
- * that extends up to the end of page zero (that we have reserved).
- * This is for buggy BIOS's that refer to (real mode) segment 0x40
- * even though they are called in protected mode.
- */
- set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
- _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
-
- /*
* Set up the long jump entry point to the APM BIOS, which is called
* from inline assembly.
*/
@@ -2358,12 +2357,12 @@ static int __init apm_init(void)
* code to that CPU.
*/
gdt = get_cpu_gdt_table(0);
- set_base(gdt[APM_CS >> 3],
- __va((unsigned long)apm_info.bios.cseg << 4));
- set_base(gdt[APM_CS_16 >> 3],
- __va((unsigned long)apm_info.bios.cseg_16 << 4));
- set_base(gdt[APM_DS >> 3],
- __va((unsigned long)apm_info.bios.dseg << 4));
+ set_desc_base(&gdt[APM_CS >> 3],
+ (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4));
+ set_desc_base(&gdt[APM_CS_16 >> 3],
+ (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4));
+ set_desc_base(&gdt[APM_DS >> 3],
+ (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4));
proc_create("apm", 0, NULL, &apm_file_ops);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 898ecc47e129..4a6aeedcd965 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -3,6 +3,7 @@
* This code generates raw asm output which is post-processed to extract
* and format the required data.
*/
+#define COMPILE_OFFSETS
#include <linux/crypto.h>
#include <linux/sched.h>
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 83b217c7225f..22a47c82f3c0 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -253,6 +253,64 @@ static int __cpuinit nearby_node(int apicid)
#endif
/*
+ * Fixup core topology information for AMD multi-node processors.
+ * Assumption 1: Number of cores in each internal node is the same.
+ * Assumption 2: Mixed systems with both single-node and dual-node
+ * processors are not supported.
+ */
+#ifdef CONFIG_X86_HT
+static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_PCI
+ u32 t, cpn;
+ u8 n, n_id;
+ int cpu = smp_processor_id();
+
+ /* fixup topology information only once for a core */
+ if (cpu_has(c, X86_FEATURE_AMD_DCM))
+ return;
+
+ /* check for multi-node processor on boot cpu */
+ t = read_pci_config(0, 24, 3, 0xe8);
+ if (!(t & (1 << 29)))
+ return;
+
+ set_cpu_cap(c, X86_FEATURE_AMD_DCM);
+
+ /* cores per node: each internal node has half the number of cores */
+ cpn = c->x86_max_cores >> 1;
+
+ /* even-numbered NB_id of this dual-node processor */
+ n = c->phys_proc_id << 1;
+
+ /*
+ * determine internal node id and assign cores fifty-fifty to
+ * each node of the dual-node processor
+ */
+ t = read_pci_config(0, 24 + n, 3, 0xe8);
+ n = (t>>30) & 0x3;
+ if (n == 0) {
+ if (c->cpu_core_id < cpn)
+ n_id = 0;
+ else
+ n_id = 1;
+ } else {
+ if (c->cpu_core_id < cpn)
+ n_id = 1;
+ else
+ n_id = 0;
+ }
+
+ /* compute entire NodeID, use llc_shared_map to store sibling info */
+ per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id;
+
+ /* fixup core id to be in range from 0 to cpn */
+ c->cpu_core_id = c->cpu_core_id % cpn;
+#endif
+}
+#endif
+
+/*
* On a AMD dual core setup the lower bits of the APIC id distingush the cores.
* Assumes number of cores is a power of two.
*/
@@ -269,6 +327,9 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
c->phys_proc_id = c->initial_apicid >> bits;
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+ /* fixup topology information on multi-node processors */
+ if ((c->x86 == 0x10) && (c->x86_model == 9))
+ amd_fixup_dcm(c);
#endif
}
@@ -277,9 +338,10 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
int cpu = smp_processor_id();
int node;
- unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
+ unsigned apicid = c->apicid;
+
+ node = per_cpu(cpu_llc_id, cpu);
- node = c->phys_proc_id;
if (apicid_to_node[apicid] != NUMA_NO_NODE)
node = apicid_to_node[apicid];
if (!node_online(node)) {
@@ -406,12 +468,24 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
/*
* Some BIOSes incorrectly force this feature, but only K8
* revision D (model = 0x14) and later actually support it.
+ * (AMD Erratum #110, docId: 25759).
*/
- if (c->x86_model < 0x14)
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
+ u64 val;
+
clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
+ if (!rdmsrl_amd_safe(0xc001100d, &val)) {
+ val &= ~(1ULL << 32);
+ wrmsrl_amd_safe(0xc001100d, val);
+ }
+ }
+
}
if (c->x86 == 0x10 || c->x86 == 0x11)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* get apicid instead of initial apic id from cpuid */
+ c->apicid = hard_smp_processor_id();
#else
/*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 734eaad93656..2055fc2b2e6b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -94,45 +94,45 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
* TLS descriptors are currently at a different place compared to i386.
* Hopefully nobody expects them at a fixed place (Wine?)
*/
- [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
- [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
#else
- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
/*
* Segments used for calling PnP BIOS have byte granularity.
* They code segments and data segments have fixed 64k limits,
* the transfer segment sizes are set at run time.
*/
/* 32-bit code */
- [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+ [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
/* 16-bit code */
- [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+ [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0),
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0),
/*
* The APM segments have byte granularity and their bases
* are set at run time. All have 64k limits.
*/
/* 32-bit code */
- [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
+ [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
/* 16-bit code */
- [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
/* data */
- [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
+ [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
- [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } },
- [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
+ [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+ [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
GDT_STACK_CANARY_INIT
#endif
} };
@@ -987,13 +987,21 @@ struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE);
-DEFINE_PER_CPU(char *, irq_stack_ptr) =
- init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+/*
+ * The following four percpu variables are hot. Align current_task to
+ * cacheline size such that all four fall in the same cacheline.
+ */
+DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
+ &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
DEFINE_PER_CPU(unsigned long, kernel_stack) =
(unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
EXPORT_PER_CPU_SYMBOL(kernel_stack);
+DEFINE_PER_CPU(char *, irq_stack_ptr) =
+ init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+
DEFINE_PER_CPU(unsigned int, irq_count) = -1;
/*
@@ -1008,8 +1016,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
};
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
- __aligned(PAGE_SIZE);
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */
void syscall_init(void)
@@ -1042,8 +1049,11 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist);
#else /* CONFIG_X86_64 */
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
#ifdef CONFIG_CC_STACKPROTECTOR
-DEFINE_PER_CPU(unsigned long, stack_canary);
+DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
#endif
/* Make sure %fs and %gs are initialized properly in idle threads */
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 306bf0dca061..804c40e2bc3e 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -241,7 +241,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
case 0:
if (!l1->val)
return;
- assoc = l1->assoc;
+ assoc = assocs[l1->assoc];
line_size = l1->line_size;
lines_per_tag = l1->lines_per_tag;
size_in_kb = l1->size_in_kb;
@@ -249,7 +249,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
case 2:
if (!l2.val)
return;
- assoc = l2.assoc;
+ assoc = assocs[l2.assoc];
line_size = l2.line_size;
lines_per_tag = l2.lines_per_tag;
/* cpu_data has errata corrections for K7 applied */
@@ -258,10 +258,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
case 3:
if (!l3.val)
return;
- assoc = l3.assoc;
+ assoc = assocs[l3.assoc];
line_size = l3.line_size;
lines_per_tag = l3.lines_per_tag;
size_in_kb = l3.size_encoded * 512;
+ if (boot_cpu_has(X86_FEATURE_AMD_DCM)) {
+ size_in_kb = size_in_kb >> 1;
+ assoc = assoc >> 1;
+ }
break;
default:
return;
@@ -270,18 +274,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
eax->split.is_self_initializing = 1;
eax->split.type = types[leaf];
eax->split.level = levels[leaf];
- if (leaf == 3)
- eax->split.num_threads_sharing =
- current_cpu_data.x86_max_cores - 1;
- else
- eax->split.num_threads_sharing = 0;
+ eax->split.num_threads_sharing = 0;
eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
- if (assoc == 0xf)
+ if (assoc == 0xffff)
eax->split.is_fully_associative = 1;
ebx->split.coherency_line_size = line_size - 1;
- ebx->split.ways_of_associativity = assocs[assoc] - 1;
+ ebx->split.ways_of_associativity = assoc - 1;
ebx->split.physical_line_partition = lines_per_tag - 1;
ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
(ebx->split.ways_of_associativity + 1) - 1;
@@ -523,6 +523,18 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
int index_msb, i;
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
+ struct cpuinfo_x86 *d;
+ for_each_online_cpu(i) {
+ if (!per_cpu(cpuid4_info, i))
+ continue;
+ d = &cpu_data(i);
+ this_leaf = CPUID4_INFO_IDX(i, index);
+ cpumask_copy(to_cpumask(this_leaf->shared_cpu_map),
+ d->llc_shared_map);
+ }
+ return;
+ }
this_leaf = CPUID4_INFO_IDX(cpu, index);
num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 01213048f62f..9bfe9d2ea615 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -183,6 +183,11 @@ void mce_log(struct mce *mce)
set_bit(0, &mce_need_notify);
}
+void __weak decode_mce(struct mce *m)
+{
+ return;
+}
+
static void print_mce(struct mce *m)
{
printk(KERN_EMERG
@@ -205,6 +210,8 @@ static void print_mce(struct mce *m)
printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid,
m->apicid);
+
+ decode_mce(m);
}
static void print_mce_head(void)
@@ -215,7 +222,10 @@ static void print_mce_head(void)
static void print_mce_tail(void)
{
printk(KERN_EMERG "This is not a software problem!\n"
- "Run through mcelog --ascii to decode and contact your hardware vendor\n");
+#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD))
+ "Run through mcelog --ascii to decode and contact your hardware vendor\n"
+#endif
+ );
}
#define PANIC_TIMEOUT 5 /* 5 seconds */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ddae21620bda..1fecba404fd8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -489,12 +489,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
int i, err = 0;
struct threshold_bank *b = NULL;
char name[32];
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
sprintf(name, "threshold_bank%i", bank);
#ifdef CONFIG_SMP
if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
- i = cpumask_first(cpu_core_mask(cpu));
+ i = cpumask_first(c->llc_shared_map);
/* first core not up yet */
if (cpu_data(i).cpu_core_id)
@@ -514,7 +516,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (err)
goto out;
- cpumask_copy(b->cpus, cpu_core_mask(cpu));
+ cpumask_copy(b->cpus, c->llc_shared_map);
per_cpu(threshold_banks, cpu)[bank] = b;
goto out;
@@ -539,7 +541,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
#ifndef CONFIG_SMP
cpumask_setall(b->cpus);
#else
- cpumask_copy(b->cpus, cpu_core_mask(cpu));
+ cpumask_copy(b->cpus, c->llc_shared_map);
#endif
per_cpu(threshold_banks, cpu)[bank] = b;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 900332b800f8..f9cd0849bd42 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -6,6 +6,7 @@
* Copyright (C) 2009 Jaswinder Singh Rajput
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
*
* For licencing details see kernel-base/COPYING
*/
@@ -20,6 +21,7 @@
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
+#include <linux/cpu.h>
#include <asm/apic.h>
#include <asm/stacktrace.h>
@@ -27,12 +29,52 @@
static u64 perf_counter_mask __read_mostly;
+/* The maximal number of PEBS counters: */
+#define MAX_PEBS_COUNTERS 4
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE 24
+
+/* The size of a per-cpu BTS buffer in bytes: */
+#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024)
+
+/* The BTS overflow threshold in bytes from the end of the buffer: */
+#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64)
+
+
+/*
+ * Bits in the debugctlmsr controlling branch tracing.
+ */
+#define X86_DEBUGCTL_TR (1 << 6)
+#define X86_DEBUGCTL_BTS (1 << 7)
+#define X86_DEBUGCTL_BTINT (1 << 8)
+#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
+#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+ u64 bts_buffer_base;
+ u64 bts_index;
+ u64 bts_absolute_maximum;
+ u64 bts_interrupt_threshold;
+ u64 pebs_buffer_base;
+ u64 pebs_index;
+ u64 pebs_absolute_maximum;
+ u64 pebs_interrupt_threshold;
+ u64 pebs_counter_reset[MAX_PEBS_COUNTERS];
+};
+
struct cpu_hw_counters {
struct perf_counter *counters[X86_PMC_IDX_MAX];
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long interrupts;
int enabled;
+ struct debug_store *ds;
};
/*
@@ -58,6 +100,8 @@ struct x86_pmu {
int apic;
u64 max_period;
u64 intel_ctrl;
+ void (*enable_bts)(u64 config);
+ void (*disable_bts)(void);
};
static struct x86_pmu x86_pmu __read_mostly;
@@ -577,6 +621,9 @@ x86_perf_counter_update(struct perf_counter *counter,
u64 prev_raw_count, new_raw_count;
s64 delta;
+ if (idx == X86_PMC_IDX_FIXED_BTS)
+ return 0;
+
/*
* Careful: an NMI might modify the previous counter value.
*
@@ -666,10 +713,110 @@ static void release_pmc_hardware(void)
#endif
}
+static inline bool bts_available(void)
+{
+ return x86_pmu.enable_bts != NULL;
+}
+
+static inline void init_debug_store_on_cpu(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+
+ if (!ds)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+ (u32)((u64)(unsigned long)ds),
+ (u32)((u64)(unsigned long)ds >> 32));
+}
+
+static inline void fini_debug_store_on_cpu(int cpu)
+{
+ if (!per_cpu(cpu_hw_counters, cpu).ds)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static void release_bts_hardware(void)
+{
+ int cpu;
+
+ if (!bts_available())
+ return;
+
+ get_online_cpus();
+
+ for_each_online_cpu(cpu)
+ fini_debug_store_on_cpu(cpu);
+
+ for_each_possible_cpu(cpu) {
+ struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+
+ if (!ds)
+ continue;
+
+ per_cpu(cpu_hw_counters, cpu).ds = NULL;
+
+ kfree((void *)(unsigned long)ds->bts_buffer_base);
+ kfree(ds);
+ }
+
+ put_online_cpus();
+}
+
+static int reserve_bts_hardware(void)
+{
+ int cpu, err = 0;
+
+ if (!bts_available())
+ return 0;
+
+ get_online_cpus();
+
+ for_each_possible_cpu(cpu) {
+ struct debug_store *ds;
+ void *buffer;
+
+ err = -ENOMEM;
+ buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+ if (unlikely(!buffer))
+ break;
+
+ ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+ if (unlikely(!ds)) {
+ kfree(buffer);
+ break;
+ }
+
+ ds->bts_buffer_base = (u64)(unsigned long)buffer;
+ ds->bts_index = ds->bts_buffer_base;
+ ds->bts_absolute_maximum =
+ ds->bts_buffer_base + BTS_BUFFER_SIZE;
+ ds->bts_interrupt_threshold =
+ ds->bts_absolute_maximum - BTS_OVFL_TH;
+
+ per_cpu(cpu_hw_counters, cpu).ds = ds;
+ err = 0;
+ }
+
+ if (err)
+ release_bts_hardware();
+ else {
+ for_each_online_cpu(cpu)
+ init_debug_store_on_cpu(cpu);
+ }
+
+ put_online_cpus();
+
+ return err;
+}
+
static void hw_perf_counter_destroy(struct perf_counter *counter)
{
if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
release_pmc_hardware();
+ release_bts_hardware();
mutex_unlock(&pmc_reserve_mutex);
}
}
@@ -712,6 +859,42 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
return 0;
}
+static void intel_pmu_enable_bts(u64 config)
+{
+ unsigned long debugctlmsr;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr |= X86_DEBUGCTL_TR;
+ debugctlmsr |= X86_DEBUGCTL_BTS;
+ debugctlmsr |= X86_DEBUGCTL_BTINT;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+ debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+ debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ unsigned long debugctlmsr;
+
+ if (!cpuc->ds)
+ return;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr &=
+ ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+ X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+
+ update_debugctlmsr(debugctlmsr);
+}
+
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -728,9 +911,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
err = 0;
if (!atomic_inc_not_zero(&active_counters)) {
mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
- err = -EBUSY;
- else
+ if (atomic_read(&active_counters) == 0) {
+ if (!reserve_pmc_hardware())
+ err = -EBUSY;
+ else
+ err = reserve_bts_hardware();
+ }
+ if (!err)
atomic_inc(&active_counters);
mutex_unlock(&pmc_reserve_mutex);
}
@@ -793,6 +980,20 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
if (config == -1LL)
return -EINVAL;
+ /*
+ * Branch tracing:
+ */
+ if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+ (hwc->sample_period == 1)) {
+ /* BTS is not supported by this architecture. */
+ if (!bts_available())
+ return -EOPNOTSUPP;
+
+ /* BTS is currently only allowed for user-mode. */
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ return -EOPNOTSUPP;
+ }
+
hwc->config |= config;
return 0;
@@ -817,7 +1018,18 @@ static void p6_pmu_disable_all(void)
static void intel_pmu_disable_all(void)
{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ if (!cpuc->enabled)
+ return;
+
+ cpuc->enabled = 0;
+ barrier();
+
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+ if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ intel_pmu_disable_bts();
}
static void amd_pmu_disable_all(void)
@@ -875,7 +1087,25 @@ static void p6_pmu_enable_all(void)
static void intel_pmu_enable_all(void)
{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ if (cpuc->enabled)
+ return;
+
+ cpuc->enabled = 1;
+ barrier();
+
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+ if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+ struct perf_counter *counter =
+ cpuc->counters[X86_PMC_IDX_FIXED_BTS];
+
+ if (WARN_ON_ONCE(!counter))
+ return;
+
+ intel_pmu_enable_bts(counter->hw.config);
+ }
}
static void amd_pmu_enable_all(void)
@@ -962,6 +1192,11 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
static inline void
intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
{
+ if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+ intel_pmu_disable_bts();
+ return;
+ }
+
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_disable_fixed(hwc, idx);
return;
@@ -990,6 +1225,9 @@ x86_perf_counter_set_period(struct perf_counter *counter,
s64 period = hwc->sample_period;
int err, ret = 0;
+ if (idx == X86_PMC_IDX_FIXED_BTS)
+ return 0;
+
/*
* If we are way outside a reasoable range then just skip forward:
*/
@@ -1072,6 +1310,14 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
{
+ if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+ if (!__get_cpu_var(cpu_hw_counters).enabled)
+ return;
+
+ intel_pmu_enable_bts(hwc->config);
+ return;
+ }
+
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_enable_fixed(hwc, idx);
return;
@@ -1093,11 +1339,16 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
{
unsigned int event;
+ event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+ if (unlikely((event ==
+ x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+ (hwc->sample_period == 1)))
+ return X86_PMC_IDX_FIXED_BTS;
+
if (!x86_pmu.num_counters_fixed)
return -1;
- event = hwc->config & ARCH_PERFMON_EVENT_MASK;
-
if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
return X86_PMC_IDX_FIXED_INSTRUCTIONS;
if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1118,7 +1369,15 @@ static int x86_pmu_enable(struct perf_counter *counter)
int idx;
idx = fixed_mode_idx(counter, hwc);
- if (idx >= 0) {
+ if (idx == X86_PMC_IDX_FIXED_BTS) {
+ /* BTS is already occupied. */
+ if (test_and_set_bit(idx, cpuc->used_mask))
+ return -EAGAIN;
+
+ hwc->config_base = 0;
+ hwc->counter_base = 0;
+ hwc->idx = idx;
+ } else if (idx >= 0) {
/*
* Try to get the fixed counter, if that is already taken
* then try to get a generic counter:
@@ -1229,6 +1488,44 @@ void perf_counter_print_debug(void)
local_irq_restore(flags);
}
+static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
+ struct perf_sample_data *data)
+{
+ struct debug_store *ds = cpuc->ds;
+ struct bts_record {
+ u64 from;
+ u64 to;
+ u64 flags;
+ };
+ struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
+ unsigned long orig_ip = data->regs->ip;
+ struct bts_record *at, *top;
+
+ if (!counter)
+ return;
+
+ if (!ds)
+ return;
+
+ at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+ top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+ ds->bts_index = ds->bts_buffer_base;
+
+ for (; at < top; at++) {
+ data->regs->ip = at->from;
+ data->addr = at->to;
+
+ perf_counter_output(counter, 1, data);
+ }
+
+ data->regs->ip = orig_ip;
+ data->addr = 0;
+
+ /* There's new data available. */
+ counter->pending_kill = POLL_IN;
+}
+
static void x86_pmu_disable(struct perf_counter *counter)
{
struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -1253,6 +1550,15 @@ static void x86_pmu_disable(struct perf_counter *counter)
* that we are disabling:
*/
x86_perf_counter_update(counter, hwc, idx);
+
+ /* Drain the remaining BTS records. */
+ if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+ struct perf_sample_data data;
+ struct pt_regs regs;
+
+ data.regs = &regs;
+ intel_pmu_drain_bts_buffer(cpuc, &data);
+ }
cpuc->counters[idx] = NULL;
clear_bit(idx, cpuc->used_mask);
@@ -1280,6 +1586,7 @@ static int intel_pmu_save_and_restart(struct perf_counter *counter)
static void intel_pmu_reset(void)
{
+ struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds;
unsigned long flags;
int idx;
@@ -1297,6 +1604,8 @@ static void intel_pmu_reset(void)
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
}
+ if (ds)
+ ds->bts_index = ds->bts_buffer_base;
local_irq_restore(flags);
}
@@ -1362,6 +1671,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_counters);
perf_disable();
+ intel_pmu_drain_bts_buffer(cpuc, &data);
status = intel_pmu_get_status();
if (!status) {
perf_enable();
@@ -1571,6 +1881,8 @@ static struct x86_pmu intel_pmu = {
* the generic counter period:
*/
.max_period = (1ULL << 31) - 1,
+ .enable_bts = intel_pmu_enable_bts,
+ .disable_bts = intel_pmu_disable_bts,
};
static struct x86_pmu amd_pmu = {
@@ -1962,3 +2274,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
return entry;
}
+
+void hw_perf_counter_setup_online(int cpu)
+{
+ init_debug_store_on_cpu(cpu);
+}
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 1e904346bbf4..62ac8cb6ba27 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -116,11 +116,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
#endif
seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size);
-#ifdef CONFIG_X86_64
seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
c->x86_phys_bits, c->x86_virt_bits);
-#endif
seq_printf(m, "power management:");
for (i = 0; i < 32; i++) {
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index b4f14c6c09d9..37250fe490b1 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -27,9 +27,7 @@ static void doublefault_fn(void)
if (ptr_ok(gdt)) {
gdt += GDT_ENTRY_TSS << 3;
- tss = *(u16 *)(gdt+2);
- tss += *(u8 *)(gdt+4) << 16;
- tss += *(u8 *)(gdt+7) << 24;
+ tss = get_desc_base((struct desc_struct *)gdt);
printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
if (ptr_ok(tss)) {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 5cb5725b2bae..147005a1cc3c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -115,7 +115,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
{
int x = e820x->nr_map;
- if (x == ARRAY_SIZE(e820x->map)) {
+ if (x >= ARRAY_SIZE(e820x->map)) {
printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
return;
}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d94e1ea3b9fe..9dbb527e1652 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
unsigned long return_hooker = (unsigned long)
&return_to_handler;
- /* Nmi's are currently unsupported */
- if (unlikely(in_nmi()))
- return;
-
if (unlikely(atomic_read(&current->tracing_graph_pause)))
return;
@@ -498,37 +494,56 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
struct syscall_metadata *syscall_nr_to_meta(int nr)
{
- if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0)
+ if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
return NULL;
return syscalls_metadata[nr];
}
-void arch_init_ftrace_syscalls(void)
+int syscall_name_to_nr(char *name)
+{
+ int i;
+
+ if (!syscalls_metadata)
+ return -1;
+
+ for (i = 0; i < NR_syscalls; i++) {
+ if (syscalls_metadata[i]) {
+ if (!strcmp(syscalls_metadata[i]->name, name))
+ return i;
+ }
+ }
+ return -1;
+}
+
+void set_syscall_enter_id(int num, int id)
+{
+ syscalls_metadata[num]->enter_id = id;
+}
+
+void set_syscall_exit_id(int num, int id)
+{
+ syscalls_metadata[num]->exit_id = id;
+}
+
+static int __init arch_init_ftrace_syscalls(void)
{
int i;
struct syscall_metadata *meta;
unsigned long **psys_syscall_table = &sys_call_table;
- static atomic_t refs;
-
- if (atomic_inc_return(&refs) != 1)
- goto end;
syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
- FTRACE_SYSCALL_MAX, GFP_KERNEL);
+ NR_syscalls, GFP_KERNEL);
if (!syscalls_metadata) {
WARN_ON(1);
- return;
+ return -ENOMEM;
}
- for (i = 0; i < FTRACE_SYSCALL_MAX; i++) {
+ for (i = 0; i < NR_syscalls; i++) {
meta = find_syscall_meta(psys_syscall_table[i]);
syscalls_metadata[i] = meta;
}
- return;
-
- /* Paranoid: avoid overflow */
-end:
- atomic_dec(&refs);
+ return 0;
}
+arch_initcall(arch_init_ftrace_syscalls);
#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index cc827ac9e8d3..7ffec6b3b331 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -439,7 +439,6 @@ is386: movl $2,%ecx # set MP
jne 1f
movl $per_cpu__gdt_page,%eax
movl $per_cpu__stack_canary,%ecx
- subl $20, %ecx
movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
shrl $16, %ecx
movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c664d515f613..63b0ec8d3d4a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -34,7 +34,6 @@
struct kvm_para_state {
u8 mmu_queue[MMU_QUEUE_SIZE];
int mmu_queue_len;
- enum paravirt_lazy_mode mode;
};
static DEFINE_PER_CPU(struct kvm_para_state, para_state);
@@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len)
{
struct kvm_para_state *state = kvm_para_state();
- if (state->mode != PARAVIRT_LAZY_MMU) {
+ if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
kvm_mmu_op(buffer, len);
return;
}
@@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn)
static void kvm_enter_lazy_mmu(void)
{
- struct kvm_para_state *state = kvm_para_state();
-
paravirt_enter_lazy_mmu();
- state->mode = paravirt_get_lazy_mode();
}
static void kvm_leave_lazy_mmu(void)
@@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void)
mmu_queue_flush(state);
paravirt_leave_lazy_mmu();
- state->mode = paravirt_get_lazy_mode();
}
static void __init paravirt_ops_setup(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 223af43f1526..e5efcdcca31b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -50,8 +50,8 @@ static unsigned long kvm_get_wallclock(void)
struct timespec ts;
int low, high;
- low = (int)__pa(&wall_clock);
- high = ((u64)__pa(&wall_clock) >> 32);
+ low = (int)__pa_symbol(&wall_clock);
+ high = ((u64)__pa_symbol(&wall_clock) >> 32);
native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
vcpu_time = &get_cpu_var(hv_clock);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 651c93b28862..fcd513bf2846 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -482,11 +482,11 @@ static void __init construct_ioapic_table(int mpc_default_type)
MP_bus_info(&bus);
}
- ioapic.type = MP_IOAPIC;
- ioapic.apicid = 2;
- ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
- ioapic.flags = MPC_APIC_USABLE;
- ioapic.apicaddr = 0xFEC00000;
+ ioapic.type = MP_IOAPIC;
+ ioapic.apicid = 2;
+ ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ ioapic.flags = MPC_APIC_USABLE;
+ ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE;
MP_ioapic_info(&ioapic);
/*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 98fd6cd4e3a4..7dd950094178 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -1,6 +1,7 @@
/* ----------------------------------------------------------------------- *
*
* Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author: H. Peter Anvin
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -80,11 +81,8 @@ static ssize_t msr_read(struct file *file, char __user *buf,
for (; count; count -= 8) {
err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
- if (err) {
- if (err == -EFAULT) /* Fix idiotic error code */
- err = -EIO;
+ if (err)
break;
- }
if (copy_to_user(tmp, &data, 8)) {
err = -EFAULT;
break;
@@ -115,11 +113,8 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
break;
}
err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
- if (err) {
- if (err == -EFAULT) /* Fix idiotic error code */
- err = -EIO;
+ if (err)
break;
- }
tmp += 2;
bytes += 8;
}
@@ -127,6 +122,54 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
return bytes ? bytes : err;
}
+static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
+{
+ u32 __user *uregs = (u32 __user *)arg;
+ u32 regs[8];
+ int cpu = iminor(file->f_path.dentry->d_inode);
+ int err;
+
+ switch (ioc) {
+ case X86_IOC_RDMSR_REGS:
+ if (!(file->f_mode & FMODE_READ)) {
+ err = -EBADF;
+ break;
+ }
+ if (copy_from_user(&regs, uregs, sizeof regs)) {
+ err = -EFAULT;
+ break;
+ }
+ err = rdmsr_safe_regs_on_cpu(cpu, regs);
+ if (err)
+ break;
+ if (copy_to_user(uregs, &regs, sizeof regs))
+ err = -EFAULT;
+ break;
+
+ case X86_IOC_WRMSR_REGS:
+ if (!(file->f_mode & FMODE_WRITE)) {
+ err = -EBADF;
+ break;
+ }
+ if (copy_from_user(&regs, uregs, sizeof regs)) {
+ err = -EFAULT;
+ break;
+ }
+ err = wrmsr_safe_regs_on_cpu(cpu, regs);
+ if (err)
+ break;
+ if (copy_to_user(uregs, &regs, sizeof regs))
+ err = -EFAULT;
+ break;
+
+ default:
+ err = -ENOTTY;
+ break;
+ }
+
+ return err;
+}
+
static int msr_open(struct inode *inode, struct file *file)
{
unsigned int cpu = iminor(file->f_path.dentry->d_inode);
@@ -157,6 +200,8 @@ static const struct file_operations msr_fops = {
.read = msr_read,
.write = msr_write,
.open = msr_open,
+ .unlocked_ioctl = msr_ioctl,
+ .compat_ioctl = msr_ioctl,
};
static int __cpuinit msr_device_create(int cpu)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 70ec9b951d76..f5b0b4a01fb2 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -362,8 +362,9 @@ struct pv_cpu_ops pv_cpu_ops = {
#endif
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
- .read_msr_amd = native_read_msr_amd_safe,
+ .rdmsr_regs = native_rdmsr_safe_regs,
.write_msr = native_write_msr_safe,
+ .wrmsr_regs = native_wrmsr_safe_regs,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
.read_tscp = native_read_tscp,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1a041bcf506b..d71c8655905b 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -3,6 +3,7 @@
#include <linux/dmar.h>
#include <linux/bootmem.h>
#include <linux/pci.h>
+#include <linux/kmemleak.h>
#include <asm/proto.h>
#include <asm/dma.h>
@@ -32,7 +33,14 @@ int no_iommu __read_mostly;
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;
-int iommu_pass_through;
+/*
+ * This variable becomes 1 if iommu=pt is passed on the kernel command line.
+ * If this variable is 1, IOMMU implementations do no DMA ranslation for
+ * devices and allow every device to access to whole physical memory. This is
+ * useful if a user want to use an IOMMU only for KVM device assignment to
+ * guests and not for driver dma translation.
+ */
+int iommu_pass_through __read_mostly;
dma_addr_t bad_dma_address __read_mostly = 0;
EXPORT_SYMBOL(bad_dma_address);
@@ -88,6 +96,11 @@ void __init dma32_reserve_bootmem(void)
size = roundup(dma32_bootmem_size, align);
dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
512ULL<<20);
+ /*
+ * Kmemleak should not scan this block as it may not be mapped via the
+ * kernel direct mapping.
+ */
+ kmemleak_ignore(dma32_bootmem_ptr);
if (dma32_bootmem_ptr)
dma32_bootmem_size = size;
else
@@ -147,7 +160,7 @@ again:
return NULL;
addr = page_to_phys(page);
- if (!is_buffer_dma_capable(dma_mask, addr, size)) {
+ if (addr + size > dma_mask) {
__free_pages(page, get_order(size));
if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) {
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index d2e56b8f48e7..98a827ee9ed7 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -190,14 +190,13 @@ static void iommu_full(struct device *dev, size_t size, int dir)
static inline int
need_iommu(struct device *dev, unsigned long addr, size_t size)
{
- return force_iommu ||
- !is_buffer_dma_capable(*dev->dma_mask, addr, size);
+ return force_iommu || !dma_capable(dev, addr, size);
}
static inline int
nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
{
- return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
+ return !dma_capable(dev, addr, size);
}
/* Map a single continuous physical area into the IOMMU.
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 71d412a09f30..a3933d4330cd 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
static int
check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
{
- if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
+ if (hwdev && !dma_capable(hwdev, bus, size)) {
if (*hwdev->dma_mask >= DMA_BIT_MASK(32))
printk(KERN_ERR
"nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -79,12 +79,29 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
free_pages((unsigned long)vaddr, get_order(size));
}
+static void nommu_sync_single_for_device(struct device *dev,
+ dma_addr_t addr, size_t size,
+ enum dma_data_direction dir)
+{
+ flush_write_buffers();
+}
+
+
+static void nommu_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sg, int nelems,
+ enum dma_data_direction dir)
+{
+ flush_write_buffers();
+}
+
struct dma_map_ops nommu_dma_ops = {
- .alloc_coherent = dma_generic_alloc_coherent,
- .free_coherent = nommu_free_coherent,
- .map_sg = nommu_map_sg,
- .map_page = nommu_map_page,
- .is_phys = 1,
+ .alloc_coherent = dma_generic_alloc_coherent,
+ .free_coherent = nommu_free_coherent,
+ .map_sg = nommu_map_sg,
+ .map_page = nommu_map_page,
+ .sync_single_for_device = nommu_sync_single_for_device,
+ .sync_sg_for_device = nommu_sync_sg_for_device,
+ .is_phys = 1,
};
void __init no_iommu_init(void)
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 6af96ee44200..e8a35016115f 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -13,31 +13,6 @@
int swiotlb __read_mostly;
-void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs)
-{
- return alloc_bootmem_low_pages(size);
-}
-
-void *swiotlb_alloc(unsigned order, unsigned long nslabs)
-{
- return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
-}
-
-dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
-{
- return paddr;
-}
-
-phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
-{
- return baddr;
-}
-
-int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
-{
- return 0;
-}
-
static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags)
{
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 994dd6a4a2a0..071166a4ba83 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -519,16 +519,12 @@ static void c1e_idle(void)
if (!cpumask_test_cpu(cpu, c1e_mask)) {
cpumask_set_cpu(cpu, c1e_mask);
/*
- * Force broadcast so ACPI can not interfere. Needs
- * to run with interrupts enabled as it uses
- * smp_function_call.
+ * Force broadcast so ACPI can not interfere.
*/
- local_irq_enable();
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
&cpu);
printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
cpu);
- local_irq_disable();
}
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 59f4524984af..4cf79567cdab 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -61,9 +61,6 @@
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
-DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-
/*
* Return saved PC of a blocked thread.
*/
@@ -350,14 +347,21 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*next = &next_p->thread;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ bool preload_fpu;
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
- __unlazy_fpu(prev_p);
+ /*
+ * If the task has used fpu the last 5 timeslices, just do a full
+ * restore of the math state immediately to avoid the trap; the
+ * chances of needing FPU soon are obviously high now
+ */
+ preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
+ __unlazy_fpu(prev_p);
/* we're going to use this soon, after a few expensive things */
- if (next_p->fpu_counter > 5)
+ if (preload_fpu)
prefetch(next->xstate);
/*
@@ -398,6 +402,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
__switch_to_xtra(prev_p, next_p, tss);
+ /* If we're going to preload the fpu context, make sure clts
+ is run while we're batching the cpu state updates. */
+ if (preload_fpu)
+ clts();
+
/*
* Leave lazy mode, flushing any hypercalls made here.
* This must be done before restoring TLS segments so
@@ -407,15 +416,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
arch_end_context_switch(next_p);
- /* If the task has used fpu the last 5 timeslices, just do a full
- * restore of the math state immediately to avoid the trap; the
- * chances of needing FPU soon are obviously high now
- *
- * tsk_used_math() checks prevent calling math_state_restore(),
- * which can sleep in the case of !tsk_used_math()
- */
- if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
- math_state_restore();
+ if (preload_fpu)
+ __math_state_restore();
/*
* Restore %gs if needed (which is common)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ebefb5407b9d..ad535b683170 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -55,9 +55,6 @@
asmlinkage extern void ret_from_fork(void);
-DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-
DEFINE_PER_CPU(unsigned long, old_rsp);
static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -386,9 +383,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
unsigned fsindex, gsindex;
+ bool preload_fpu;
+
+ /*
+ * If the task has used fpu the last 5 timeslices, just do a full
+ * restore of the math state immediately to avoid the trap; the
+ * chances of needing FPU soon are obviously high now
+ */
+ preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
/* we're going to use this soon, after a few expensive things */
- if (next_p->fpu_counter > 5)
+ if (preload_fpu)
prefetch(next->xstate);
/*
@@ -419,6 +424,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
load_TLS(next, cpu);
+ /* Must be after DS reload */
+ unlazy_fpu(prev_p);
+
+ /* Make sure cpu is ready for new context */
+ if (preload_fpu)
+ clts();
+
/*
* Leave lazy mode, flushing any hypercalls made here.
* This must be done before restoring TLS segments so
@@ -459,9 +471,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
prev->gsindex = gsindex;
- /* Must be after DS reload */
- unlazy_fpu(prev_p);
-
/*
* Switch the PDA and FPU contexts.
*/
@@ -480,15 +489,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p, tss);
- /* If the task has used fpu the last 5 timeslices, just do a full
- * restore of the math state immediately to avoid the trap; the
- * chances of needing FPU soon are obviously high now
- *
- * tsk_used_math() checks prevent calling math_state_restore(),
- * which can sleep in the case of !tsk_used_math()
+ /*
+ * Preload the FPU context, now that we've determined that the
+ * task is likely to be using it.
*/
- if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
- math_state_restore();
+ if (preload_fpu)
+ __math_state_restore();
return prev_p;
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 09ecbde91c13..8d7d5c9c1be3 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -35,10 +35,11 @@
#include <asm/proto.h>
#include <asm/ds.h>
-#include <trace/syscall.h>
-
#include "tls.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
enum x86_regset {
REGSET_GENERAL,
REGSET_FP,
@@ -1497,8 +1498,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
tracehook_report_syscall_entry(regs))
ret = -1L;
- if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
- ftrace_syscall_enter(regs);
+ if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+ trace_sys_enter(regs, regs->orig_ax);
if (unlikely(current->audit_context)) {
if (IS_IA32)
@@ -1523,8 +1524,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
if (unlikely(current->audit_context))
audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
- if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
- ftrace_syscall_exit(regs);
+ if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+ trace_sys_exit(regs, regs->ax);
if (test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall_exit(regs, 0);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a06e8d101844..27349f92a6d7 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -4,6 +4,7 @@
#include <linux/pm.h>
#include <linux/efi.h>
#include <linux/dmi.h>
+#include <linux/tboot.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -508,6 +509,8 @@ static void native_machine_emergency_restart(void)
if (reboot_emergency)
emergency_vmx_disable_all();
+ tboot_shutdown(TB_SHUTDOWN_REBOOT);
+
/* Tell the BIOS if we want cold or warm reboot */
*((unsigned short *)__va(0x472)) = reboot_mode;
@@ -634,6 +637,8 @@ static void native_machine_halt(void)
/* stop other cpus and apics */
machine_shutdown();
+ tboot_shutdown(TB_SHUTDOWN_HALT);
+
/* stop this cpu */
stop_this_cpu(NULL);
}
@@ -645,6 +650,8 @@ static void native_machine_power_off(void)
machine_shutdown();
pm_power_off();
}
+ /* a fallback in case there is no PM info available */
+ tboot_shutdown(TB_SHUTDOWN_HALT);
}
struct machine_ops machine_ops = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index eb1f1e6e52b0..19f15c4076fb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -66,6 +66,7 @@
#include <linux/percpu.h>
#include <linux/crash_dump.h>
+#include <linux/tboot.h>
#include <video/edid.h>
@@ -987,6 +988,8 @@ void __init setup_arch(char **cmdline_p)
paravirt_pagetable_setup_done(swapper_pg_dir);
paravirt_post_allocator_init();
+ tboot_probe();
+
#ifdef CONFIG_X86_64
map_vsyscall();
#endif
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 29a3eef7cf4a..07d81916f212 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -165,7 +165,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
if (!chosen) {
size_t vm_size = VMALLOC_END - VMALLOC_START;
- size_t tot_size = num_possible_cpus() * PMD_SIZE;
+ size_t tot_size = nr_cpu_ids * PMD_SIZE;
/* on non-NUMA, embedding is better */
if (!pcpu_need_numa())
@@ -199,7 +199,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
/* allocate pointer array and alloc large pages */
- map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
+ map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
pcpul_map = alloc_bootmem(map_size);
for_each_possible_cpu(cpu) {
@@ -228,7 +228,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
/* allocate address and map */
pcpul_vm.flags = VM_ALLOC;
- pcpul_vm.size = num_possible_cpus() * PMD_SIZE;
+ pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
vm_area_register_early(&pcpul_vm, PMD_SIZE);
for_each_possible_cpu(cpu) {
@@ -250,8 +250,8 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
PMD_SIZE, pcpul_vm.addr, NULL);
/* sort pcpul_map array for pcpu_lpage_remapped() */
- for (i = 0; i < num_possible_cpus() - 1; i++)
- for (j = i + 1; j < num_possible_cpus(); j++)
+ for (i = 0; i < nr_cpu_ids - 1; i++)
+ for (j = i + 1; j < nr_cpu_ids; j++)
if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
struct pcpul_ent tmp = pcpul_map[i];
pcpul_map[i] = pcpul_map[j];
@@ -288,7 +288,7 @@ void *pcpu_lpage_remapped(void *kaddr)
{
void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
- int left = 0, right = num_possible_cpus() - 1;
+ int left = 0, right = nr_cpu_ids - 1;
int pos;
/* pcpul in use at all? */
@@ -377,7 +377,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
pcpu4k_nr_static_pages = PFN_UP(static_size);
/* unaligned allocations can't be freed, round up to page size */
- pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
+ pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
* sizeof(pcpu4k_pages[0]));
pcpu4k_pages = alloc_bootmem(pages_size);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4c578751e94e..81e58238c4ce 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -869,6 +869,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
+ if (current->replacement_session_keyring)
+ key_replace_session_keyring();
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d720b7e0cf3d..a25eeec00080 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -47,6 +47,7 @@
#include <linux/bootmem.h>
#include <linux/err.h>
#include <linux/nmi.h>
+#include <linux/tboot.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -434,7 +435,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
* For perf, we return last level cache shared map.
* And for power savings, we return cpu_core_map
*/
- if (sched_mc_power_savings || sched_smt_power_savings)
+ if ((sched_mc_power_savings || sched_smt_power_savings) &&
+ !(cpu_has(c, X86_FEATURE_AMD_DCM)))
return cpu_core_mask(cpu);
else
return c->llc_shared_map;
@@ -1331,6 +1333,7 @@ void play_dead_common(void)
void native_play_dead(void)
{
play_dead_common();
+ tboot_shutdown(TB_SHUTDOWN_WFS);
wbinvd_halt();
}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index e8b9863ef8c4..3149032ff107 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/ptrace.h>
+#include <asm/desc.h>
unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
{
@@ -23,7 +24,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
* and APM bios ones we just ignore here.
*/
if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) {
- u32 *desc;
+ struct desc_struct *desc;
unsigned long base;
seg &= ~7UL;
@@ -33,12 +34,10 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
addr = -1L; /* bogus selector, access would fault */
else {
desc = child->mm->context.ldt + seg;
- base = ((desc[0] >> 16) |
- ((desc[1] & 0xff) << 16) |
- (desc[1] & 0xff000000));
+ base = get_desc_base(desc);
/* 16-bit code segment? */
- if (!((desc[1] >> 22) & 1))
+ if (!desc->d)
addr &= 0xffff;
addr += base;
}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 6bc211accf08..45e00eb09c3a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -18,9 +18,9 @@
#include <asm/ia32.h>
#include <asm/syscalls.h>
-asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long off)
+SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, off)
{
long error;
struct file *file;
@@ -226,7 +226,7 @@ bottomup:
}
-asmlinkage long sys_uname(struct new_utsname __user *name)
+SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
{
int err;
down_read(&uts_sem);
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
new file mode 100644
index 000000000000..86c9f91b48ae
--- /dev/null
+++ b/arch/x86/kernel/tboot.c
@@ -0,0 +1,447 @@
+/*
+ * tboot.c: main implementation of helper functions used by kernel for
+ * runtime support of Intel(R) Trusted Execution Technology
+ *
+ * Copyright (c) 2006-2009, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/dma_remapping.h>
+#include <linux/init_task.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+#include <linux/cpu.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/tboot.h>
+
+#include <asm/trampoline.h>
+#include <asm/processor.h>
+#include <asm/bootparam.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+
+#include "acpi/realmode/wakeup.h"
+
+/* Global pointer to shared data; NULL means no measured launch. */
+struct tboot *tboot __read_mostly;
+
+/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
+#define AP_WAIT_TIMEOUT 1
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tboot: " fmt
+
+static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
+
+void __init tboot_probe(void)
+{
+ /* Look for valid page-aligned address for shared page. */
+ if (!boot_params.tboot_addr)
+ return;
+ /*
+ * also verify that it is mapped as we expect it before calling
+ * set_fixmap(), to reduce chance of garbage value causing crash
+ */
+ if (!e820_any_mapped(boot_params.tboot_addr,
+ boot_params.tboot_addr, E820_RESERVED)) {
+ pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n");
+ return;
+ }
+
+ /* only a natively booted kernel should be using TXT */
+ if (paravirt_enabled()) {
+ pr_warning("non-0 tboot_addr but pv_ops is enabled\n");
+ return;
+ }
+
+ /* Map and check for tboot UUID. */
+ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
+ tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
+ if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
+ pr_warning("tboot at 0x%llx is invalid\n",
+ boot_params.tboot_addr);
+ tboot = NULL;
+ return;
+ }
+ if (tboot->version < 5) {
+ pr_warning("tboot version is invalid: %u\n", tboot->version);
+ tboot = NULL;
+ return;
+ }
+
+ pr_info("found shared page at phys addr 0x%llx:\n",
+ boot_params.tboot_addr);
+ pr_debug("version: %d\n", tboot->version);
+ pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
+ pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
+ pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
+ pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
+}
+
+static pgd_t *tboot_pg_dir;
+static struct mm_struct tboot_mm = {
+ .mm_rb = RB_ROOT,
+ .pgd = swapper_pg_dir,
+ .mm_users = ATOMIC_INIT(2),
+ .mm_count = ATOMIC_INIT(1),
+ .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
+ .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+ .cpu_vm_mask = CPU_MASK_ALL,
+};
+
+static inline void switch_to_tboot_pt(void)
+{
+ write_cr3(virt_to_phys(tboot_pg_dir));
+}
+
+static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
+ pgprot_t prot)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(&tboot_mm, vaddr);
+ pud = pud_alloc(&tboot_mm, pgd, vaddr);
+ if (!pud)
+ return -1;
+ pmd = pmd_alloc(&tboot_mm, pud, vaddr);
+ if (!pmd)
+ return -1;
+ pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+ if (!pte)
+ return -1;
+ set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
+ pte_unmap(pte);
+ return 0;
+}
+
+static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
+ unsigned long nr)
+{
+ /* Reuse the original kernel mapping */
+ tboot_pg_dir = pgd_alloc(&tboot_mm);
+ if (!tboot_pg_dir)
+ return -1;
+
+ for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) {
+ if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC))
+ return -1;
+ }
+
+ return 0;
+}
+
+static void tboot_create_trampoline(void)
+{
+ u32 map_base, map_size;
+
+ /* Create identity map for tboot shutdown code. */
+ map_base = PFN_DOWN(tboot->tboot_base);
+ map_size = PFN_UP(tboot->tboot_size);
+ if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
+ panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n",
+ map_base, map_size);
+}
+
+#ifdef CONFIG_ACPI_SLEEP
+
+static void add_mac_region(phys_addr_t start, unsigned long size)
+{
+ struct tboot_mac_region *mr;
+ phys_addr_t end = start + size;
+
+ if (start && size) {
+ mr = &tboot->mac_regions[tboot->num_mac_regions++];
+ mr->start = round_down(start, PAGE_SIZE);
+ mr->size = round_up(end, PAGE_SIZE) - mr->start;
+ }
+}
+
+static int tboot_setup_sleep(void)
+{
+ tboot->num_mac_regions = 0;
+
+ /* S3 resume code */
+ add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
+
+#ifdef CONFIG_X86_TRAMPOLINE
+ /* AP trampoline code */
+ add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
+#endif
+
+ /* kernel code + data + bss */
+ add_mac_region(virt_to_phys(_text), _end - _text);
+
+ tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
+
+ return 0;
+}
+
+#else /* no CONFIG_ACPI_SLEEP */
+
+static int tboot_setup_sleep(void)
+{
+ /* S3 shutdown requested, but S3 not supported by the kernel... */
+ BUG();
+ return -1;
+}
+
+#endif
+
+void tboot_shutdown(u32 shutdown_type)
+{
+ void (*shutdown)(void);
+
+ if (!tboot_enabled())
+ return;
+
+ /*
+ * if we're being called before the 1:1 mapping is set up then just
+ * return and let the normal shutdown happen; this should only be
+ * due to very early panic()
+ */
+ if (!tboot_pg_dir)
+ return;
+
+ /* if this is S3 then set regions to MAC */
+ if (shutdown_type == TB_SHUTDOWN_S3)
+ if (tboot_setup_sleep())
+ return;
+
+ tboot->shutdown_type = shutdown_type;
+
+ switch_to_tboot_pt();
+
+ shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry;
+ shutdown();
+
+ /* should not reach here */
+ while (1)
+ halt();
+}
+
+static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
+{
+#define TB_COPY_GAS(tbg, g) \
+ tbg.space_id = g.space_id; \
+ tbg.bit_width = g.bit_width; \
+ tbg.bit_offset = g.bit_offset; \
+ tbg.access_width = g.access_width; \
+ tbg.address = g.address;
+
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block);
+
+ /*
+ * We need phys addr of waking vector, but can't use virt_to_phys() on
+ * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys
+ * addr.
+ */
+ tboot->acpi_sinfo.wakeup_vector = fadt->facs +
+ offsetof(struct acpi_table_facs, firmware_waking_vector);
+}
+
+void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
+{
+ static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
+ /* S0,1,2: */ -1, -1, -1,
+ /* S3: */ TB_SHUTDOWN_S3,
+ /* S4: */ TB_SHUTDOWN_S4,
+ /* S5: */ TB_SHUTDOWN_S5 };
+
+ if (!tboot_enabled())
+ return;
+
+ tboot_copy_fadt(&acpi_gbl_FADT);
+ tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
+ tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control;
+ /* we always use the 32b wakeup vector */
+ tboot->acpi_sinfo.vector_width = 32;
+
+ if (sleep_state >= ACPI_S_STATE_COUNT ||
+ acpi_shutdown_map[sleep_state] == -1) {
+ pr_warning("unsupported sleep state 0x%x\n", sleep_state);
+ return;
+ }
+
+ tboot_shutdown(acpi_shutdown_map[sleep_state]);
+}
+
+static atomic_t ap_wfs_count;
+
+static int tboot_wait_for_aps(int num_aps)
+{
+ unsigned long timeout;
+
+ timeout = AP_WAIT_TIMEOUT*HZ;
+ while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
+ timeout) {
+ mdelay(1);
+ timeout--;
+ }
+
+ if (timeout)
+ pr_warning("tboot wait for APs timeout\n");
+
+ return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
+}
+
+static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case CPU_DYING:
+ atomic_inc(&ap_wfs_count);
+ if (num_online_cpus() == 1)
+ if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
+ return NOTIFY_BAD;
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block tboot_cpu_notifier __cpuinitdata =
+{
+ .notifier_call = tboot_cpu_callback,
+};
+
+static __init int tboot_late_init(void)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ tboot_create_trampoline();
+
+ atomic_set(&ap_wfs_count, 0);
+ register_hotcpu_notifier(&tboot_cpu_notifier);
+ return 0;
+}
+
+late_initcall(tboot_late_init);
+
+/*
+ * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
+ */
+
+#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000
+#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000
+
+/* # pages for each config regs space - used by fixmap */
+#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \
+ TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT)
+
+/* offsets from pub/priv config space */
+#define TXTCR_HEAP_BASE 0x0300
+#define TXTCR_HEAP_SIZE 0x0308
+
+#define SHA1_SIZE 20
+
+struct sha1_hash {
+ u8 hash[SHA1_SIZE];
+};
+
+struct sinit_mle_data {
+ u32 version; /* currently 6 */
+ struct sha1_hash bios_acm_id;
+ u32 edx_senter_flags;
+ u64 mseg_valid;
+ struct sha1_hash sinit_hash;
+ struct sha1_hash mle_hash;
+ struct sha1_hash stm_hash;
+ struct sha1_hash lcp_policy_hash;
+ u32 lcp_policy_control;
+ u32 rlp_wakeup_addr;
+ u32 reserved;
+ u32 num_mdrs;
+ u32 mdrs_off;
+ u32 num_vtd_dmars;
+ u32 vtd_dmars_off;
+} __packed;
+
+struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl)
+{
+ void *heap_base, *heap_ptr, *config;
+
+ if (!tboot_enabled())
+ return dmar_tbl;
+
+ /*
+ * ACPI tables may not be DMA protected by tboot, so use DMAR copy
+ * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
+ */
+
+ /* map config space in order to get heap addr */
+ config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES *
+ PAGE_SIZE);
+ if (!config)
+ return NULL;
+
+ /* now map TXT heap */
+ heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE),
+ *(u64 *)(config + TXTCR_HEAP_SIZE));
+ iounmap(config);
+ if (!heap_base)
+ return NULL;
+
+ /* walk heap to SinitMleData */
+ /* skip BiosData */
+ heap_ptr = heap_base + *(u64 *)heap_base;
+ /* skip OsMleData */
+ heap_ptr += *(u64 *)heap_ptr;
+ /* skip OsSinitData */
+ heap_ptr += *(u64 *)heap_ptr;
+ /* now points to SinitMleDataSize; set to SinitMleData */
+ heap_ptr += sizeof(u64);
+ /* get addr of DMAR table */
+ dmar_tbl = (struct acpi_table_header *)(heap_ptr +
+ ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off -
+ sizeof(u64));
+
+ /* don't unmap heap because dmar.c needs access to this */
+
+ return dmar_tbl;
+}
+
+int tboot_force_iommu(void)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ if (no_iommu || swiotlb || dmar_disabled)
+ pr_warning("Forcing Intel-IOMMU to enabled\n");
+
+ dmar_disabled = 0;
+#ifdef CONFIG_SWIOTLB
+ swiotlb = 0;
+#endif
+ no_iommu = 0;
+
+ return 1;
+}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 77b9689f8edb..503c1f2e8835 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -640,13 +640,13 @@ static int __init uv_ptc_init(void)
if (!is_uv_system())
return 0;
- proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
+ proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
+ &proc_uv_ptc_operations);
if (!proc_uv_ptc) {
printk(KERN_ERR "unable to create %s proc entry\n",
UV_PTC_BASENAME);
return -EINVAL;
}
- proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
return 0;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e4b1f5dec8e..83264922a878 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -786,33 +786,34 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
#endif
}
-#ifdef CONFIG_X86_32
-unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
+asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
{
- struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
- unsigned long base = (kesp - uesp) & -THREAD_SIZE;
- unsigned long new_kesp = kesp - base;
- unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
- __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
-
- /* Set up base for espfix segment */
- desc &= 0x00f0ff0000000000ULL;
- desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
- ((((__u64)base) << 32) & 0xff00000000000000ULL) |
- ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
- (lim_pages & 0xffff);
- *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
-
- return new_kesp;
}
-#endif
-asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
{
}
-asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
+/*
+ * __math_state_restore assumes that cr0.TS is already clear and the
+ * fpu state is all ready for use. Used during context switch.
+ */
+void __math_state_restore(void)
{
+ struct thread_info *thread = current_thread_info();
+ struct task_struct *tsk = thread->task;
+
+ /*
+ * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+ */
+ if (unlikely(restore_fpu_checking(tsk))) {
+ stts();
+ force_sig(SIGSEGV, tsk);
+ return;
+ }
+
+ thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
+ tsk->fpu_counter++;
}
/*
@@ -846,17 +847,8 @@ asmlinkage void math_state_restore(void)
}
clts(); /* Allow maths ops (or we recurse) */
- /*
- * Paranoid restore. send a SIGSEGV if we fail to restore the state.
- */
- if (unlikely(restore_fpu_checking(tsk))) {
- stts();
- force_sig(SIGSEGV, tsk);
- return;
- }
- thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
- tsk->fpu_counter++;
+ __math_state_restore();
}
EXPORT_SYMBOL_GPL(math_state_restore);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 78d185d797de..9fc178255c04 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -46,11 +46,10 @@ PHDRS {
data PT_LOAD FLAGS(7); /* RWE */
#ifdef CONFIG_X86_64
user PT_LOAD FLAGS(7); /* RWE */
- data.init PT_LOAD FLAGS(7); /* RWE */
#ifdef CONFIG_SMP
percpu PT_LOAD FLAGS(7); /* RWE */
#endif
- data.init2 PT_LOAD FLAGS(7); /* RWE */
+ init PT_LOAD FLAGS(7); /* RWE */
#endif
note PT_NOTE FLAGS(0); /* ___ */
}
@@ -103,65 +102,43 @@ SECTIONS
__stop___ex_table = .;
} :text = 0x9090
- RODATA
+ RO_DATA(PAGE_SIZE)
/* Data */
- . = ALIGN(PAGE_SIZE);
.data : AT(ADDR(.data) - LOAD_OFFSET) {
/* Start of data section */
_sdata = .;
- DATA_DATA
- CONSTRUCTORS
- } :data
+
+ /* init_task */
+ INIT_TASK_DATA(THREAD_SIZE)
#ifdef CONFIG_X86_32
- /* 32 bit has nosave before _edata */
- . = ALIGN(PAGE_SIZE);
- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
- __nosave_begin = .;
- *(.data.nosave)
- . = ALIGN(PAGE_SIZE);
- __nosave_end = .;
- }
+ /* 32 bit has nosave before _edata */
+ NOSAVE_DATA
#endif
- . = ALIGN(PAGE_SIZE);
- .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
- *(.data.page_aligned)
+ PAGE_ALIGNED_DATA(PAGE_SIZE)
*(.data.idt)
- }
-#ifdef CONFIG_X86_32
- . = ALIGN(32);
-#else
- . = ALIGN(PAGE_SIZE);
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-#endif
- .data.cacheline_aligned :
- AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
- *(.data.cacheline_aligned)
- }
+ CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
- /* rarely changed data like cpu maps */
-#ifdef CONFIG_X86_32
- . = ALIGN(32);
-#else
- . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
-#endif
- .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
- *(.data.read_mostly)
+ DATA_DATA
+ CONSTRUCTORS
+
+ /* rarely changed data like cpu maps */
+ READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES)
/* End of data section */
_edata = .;
- }
+ } :data
#ifdef CONFIG_X86_64
#define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
- SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
- SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \
+ PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \
+ PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
@@ -227,35 +204,29 @@ SECTIONS
#endif /* CONFIG_X86_64 */
- /* init_task */
- . = ALIGN(THREAD_SIZE);
- .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
- *(.data.init_task)
+ /* Init code and data - will be freed after init */
+ . = ALIGN(PAGE_SIZE);
+ .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
+ __init_begin = .; /* paired with __init_end */
}
-#ifdef CONFIG_X86_64
- :data.init
-#endif
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
/*
- * smp_locks might be freed after init
- * start/end must be page aligned
+ * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
+ * output PHDR, so the next output section - .init.text - should
+ * start another segment - init.
*/
- . = ALIGN(PAGE_SIZE);
- .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
- __smp_locks = .;
- *(.smp_locks)
- __smp_locks_end = .;
- . = ALIGN(PAGE_SIZE);
- }
+ PERCPU_VADDR(0, :percpu)
+#endif
- /* Init code and data - will be freed after init */
- . = ALIGN(PAGE_SIZE);
.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
- __init_begin = .; /* paired with __init_end */
_sinittext = .;
INIT_TEXT
_einittext = .;
}
+#ifdef CONFIG_X86_64
+ :init
+#endif
.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
INIT_DATA
@@ -326,17 +297,7 @@ SECTIONS
}
#endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
- /*
- * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
- * output PHDR, so the next output section - __data_nosave - should
- * start another section data.init2. Also, pda should be at the head of
- * percpu area. Preallocate it and define the percpu offset symbol
- * so that it can be accessed as a percpu variable.
- */
- . = ALIGN(PAGE_SIZE);
- PERCPU_VADDR(0, :percpu)
-#else
+#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
PERCPU(PAGE_SIZE)
#endif
@@ -347,15 +308,22 @@ SECTIONS
__init_end = .;
}
+ /*
+ * smp_locks might be freed after init
+ * start/end must be page aligned
+ */
+ . = ALIGN(PAGE_SIZE);
+ .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+ __smp_locks = .;
+ *(.smp_locks)
+ __smp_locks_end = .;
+ . = ALIGN(PAGE_SIZE);
+ }
+
#ifdef CONFIG_X86_64
.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __nosave_begin = .;
- *(.data.nosave)
- . = ALIGN(PAGE_SIZE);
- __nosave_end = .;
- } :data.init2
- /* use another section data.init2, see PERCPU_VADDR() above */
+ NOSAVE_DATA
+ }
#endif
/* BSS */