aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig21
-rw-r--r--arch/x86/boot/compressed/eboot.c174
-rw-r--r--arch/x86/boot/compressed/head_32.S6
-rw-r--r--arch/x86/boot/compressed/head_64.S8
-rw-r--r--arch/x86/configs/tiny.config2
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb.c4
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S7
-rw-r--r--arch/x86/crypto/sha512-mb/sha512_mb.c4
-rw-r--r--arch/x86/entry/Makefile2
-rw-r--r--arch/x86/entry/common.c24
-rw-r--r--arch/x86/entry/entry_32.S68
-rw-r--r--arch/x86/entry/entry_64.S180
-rw-r--r--arch/x86/entry/vdso/vdso2c.h2
-rw-r--r--arch/x86/entry/vdso/vma.c175
-rw-r--r--arch/x86/events/amd/core.c4
-rw-r--r--arch/x86/events/amd/uncore.c22
-rw-r--r--arch/x86/events/core.c60
-rw-r--r--arch/x86/events/intel/bts.c126
-rw-r--r--arch/x86/events/intel/core.c46
-rw-r--r--arch/x86/events/intel/cqm.c9
-rw-r--r--arch/x86/events/intel/ds.c127
-rw-r--r--arch/x86/events/intel/lbr.c70
-rw-r--r--arch/x86/events/intel/pt.c42
-rw-r--r--arch/x86/events/intel/pt.h5
-rw-r--r--arch/x86/events/intel/rapl.c4
-rw-r--r--arch/x86/events/intel/uncore.c11
-rw-r--r--arch/x86/events/intel/uncore.h7
-rw-r--r--arch/x86/events/intel/uncore_snb.c16
-rw-r--r--arch/x86/events/intel/uncore_snbep.c636
-rw-r--r--arch/x86/events/perf_event.h13
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/include/asm/alternative.h8
-rw-r--r--arch/x86/include/asm/apic.h7
-rw-r--r--arch/x86/include/asm/cmpxchg.h44
-rw-r--r--arch/x86/include/asm/compat.h8
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/desc.h2
-rw-r--r--arch/x86/include/asm/e820.h6
-rw-r--r--arch/x86/include/asm/efi.h29
-rw-r--r--arch/x86/include/asm/fpu/signal.h6
-rw-r--r--arch/x86/include/asm/fpu/xstate.h3
-rw-r--r--arch/x86/include/asm/ftrace.h3
-rw-r--r--arch/x86/include/asm/hardirq.h4
-rw-r--r--arch/x86/include/asm/hypervisor.h4
-rw-r--r--arch/x86/include/asm/init.h4
-rw-r--r--arch/x86/include/asm/intel-family.h4
-rw-r--r--arch/x86/include/asm/intel-mid.h2
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h2
-rw-r--r--arch/x86/include/asm/kaslr.h1
-rw-r--r--arch/x86/include/asm/kdebug.h2
-rw-r--r--arch/x86/include/asm/mce.h66
-rw-r--r--arch/x86/include/asm/mpspec.h3
-rw-r--r--arch/x86/include/asm/paravirt.h22
-rw-r--r--arch/x86/include/asm/paravirt_types.h8
-rw-r--r--arch/x86/include/asm/pgtable_64.h4
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h4
-rw-r--r--arch/x86/include/asm/pmem.h5
-rw-r--r--arch/x86/include/asm/processor.h25
-rw-r--r--arch/x86/include/asm/realmode.h12
-rw-r--r--arch/x86/include/asm/rwsem.h2
-rw-r--r--arch/x86/include/asm/signal.h4
-rw-r--r--arch/x86/include/asm/smp.h3
-rw-r--r--arch/x86/include/asm/special_insns.h22
-rw-r--r--arch/x86/include/asm/spinlock.h174
-rw-r--r--arch/x86/include/asm/spinlock_types.h13
-rw-r--r--arch/x86/include/asm/stacktrace.h120
-rw-r--r--arch/x86/include/asm/string_64.h19
-rw-r--r--arch/x86/include/asm/switch_to.h164
-rw-r--r--arch/x86/include/asm/syscall.h20
-rw-r--r--arch/x86/include/asm/thread_info.h115
-rw-r--r--arch/x86/include/asm/tlbflush.h7
-rw-r--r--arch/x86/include/asm/traps.h6
-rw-r--r--arch/x86/include/asm/uaccess.h101
-rw-r--r--arch/x86/include/asm/uaccess_32.h2
-rw-r--r--arch/x86/include/asm/uaccess_64.h2
-rw-r--r--arch/x86/include/asm/unwind.h73
-rw-r--r--arch/x86/include/asm/uv/bios.h5
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h45
-rw-r--r--arch/x86/include/asm/vdso.h2
-rw-r--r--arch/x86/include/uapi/asm/mce.h2
-rw-r--r--arch/x86/include/uapi/asm/prctl.h6
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/acpi/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c35
-rw-r--r--arch/x86/kernel/acpi/cppc_msr.c58
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/apic/apic.c128
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c22
-rw-r--r--arch/x86/kernel/apic/apic_noop.c2
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c5
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c4
-rw-r--r--arch/x86/kernel/apic/msi.c2
-rw-r--r--arch/x86/kernel/apic/probe_32.c6
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c15
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c82
-rw-r--r--arch/x86/kernel/asm-offsets.c7
-rw-r--r--arch/x86/kernel/asm-offsets_32.c5
-rw-r--r--arch/x86/kernel/asm-offsets_64.c5
-rw-r--r--arch/x86/kernel/cpu/amd.c12
-rw-r--r--arch/x86/kernel/cpu/common.c41
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c44
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c204
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c18
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c52
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h2
-rw-r--r--arch/x86/kernel/dumpstack.c258
-rw-r--r--arch/x86/kernel/dumpstack_32.c154
-rw-r--r--arch/x86/kernel/dumpstack_64.c318
-rw-r--r--arch/x86/kernel/e820.c155
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/fpu/init.c1
-rw-r--r--arch/x86/kernel/fpu/xstate.c138
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head64.c1
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/head_64.S12
-rw-r--r--arch/x86/kernel/hpet.c96
-rw-r--r--arch/x86/kernel/irq.c3
-rw-r--r--arch/x86/kernel/irq_64.c3
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c4
-rw-r--r--arch/x86/kernel/kgdb.c8
-rw-r--r--arch/x86/kernel/kprobes/core.c2
-rw-r--r--arch/x86/kernel/kprobes/opt.c2
-rw-r--r--arch/x86/kernel/ksysfs.c2
-rw-r--r--arch/x86/kernel/kvm.c288
-rw-r--r--arch/x86/kernel/kvmclock.c3
-rw-r--r--arch/x86/kernel/mpparse.c3
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c7
-rw-r--r--arch/x86/kernel/paravirt.c7
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c4
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c4
-rw-r--r--arch/x86/kernel/process.c42
-rw-r--r--arch/x86/kernel/process_32.c33
-rw-r--r--arch/x86/kernel/process_64.c52
-rw-r--r--arch/x86/kernel/ptrace.c14
-rw-r--r--arch/x86/kernel/quirks.c31
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/resource.c4
-rw-r--r--arch/x86/kernel/setup.c54
-rw-r--r--arch/x86/kernel/setup_percpu.c4
-rw-r--r--arch/x86/kernel/signal.c22
-rw-r--r--arch/x86/kernel/signal_compat.c34
-rw-r--r--arch/x86/kernel/smpboot.c98
-rw-r--r--arch/x86/kernel/stacktrace.c79
-rw-r--r--arch/x86/kernel/tboot.c8
-rw-r--r--arch/x86/kernel/traps.c61
-rw-r--r--arch/x86/kernel/tsc.c16
-rw-r--r--arch/x86/kernel/unwind_frame.c93
-rw-r--r--arch/x86/kernel/unwind_guess.c43
-rw-r--r--arch/x86/kernel/uprobes.c22
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c2
-rw-r--r--arch/x86/kernel/x86_init.c6
-rw-r--r--arch/x86/kvm/ioapic.c8
-rw-r--r--arch/x86/kvm/pmu_amd.c4
-rw-r--r--arch/x86/kvm/svm.c2
-rw-r--r--arch/x86/kvm/vmx.c138
-rw-r--r--arch/x86/kvm/x86.c8
-rw-r--r--arch/x86/lib/hweight.S2
-rw-r--r--arch/x86/lib/kaslr.c2
-rw-r--r--arch/x86/lib/memcpy_64.S6
-rw-r--r--arch/x86/mm/amdtopology.c22
-rw-r--r--arch/x86/mm/extable.c2
-rw-r--r--arch/x86/mm/fault.c34
-rw-r--r--arch/x86/mm/ident_map.c19
-rw-r--r--arch/x86/mm/init.c18
-rw-r--r--arch/x86/mm/kaslr.c30
-rw-r--r--arch/x86/mm/numa.c27
-rw-r--r--arch/x86/mm/pageattr.c21
-rw-r--r--arch/x86/mm/pat.c17
-rw-r--r--arch/x86/mm/pat_rbtree.c4
-rw-r--r--arch/x86/mm/tlb.c15
-rw-r--r--arch/x86/oprofile/backtrace.c49
-rw-r--r--arch/x86/pci/fixup.c20
-rw-r--r--arch/x86/pci/pcbios.c7
-rw-r--r--arch/x86/pci/vmd.c10
-rw-r--r--arch/x86/platform/Makefile1
-rw-r--r--arch/x86/platform/atom/punit_atom_debug.c2
-rw-r--r--arch/x86/platform/efi/efi-bgrt.c13
-rw-r--r--arch/x86/platform/efi/efi.c200
-rw-r--r--arch/x86/platform/efi/efi_64.c25
-rw-r--r--arch/x86/platform/efi/quirks.c148
-rw-r--r--arch/x86/platform/intel-mid/device_libs/Makefile4
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c95
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c47
-rw-r--r--arch/x86/platform/intel-mid/intel-mid.c5
-rw-r--r--arch/x86/platform/intel-mid/pwr.c76
-rw-r--r--arch/x86/platform/mellanox/Makefile1
-rw-r--r--arch/x86/platform/mellanox/mlx-platform.c266
-rw-r--r--arch/x86/platform/uv/bios_uv.c18
-rw-r--r--arch/x86/platform/uv/tlb_uv.c184
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/power/hibernate_64.c4
-rw-r--r--arch/x86/ras/mce_amd_inj.c54
-rw-r--r--arch/x86/realmode/init.c47
-rw-r--r--arch/x86/um/ptrace_32.c11
-rw-r--r--arch/x86/um/ptrace_64.c4
-rw-r--r--arch/x86/xen/enlighten.c43
-rw-r--r--arch/x86/xen/setup.c2
-rw-r--r--arch/x86/xen/spinlock.c250
204 files changed, 4743 insertions, 3281 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5c6e7471b732..9b2d50a73a11 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,7 +24,6 @@ config X86
select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
- select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
@@ -80,6 +79,7 @@ config X86
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_AOUT if X86_32
select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_ARCH_HARDENED_USERCOPY
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
@@ -91,7 +91,9 @@ config X86
select HAVE_ARCH_SOFT_DIRTY if X86_64
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select HAVE_ARCH_WITHIN_STACK_FRAMES
select HAVE_EBPF_JIT if X86_64
+ select HAVE_ARCH_VMAP_STACK if X86_64
select HAVE_CC_STACKPROTECTOR
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
@@ -108,7 +110,6 @@ config X86
select HAVE_EXIT_THREAD
select HAVE_FENTRY if X86_64
select HAVE_FTRACE_MCOUNT_RECORD
- select HAVE_FUNCTION_GRAPH_FP_TEST
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS
@@ -156,6 +157,7 @@ config X86
select SPARSE_IRQ
select SRCU
select SYSCTL_EXCEPTION_TRACE
+ select THREAD_INFO_IN_TASK
select USER_STACKTRACE_SUPPORT
select VIRT_TO_BUS
select X86_DEV_DMA_OPS if X86_64
@@ -548,6 +550,18 @@ config X86_INTEL_QUARK
Say Y here if you have a Quark based system such as the Arduino
compatible Intel Galileo.
+config MLX_PLATFORM
+ tristate "Mellanox Technologies platform support"
+ depends on X86_64
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
+ This option enables system support for the Mellanox Technologies
+ platform.
+
+ Say Y here if you are building a kernel for Mellanox system.
+
+ Otherwise, say N.
+
config X86_INTEL_LPSS
bool "Intel Low Power Subsystem Support"
depends on X86 && ACPI
@@ -704,7 +718,6 @@ config PARAVIRT_DEBUG
config PARAVIRT_SPINLOCKS
bool "Paravirtualization layer for spinlocks"
depends on PARAVIRT && SMP
- select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS
---help---
Paravirtualized spinlocks allow a pvops backend to replace the
spinlock implementation with something virtualization-friendly
@@ -717,7 +730,7 @@ config PARAVIRT_SPINLOCKS
config QUEUED_LOCK_STAT
bool "Paravirt queued spinlock statistics"
- depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS
+ depends on PARAVIRT_SPINLOCKS && DEBUG_FS
---help---
Enable the collection of statistical data on the slowpath
behavior of paravirtualized queued spinlocks and report
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index ff574dad95cc..cc69e37548db 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -29,22 +29,11 @@ __pure const struct efi_config *__efi_early(void)
static void setup_boot_services##bits(struct efi_config *c) \
{ \
efi_system_table_##bits##_t *table; \
- efi_boot_services_##bits##_t *bt; \
\
table = (typeof(table))sys_table; \
\
+ c->boot_services = table->boottime; \
c->text_output = table->con_out; \
- \
- bt = (typeof(bt))(unsigned long)(table->boottime); \
- \
- c->allocate_pool = bt->allocate_pool; \
- c->allocate_pages = bt->allocate_pages; \
- c->get_memory_map = bt->get_memory_map; \
- c->free_pool = bt->free_pool; \
- c->free_pages = bt->free_pages; \
- c->locate_handle = bt->locate_handle; \
- c->handle_protocol = bt->handle_protocol; \
- c->exit_boot_services = bt->exit_boot_services; \
}
BOOT_SERVICES(32);
BOOT_SERVICES(64);
@@ -286,29 +275,6 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
}
}
-static void find_bits(unsigned long mask, u8 *pos, u8 *size)
-{
- u8 first, len;
-
- first = 0;
- len = 0;
-
- if (mask) {
- while (!(mask & 0x1)) {
- mask = mask >> 1;
- first++;
- }
-
- while (mask & 0x1) {
- mask = mask >> 1;
- len++;
- }
- }
-
- *pos = first;
- *size = len;
-}
-
static efi_status_t
__setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
{
@@ -578,7 +544,7 @@ setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height)
efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
unsigned long nr_ugas;
u32 *handles = (u32 *)uga_handle;;
- efi_status_t status;
+ efi_status_t status = EFI_INVALID_PARAMETER;
int i;
first_uga = NULL;
@@ -623,7 +589,7 @@ setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height)
efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
unsigned long nr_ugas;
u64 *handles = (u64 *)uga_handle;;
- efi_status_t status;
+ efi_status_t status = EFI_INVALID_PARAMETER;
int i;
first_uga = NULL;
@@ -1004,79 +970,87 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext,
return status;
}
-static efi_status_t exit_boot(struct boot_params *boot_params,
- void *handle, bool is64)
-{
- struct efi_info *efi = &boot_params->efi_info;
- unsigned long map_sz, key, desc_size;
- efi_memory_desc_t *mem_map;
+struct exit_boot_struct {
+ struct boot_params *boot_params;
+ struct efi_info *efi;
struct setup_data *e820ext;
- const char *signature;
__u32 e820ext_size;
- __u32 nr_desc, prev_nr_desc;
- efi_status_t status;
- __u32 desc_version;
- bool called_exit = false;
- u8 nr_entries;
- int i;
-
- nr_desc = 0;
- e820ext = NULL;
- e820ext_size = 0;
-
-get_map:
- status = efi_get_memory_map(sys_table, &mem_map, &map_sz, &desc_size,
- &desc_version, &key);
-
- if (status != EFI_SUCCESS)
- return status;
-
- prev_nr_desc = nr_desc;
- nr_desc = map_sz / desc_size;
- if (nr_desc > prev_nr_desc &&
- nr_desc > ARRAY_SIZE(boot_params->e820_map)) {
- u32 nr_e820ext = nr_desc - ARRAY_SIZE(boot_params->e820_map);
-
- status = alloc_e820ext(nr_e820ext, &e820ext, &e820ext_size);
- if (status != EFI_SUCCESS)
- goto free_mem_map;
+ bool is64;
+};
- efi_call_early(free_pool, mem_map);
- goto get_map; /* Allocated memory, get map again */
+static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg,
+ struct efi_boot_memmap *map,
+ void *priv)
+{
+ static bool first = true;
+ const char *signature;
+ __u32 nr_desc;
+ efi_status_t status;
+ struct exit_boot_struct *p = priv;
+
+ if (first) {
+ nr_desc = *map->buff_size / *map->desc_size;
+ if (nr_desc > ARRAY_SIZE(p->boot_params->e820_map)) {
+ u32 nr_e820ext = nr_desc -
+ ARRAY_SIZE(p->boot_params->e820_map);
+
+ status = alloc_e820ext(nr_e820ext, &p->e820ext,
+ &p->e820ext_size);
+ if (status != EFI_SUCCESS)
+ return status;
+ }
+ first = false;
}
- signature = is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE;
- memcpy(&efi->efi_loader_signature, signature, sizeof(__u32));
+ signature = p->is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE;
+ memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32));
- efi->efi_systab = (unsigned long)sys_table;
- efi->efi_memdesc_size = desc_size;
- efi->efi_memdesc_version = desc_version;
- efi->efi_memmap = (unsigned long)mem_map;
- efi->efi_memmap_size = map_sz;
+ p->efi->efi_systab = (unsigned long)sys_table_arg;
+ p->efi->efi_memdesc_size = *map->desc_size;
+ p->efi->efi_memdesc_version = *map->desc_ver;
+ p->efi->efi_memmap = (unsigned long)*map->map;
+ p->efi->efi_memmap_size = *map->map_size;
#ifdef CONFIG_X86_64
- efi->efi_systab_hi = (unsigned long)sys_table >> 32;
- efi->efi_memmap_hi = (unsigned long)mem_map >> 32;
+ p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32;
+ p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32;
#endif
+ return EFI_SUCCESS;
+}
+
+static efi_status_t exit_boot(struct boot_params *boot_params,
+ void *handle, bool is64)
+{
+ unsigned long map_sz, key, desc_size, buff_size;
+ efi_memory_desc_t *mem_map;
+ struct setup_data *e820ext;
+ __u32 e820ext_size;
+ efi_status_t status;
+ __u32 desc_version;
+ struct efi_boot_memmap map;
+ struct exit_boot_struct priv;
+
+ map.map = &mem_map;
+ map.map_size = &map_sz;
+ map.desc_size = &desc_size;
+ map.desc_ver = &desc_version;
+ map.key_ptr = &key;
+ map.buff_size = &buff_size;
+ priv.boot_params = boot_params;
+ priv.efi = &boot_params->efi_info;
+ priv.e820ext = NULL;
+ priv.e820ext_size = 0;
+ priv.is64 = is64;
+
/* Might as well exit boot services now */
- status = efi_call_early(exit_boot_services, handle, key);
- if (status != EFI_SUCCESS) {
- /*
- * ExitBootServices() will fail if any of the event
- * handlers change the memory map. In which case, we
- * must be prepared to retry, but only once so that
- * we're guaranteed to exit on repeated failures instead
- * of spinning forever.
- */
- if (called_exit)
- goto free_mem_map;
-
- called_exit = true;
- efi_call_early(free_pool, mem_map);
- goto get_map;
- }
+ status = efi_exit_boot_services(sys_table, handle, &map, &priv,
+ exit_boot_func);
+ if (status != EFI_SUCCESS)
+ return status;
+ e820ext = priv.e820ext;
+ e820ext_size = priv.e820ext_size;
/* Historic? */
boot_params->alt_mem_k = 32 * 1024;
@@ -1085,10 +1059,6 @@ get_map:
return status;
return EFI_SUCCESS;
-
-free_mem_map:
- efi_call_early(free_pool, mem_map);
- return status;
}
/*
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1038524270e7..fd0b6a272dd5 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -82,7 +82,7 @@ ENTRY(efi_pe_entry)
/* Relocate efi_config->call() */
leal efi32_config(%esi), %eax
- add %esi, 88(%eax)
+ add %esi, 32(%eax)
pushl %eax
call make_boot_params
@@ -108,7 +108,7 @@ ENTRY(efi32_stub_entry)
/* Relocate efi_config->call() */
leal efi32_config(%esi), %eax
- add %esi, 88(%eax)
+ add %esi, 32(%eax)
pushl %eax
2:
call efi_main
@@ -264,7 +264,7 @@ relocated:
#ifdef CONFIG_EFI_STUB
.data
efi32_config:
- .fill 11,8,0
+ .fill 4,8,0
.long efi_call_phys
.long 0
.byte 0
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 0d80a7ad65cd..efdfba21a5b2 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -265,7 +265,7 @@ ENTRY(efi_pe_entry)
/*
* Relocate efi_config->call().
*/
- addq %rbp, efi64_config+88(%rip)
+ addq %rbp, efi64_config+32(%rip)
movq %rax, %rdi
call make_boot_params
@@ -285,7 +285,7 @@ handover_entry:
* Relocate efi_config->call().
*/
movq efi_config(%rip), %rax
- addq %rbp, 88(%rax)
+ addq %rbp, 32(%rax)
2:
movq efi_config(%rip), %rdi
call efi_main
@@ -457,14 +457,14 @@ efi_config:
#ifdef CONFIG_EFI_MIXED
.global efi32_config
efi32_config:
- .fill 11,8,0
+ .fill 4,8,0
.quad efi64_thunk
.byte 0
#endif
.global efi64_config
efi64_config:
- .fill 11,8,0
+ .fill 4,8,0
.quad efi_call
.byte 1
#endif /* CONFIG_EFI_STUB */
diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
index 4e2ecfa23c15..4b429df40d7a 100644
--- a/arch/x86/configs/tiny.config
+++ b/arch/x86/configs/tiny.config
@@ -1 +1,3 @@
CONFIG_NOHIGHMEM=y
+# CONFIG_HIGHMEM4G is not set
+# CONFIG_HIGHMEM64G is not set
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c
index 89fa85e8b10c..6f97fb33ae21 100644
--- a/arch/x86/crypto/sha256-mb/sha256_mb.c
+++ b/arch/x86/crypto/sha256-mb/sha256_mb.c
@@ -485,10 +485,10 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
req = cast_mcryptd_ctx_to_req(req_ctx);
if (irqs_disabled())
- rctx->complete(&req->base, ret);
+ req_ctx->complete(&req->base, ret);
else {
local_bh_disable();
- rctx->complete(&req->base, ret);
+ req_ctx->complete(&req->base, ret);
local_bh_enable();
}
}
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
index b691da981cd9..a78a0694ddef 100644
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
@@ -265,13 +265,14 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2)
vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
- movl _args_digest+4*32(state, idx, 4), tmp2_w
+ vmovd _args_digest(state , idx, 4) , %xmm0
vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
- vmovdqu %xmm0, _result_digest(job_rax)
- movl tmp2_w, _result_digest+1*16(job_rax)
+ vmovdqu %xmm0, _result_digest(job_rax)
+ offset = (_result_digest + 1*16)
+ vmovdqu %xmm1, offset(job_rax)
pop %rbx
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c
index f4cf5b78fd36..d210174a52b0 100644
--- a/arch/x86/crypto/sha512-mb/sha512_mb.c
+++ b/arch/x86/crypto/sha512-mb/sha512_mb.c
@@ -497,10 +497,10 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
req = cast_mcryptd_ctx_to_req(req_ctx);
if (irqs_disabled())
- rctx->complete(&req->base, ret);
+ req_ctx->complete(&req->base, ret);
else {
local_bh_disable();
- rctx->complete(&req->base, ret);
+ req_ctx->complete(&req->base, ret);
local_bh_enable();
}
}
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index fe91c25092da..77f28ce9c646 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -5,6 +5,8 @@
OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
+CFLAGS_syscall_64.o += -Wno-override-init
+CFLAGS_syscall_32.o += -Wno-override-init
obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
obj-y += common.o
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 1433f6b4607d..bdd9cc59d20f 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -31,13 +31,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
-static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
-{
- unsigned long top_of_stack =
- (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
- return (struct thread_info *)(top_of_stack - THREAD_SIZE);
-}
-
#ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */
__visible inline void enter_from_user_mode(void)
@@ -71,7 +64,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
{
u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
unsigned long ret = 0;
bool emulated = false;
u32 work;
@@ -173,18 +166,17 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
/* Disable IRQs and retry */
local_irq_disable();
- cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+ cached_flags = READ_ONCE(current_thread_info()->flags);
if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
break;
-
}
}
/* Called with IRQs disabled. */
__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
u32 cached_flags;
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
@@ -209,7 +201,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
* special case only applies after poking regs and before the
* very next return to user mode.
*/
- ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
+ current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
#endif
user_enter_irqoff();
@@ -247,7 +239,7 @@ static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
*/
__visible inline void syscall_return_slowpath(struct pt_regs *regs)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
u32 cached_flags = READ_ONCE(ti->flags);
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
@@ -270,7 +262,7 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
#ifdef CONFIG_X86_64
__visible void do_syscall_64(struct pt_regs *regs)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
unsigned long nr = regs->orig_ax;
enter_from_user_mode();
@@ -303,11 +295,11 @@ __visible void do_syscall_64(struct pt_regs *regs)
*/
static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
unsigned int nr = (unsigned int)regs->orig_ax;
#ifdef CONFIG_IA32_EMULATION
- ti->status |= TS_COMPAT;
+ current->thread.status |= TS_COMPAT;
#endif
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 0b56666e6039..b75a8bcd2d23 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -204,34 +204,70 @@
POP_GS_EX
.endm
+/*
+ * %eax: prev task
+ * %edx: next task
+ */
+ENTRY(__switch_to_asm)
+ /*
+ * Save callee-saved registers
+ * This must match the order in struct inactive_task_frame
+ */
+ pushl %ebp
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+
+ /* switch stack */
+ movl %esp, TASK_threadsp(%eax)
+ movl TASK_threadsp(%edx), %esp
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+ movl TASK_stack_canary(%edx), %ebx
+ movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+#endif
+
+ /* restore callee-saved registers */
+ popl %esi
+ popl %edi
+ popl %ebx
+ popl %ebp
+
+ jmp __switch_to
+END(__switch_to_asm)
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * eax: prev task we switched from
+ * ebx: kernel thread func (NULL for user thread)
+ * edi: kernel thread arg
+ */
ENTRY(ret_from_fork)
pushl %eax
call schedule_tail
popl %eax
+ testl %ebx, %ebx
+ jnz 1f /* kernel threads are uncommon */
+
+2:
/* When we fork, we trace the syscall return in the child, too. */
movl %esp, %eax
call syscall_return_slowpath
jmp restore_all
-END(ret_from_fork)
-
-ENTRY(ret_from_kernel_thread)
- pushl %eax
- call schedule_tail
- popl %eax
- movl PT_EBP(%esp), %eax
- call *PT_EBX(%esp)
- movl $0, PT_EAX(%esp)
+ /* kernel thread */
+1: movl %edi, %eax
+ call *%ebx
/*
- * Kernel threads return to userspace as if returning from a syscall.
- * We should check whether anything actually uses this path and, if so,
- * consider switching it over to ret_from_fork.
+ * A kernel thread is allowed to return here after successfully
+ * calling do_execve(). Exit to userspace to complete the execve()
+ * syscall.
*/
- movl %esp, %eax
- call syscall_return_slowpath
- jmp restore_all
-ENDPROC(ret_from_kernel_thread)
+ movl $0, PT_EAX(%esp)
+ jmp 2b
+END(ret_from_fork)
/*
* Return to user mode is not as complex as all this looks,
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index b846875aeea6..fee1d95902b5 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -179,7 +179,8 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
* If we need to do entry work or if we guess we'll need to do
* exit work, go straight to the slow path.
*/
- testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ movq PER_CPU_VAR(current_task), %r11
+ testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz entry_SYSCALL64_slow_path
entry_SYSCALL_64_fastpath:
@@ -217,7 +218,8 @@ entry_SYSCALL_64_fastpath:
*/
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ movq PER_CPU_VAR(current_task), %r11
+ testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz 1f
LOCKDEP_SYS_EXIT
@@ -288,11 +290,15 @@ return_from_SYSCALL_64:
jne opportunistic_sysret_failed
/*
- * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
- * restoring TF results in a trap from userspace immediately after
- * SYSRET. This would cause an infinite loop whenever #DB happens
- * with register state that satisfies the opportunistic SYSRET
- * conditions. For example, single-stepping this user code:
+ * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
+ * restore RF properly. If the slowpath sets it for whatever reason, we
+ * need to restore it correctly.
+ *
+ * SYSRET can restore TF, but unlike IRET, restoring TF results in a
+ * trap from userspace immediately after SYSRET. This would cause an
+ * infinite loop whenever #DB happens with register state that satisfies
+ * the opportunistic SYSRET conditions. For example, single-stepping
+ * this user code:
*
* movq $stuck_here, %rcx
* pushfq
@@ -347,8 +353,7 @@ ENTRY(stub_ptregs_64)
jmp entry_SYSCALL64_slow_path
1:
- /* Called from C */
- jmp *%rax /* called from C */
+ jmp *%rax /* Called from C */
END(stub_ptregs_64)
.macro ptregs_stub func
@@ -365,41 +370,73 @@ END(ptregs_\func)
#include <asm/syscalls_64.h>
/*
+ * %rdi: prev task
+ * %rsi: next task
+ */
+ENTRY(__switch_to_asm)
+ /*
+ * Save callee-saved registers
+ * This must match the order in inactive_task_frame
+ */
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ /* switch stack */
+ movq %rsp, TASK_threadsp(%rdi)
+ movq TASK_threadsp(%rsi), %rsp
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+ movq TASK_stack_canary(%rsi), %rbx
+ movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
+#endif
+
+ /* restore callee-saved registers */
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ popq %rbp
+
+ jmp __switch_to
+END(__switch_to_asm)
+
+/*
* A newly forked process directly context switches into this address.
*
- * rdi: prev task we switched from
+ * rax: prev task we switched from
+ * rbx: kernel thread func (NULL for user thread)
+ * r12: kernel thread arg
*/
ENTRY(ret_from_fork)
- LOCK ; btr $TIF_FORK, TI_flags(%r8)
-
+ movq %rax, %rdi
call schedule_tail /* rdi: 'prev' task parameter */
- testb $3, CS(%rsp) /* from kernel_thread? */
- jnz 1f
-
- /*
- * We came from kernel_thread. This code path is quite twisted, and
- * someone should clean it up.
- *
- * copy_thread_tls stashes the function pointer in RBX and the
- * parameter to be passed in RBP. The called function is permitted
- * to call do_execve and thereby jump to user mode.
- */
- movq RBP(%rsp), %rdi
- call *RBX(%rsp)
- movl $0, RAX(%rsp)
-
- /*
- * Fall through as though we're exiting a syscall. This makes a
- * twisted sort of sense if we just called do_execve.
- */
+ testq %rbx, %rbx /* from kernel_thread? */
+ jnz 1f /* kernel threads are uncommon */
-1:
+2:
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
SWAPGS
jmp restore_regs_and_iret
+
+1:
+ /* kernel thread */
+ movq %r12, %rdi
+ call *%rbx
+ /*
+ * A kernel thread is allowed to return here after successfully
+ * calling do_execve(). Exit to userspace to complete the execve()
+ * syscall.
+ */
+ movq $0, RAX(%rsp)
+ jmp 2b
END(ret_from_fork)
/*
@@ -551,27 +588,69 @@ native_irq_return_iret:
#ifdef CONFIG_X86_ESPFIX64
native_irq_return_ldt:
- pushq %rax
- pushq %rdi
+ /*
+ * We are running with user GSBASE. All GPRs contain their user
+ * values. We have a percpu ESPFIX stack that is eight slots
+ * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom
+ * of the ESPFIX stack.
+ *
+ * We clobber RAX and RDI in this code. We stash RDI on the
+ * normal stack and RAX on the ESPFIX stack.
+ *
+ * The ESPFIX stack layout we set up looks like this:
+ *
+ * --- top of ESPFIX stack ---
+ * SS
+ * RSP
+ * RFLAGS
+ * CS
+ * RIP <-- RSP points here when we're done
+ * RAX <-- espfix_waddr points here
+ * --- bottom of ESPFIX stack ---
+ */
+
+ pushq %rdi /* Stash user RDI */
SWAPGS
movq PER_CPU_VAR(espfix_waddr), %rdi
- movq %rax, (0*8)(%rdi) /* RAX */
- movq (2*8)(%rsp), %rax /* RIP */
+ movq %rax, (0*8)(%rdi) /* user RAX */
+ movq (1*8)(%rsp), %rax /* user RIP */
movq %rax, (1*8)(%rdi)
- movq (3*8)(%rsp), %rax /* CS */
+ movq (2*8)(%rsp), %rax /* user CS */
movq %rax, (2*8)(%rdi)
- movq (4*8)(%rsp), %rax /* RFLAGS */
+ movq (3*8)(%rsp), %rax /* user RFLAGS */
movq %rax, (3*8)(%rdi)
- movq (6*8)(%rsp), %rax /* SS */
+ movq (5*8)(%rsp), %rax /* user SS */
movq %rax, (5*8)(%rdi)
- movq (5*8)(%rsp), %rax /* RSP */
+ movq (4*8)(%rsp), %rax /* user RSP */
movq %rax, (4*8)(%rdi)
- andl $0xffff0000, %eax
- popq %rdi
+ /* Now RAX == RSP. */
+
+ andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
+ popq %rdi /* Restore user RDI */
+
+ /*
+ * espfix_stack[31:16] == 0. The page tables are set up such that
+ * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
+ * espfix_waddr for any X. That is, there are 65536 RO aliases of
+ * the same page. Set up RSP so that RSP[31:16] contains the
+ * respective 16 bits of the /userspace/ RSP and RSP nonetheless
+ * still points to an RO alias of the ESPFIX stack.
+ */
orq PER_CPU_VAR(espfix_stack), %rax
SWAPGS
movq %rax, %rsp
- popq %rax
+
+ /*
+ * At this point, we cannot write to the stack any more, but we can
+ * still read.
+ */
+ popq %rax /* Restore user RAX */
+
+ /*
+ * RSP now points to an ordinary IRET frame, except that the page
+ * is read-only and RSP[31:16] are preloaded with the userspace
+ * values. We can now IRET back to userspace.
+ */
jmp native_irq_return_iret
#endif
END(common_interrupt)
@@ -601,9 +680,20 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
.endm
#endif
+/* Make sure APIC interrupt handlers end up in the irqentry section: */
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
+# define POP_SECTION_IRQENTRY .popsection
+#else
+# define PUSH_SECTION_IRQENTRY
+# define POP_SECTION_IRQENTRY
+#endif
+
.macro apicinterrupt num sym do_sym
+PUSH_SECTION_IRQENTRY
apicinterrupt3 \num \sym \do_sym
trace_apicinterrupt \num \sym
+POP_SECTION_IRQENTRY
.endm
#ifdef CONFIG_SMP
@@ -987,7 +1077,6 @@ ENTRY(error_entry)
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
-.Lerror_entry_from_usermode_swapgs:
/*
* We entered from user mode or we're pretending to have entered
* from user mode due to an IRET fault.
@@ -1030,7 +1119,8 @@ ENTRY(error_entry)
* gsbase and proceed. We'll fix up the exception and land in
* .Lgs_change's error handler with kernel gsbase.
*/
- jmp .Lerror_entry_from_usermode_swapgs
+ SWAPGS
+ jmp .Lerror_entry_done
.Lbstep_iret:
/* Fix truncated RIP */
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 4f741192846d..3dab75f2a673 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -22,7 +22,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff));
- if (hdr->e_type != ET_DYN)
+ if (GET_LE(&hdr->e_type) != ET_DYN)
fail("input is not a shared object\n");
/* Walk the segment table. */
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index f840766659a8..23c881caabd1 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -37,54 +37,6 @@ void __init init_vdso_image(const struct vdso_image *image)
struct linux_binprm;
-/*
- * Put the vdso above the (randomized) stack with another randomized
- * offset. This way there is no hole in the middle of address space.
- * To save memory make sure it is still in the same PTE as the stack
- * top. This doesn't give that many random bits.
- *
- * Note that this algorithm is imperfect: the distribution of the vdso
- * start address within a PMD is biased toward the end.
- *
- * Only used for the 64-bit and x32 vdsos.
- */
-static unsigned long vdso_addr(unsigned long start, unsigned len)
-{
-#ifdef CONFIG_X86_32
- return 0;
-#else
- unsigned long addr, end;
- unsigned offset;
-
- /*
- * Round up the start address. It can start out unaligned as a result
- * of stack start randomization.
- */
- start = PAGE_ALIGN(start);
-
- /* Round the lowest possible end address up to a PMD boundary. */
- end = (start + len + PMD_SIZE - 1) & PMD_MASK;
- if (end >= TASK_SIZE_MAX)
- end = TASK_SIZE_MAX;
- end -= len;
-
- if (end > start) {
- offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
- addr = start + (offset << PAGE_SHIFT);
- } else {
- addr = start;
- }
-
- /*
- * Forcibly align the final address in case we have a hardware
- * issue that requires alignment for performance reasons.
- */
- addr = align_vdso_addr(addr);
-
- return addr;
-#endif
-}
-
static int vdso_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
@@ -176,30 +128,28 @@ static int vvar_fault(const struct vm_special_mapping *sm,
return VM_FAULT_SIGBUS;
}
-static int map_vdso(const struct vdso_image *image, bool calculate_addr)
+static const struct vm_special_mapping vdso_mapping = {
+ .name = "[vdso]",
+ .fault = vdso_fault,
+ .mremap = vdso_mremap,
+};
+static const struct vm_special_mapping vvar_mapping = {
+ .name = "[vvar]",
+ .fault = vvar_fault,
+};
+
+/*
+ * Add vdso and vvar mappings to current process.
+ * @image - blob to map
+ * @addr - request a specific address (zero to map at free addr)
+ */
+static int map_vdso(const struct vdso_image *image, unsigned long addr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long addr, text_start;
+ unsigned long text_start;
int ret = 0;
- static const struct vm_special_mapping vdso_mapping = {
- .name = "[vdso]",
- .fault = vdso_fault,
- .mremap = vdso_mremap,
- };
- static const struct vm_special_mapping vvar_mapping = {
- .name = "[vvar]",
- .fault = vvar_fault,
- };
-
- if (calculate_addr) {
- addr = vdso_addr(current->mm->start_stack,
- image->size - image->sym_vvar_start);
- } else {
- addr = 0;
- }
-
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
@@ -238,24 +188,104 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
- goto up_fail;
+ do_munmap(mm, text_start, image->size);
}
up_fail:
- if (ret)
+ if (ret) {
current->mm->context.vdso = NULL;
+ current->mm->context.vdso_image = NULL;
+ }
up_write(&mm->mmap_sem);
return ret;
}
+#ifdef CONFIG_X86_64
+/*
+ * Put the vdso above the (randomized) stack with another randomized
+ * offset. This way there is no hole in the middle of address space.
+ * To save memory make sure it is still in the same PTE as the stack
+ * top. This doesn't give that many random bits.
+ *
+ * Note that this algorithm is imperfect: the distribution of the vdso
+ * start address within a PMD is biased toward the end.
+ *
+ * Only used for the 64-bit and x32 vdsos.
+ */
+static unsigned long vdso_addr(unsigned long start, unsigned len)
+{
+ unsigned long addr, end;
+ unsigned offset;
+
+ /*
+ * Round up the start address. It can start out unaligned as a result
+ * of stack start randomization.
+ */
+ start = PAGE_ALIGN(start);
+
+ /* Round the lowest possible end address up to a PMD boundary. */
+ end = (start + len + PMD_SIZE - 1) & PMD_MASK;
+ if (end >= TASK_SIZE_MAX)
+ end = TASK_SIZE_MAX;
+ end -= len;
+
+ if (end > start) {
+ offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
+ addr = start + (offset << PAGE_SHIFT);
+ } else {
+ addr = start;
+ }
+
+ /*
+ * Forcibly align the final address in case we have a hardware
+ * issue that requires alignment for performance reasons.
+ */
+ addr = align_vdso_addr(addr);
+
+ return addr;
+}
+
+static int map_vdso_randomized(const struct vdso_image *image)
+{
+ unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start);
+
+ return map_vdso(image, addr);
+}
+#endif
+
+int map_vdso_once(const struct vdso_image *image, unsigned long addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+
+ down_write(&mm->mmap_sem);
+ /*
+ * Check if we have already mapped vdso blob - fail to prevent
+ * abusing from userspace install_speciall_mapping, which may
+ * not do accounting and rlimit right.
+ * We could search vma near context.vdso, but it's a slowpath,
+ * so let's explicitely check all VMAs to be completely sure.
+ */
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma_is_special_mapping(vma, &vdso_mapping) ||
+ vma_is_special_mapping(vma, &vvar_mapping)) {
+ up_write(&mm->mmap_sem);
+ return -EEXIST;
+ }
+ }
+ up_write(&mm->mmap_sem);
+
+ return map_vdso(image, addr);
+}
+
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static int load_vdso32(void)
{
if (vdso32_enabled != 1) /* Other values all mean "disabled" */
return 0;
- return map_vdso(&vdso_image_32, false);
+ return map_vdso(&vdso_image_32, 0);
}
#endif
@@ -265,7 +295,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
if (!vdso64_enabled)
return 0;
- return map_vdso(&vdso_image_64, true);
+ return map_vdso_randomized(&vdso_image_64);
}
#ifdef CONFIG_COMPAT
@@ -276,8 +306,7 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
if (test_thread_flag(TIF_X32)) {
if (!vdso64_enabled)
return 0;
-
- return map_vdso(&vdso_image_x32, true);
+ return map_vdso_randomized(&vdso_image_x32);
}
#endif
#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index e07a22bb9308..f5f4b3fbbbc2 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -119,8 +119,8 @@ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] =
{
[PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x077e,
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index e6131d4454e6..65577f081d07 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -29,6 +29,8 @@
#define COUNTER_SHIFT 16
+static HLIST_HEAD(uncore_unused_list);
+
struct amd_uncore {
int id;
int refcnt;
@@ -39,7 +41,7 @@ struct amd_uncore {
cpumask_t *active_mask;
struct pmu *pmu;
struct perf_event *events[MAX_COUNTERS];
- struct amd_uncore *free_when_cpu_online;
+ struct hlist_node node;
};
static struct amd_uncore * __percpu *amd_uncore_nb;
@@ -306,6 +308,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
uncore_nb->active_mask = &amd_nb_active_mask;
uncore_nb->pmu = &amd_nb_pmu;
+ uncore_nb->id = -1;
*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
}
@@ -319,6 +322,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
uncore_l2->active_mask = &amd_l2_active_mask;
uncore_l2->pmu = &amd_l2_pmu;
+ uncore_l2->id = -1;
*per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
}
@@ -348,7 +352,7 @@ amd_uncore_find_online_sibling(struct amd_uncore *this,
continue;
if (this->id == that->id) {
- that->free_when_cpu_online = this;
+ hlist_add_head(&this->node, &uncore_unused_list);
this = that;
break;
}
@@ -388,13 +392,23 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
return 0;
}
+static void uncore_clean_online(void)
+{
+ struct amd_uncore *uncore;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) {
+ hlist_del(&uncore->node);
+ kfree(uncore);
+ }
+}
+
static void uncore_online(unsigned int cpu,
struct amd_uncore * __percpu *uncores)
{
struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
- kfree(uncore->free_when_cpu_online);
- uncore->free_when_cpu_online = NULL;
+ uncore_clean_online();
if (cpu == uncore->cpu)
cpumask_set_cpu(cpu, uncore->active_mask);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index d0efb5cb1b00..d31735f37ed7 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -37,6 +37,7 @@
#include <asm/timer.h>
#include <asm/desc.h>
#include <asm/ldt.h>
+#include <asm/unwind.h>
#include "perf_event.h"
@@ -1201,6 +1202,9 @@ static int x86_pmu_add(struct perf_event *event, int flags)
* If group events scheduling transaction was started,
* skip the schedulability test here, it will be performed
* at commit time (->commit_txn) as a whole.
+ *
+ * If commit fails, we'll call ->del() on all events
+ * for which ->add() was called.
*/
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto done_collect;
@@ -1223,6 +1227,14 @@ done_collect:
cpuc->n_added += n - n0;
cpuc->n_txn += n - n0;
+ if (x86_pmu.add) {
+ /*
+ * This is before x86_pmu_enable() will call x86_pmu_start(),
+ * so we enable LBRs before an event needs them etc..
+ */
+ x86_pmu.add(event);
+ }
+
ret = 0;
out:
return ret;
@@ -1346,7 +1358,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
/*
- * If we're called during a txn, we don't need to do anything.
+ * If we're called during a txn, we only need to undo x86_pmu.add.
* The events never got scheduled and ->cancel_txn will truncate
* the event_list.
*
@@ -1354,7 +1366,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
* an event added during that same TXN.
*/
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
- return;
+ goto do_del;
/*
* Not a TXN, therefore cleanup properly.
@@ -1384,6 +1396,15 @@ static void x86_pmu_del(struct perf_event *event, int flags)
--cpuc->n_events;
perf_event_update_userpage(event);
+
+do_del:
+ if (x86_pmu.del) {
+ /*
+ * This is after x86_pmu_stop(); so we disable LBRs after any
+ * event can need them etc..
+ */
+ x86_pmu.del(event);
+ }
}
int x86_pmu_handle_irq(struct pt_regs *regs)
@@ -2247,39 +2268,26 @@ void arch_perf_update_userpage(struct perf_event *event,
cyc2ns_read_end(data);
}
-/*
- * callchain support
- */
-
-static int backtrace_stack(void *data, char *name)
-{
- return 0;
-}
-
-static int backtrace_address(void *data, unsigned long addr, int reliable)
-{
- struct perf_callchain_entry_ctx *entry = data;
-
- return perf_callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
- .stack = backtrace_stack,
- .address = backtrace_address,
- .walk_stack = print_context_stack_bp,
-};
-
void
perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
{
+ struct unwind_state state;
+ unsigned long addr;
+
if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
/* TODO: We don't support guest os callchain now */
return;
}
- perf_callchain_store(entry, regs->ip);
+ if (perf_callchain_store(entry, regs->ip))
+ return;
- dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+ for (unwind_start(&state, current, regs, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr || perf_callchain_store(entry, addr))
+ return;
+ }
}
static inline int
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 0a6e393a2e62..982c9e31daca 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -31,7 +31,17 @@
struct bts_ctx {
struct perf_output_handle handle;
struct debug_store ds_back;
- int started;
+ int state;
+};
+
+/* BTS context states: */
+enum {
+ /* no ongoing AUX transactions */
+ BTS_STATE_STOPPED = 0,
+ /* AUX transaction is on, BTS tracing is disabled */
+ BTS_STATE_INACTIVE,
+ /* AUX transaction is on, BTS tracing is running */
+ BTS_STATE_ACTIVE,
};
static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
@@ -204,6 +214,15 @@ static void bts_update(struct bts_ctx *bts)
static int
bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle);
+/*
+ * Ordering PMU callbacks wrt themselves and the PMI is done by means
+ * of bts::state, which:
+ * - is set when bts::handle::event is valid, that is, between
+ * perf_aux_output_begin() and perf_aux_output_end();
+ * - is zero otherwise;
+ * - is ordered against bts::handle::event with a compiler barrier.
+ */
+
static void __bts_event_start(struct perf_event *event)
{
struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
@@ -221,10 +240,13 @@ static void __bts_event_start(struct perf_event *event)
/*
* local barrier to make sure that ds configuration made it
- * before we enable BTS
+ * before we enable BTS and bts::state goes ACTIVE
*/
wmb();
+ /* INACTIVE/STOPPED -> ACTIVE */
+ WRITE_ONCE(bts->state, BTS_STATE_ACTIVE);
+
intel_pmu_enable_bts(config);
}
@@ -251,9 +273,6 @@ static void bts_event_start(struct perf_event *event, int flags)
__bts_event_start(event);
- /* PMI handler: this counter is running and likely generating PMIs */
- ACCESS_ONCE(bts->started) = 1;
-
return;
fail_end_stop:
@@ -263,30 +282,34 @@ fail_stop:
event->hw.state = PERF_HES_STOPPED;
}
-static void __bts_event_stop(struct perf_event *event)
+static void __bts_event_stop(struct perf_event *event, int state)
{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */
+ WRITE_ONCE(bts->state, state);
+
/*
* No extra synchronization is mandated by the documentation to have
* BTS data stores globally visible.
*/
intel_pmu_disable_bts();
-
- if (event->hw.state & PERF_HES_STOPPED)
- return;
-
- ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
}
static void bts_event_stop(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
- struct bts_buffer *buf = perf_get_aux(&bts->handle);
+ struct bts_buffer *buf = NULL;
+ int state = READ_ONCE(bts->state);
- /* PMI handler: don't restart this counter */
- ACCESS_ONCE(bts->started) = 0;
+ if (state == BTS_STATE_ACTIVE)
+ __bts_event_stop(event, BTS_STATE_STOPPED);
- __bts_event_stop(event);
+ if (state != BTS_STATE_STOPPED)
+ buf = perf_get_aux(&bts->handle);
+
+ event->hw.state |= PERF_HES_STOPPED;
if (flags & PERF_EF_UPDATE) {
bts_update(bts);
@@ -296,6 +319,7 @@ static void bts_event_stop(struct perf_event *event, int flags)
bts->handle.head =
local_xchg(&buf->data_size,
buf->nr_pages << PAGE_SHIFT);
+
perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
!!local_xchg(&buf->lost, 0));
}
@@ -310,8 +334,20 @@ static void bts_event_stop(struct perf_event *event, int flags)
void intel_bts_enable_local(void)
{
struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ int state = READ_ONCE(bts->state);
+
+ /*
+ * Here we transition from INACTIVE to ACTIVE;
+ * if we instead are STOPPED from the interrupt handler,
+ * stay that way. Can't be ACTIVE here though.
+ */
+ if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE))
+ return;
+
+ if (state == BTS_STATE_STOPPED)
+ return;
- if (bts->handle.event && bts->started)
+ if (bts->handle.event)
__bts_event_start(bts->handle.event);
}
@@ -319,8 +355,15 @@ void intel_bts_disable_local(void)
{
struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ /*
+ * Here we transition from ACTIVE to INACTIVE;
+ * do nothing for STOPPED or INACTIVE.
+ */
+ if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE)
+ return;
+
if (bts->handle.event)
- __bts_event_stop(bts->handle.event);
+ __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE);
}
static int
@@ -335,8 +378,6 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
return 0;
head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
- if (WARN_ON_ONCE(head != local_read(&buf->head)))
- return -EINVAL;
phys = &buf->buf[buf->cur_buf];
space = phys->offset + phys->displacement + phys->size - head;
@@ -403,22 +444,37 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
int intel_bts_interrupt(void)
{
+ struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds;
struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
struct perf_event *event = bts->handle.event;
struct bts_buffer *buf;
s64 old_head;
- int err;
+ int err = -ENOSPC, handled = 0;
- if (!event || !bts->started)
- return 0;
+ /*
+ * The only surefire way of knowing if this NMI is ours is by checking
+ * the write ptr against the PMI threshold.
+ */
+ if (ds && (ds->bts_index >= ds->bts_interrupt_threshold))
+ handled = 1;
+
+ /*
+ * this is wrapped in intel_bts_enable_local/intel_bts_disable_local,
+ * so we can only be INACTIVE or STOPPED
+ */
+ if (READ_ONCE(bts->state) == BTS_STATE_STOPPED)
+ return handled;
buf = perf_get_aux(&bts->handle);
+ if (!buf)
+ return handled;
+
/*
* Skip snapshot counters: they don't use the interrupt, but
* there's no other way of telling, because the pointer will
* keep moving
*/
- if (!buf || buf->snapshot)
+ if (buf->snapshot)
return 0;
old_head = local_read(&buf->head);
@@ -426,18 +482,27 @@ int intel_bts_interrupt(void)
/* no new data */
if (old_head == local_read(&buf->head))
- return 0;
+ return handled;
perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
!!local_xchg(&buf->lost, 0));
buf = perf_aux_output_begin(&bts->handle, event);
- if (!buf)
- return 1;
+ if (buf)
+ err = bts_buffer_reset(buf, &bts->handle);
+
+ if (err) {
+ WRITE_ONCE(bts->state, BTS_STATE_STOPPED);
- err = bts_buffer_reset(buf, &bts->handle);
- if (err)
- perf_aux_output_end(&bts->handle, 0, false);
+ if (buf) {
+ /*
+ * BTS_STATE_STOPPED should be visible before
+ * cleared handle::event
+ */
+ barrier();
+ perf_aux_output_end(&bts->handle, 0, false);
+ }
+ }
return 1;
}
@@ -519,7 +584,8 @@ static __init int bts_init(void)
if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
return -ENODEV;
- bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
+ bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
+ PERF_PMU_CAP_EXCLUSIVE;
bts_pmu.task_ctx_nr = perf_sw_context;
bts_pmu.event_init = bts_event_init;
bts_pmu.add = bts_event_add;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 2cbde2f449aa..a3a9eb84b5cf 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1730,9 +1730,11 @@ static __initconst const u64 knl_hw_cache_extra_regs
* disabled state if called consecutively.
*
* During consecutive calls, the same disable value will be written to related
- * registers, so the PMU state remains unchanged. hw.state in
- * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive
- * calls.
+ * registers, so the PMU state remains unchanged.
+ *
+ * intel_bts events don't coexist with intel PMU's BTS events because of
+ * x86_add_exclusive(x86_lbr_exclusive_lbr); there's no need to keep them
+ * disabled around intel PMU's event batching etc, only inside the PMI handler.
*/
static void __intel_pmu_disable_all(void)
{
@@ -1742,8 +1744,6 @@ static void __intel_pmu_disable_all(void)
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
intel_pmu_disable_bts();
- else
- intel_bts_disable_local();
intel_pmu_pebs_disable_all();
}
@@ -1771,8 +1771,7 @@ static void __intel_pmu_enable_all(int added, bool pmi)
return;
intel_pmu_enable_bts(event->hw.config);
- } else
- intel_bts_enable_local();
+ }
}
static void intel_pmu_enable_all(int added)
@@ -1907,13 +1906,6 @@ static void intel_pmu_disable_event(struct perf_event *event)
cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
cpuc->intel_cp_status &= ~(1ull << hwc->idx);
- /*
- * must disable before any actual event
- * because any event may be combined with LBR
- */
- if (needs_branch_stack(event))
- intel_pmu_lbr_disable(event);
-
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_disable_fixed(hwc);
return;
@@ -1925,6 +1917,14 @@ static void intel_pmu_disable_event(struct perf_event *event)
intel_pmu_pebs_disable(event);
}
+static void intel_pmu_del_event(struct perf_event *event)
+{
+ if (needs_branch_stack(event))
+ intel_pmu_lbr_del(event);
+ if (event->attr.precise_ip)
+ intel_pmu_pebs_del(event);
+}
+
static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
{
int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
@@ -1968,12 +1968,6 @@ static void intel_pmu_enable_event(struct perf_event *event)
intel_pmu_enable_bts(hwc->config);
return;
}
- /*
- * must enabled before any actual event
- * because any event may be combined with LBR
- */
- if (needs_branch_stack(event))
- intel_pmu_lbr_enable(event);
if (event->attr.exclude_host)
cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
@@ -1994,6 +1988,14 @@ static void intel_pmu_enable_event(struct perf_event *event)
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
}
+static void intel_pmu_add_event(struct perf_event *event)
+{
+ if (event->attr.precise_ip)
+ intel_pmu_pebs_add(event);
+ if (needs_branch_stack(event))
+ intel_pmu_lbr_add(event);
+}
+
/*
* Save and restart an expired event. Called by NMI contexts,
* so it has to be careful about preempting normal event ops:
@@ -2073,6 +2075,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
*/
if (!x86_pmu.late_ack)
apic_write(APIC_LVTPC, APIC_DM_NMI);
+ intel_bts_disable_local();
__intel_pmu_disable_all();
handled = intel_pmu_drain_bts_buffer();
handled += intel_bts_interrupt();
@@ -2172,6 +2175,7 @@ done:
/* Only restore PMU state when it's active. See x86_pmu_disable(). */
if (cpuc->enabled)
__intel_pmu_enable_all(0, true);
+ intel_bts_enable_local();
/*
* Only unmask the NMI after the overflow counters
@@ -3290,6 +3294,8 @@ static __initconst const struct x86_pmu intel_pmu = {
.enable_all = intel_pmu_enable_all,
.enable = intel_pmu_enable_event,
.disable = intel_pmu_disable_event,
+ .add = intel_pmu_add_event,
+ .del = intel_pmu_del_event,
.hw_config = intel_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 783c49ddef29..8f82b02934fa 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -458,6 +458,11 @@ static void __intel_cqm_event_count(void *info);
static void init_mbm_sample(u32 rmid, u32 evt_type);
static void __intel_mbm_event_count(void *info);
+static bool is_cqm_event(int e)
+{
+ return (e == QOS_L3_OCCUP_EVENT_ID);
+}
+
static bool is_mbm_event(int e)
{
return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
@@ -1366,6 +1371,10 @@ static int intel_cqm_event_init(struct perf_event *event)
(event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
return -EINVAL;
+ if ((is_cqm_event(event->attr.config) && !cqm_enabled) ||
+ (is_mbm_event(event->attr.config) && !mbm_enabled))
+ return -EINVAL;
+
/* unsupported modes and filters */
if (event->attr.exclude_user ||
event->attr.exclude_kernel ||
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 7ce9f3f669e6..0319311dbdbb 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -806,9 +806,65 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
return &emptyconstraint;
}
-static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+/*
+ * We need the sched_task callback even for per-cpu events when we use
+ * the large interrupt threshold, such that we can provide PID and TID
+ * to PEBS samples.
+ */
+static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
+{
+ return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
+}
+
+static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
+{
+ struct debug_store *ds = cpuc->ds;
+ u64 threshold;
+
+ if (cpuc->n_pebs == cpuc->n_large_pebs) {
+ threshold = ds->pebs_absolute_maximum -
+ x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+ } else {
+ threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+ }
+
+ ds->pebs_interrupt_threshold = threshold;
+}
+
+static void
+pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu)
{
- return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+ /*
+ * Make sure we get updated with the first PEBS
+ * event. It will trigger also during removal, but
+ * that does not hurt:
+ */
+ bool update = cpuc->n_pebs == 1;
+
+ if (needed_cb != pebs_needs_sched_cb(cpuc)) {
+ if (!needed_cb)
+ perf_sched_cb_inc(pmu);
+ else
+ perf_sched_cb_dec(pmu);
+
+ update = true;
+ }
+
+ if (update)
+ pebs_update_threshold(cpuc);
+}
+
+void intel_pmu_pebs_add(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ bool needed_cb = pebs_needs_sched_cb(cpuc);
+
+ cpuc->n_pebs++;
+ if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+ cpuc->n_large_pebs++;
+
+ pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
}
void intel_pmu_pebs_enable(struct perf_event *event)
@@ -816,12 +872,9 @@ void intel_pmu_pebs_enable(struct perf_event *event)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
struct debug_store *ds = cpuc->ds;
- bool first_pebs;
- u64 threshold;
hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
- first_pebs = !pebs_is_enabled(cpuc);
cpuc->pebs_enabled |= 1ULL << hwc->idx;
if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
@@ -830,46 +883,34 @@ void intel_pmu_pebs_enable(struct perf_event *event)
cpuc->pebs_enabled |= 1ULL << 63;
/*
- * When the event is constrained enough we can use a larger
- * threshold and run the event with less frequent PMI.
+ * Use auto-reload if possible to save a MSR write in the PMI.
+ * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
*/
- if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
- threshold = ds->pebs_absolute_maximum -
- x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
-
- if (first_pebs)
- perf_sched_cb_inc(event->ctx->pmu);
- } else {
- threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
- /*
- * If not all events can use larger buffer,
- * roll back to threshold = 1
- */
- if (!first_pebs &&
- (ds->pebs_interrupt_threshold > threshold))
- perf_sched_cb_dec(event->ctx->pmu);
- }
-
- /* Use auto-reload if possible to save a MSR write in the PMI */
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
ds->pebs_event_reset[hwc->idx] =
(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
}
+}
+
+void intel_pmu_pebs_del(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ bool needed_cb = pebs_needs_sched_cb(cpuc);
- if (first_pebs || ds->pebs_interrupt_threshold > threshold)
- ds->pebs_interrupt_threshold = threshold;
+ cpuc->n_pebs--;
+ if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+ cpuc->n_large_pebs--;
+
+ pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
}
void intel_pmu_pebs_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- struct debug_store *ds = cpuc->ds;
- bool large_pebs = ds->pebs_interrupt_threshold >
- ds->pebs_buffer_base + x86_pmu.pebs_record_size;
- if (large_pebs)
+ if (cpuc->n_pebs == cpuc->n_large_pebs)
intel_pmu_drain_pebs_buffer();
cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
@@ -879,9 +920,6 @@ void intel_pmu_pebs_disable(struct perf_event *event)
else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
cpuc->pebs_enabled &= ~(1ULL << 63);
- if (large_pebs && !pebs_is_enabled(cpuc))
- perf_sched_cb_dec(event->ctx->pmu);
-
if (cpuc->enabled)
wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
@@ -1274,18 +1312,18 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
struct pebs_record_nhm *p = at;
u64 pebs_status;
- /* PEBS v3 has accurate status bits */
+ pebs_status = p->status & cpuc->pebs_enabled;
+ pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+
+ /* PEBS v3 has more accurate status bits */
if (x86_pmu.intel_cap.pebs_format >= 3) {
- for_each_set_bit(bit, (unsigned long *)&p->status,
- MAX_PEBS_EVENTS)
+ for_each_set_bit(bit, (unsigned long *)&pebs_status,
+ x86_pmu.max_pebs_events)
counts[bit]++;
continue;
}
- pebs_status = p->status & cpuc->pebs_enabled;
- pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
-
/*
* On some CPUs the PEBS status can be zero when PEBS is
* racing with clearing of GLOBAL_STATUS.
@@ -1333,8 +1371,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
continue;
event = cpuc->events[bit];
- WARN_ON_ONCE(!event);
- WARN_ON_ONCE(!event->attr.precise_ip);
+ if (WARN_ON_ONCE(!event))
+ continue;
+
+ if (WARN_ON_ONCE(!event->attr.precise_ip))
+ continue;
/* log dropped samples number */
if (error[bit])
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 707d358e0dff..fc6cf21c535e 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -380,7 +380,6 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
{
- struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx;
/*
@@ -390,31 +389,21 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
*/
task_ctx = ctx ? ctx->task_ctx_data : NULL;
if (task_ctx) {
- if (sched_in) {
+ if (sched_in)
__intel_pmu_lbr_restore(task_ctx);
- cpuc->lbr_context = ctx;
- } else {
+ else
__intel_pmu_lbr_save(task_ctx);
- }
return;
}
/*
- * When sampling the branck stack in system-wide, it may be
- * necessary to flush the stack on context switch. This happens
- * when the branch stack does not tag its entries with the pid
- * of the current task. Otherwise it becomes impossible to
- * associate a branch entry with a task. This ambiguity is more
- * likely to appear when the branch stack supports priv level
- * filtering and the user sets it to monitor only at the user
- * level (which could be a useful measurement in system-wide
- * mode). In that case, the risk is high of having a branch
- * stack with branch from multiple tasks.
- */
- if (sched_in) {
+ * Since a context switch can flip the address space and LBR entries
+ * are not tagged with an identifier, we need to wipe the LBR, even for
+ * per-cpu events. You simply cannot resolve the branches from the old
+ * address space.
+ */
+ if (sched_in)
intel_pmu_lbr_reset();
- cpuc->lbr_context = ctx;
- }
}
static inline bool branch_user_callstack(unsigned br_sel)
@@ -422,7 +411,7 @@ static inline bool branch_user_callstack(unsigned br_sel)
return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
}
-void intel_pmu_lbr_enable(struct perf_event *event)
+void intel_pmu_lbr_add(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx;
@@ -430,27 +419,38 @@ void intel_pmu_lbr_enable(struct perf_event *event)
if (!x86_pmu.lbr_nr)
return;
- /*
- * Reset the LBR stack if we changed task context to
- * avoid data leaks.
- */
- if (event->ctx->task && cpuc->lbr_context != event->ctx) {
- intel_pmu_lbr_reset();
- cpuc->lbr_context = event->ctx;
- }
cpuc->br_sel = event->hw.branch_reg.reg;
- if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
- event->ctx->task_ctx_data) {
+ if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
task_ctx = event->ctx->task_ctx_data;
task_ctx->lbr_callstack_users++;
}
- cpuc->lbr_users++;
+ /*
+ * Request pmu::sched_task() callback, which will fire inside the
+ * regular perf event scheduling, so that call will:
+ *
+ * - restore or wipe; when LBR-callstack,
+ * - wipe; otherwise,
+ *
+ * when this is from __perf_event_task_sched_in().
+ *
+ * However, if this is from perf_install_in_context(), no such callback
+ * will follow and we'll need to reset the LBR here if this is the
+ * first LBR event.
+ *
+ * The problem is, we cannot tell these cases apart... but we can
+ * exclude the biggest chunk of cases by looking at
+ * event->total_time_running. An event that has accrued runtime cannot
+ * be 'new'. Conversely, a new event can get installed through the
+ * context switch path for the first time.
+ */
perf_sched_cb_inc(event->ctx->pmu);
+ if (!cpuc->lbr_users++ && !event->total_time_running)
+ intel_pmu_lbr_reset();
}
-void intel_pmu_lbr_disable(struct perf_event *event)
+void intel_pmu_lbr_del(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx;
@@ -467,12 +467,6 @@ void intel_pmu_lbr_disable(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
perf_sched_cb_dec(event->ctx->pmu);
-
- if (cpuc->enabled && !cpuc->lbr_users) {
- __intel_pmu_lbr_disable();
- /* avoid stale pointer */
- cpuc->lbr_context = NULL;
- }
}
void intel_pmu_lbr_enable_all(bool pmi)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 04bb5fb5a8d7..c5047b8f777b 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -69,6 +69,8 @@ static struct pt_cap_desc {
PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)),
PT_CAP(ip_filtering, 0, CR_EBX, BIT(2)),
PT_CAP(mtc, 0, CR_EBX, BIT(3)),
+ PT_CAP(ptwrite, 0, CR_EBX, BIT(4)),
+ PT_CAP(power_event_trace, 0, CR_EBX, BIT(5)),
PT_CAP(topa_output, 0, CR_ECX, BIT(0)),
PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)),
PT_CAP(single_range_output, 0, CR_ECX, BIT(2)),
@@ -259,10 +261,16 @@ fail:
#define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \
RTIT_CTL_MTC_RANGE)
+#define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \
+ RTIT_CTL_FUP_ON_PTW)
+
#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \
RTIT_CTL_DISRETC | \
RTIT_CTL_CYC_PSB | \
- RTIT_CTL_MTC)
+ RTIT_CTL_MTC | \
+ RTIT_CTL_PWR_EVT_EN | \
+ RTIT_CTL_FUP_ON_PTW | \
+ RTIT_CTL_PTW_EN)
static bool pt_event_valid(struct perf_event *event)
{
@@ -311,6 +319,20 @@ static bool pt_event_valid(struct perf_event *event)
return false;
}
+ if (config & RTIT_CTL_PWR_EVT_EN &&
+ !pt_cap_get(PT_CAP_power_event_trace))
+ return false;
+
+ if (config & RTIT_CTL_PTW) {
+ if (!pt_cap_get(PT_CAP_ptwrite))
+ return false;
+
+ /* FUPonPTW without PTW doesn't make sense */
+ if ((config & RTIT_CTL_FUP_ON_PTW) &&
+ !(config & RTIT_CTL_PTW_EN))
+ return false;
+ }
+
return true;
}
@@ -1074,6 +1096,11 @@ static void pt_addr_filters_fini(struct perf_event *event)
event->hw.addr_filters = NULL;
}
+static inline bool valid_kernel_ip(unsigned long ip)
+{
+ return virt_addr_valid(ip) && kernel_ip(ip);
+}
+
static int pt_event_addr_filters_validate(struct list_head *filters)
{
struct perf_addr_filter *filter;
@@ -1081,11 +1108,16 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
list_for_each_entry(filter, filters, entry) {
/* PT doesn't support single address triggers */
- if (!filter->range)
+ if (!filter->range || !filter->size)
return -EOPNOTSUPP;
- if (!filter->inode && !kernel_ip(filter->offset))
- return -EINVAL;
+ if (!filter->inode) {
+ if (!valid_kernel_ip(filter->offset))
+ return -EINVAL;
+
+ if (!valid_kernel_ip(filter->offset + filter->size))
+ return -EINVAL;
+ }
if (++range > pt_cap_get(PT_CAP_num_address_ranges))
return -EOPNOTSUPP;
@@ -1111,7 +1143,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event)
} else {
/* apply the offset */
msr_a = filter->offset + offs[range];
- msr_b = filter->size + msr_a;
+ msr_b = filter->size + msr_a - 1;
}
filters->filter[range].msr_a = msr_a;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index efffa4a09f68..53473c21b554 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -26,11 +26,14 @@
#define RTIT_CTL_CYCLEACC BIT(1)
#define RTIT_CTL_OS BIT(2)
#define RTIT_CTL_USR BIT(3)
+#define RTIT_CTL_PWR_EVT_EN BIT(4)
+#define RTIT_CTL_FUP_ON_PTW BIT(5)
#define RTIT_CTL_CR3EN BIT(7)
#define RTIT_CTL_TOPA BIT(8)
#define RTIT_CTL_MTC_EN BIT(9)
#define RTIT_CTL_TSC_EN BIT(10)
#define RTIT_CTL_DISRETC BIT(11)
+#define RTIT_CTL_PTW_EN BIT(12)
#define RTIT_CTL_BRANCH_EN BIT(13)
#define RTIT_CTL_MTC_RANGE_OFFSET 14
#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
@@ -91,6 +94,8 @@ enum pt_capabilities {
PT_CAP_psb_cyc,
PT_CAP_ip_filtering,
PT_CAP_mtc,
+ PT_CAP_ptwrite,
+ PT_CAP_power_event_trace,
PT_CAP_topa_output,
PT_CAP_topa_multiple_entries,
PT_CAP_single_range_output,
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 28865938aadf..b0f0e835a770 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -357,6 +357,8 @@ static int rapl_pmu_event_init(struct perf_event *event)
if (event->cpu < 0)
return -EINVAL;
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
/*
* check event is known (determines counter)
*/
@@ -765,6 +767,8 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, hsx_rapl_init),
+
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init),
{},
};
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 463dc7a5a6c3..d9844cc74486 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -664,6 +664,8 @@ static int uncore_pmu_event_init(struct perf_event *event)
event->cpu = box->cpu;
event->pmu_private = box;
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
event->hw.idx = -1;
event->hw.last_tag = ~0ULL;
event->hw.extra_reg.idx = EXTRA_REG_NONE;
@@ -683,7 +685,8 @@ static int uncore_pmu_event_init(struct perf_event *event)
/* fixed counters have event field hardcoded to zero */
hwc->config = 0ULL;
} else {
- hwc->config = event->attr.config & pmu->type->event_mask;
+ hwc->config = event->attr.config &
+ (pmu->type->event_mask | ((u64)pmu->type->event_mask_ext << 32));
if (pmu->type->ops->hw_config) {
ret = pmu->type->ops->hw_config(box, event);
if (ret)
@@ -1321,6 +1324,11 @@ static const struct intel_uncore_init_fun skl_uncore_init __initconst = {
.pci_init = skl_uncore_pci_init,
};
+static const struct intel_uncore_init_fun skx_uncore_init __initconst = {
+ .cpu_init = skx_uncore_cpu_init,
+ .pci_init = skx_uncore_pci_init,
+};
+
static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init),
@@ -1343,6 +1351,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init),
{},
};
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 78b9c23e2d8d..ad986c1e29bc 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -44,6 +44,7 @@ struct intel_uncore_type {
unsigned perf_ctr;
unsigned event_ctl;
unsigned event_mask;
+ unsigned event_mask_ext;
unsigned fixed_ctr;
unsigned fixed_ctl;
unsigned box_ctl;
@@ -120,6 +121,7 @@ struct intel_uncore_box {
};
#define UNCORE_BOX_FLAG_INITIATED 0
+#define UNCORE_BOX_FLAG_CTL_OFFS8 1 /* event config registers are 8-byte apart */
struct uncore_event_desc {
struct kobj_attribute attr;
@@ -172,6 +174,9 @@ static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
static inline
unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
{
+ if (test_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags))
+ return idx * 8 + box->pmu->type->event_ctl;
+
return idx * 4 + box->pmu->type->event_ctl;
}
@@ -377,6 +382,8 @@ int bdx_uncore_pci_init(void);
void bdx_uncore_cpu_init(void);
int knl_uncore_pci_init(void);
void knl_uncore_cpu_init(void);
+int skx_uncore_pci_init(void);
+void skx_uncore_cpu_init(void);
/* perf_event_intel_uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 97a69dbba649..5f845eef9a4d 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -100,6 +100,12 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
}
}
+static void snb_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+}
+
static void snb_uncore_msr_exit_box(struct intel_uncore_box *box)
{
if (box->pmu->pmu_idx == 0)
@@ -127,6 +133,7 @@ static struct attribute_group snb_uncore_format_group = {
static struct intel_uncore_ops snb_uncore_msr_ops = {
.init_box = snb_uncore_msr_init_box,
+ .enable_box = snb_uncore_msr_enable_box,
.exit_box = snb_uncore_msr_exit_box,
.disable_event = snb_uncore_msr_disable_event,
.enable_event = snb_uncore_msr_enable_event,
@@ -192,6 +199,12 @@ static void skl_uncore_msr_init_box(struct intel_uncore_box *box)
}
}
+static void skl_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(SKL_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL);
+}
+
static void skl_uncore_msr_exit_box(struct intel_uncore_box *box)
{
if (box->pmu->pmu_idx == 0)
@@ -200,6 +213,7 @@ static void skl_uncore_msr_exit_box(struct intel_uncore_box *box)
static struct intel_uncore_ops skl_uncore_msr_ops = {
.init_box = skl_uncore_msr_init_box,
+ .enable_box = skl_uncore_msr_enable_box,
.exit_box = skl_uncore_msr_exit_box,
.disable_event = snb_uncore_msr_disable_event,
.enable_event = snb_uncore_msr_enable_event,
@@ -374,6 +388,8 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
event->cpu = box->cpu;
event->pmu_private = box;
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
event->hw.idx = -1;
event->hw.last_tag = ~0ULL;
event->hw.extra_reg.idx = EXTRA_REG_NONE;
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 824e54086e07..272427700d48 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1,6 +1,10 @@
/* SandyBridge-EP/IvyTown uncore support */
#include "uncore.h"
+/* SNB-EP pci bus to socket mapping */
+#define SNBEP_CPUNODEID 0x40
+#define SNBEP_GIDNIDMAP 0x54
+
/* SNB-EP Box level control */
#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
@@ -264,15 +268,72 @@
SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+/* SKX pci bus to socket mapping */
+#define SKX_CPUNODEID 0xc0
+#define SKX_GIDNIDMAP 0xd4
+
+/* SKX CHA */
+#define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_STATE (0x3ffULL << 17)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_REM (0x1ULL << 32)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_LOC (0x1ULL << 33)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC (0x1ULL << 35)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NM (0x1ULL << 36)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM (0x1ULL << 37)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC0 (0x3ffULL << 41)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC1 (0x3ffULL << 51)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63)
+
+/* SKX IIO */
+#define SKX_IIO0_MSR_PMON_CTL0 0xa48
+#define SKX_IIO0_MSR_PMON_CTR0 0xa41
+#define SKX_IIO0_MSR_PMON_BOX_CTL 0xa40
+#define SKX_IIO_MSR_OFFSET 0x20
+
+#define SKX_PMON_CTL_TRESH_MASK (0xff << 24)
+#define SKX_PMON_CTL_TRESH_MASK_EXT (0xf)
+#define SKX_PMON_CTL_CH_MASK (0xff << 4)
+#define SKX_PMON_CTL_FC_MASK (0x7 << 12)
+#define SKX_IIO_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SKX_PMON_CTL_TRESH_MASK)
+#define SKX_IIO_PMON_RAW_EVENT_MASK_EXT (SKX_PMON_CTL_TRESH_MASK_EXT | \
+ SKX_PMON_CTL_CH_MASK | \
+ SKX_PMON_CTL_FC_MASK)
+
+/* SKX IRP */
+#define SKX_IRP0_MSR_PMON_CTL0 0xa5b
+#define SKX_IRP0_MSR_PMON_CTR0 0xa59
+#define SKX_IRP0_MSR_PMON_BOX_CTL 0xa58
+#define SKX_IRP_MSR_OFFSET 0x20
+
+/* SKX UPI */
+#define SKX_UPI_PCI_PMON_CTL0 0x350
+#define SKX_UPI_PCI_PMON_CTR0 0x318
+#define SKX_UPI_PCI_PMON_BOX_CTL 0x378
+#define SKX_PMON_CTL_UMASK_EXT 0xff
+
+/* SKX M2M */
+#define SKX_M2M_PCI_PMON_CTL0 0x228
+#define SKX_M2M_PCI_PMON_CTR0 0x200
+#define SKX_M2M_PCI_PMON_BOX_CTL 0x258
+
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-39");
DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh9, thresh, "config:24-35");
DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
@@ -280,6 +341,8 @@ DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
+DEFINE_UNCORE_FORMAT_ATTR(ch_mask, ch_mask, "config:36-43");
+DEFINE_UNCORE_FORMAT_ATTR(fc_mask, fc_mask, "config:44-46");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
@@ -288,18 +351,26 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link4, filter_link, "config1:9-12");
DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state5, filter_state, "config1:17-26");
+DEFINE_UNCORE_FORMAT_ATTR(filter_rem, filter_rem, "config1:32");
+DEFINE_UNCORE_FORMAT_ATTR(filter_loc, filter_loc, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nm, filter_nm, "config1:36");
+DEFINE_UNCORE_FORMAT_ATTR(filter_not_nm, filter_not_nm, "config1:37");
DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc_0, filter_opc0, "config1:41-50");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc_1, filter_opc1, "config1:51-60");
DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
@@ -1153,7 +1224,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
/*
* build pci bus to socket mapping
*/
-static int snbep_pci2phy_map_init(int devid)
+static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
{
struct pci_dev *ubox_dev = NULL;
int i, bus, nodeid, segment;
@@ -1168,12 +1239,12 @@ static int snbep_pci2phy_map_init(int devid)
break;
bus = ubox_dev->bus->number;
/* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, 0x40, &config);
+ err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
if (err)
break;
nodeid = config;
/* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, 0x54, &config);
+ err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
if (err)
break;
@@ -1207,11 +1278,20 @@ static int snbep_pci2phy_map_init(int devid)
raw_spin_lock(&pci2phy_map_lock);
list_for_each_entry(map, &pci2phy_map_head, list) {
i = -1;
- for (bus = 255; bus >= 0; bus--) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
- else
- map->pbus_to_physid[bus] = i;
+ if (reverse) {
+ for (bus = 255; bus >= 0; bus--) {
+ if (map->pbus_to_physid[bus] >= 0)
+ i = map->pbus_to_physid[bus];
+ else
+ map->pbus_to_physid[bus] = i;
+ }
+ } else {
+ for (bus = 0; bus <= 255; bus++) {
+ if (map->pbus_to_physid[bus] >= 0)
+ i = map->pbus_to_physid[bus];
+ else
+ map->pbus_to_physid[bus] = i;
+ }
}
}
raw_spin_unlock(&pci2phy_map_lock);
@@ -1224,7 +1304,7 @@ static int snbep_pci2phy_map_init(int devid)
int snbep_uncore_pci_init(void)
{
- int ret = snbep_pci2phy_map_init(0x3ce0);
+ int ret = snbep_pci2phy_map_init(0x3ce0, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
if (ret)
return ret;
uncore_pci_uncores = snbep_pci_uncores;
@@ -1788,7 +1868,7 @@ static struct pci_driver ivbep_uncore_pci_driver = {
int ivbep_uncore_pci_init(void)
{
- int ret = snbep_pci2phy_map_init(0x0e1e);
+ int ret = snbep_pci2phy_map_init(0x0e1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
if (ret)
return ret;
uncore_pci_uncores = ivbep_pci_uncores;
@@ -2626,7 +2706,7 @@ void hswep_uncore_cpu_init(void)
static struct intel_uncore_type hswep_uncore_ha = {
.name = "ha",
- .num_counters = 5,
+ .num_counters = 4,
.num_boxes = 2,
.perf_ctr_bits = 48,
SNBEP_UNCORE_PCI_COMMON_INIT(),
@@ -2645,7 +2725,7 @@ static struct uncore_event_desc hswep_uncore_imc_events[] = {
static struct intel_uncore_type hswep_uncore_imc = {
.name = "imc",
- .num_counters = 5,
+ .num_counters = 4,
.num_boxes = 8,
.perf_ctr_bits = 48,
.fixed_ctr_bits = 48,
@@ -2691,7 +2771,7 @@ static struct intel_uncore_type hswep_uncore_irp = {
static struct intel_uncore_type hswep_uncore_qpi = {
.name = "qpi",
- .num_counters = 5,
+ .num_counters = 4,
.num_boxes = 3,
.perf_ctr_bits = 48,
.perf_ctr = SNBEP_PCI_PMON_CTR0,
@@ -2773,7 +2853,7 @@ static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
static struct intel_uncore_type hswep_uncore_r3qpi = {
.name = "r3qpi",
- .num_counters = 4,
+ .num_counters = 3,
.num_boxes = 3,
.perf_ctr_bits = 44,
.constraints = hswep_uncore_r3qpi_constraints,
@@ -2897,7 +2977,7 @@ static struct pci_driver hswep_uncore_pci_driver = {
int hswep_uncore_pci_init(void)
{
- int ret = snbep_pci2phy_map_init(0x2f1e);
+ int ret = snbep_pci2phy_map_init(0x2f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
if (ret)
return ret;
uncore_pci_uncores = hswep_pci_uncores;
@@ -2972,7 +3052,7 @@ static struct intel_uncore_type bdx_uncore_ha = {
static struct intel_uncore_type bdx_uncore_imc = {
.name = "imc",
- .num_counters = 5,
+ .num_counters = 4,
.num_boxes = 8,
.perf_ctr_bits = 48,
.fixed_ctr_bits = 48,
@@ -3186,7 +3266,7 @@ static struct pci_driver bdx_uncore_pci_driver = {
int bdx_uncore_pci_init(void)
{
- int ret = snbep_pci2phy_map_init(0x6f1e);
+ int ret = snbep_pci2phy_map_init(0x6f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
if (ret)
return ret;
@@ -3196,3 +3276,525 @@ int bdx_uncore_pci_init(void)
}
/* end of BDX uncore support */
+
+/* SKX uncore support */
+
+static struct intel_uncore_type skx_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_ubox_format_group,
+};
+
+static struct attribute *skx_uncore_cha_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid4.attr,
+ &format_attr_filter_link4.attr,
+ &format_attr_filter_state5.attr,
+ &format_attr_filter_rem.attr,
+ &format_attr_filter_loc.attr,
+ &format_attr_filter_nm.attr,
+ &format_attr_filter_all_op.attr,
+ &format_attr_filter_not_nm.attr,
+ &format_attr_filter_opc_0.attr,
+ &format_attr_filter_opc_1.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_c6.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static struct attribute_group skx_uncore_chabox_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_cha_formats_attr,
+};
+
+static struct event_constraint skx_uncore_chabox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg skx_uncore_cha_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8134, 0xffff, 0x4),
+};
+
+static u64 skx_cha_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LINK;
+ if (fields & 0x4)
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_STATE;
+ return mask;
+}
+
+static struct event_constraint *
+skx_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, skx_cha_filter_mask);
+}
+
+static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = skx_uncore_cha_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+ HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & skx_cha_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops skx_uncore_chabox_ops = {
+ /* There is no frz_en for chabox ctl */
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = hswep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = skx_cha_hw_config,
+ .get_constraint = skx_cha_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type skx_uncore_chabox = {
+ .name = "cha",
+ .num_counters = 4,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = skx_uncore_chabox_constraints,
+ .ops = &skx_uncore_chabox_ops,
+ .format_group = &skx_uncore_chabox_format_group,
+};
+
+static struct attribute *skx_uncore_iio_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh9.attr,
+ &format_attr_ch_mask.attr,
+ &format_attr_fc_mask.attr,
+ NULL,
+};
+
+static struct attribute_group skx_uncore_iio_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_iio_formats_attr,
+};
+
+static struct event_constraint skx_uncore_iio_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x88, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x95, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd4, 0xc),
+ EVENT_CONSTRAINT_END
+};
+
+static void skx_iio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops skx_uncore_iio_ops = {
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = skx_iio_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_iio = {
+ .name = "iio",
+ .num_counters = 4,
+ .num_boxes = 5,
+ .perf_ctr_bits = 48,
+ .event_ctl = SKX_IIO0_MSR_PMON_CTL0,
+ .perf_ctr = SKX_IIO0_MSR_PMON_CTR0,
+ .event_mask = SKX_IIO_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SKX_IIO_PMON_RAW_EVENT_MASK_EXT,
+ .box_ctl = SKX_IIO0_MSR_PMON_BOX_CTL,
+ .msr_offset = SKX_IIO_MSR_OFFSET,
+ .constraints = skx_uncore_iio_constraints,
+ .ops = &skx_uncore_iio_ops,
+ .format_group = &skx_uncore_iio_format_group,
+};
+
+static struct attribute *skx_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute_group skx_uncore_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_formats_attr,
+};
+
+static struct intel_uncore_type skx_uncore_irp = {
+ .name = "irp",
+ .num_counters = 2,
+ .num_boxes = 5,
+ .perf_ctr_bits = 48,
+ .event_ctl = SKX_IRP0_MSR_PMON_CTL0,
+ .perf_ctr = SKX_IRP0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SKX_IRP0_MSR_PMON_BOX_CTL,
+ .msr_offset = SKX_IRP_MSR_OFFSET,
+ .ops = &skx_uncore_iio_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct intel_uncore_ops skx_uncore_pcu_ops = {
+ IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = hswep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type skx_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_PCU_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &skx_uncore_pcu_ops,
+ .format_group = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *skx_msr_uncores[] = {
+ &skx_uncore_ubox,
+ &skx_uncore_chabox,
+ &skx_uncore_iio,
+ &skx_uncore_irp,
+ &skx_uncore_pcu,
+ NULL,
+};
+
+static int skx_count_chabox(void)
+{
+ struct pci_dev *chabox_dev = NULL;
+ int bus, count = 0;
+
+ while (1) {
+ chabox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x208d, chabox_dev);
+ if (!chabox_dev)
+ break;
+ if (count == 0)
+ bus = chabox_dev->bus->number;
+ if (bus != chabox_dev->bus->number)
+ break;
+ count++;
+ }
+
+ pci_dev_put(chabox_dev);
+ return count;
+}
+
+void skx_uncore_cpu_init(void)
+{
+ skx_uncore_chabox.num_boxes = skx_count_chabox();
+ uncore_msr_uncores = skx_msr_uncores;
+}
+
+static struct intel_uncore_type skx_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = hswep_uncore_imc_events,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct attribute *skx_upi_uncore_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_umask_ext.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute_group skx_upi_uncore_format_group = {
+ .name = "format",
+ .attrs = skx_upi_uncore_formats_attr,
+};
+
+static void skx_upi_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ pci_write_config_dword(pdev, SKX_UPI_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static struct intel_uncore_ops skx_upi_uncore_pci_ops = {
+ .init_box = skx_upi_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_upi = {
+ .name = "upi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SKX_UPI_PCI_PMON_CTR0,
+ .event_ctl = SKX_UPI_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SKX_PMON_CTL_UMASK_EXT,
+ .box_ctl = SKX_UPI_PCI_PMON_BOX_CTL,
+ .ops = &skx_upi_uncore_pci_ops,
+ .format_group = &skx_upi_uncore_format_group,
+};
+
+static void skx_m2m_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ pci_write_config_dword(pdev, SKX_M2M_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static struct intel_uncore_ops skx_m2m_uncore_pci_ops = {
+ .init_box = skx_m2m_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_m2m = {
+ .name = "m2m",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SKX_M2M_PCI_PMON_CTR0,
+ .event_ctl = SKX_M2M_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SKX_M2M_PCI_PMON_BOX_CTL,
+ .ops = &skx_m2m_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct event_constraint skx_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type skx_uncore_m2pcie = {
+ .name = "m2pcie",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .constraints = skx_uncore_m2pcie_constraints,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct event_constraint skx_uncore_m3upi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x1d, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1e, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x40, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x4e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x4f, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x50, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x51, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x52, 0x7),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type skx_uncore_m3upi = {
+ .name = "m3upi",
+ .num_counters = 3,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .constraints = skx_uncore_m3upi_constraints,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+enum {
+ SKX_PCI_UNCORE_IMC,
+ SKX_PCI_UNCORE_M2M,
+ SKX_PCI_UNCORE_UPI,
+ SKX_PCI_UNCORE_M2PCIE,
+ SKX_PCI_UNCORE_M3UPI,
+};
+
+static struct intel_uncore_type *skx_pci_uncores[] = {
+ [SKX_PCI_UNCORE_IMC] = &skx_uncore_imc,
+ [SKX_PCI_UNCORE_M2M] = &skx_uncore_m2m,
+ [SKX_PCI_UNCORE_UPI] = &skx_uncore_upi,
+ [SKX_PCI_UNCORE_M2PCIE] = &skx_uncore_m2pcie,
+ [SKX_PCI_UNCORE_M3UPI] = &skx_uncore_m3upi,
+ NULL,
+};
+
+static const struct pci_device_id skx_uncore_pci_ids[] = {
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 2, SKX_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 6, SKX_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(11, 2, SKX_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 2, SKX_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 6, SKX_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(13, 2, SKX_PCI_UNCORE_IMC, 5),
+ },
+ { /* M2M0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 0, SKX_PCI_UNCORE_M2M, 0),
+ },
+ { /* M2M1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 0, SKX_PCI_UNCORE_M2M, 1),
+ },
+ { /* UPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(14, 0, SKX_PCI_UNCORE_UPI, 0),
+ },
+ { /* UPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, SKX_PCI_UNCORE_UPI, 1),
+ },
+ { /* UPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(16, 0, SKX_PCI_UNCORE_UPI, 2),
+ },
+ { /* M2PCIe 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 1, SKX_PCI_UNCORE_M2PCIE, 0),
+ },
+ { /* M2PCIe 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(22, 1, SKX_PCI_UNCORE_M2PCIE, 1),
+ },
+ { /* M2PCIe 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(23, 1, SKX_PCI_UNCORE_M2PCIE, 2),
+ },
+ { /* M2PCIe 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 5, SKX_PCI_UNCORE_M2PCIE, 3),
+ },
+ { /* M3UPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, SKX_PCI_UNCORE_M3UPI, 0),
+ },
+ { /* M3UPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 1),
+ },
+ { /* M3UPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 4, SKX_PCI_UNCORE_M3UPI, 2),
+ },
+ { /* end: all zeroes */ }
+};
+
+
+static struct pci_driver skx_uncore_pci_driver = {
+ .name = "skx_uncore",
+ .id_table = skx_uncore_pci_ids,
+};
+
+int skx_uncore_pci_init(void)
+{
+ /* need to double check pci address */
+ int ret = snbep_pci2phy_map_init(0x2014, SKX_CPUNODEID, SKX_GIDNIDMAP, false);
+
+ if (ret)
+ return ret;
+
+ uncore_pci_uncores = skx_pci_uncores;
+ uncore_pci_driver = &skx_uncore_pci_driver;
+ return 0;
+}
+
+/* end of SKX uncore support */
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 8c4a47706296..5874d8de1f8d 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -194,12 +194,13 @@ struct cpu_hw_events {
*/
struct debug_store *ds;
u64 pebs_enabled;
+ int n_pebs;
+ int n_large_pebs;
/*
* Intel LBR bits
*/
int lbr_users;
- void *lbr_context;
struct perf_branch_stack lbr_stack;
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
struct er_account *lbr_sel;
@@ -508,6 +509,8 @@ struct x86_pmu {
void (*enable_all)(int added);
void (*enable)(struct perf_event *);
void (*disable)(struct perf_event *);
+ void (*add)(struct perf_event *);
+ void (*del)(struct perf_event *);
int (*hw_config)(struct perf_event *event);
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
unsigned eventsel;
@@ -888,6 +891,10 @@ extern struct event_constraint intel_skl_pebs_event_constraints[];
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
+void intel_pmu_pebs_add(struct perf_event *event);
+
+void intel_pmu_pebs_del(struct perf_event *event);
+
void intel_pmu_pebs_enable(struct perf_event *event);
void intel_pmu_pebs_disable(struct perf_event *event);
@@ -906,9 +913,9 @@ u64 lbr_from_signext_quirk_wr(u64 val);
void intel_pmu_lbr_reset(void);
-void intel_pmu_lbr_enable(struct perf_event *event);
+void intel_pmu_lbr_add(struct perf_event *event);
-void intel_pmu_lbr_disable(struct perf_event *event);
+void intel_pmu_lbr_del(struct perf_event *event);
void intel_pmu_lbr_enable_all(bool pmi);
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 2f29f4e407c3..cb13c0564ea7 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -378,7 +378,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode);
} put_user_catch(err);
- err |= copy_siginfo_to_user32(&frame->info, &ksig->info);
+ err |= __copy_siginfo_to_user32(&frame->info, &ksig->info, false);
err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index e77a6443104f..1b020381ab38 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -217,10 +217,14 @@ static inline int alternatives_text_reserved(void *start, void *end)
*/
#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \
output, input...) \
+{ \
+ register void *__sp asm(_ASM_SP); \
asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
"call %P[new2]", feature2) \
- : output : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
- [new2] "i" (newfunc2), ## input)
+ : output, "+r" (__sp) \
+ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
+ [new2] "i" (newfunc2), ## input); \
+}
/*
* use this macro(s) if you need more than one output parameter
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index f5befd4945f2..f5aaf6c83222 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -135,6 +135,7 @@ extern void init_apic_mappings(void);
void register_lapic_address(unsigned long address);
extern void setup_boot_APIC_clock(void);
extern void setup_secondary_APIC_clock(void);
+extern void lapic_update_tsc_freq(void);
extern int APIC_init_uniprocessor(void);
#ifdef CONFIG_X86_64
@@ -170,6 +171,7 @@ static inline void init_apic_mappings(void) { }
static inline void disable_local_APIC(void) { }
# define setup_boot_APIC_clock x86_init_noop
# define setup_secondary_APIC_clock x86_init_noop
+static inline void lapic_update_tsc_freq(void) { }
#endif /* !CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_X2APIC
@@ -648,8 +650,8 @@ static inline void entering_ack_irq(void)
static inline void ipi_entering_ack_irq(void)
{
- ack_APIC_irq();
irq_enter();
+ ack_APIC_irq();
}
static inline void exiting_irq(void)
@@ -659,9 +661,8 @@ static inline void exiting_irq(void)
static inline void exiting_ack_irq(void)
{
- irq_exit();
- /* Ack only at the end to avoid potential reentry */
ack_APIC_irq();
+ irq_exit();
}
extern void ioapic_zap_locks(void);
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 9733361fed6f..97848cdfcb1a 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -158,53 +158,9 @@ extern void __add_wrong_size(void)
* value of "*ptr".
*
* xadd() is locked when multiple CPUs are online
- * xadd_sync() is always locked
- * xadd_local() is never locked
*/
#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock)
#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX)
-#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ")
-#define xadd_local(ptr, inc) __xadd((ptr), (inc), "")
-
-#define __add(ptr, inc, lock) \
- ({ \
- __typeof__ (*(ptr)) __ret = (inc); \
- switch (sizeof(*(ptr))) { \
- case __X86_CASE_B: \
- asm volatile (lock "addb %b1, %0\n" \
- : "+m" (*(ptr)) : "qi" (inc) \
- : "memory", "cc"); \
- break; \
- case __X86_CASE_W: \
- asm volatile (lock "addw %w1, %0\n" \
- : "+m" (*(ptr)) : "ri" (inc) \
- : "memory", "cc"); \
- break; \
- case __X86_CASE_L: \
- asm volatile (lock "addl %1, %0\n" \
- : "+m" (*(ptr)) : "ri" (inc) \
- : "memory", "cc"); \
- break; \
- case __X86_CASE_Q: \
- asm volatile (lock "addq %1, %0\n" \
- : "+m" (*(ptr)) : "ri" (inc) \
- : "memory", "cc"); \
- break; \
- default: \
- __add_wrong_size(); \
- } \
- __ret; \
- })
-
-/*
- * add_*() adds "inc" to "*ptr"
- *
- * __add() takes a lock prefix
- * add_smp() is locked when multiple CPUs are online
- * add_sync() is always locked
- */
-#define add_smp(ptr, inc) __add((ptr), (inc), LOCK_PREFIX)
-#define add_sync(ptr, inc) __add((ptr), (inc), "lock; ")
#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2) \
({ \
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index a18806165fe4..03d269bed941 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -275,10 +275,10 @@ struct compat_shmid64_ds {
#ifdef CONFIG_X86_X32_ABI
typedef struct user_regs_struct compat_elf_gregset_t;
-#define PR_REG_SIZE(S) (test_thread_flag(TIF_IA32) ? 68 : 216)
-#define PRSTATUS_SIZE(S) (test_thread_flag(TIF_IA32) ? 144 : 296)
-#define SET_PR_FPVALID(S,V) \
- do { *(int *) (((void *) &((S)->pr_reg)) + PR_REG_SIZE(0)) = (V); } \
+/* Full regset -- prstatus on x32, otherwise on ia32 */
+#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296)
+#define SET_PR_FPVALID(S, V, R) \
+ do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \
while (0)
#define COMPAT_USE_64BIT_TIME \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 92a8308b96f6..1188bc849ee3 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -106,7 +106,6 @@
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
-#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4e10d73cf018..12080d87da3b 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -36,7 +36,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
extern struct desc_ptr idt_descr;
extern gate_desc idt_table[];
-extern struct desc_ptr debug_idt_descr;
+extern const struct desc_ptr debug_idt_descr;
extern gate_desc debug_idt_table[];
struct gdt_page {
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 3ab0537872fb..476b574de99e 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -10,8 +10,8 @@
#include <uapi/asm/e820.h>
#ifndef __ASSEMBLY__
/* see comment in arch/x86/kernel/e820.c */
-extern struct e820map e820;
-extern struct e820map e820_saved;
+extern struct e820map *e820;
+extern struct e820map *e820_saved;
extern unsigned long pci_mem_start;
extern int e820_any_mapped(u64 start, u64 end, unsigned type);
@@ -53,6 +53,8 @@ extern void e820_reserve_resources_late(void);
extern void setup_memory_map(void);
extern char *default_machine_specific_memory_setup(void);
+extern void e820_reallocate_tables(void);
+
/*
* Returns true iff the specified range [s,e) is completely contained inside
* the ISA region.
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index d0bb76d81402..389d700b961e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -117,7 +117,6 @@ extern int __init efi_memblock_x86_reserve_range(void);
extern pgd_t * __init efi_call_phys_prolog(void);
extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
extern void __init efi_print_memmap(void);
-extern void __init efi_unmap_memmap(void);
extern void __init efi_memory_uc(u64 addr, unsigned long size);
extern void __init efi_map_region(efi_memory_desc_t *md);
extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
@@ -192,14 +191,7 @@ static inline efi_status_t efi_thunk_set_virtual_address_map(
struct efi_config {
u64 image_handle;
u64 table;
- u64 allocate_pool;
- u64 allocate_pages;
- u64 get_memory_map;
- u64 free_pool;
- u64 free_pages;
- u64 locate_handle;
- u64 handle_protocol;
- u64 exit_boot_services;
+ u64 boot_services;
u64 text_output;
efi_status_t (*call)(unsigned long, ...);
bool is64;
@@ -207,14 +199,27 @@ struct efi_config {
__pure const struct efi_config *__efi_early(void);
+static inline bool efi_is_64bit(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return false;
+
+ if (!IS_ENABLED(CONFIG_EFI_MIXED))
+ return true;
+
+ return __efi_early()->is64;
+}
+
#define efi_call_early(f, ...) \
- __efi_early()->call(__efi_early()->f, __VA_ARGS__);
+ __efi_early()->call(efi_is_64bit() ? \
+ ((efi_boot_services_64_t *)(unsigned long) \
+ __efi_early()->boot_services)->f : \
+ ((efi_boot_services_32_t *)(unsigned long) \
+ __efi_early()->boot_services)->f, __VA_ARGS__)
#define __efi_call_early(f, ...) \
__efi_early()->call((unsigned long)f, __VA_ARGS__);
-#define efi_is_64bit() __efi_early()->is64
-
extern bool efi_reboot_required(void);
#else
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 0e970d00dfcd..20a1fbf7fe4e 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -19,6 +19,12 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
# define ia32_setup_rt_frame __setup_rt_frame
#endif
+#ifdef CONFIG_COMPAT
+int __copy_siginfo_to_user32(compat_siginfo_t __user *to,
+ const siginfo_t *from, bool x32_ABI);
+#endif
+
+
extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
struct task_struct *tsk);
extern void convert_to_fxsr(struct task_struct *tsk,
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index ae55a43e09c0..d4957ac72b48 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -45,7 +45,8 @@
extern u64 xfeatures_mask;
extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
-extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
+extern void __init update_regset_xstate_info(unsigned int size,
+ u64 xstate_mask);
void fpu__xstate_clear_all_cpu_caps(void);
void *get_xsave_addr(struct xregs_state *xsave, int xstate);
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index a4820d4df617..eccd0ac6bc38 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -6,6 +6,7 @@
# define MCOUNT_ADDR ((unsigned long)(__fentry__))
#else
# define MCOUNT_ADDR ((unsigned long)(mcount))
+# define HAVE_FUNCTION_GRAPH_FP_TEST
#endif
#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
@@ -13,6 +14,8 @@
#define ARCH_SUPPORTS_FTRACE_OPS 1
#endif
+#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+
#ifndef __ASSEMBLY__
extern void mcount(void);
extern atomic_t modifying_ftrace_code;
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 7178043b0e1d..59405a248fc2 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,10 +22,6 @@ typedef struct {
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
- /*
- * irq_tlb_count is double-counted in irq_call_count, so it must be
- * subtracted from irq_call_count when displaying irq_call_count
- */
unsigned int irq_tlb_count;
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 055ea9941dd5..67942b6ad4b7 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -43,6 +43,9 @@ struct hypervisor_x86 {
/* X2APIC detection (run once per boot) */
bool (*x2apic_available)(void);
+
+ /* pin current vcpu to specified physical cpu (run rarely) */
+ void (*pin_vcpu)(int);
};
extern const struct hypervisor_x86 *x86_hyper;
@@ -56,6 +59,7 @@ extern const struct hypervisor_x86 x86_hyper_kvm;
extern void init_hypervisor(struct cpuinfo_x86 *c);
extern void init_hypervisor_platform(void);
extern bool hypervisor_x2apic_available(void);
+extern void hypervisor_pin_vcpu(int cpu);
#else
static inline void init_hypervisor(struct cpuinfo_x86 *c) { }
static inline void init_hypervisor_platform(void) { }
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 223042086f4e..737da62bfeb0 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,10 +5,10 @@ struct x86_mapping_info {
void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
void *context; /* context for alloc_pgt_page */
unsigned long pmd_flag; /* page flag for PMD entry */
- bool kernel_mapping; /* kernel mapping or ident mapping */
+ unsigned long offset; /* ident mapping offset */
};
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
- unsigned long addr, unsigned long end);
+ unsigned long pstart, unsigned long pend);
#endif /* _ASM_X86_INIT_H */
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 627719475457..9ae5ab80a497 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -56,8 +56,8 @@
#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */
#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */
#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */
-#define INTEL_FAM6_ATOM_MERRIFIELD1 0x4A /* Tangier */
-#define INTEL_FAM6_ATOM_MERRIFIELD2 0x5A /* Annidale */
+#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */
+#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Annidale */
#define INTEL_FAM6_ATOM_GOLDMONT 0x5C
#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index 9d6b097aa73d..5b6753d1f7f4 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -18,6 +18,8 @@
extern int intel_mid_pci_init(void);
extern int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state);
+extern void intel_mid_pwr_power_off(void);
+
#define INTEL_MID_PWR_LSS_OFFSET 4
#define INTEL_MID_PWR_LSS_TYPE (1 << 7)
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
index 925b605eb5c6..4fb1d0abef95 100644
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -3,6 +3,8 @@
#include <linux/notifier.h>
+#define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */
+
#define IPCMSG_WARM_RESET 0xF0
#define IPCMSG_COLD_RESET 0xF1
#define IPCMSG_SOFT_RESET 0xF2
diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h
index 2674ee3de748..1052a797d71d 100644
--- a/arch/x86/include/asm/kaslr.h
+++ b/arch/x86/include/asm/kaslr.h
@@ -6,6 +6,7 @@ unsigned long kaslr_get_random_long(const char *purpose);
#ifdef CONFIG_RANDOMIZE_MEMORY
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
+extern unsigned long vmemmap_base;
void kernel_randomize_memory(void);
#else
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 1ef9d581b5d9..d31881188431 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -24,8 +24,6 @@ enum die_val {
extern void printk_address(unsigned long address);
extern void die(const char *, struct pt_regs *,long);
extern int __must_check __die(const char *, struct pt_regs *, long);
-extern void show_trace(struct task_struct *t, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp);
extern void show_stack_regs(struct pt_regs *regs);
extern void __show_regs(struct pt_regs *regs, int all);
extern unsigned long oops_begin(void);
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 8bf766ef0e18..9bd7ff5ffbcc 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,9 +40,10 @@
#define MCI_STATUS_AR (1ULL<<55) /* Action required */
/* AMD-specific bits */
+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
+#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */
#define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */
#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
-#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
/*
* McaX field if set indicates a given bank supports MCA extensions:
@@ -110,6 +111,7 @@
#define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003
#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
+#define MSR_AMD64_SMCA_MC0_SYND 0xc0002006
#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
@@ -119,6 +121,7 @@
#define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_SYND(x) (MSR_AMD64_SMCA_MC0_SYND + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
@@ -334,44 +337,47 @@ extern void apei_mce_report_mem_error(int corrected,
* Scalable MCA.
*/
#ifdef CONFIG_X86_MCE_AMD
-enum amd_ip_types {
- SMCA_F17H_CORE = 0, /* Core errors */
- SMCA_DF, /* Data Fabric */
- SMCA_UMC, /* Unified Memory Controller */
- SMCA_PB, /* Parameter Block */
- SMCA_PSP, /* Platform Security Processor */
- SMCA_SMU, /* System Management Unit */
- N_AMD_IP_TYPES
-};
-
-struct amd_hwid {
- const char *name;
- unsigned int hwid;
-};
-
-extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
-enum amd_core_mca_blocks {
+/* These may be used by multiple smca_hwid_mcatypes */
+enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
SMCA_IF, /* Instruction Fetch */
- SMCA_L2_CACHE, /* L2 cache */
- SMCA_DE, /* Decoder unit */
- RES, /* Reserved */
- SMCA_EX, /* Execution unit */
+ SMCA_L2_CACHE, /* L2 Cache */
+ SMCA_DE, /* Decoder Unit */
+ SMCA_EX, /* Execution Unit */
SMCA_FP, /* Floating Point */
- SMCA_L3_CACHE, /* L3 cache */
- N_CORE_MCA_BLOCKS
+ SMCA_L3_CACHE, /* L3 Cache */
+ SMCA_CS, /* Coherent Slave */
+ SMCA_PIE, /* Power, Interrupts, etc. */
+ SMCA_UMC, /* Unified Memory Controller */
+ SMCA_PB, /* Parameter Block */
+ SMCA_PSP, /* Platform Security Processor */
+ SMCA_SMU, /* System Management Unit */
+ N_SMCA_BANK_TYPES
};
-extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
+struct smca_bank_name {
+ const char *name; /* Short name for sysfs */
+ const char *long_name; /* Long name for pretty-printing */
+};
+
+extern struct smca_bank_name smca_bank_names[N_SMCA_BANK_TYPES];
+
+#define HWID_MCATYPE(hwid, mcatype) ((hwid << 16) | mcatype)
-enum amd_df_mca_blocks {
- SMCA_CS = 0, /* Coherent Slave */
- SMCA_PIE, /* Power management, Interrupts, etc */
- N_DF_BLOCKS
+struct smca_hwid_mcatype {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
+ u32 hwid_mcatype; /* (hwid,mcatype) tuple */
+ u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */
};
-extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
+struct smca_bank_info {
+ struct smca_hwid_mcatype *type;
+ u32 type_instance;
+};
+
+extern struct smca_bank_info smca_banks[MAX_NR_BANKS];
+
#endif
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index b07233b64578..32007041ef8c 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -6,7 +6,6 @@
#include <asm/x86_init.h>
#include <asm/apicdef.h>
-extern int apic_version[];
extern int pic_mode;
#ifdef CONFIG_X86_32
@@ -40,6 +39,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES];
extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
extern unsigned int boot_cpu_physical_apicid;
+extern u8 boot_cpu_apic_version;
extern unsigned long mp_lapic_addr;
#ifdef CONFIG_X86_LOCAL_APIC
@@ -86,6 +86,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
#endif
int generic_processor_info(int apicid, int version);
+int __generic_processor_info(int apicid, int version, bool enabled);
#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 2970d22d7766..ce932812f142 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -80,10 +80,6 @@ static inline unsigned long __read_cr4(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
}
-static inline unsigned long __read_cr4_safe(void)
-{
- return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
-}
static inline void __write_cr4(unsigned long x)
{
@@ -661,8 +657,6 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
-#ifdef CONFIG_QUEUED_SPINLOCKS
-
static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
u32 val)
{
@@ -684,22 +678,6 @@ static __always_inline void pv_kick(int cpu)
PVOP_VCALL1(pv_lock_ops.kick, cpu);
}
-#else /* !CONFIG_QUEUED_SPINLOCKS */
-
-static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
- __ticket_t ticket)
-{
- PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket);
-}
-
-static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
- __ticket_t ticket)
-{
- PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
-}
-
-#endif /* CONFIG_QUEUED_SPINLOCKS */
-
#endif /* SMP && PARAVIRT_SPINLOCKS */
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7fa9e7740ba3..0f400c0e4979 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -108,7 +108,6 @@ struct pv_cpu_ops {
unsigned long (*read_cr0)(void);
void (*write_cr0)(unsigned long);
- unsigned long (*read_cr4_safe)(void);
unsigned long (*read_cr4)(void);
void (*write_cr4)(unsigned long);
@@ -301,23 +300,16 @@ struct pv_mmu_ops {
struct arch_spinlock;
#ifdef CONFIG_SMP
#include <asm/spinlock_types.h>
-#else
-typedef u16 __ticket_t;
#endif
struct qspinlock;
struct pv_lock_ops {
-#ifdef CONFIG_QUEUED_SPINLOCKS
void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
struct paravirt_callee_save queued_spin_unlock;
void (*wait)(u8 *ptr, u8 val);
void (*kick)(int cpu);
-#else /* !CONFIG_QUEUED_SPINLOCKS */
- struct paravirt_callee_save lock_spinning;
- void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
-#endif /* !CONFIG_QUEUED_SPINLOCKS */
};
/* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 7e8ec7ae10fa..1cc82ece9ac1 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -145,7 +145,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
*
* | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number
* | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names
- * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry
+ * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry
*
* G (8) is aliased and used as a PROT_NONE indicator for
* !present ptes. We need to start storing swap entries above
@@ -156,7 +156,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
#define SWP_TYPE_BITS 5
/* Place the offset above the type: */
-#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1)
+#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS)
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6fdef9eef2d5..3a264200c62f 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -57,11 +57,13 @@ typedef struct { pteval_t pte; } pte_t;
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
#define VMALLOC_SIZE_TB _AC(32, UL)
#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
-#define VMEMMAP_START _AC(0xffffea0000000000, UL)
+#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
#ifdef CONFIG_RANDOMIZE_MEMORY
#define VMALLOC_START vmalloc_base
+#define VMEMMAP_START vmemmap_base
#else
#define VMALLOC_START __VMALLOC_BASE
+#define VMEMMAP_START __VMEMMAP_BASE
#endif /* CONFIG_RANDOMIZE_MEMORY */
#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index 643eba42d620..2c1ebeb4d737 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -46,10 +46,7 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
{
- if (static_cpu_has(X86_FEATURE_MCE_RECOVERY))
- return memcpy_mcsafe(dst, src, n);
- memcpy(dst, src, n);
- return 0;
+ return memcpy_mcsafe(dst, src, n);
}
/**
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 63def9537a2d..984a7bf17f6a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -389,9 +389,9 @@ struct thread_struct {
unsigned short fsindex;
unsigned short gsindex;
#endif
-#ifdef CONFIG_X86_32
- unsigned long ip;
-#endif
+
+ u32 status; /* thread synchronous flags */
+
#ifdef CONFIG_X86_64
unsigned long fsbase;
unsigned long gsbase;
@@ -438,6 +438,15 @@ struct thread_struct {
};
/*
+ * Thread-synchronous status.
+ *
+ * This is different from the flags in that nobody else
+ * ever touches our thread-synchronous status, so we don't
+ * have to worry about atomic accesses.
+ */
+#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
+
+/*
* Set IOPL bits in EFLAGS from given mask
*/
static inline void native_set_iopl_mask(unsigned mask)
@@ -724,8 +733,6 @@ static inline void spin_lock_prefetch(const void *x)
.addr_limit = KERNEL_DS, \
}
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-
/*
* TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
* This is necessary to guarantee that the entire "struct pt_regs"
@@ -776,17 +783,13 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
.addr_limit = KERNEL_DS, \
}
-/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
- */
-#define thread_saved_pc(t) READ_ONCE_NOCHECK(*(unsigned long *)((t)->thread.sp - 8))
-
#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
extern unsigned long KSTK_ESP(struct task_struct *task);
#endif /* CONFIG_X86_64 */
+extern unsigned long thread_saved_pc(struct task_struct *tsk);
+
extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp);
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 9c6b890d5e7a..230e1903acf0 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -44,9 +44,9 @@ struct trampoline_header {
extern struct real_mode_header *real_mode_header;
extern unsigned char real_mode_blob_end[];
-extern unsigned long init_rsp;
extern unsigned long initial_code;
extern unsigned long initial_gs;
+extern unsigned long initial_stack;
extern unsigned char real_mode_blob[];
extern unsigned char real_mode_relocs[];
@@ -58,7 +58,15 @@ extern unsigned char boot_gdt[];
extern unsigned char secondary_startup_64[];
#endif
+static inline size_t real_mode_size_needed(void)
+{
+ if (real_mode_header)
+ return 0; /* already allocated. */
+
+ return ALIGN(real_mode_blob_end - real_mode_blob, PAGE_SIZE);
+}
+
+void set_real_mode_mem(phys_addr_t mem, size_t size);
void reserve_real_mode(void);
-void setup_real_mode(void);
#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 8dbc762ad132..3d33a719f5c1 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -154,7 +154,7 @@ static inline bool __down_write_trylock(struct rw_semaphore *sem)
: "+m" (sem->count), "=&a" (tmp0), "=&r" (tmp1),
CC_OUT(e) (result)
: "er" (RWSEM_ACTIVE_WRITE_BIAS)
- : "memory", "cc");
+ : "memory");
return result;
}
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index dd1e7d6387ab..8af22be0fe61 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -23,6 +23,10 @@ typedef struct {
unsigned long sig[_NSIG_WORDS];
} sigset_t;
+/* non-uapi in-kernel SA_FLAGS for those indicates ABI for a signal frame */
+#define SA_IA32_ABI 0x02000000u
+#define SA_X32_ABI 0x01000000u
+
#ifndef CONFIG_COMPAT
typedef sigset_t compat_sigset_t;
#endif
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index ebd0c164cd4e..19980b36f394 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -39,9 +39,6 @@ DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
#endif
-/* Static state in head.S used to set up a CPU */
-extern unsigned long stack_start; /* Initial stack pointer address */
-
struct task_struct;
struct smp_ops {
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 587d7914ea4b..19a2224f9e16 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -59,22 +59,19 @@ static inline void native_write_cr3(unsigned long val)
static inline unsigned long native_read_cr4(void)
{
unsigned long val;
- asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
- return val;
-}
-
-static inline unsigned long native_read_cr4_safe(void)
-{
- unsigned long val;
- /* This could fault if %cr4 does not exist. In x86_64, a cr4 always
- * exists, so it will never fail. */
#ifdef CONFIG_X86_32
+ /*
+ * This could fault if CR4 does not exist. Non-existent CR4
+ * is functionally equivalent to CR4 == 0. Keep it simple and pretend
+ * that CR4 == 0 on CPUs that don't have CR4.
+ */
asm volatile("1: mov %%cr4, %0\n"
"2:\n"
_ASM_EXTABLE(1b, 2b)
: "=r" (val), "=m" (__force_order) : "0" (0));
#else
- val = native_read_cr4();
+ /* CR4 always exists on x86_64. */
+ asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
#endif
return val;
}
@@ -182,11 +179,6 @@ static inline unsigned long __read_cr4(void)
return native_read_cr4();
}
-static inline unsigned long __read_cr4_safe(void)
-{
- return native_read_cr4_safe();
-}
-
static inline void __write_cr4(unsigned long x)
{
native_write_cr4(x);
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index be0a05913b91..921bea7a2708 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -20,187 +20,13 @@
* (the type definitions are in asm/spinlock_types.h)
*/
-#ifdef CONFIG_X86_32
-# define LOCK_PTR_REG "a"
-#else
-# define LOCK_PTR_REG "D"
-#endif
-
-#if defined(CONFIG_X86_32) && (defined(CONFIG_X86_PPRO_FENCE))
-/*
- * On PPro SMP, we use a locked operation to unlock
- * (PPro errata 66, 92)
- */
-# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
-#else
-# define UNLOCK_LOCK_PREFIX
-#endif
-
/* How long a lock should spin before we consider blocking */
#define SPIN_THRESHOLD (1 << 15)
extern struct static_key paravirt_ticketlocks_enabled;
static __always_inline bool static_key_false(struct static_key *key);
-#ifdef CONFIG_QUEUED_SPINLOCKS
#include <asm/qspinlock.h>
-#else
-
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-
-static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
-{
- set_bit(0, (volatile unsigned long *)&lock->tickets.head);
-}
-
-#else /* !CONFIG_PARAVIRT_SPINLOCKS */
-static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock,
- __ticket_t ticket)
-{
-}
-static inline void __ticket_unlock_kick(arch_spinlock_t *lock,
- __ticket_t ticket)
-{
-}
-
-#endif /* CONFIG_PARAVIRT_SPINLOCKS */
-static inline int __tickets_equal(__ticket_t one, __ticket_t two)
-{
- return !((one ^ two) & ~TICKET_SLOWPATH_FLAG);
-}
-
-static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock,
- __ticket_t head)
-{
- if (head & TICKET_SLOWPATH_FLAG) {
- arch_spinlock_t old, new;
-
- old.tickets.head = head;
- new.tickets.head = head & ~TICKET_SLOWPATH_FLAG;
- old.tickets.tail = new.tickets.head + TICKET_LOCK_INC;
- new.tickets.tail = old.tickets.tail;
-
- /* try to clear slowpath flag when there are no contenders */
- cmpxchg(&lock->head_tail, old.head_tail, new.head_tail);
- }
-}
-
-static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
-{
- return __tickets_equal(lock.tickets.head, lock.tickets.tail);
-}
-
-/*
- * Ticket locks are conceptually two parts, one indicating the current head of
- * the queue, and the other indicating the current tail. The lock is acquired
- * by atomically noting the tail and incrementing it by one (thus adding
- * ourself to the queue and noting our position), then waiting until the head
- * becomes equal to the the initial value of the tail.
- *
- * We use an xadd covering *both* parts of the lock, to increment the tail and
- * also load the position of the head, which takes care of memory ordering
- * issues and should be optimal for the uncontended case. Note the tail must be
- * in the high part, because a wide xadd increment of the low part would carry
- * up and contaminate the high part.
- */
-static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
-{
- register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC };
-
- inc = xadd(&lock->tickets, inc);
- if (likely(inc.head == inc.tail))
- goto out;
-
- for (;;) {
- unsigned count = SPIN_THRESHOLD;
-
- do {
- inc.head = READ_ONCE(lock->tickets.head);
- if (__tickets_equal(inc.head, inc.tail))
- goto clear_slowpath;
- cpu_relax();
- } while (--count);
- __ticket_lock_spinning(lock, inc.tail);
- }
-clear_slowpath:
- __ticket_check_and_clear_slowpath(lock, inc.head);
-out:
- barrier(); /* make sure nothing creeps before the lock is taken */
-}
-
-static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
- arch_spinlock_t old, new;
-
- old.tickets = READ_ONCE(lock->tickets);
- if (!__tickets_equal(old.tickets.head, old.tickets.tail))
- return 0;
-
- new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
- new.head_tail &= ~TICKET_SLOWPATH_FLAG;
-
- /* cmpxchg is a full barrier, so nothing can move before it */
- return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
-}
-
-static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
- if (TICKET_SLOWPATH_FLAG &&
- static_key_false(&paravirt_ticketlocks_enabled)) {
- __ticket_t head;
-
- BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
-
- head = xadd(&lock->tickets.head, TICKET_LOCK_INC);
-
- if (unlikely(head & TICKET_SLOWPATH_FLAG)) {
- head &= ~TICKET_SLOWPATH_FLAG;
- __ticket_unlock_kick(lock, (head + TICKET_LOCK_INC));
- }
- } else
- __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
-}
-
-static inline int arch_spin_is_locked(arch_spinlock_t *lock)
-{
- struct __raw_tickets tmp = READ_ONCE(lock->tickets);
-
- return !__tickets_equal(tmp.tail, tmp.head);
-}
-
-static inline int arch_spin_is_contended(arch_spinlock_t *lock)
-{
- struct __raw_tickets tmp = READ_ONCE(lock->tickets);
-
- tmp.head &= ~TICKET_SLOWPATH_FLAG;
- return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
-}
-#define arch_spin_is_contended arch_spin_is_contended
-
-static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
- unsigned long flags)
-{
- arch_spin_lock(lock);
-}
-
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- __ticket_t head = READ_ONCE(lock->tickets.head);
-
- for (;;) {
- struct __raw_tickets tmp = READ_ONCE(lock->tickets);
- /*
- * We need to check "unlocked" in a loop, tmp.head == head
- * can be false positive because of overflow.
- */
- if (__tickets_equal(tmp.head, tmp.tail) ||
- !__tickets_equal(tmp.head, head))
- break;
-
- cpu_relax();
- }
-}
-#endif /* CONFIG_QUEUED_SPINLOCKS */
/*
* Read-write spinlocks, allowing multiple readers
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 65c3e37f879a..25311ebb446c 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -23,20 +23,7 @@ typedef u32 __ticketpair_t;
#define TICKET_SHIFT (sizeof(__ticket_t) * 8)
-#ifdef CONFIG_QUEUED_SPINLOCKS
#include <asm-generic/qspinlock_types.h>
-#else
-typedef struct arch_spinlock {
- union {
- __ticketpair_t head_tail;
- struct __raw_tickets {
- __ticket_t head, tail;
- } tickets;
- };
-} arch_spinlock_t;
-
-#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } }
-#endif /* CONFIG_QUEUED_SPINLOCKS */
#include <asm-generic/qrwlock_types.h>
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 0944218af9e2..37f2e0b377ad 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -8,86 +8,86 @@
#include <linux/uaccess.h>
#include <linux/ptrace.h>
+#include <asm/switch_to.h>
+
+enum stack_type {
+ STACK_TYPE_UNKNOWN,
+ STACK_TYPE_TASK,
+ STACK_TYPE_IRQ,
+ STACK_TYPE_SOFTIRQ,
+ STACK_TYPE_EXCEPTION,
+ STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
+};
-extern int kstack_depth_to_print;
-
-struct thread_info;
-struct stacktrace_ops;
-
-typedef unsigned long (*walk_stack_t)(struct task_struct *task,
- unsigned long *stack,
- unsigned long bp,
- const struct stacktrace_ops *ops,
- void *data,
- unsigned long *end,
- int *graph);
-
-extern unsigned long
-print_context_stack(struct task_struct *task,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph);
-
-extern unsigned long
-print_context_stack_bp(struct task_struct *task,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph);
-
-/* Generic stack tracer with callbacks */
-
-struct stacktrace_ops {
- int (*address)(void *data, unsigned long address, int reliable);
- /* On negative return stop dumping */
- int (*stack)(void *data, char *name);
- walk_stack_t walk_stack;
+struct stack_info {
+ enum stack_type type;
+ unsigned long *begin, *end, *next_sp;
};
-void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data);
+bool in_task_stack(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info);
+
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask);
+
+void stack_type_str(enum stack_type type, const char **begin,
+ const char **end);
+
+static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
+{
+ void *begin = info->begin;
+ void *end = info->end;
+
+ return (info->type != STACK_TYPE_UNKNOWN &&
+ addr >= begin && addr < end &&
+ addr + len > begin && addr + len <= end);
+}
+
+extern int kstack_depth_to_print;
#ifdef CONFIG_X86_32
#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
#else
#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
#endif
#ifdef CONFIG_FRAME_POINTER
-static inline unsigned long
-stack_frame(struct task_struct *task, struct pt_regs *regs)
+static inline unsigned long *
+get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
{
- unsigned long bp;
-
if (regs)
- return regs->bp;
+ return (unsigned long *)regs->bp;
- if (task == current) {
- /* Grab bp right from our regs */
- get_bp(bp);
- return bp;
- }
+ if (task == current)
+ return __builtin_frame_address(0);
- /* bp is the last reg pushed by switch_to */
- return *(unsigned long *)task->thread.sp;
+ return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp;
}
#else
-static inline unsigned long
-stack_frame(struct task_struct *task, struct pt_regs *regs)
+static inline unsigned long *
+get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
{
- return 0;
+ return NULL;
+}
+#endif /* CONFIG_FRAME_POINTER */
+
+static inline unsigned long *
+get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
+{
+ if (regs)
+ return (unsigned long *)kernel_stack_pointer(regs);
+
+ if (task == current)
+ return __builtin_frame_address(0);
+
+ return (unsigned long *)task->thread.sp;
}
-#endif
-extern void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl);
+void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, char *log_lvl);
-extern void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl);
+void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, char *log_lvl);
extern unsigned int code_bytes;
@@ -106,7 +106,7 @@ static inline unsigned long caller_frame_pointer(void)
{
struct stack_frame *frame;
- get_bp(frame);
+ frame = __builtin_frame_address(0);
#ifdef CONFIG_FRAME_POINTER
frame = frame->next_frame;
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 90dbbd9666d4..a164862d77e3 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -2,6 +2,7 @@
#define _ASM_X86_STRING_64_H
#ifdef __KERNEL__
+#include <linux/jump_label.h>
/* Written 2002 by Andi Kleen */
@@ -78,6 +79,9 @@ int strcmp(const char *cs, const char *ct);
#define memset(s, c, n) __memset(s, c, n)
#endif
+__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt);
+DECLARE_STATIC_KEY_FALSE(mcsafe_key);
+
/**
* memcpy_mcsafe - copy memory with indication if a machine check happened
*
@@ -86,10 +90,23 @@ int strcmp(const char *cs, const char *ct);
* @cnt: number of bytes to copy
*
* Low level memory copy function that catches machine checks
+ * We only call into the "safe" function on systems that can
+ * actually do machine check recovery. Everyone else can just
+ * use memcpy().
*
* Return 0 for success, -EFAULT for fail
*/
-int memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+static __always_inline __must_check int
+memcpy_mcsafe(void *dst, const void *src, size_t cnt)
+{
+#ifdef CONFIG_X86_MCE
+ if (static_branch_unlikely(&mcsafe_key))
+ return memcpy_mcsafe_unrolled(dst, src, cnt);
+ else
+#endif
+ memcpy(dst, src, cnt);
+ return 0;
+}
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 8f321a1b03a1..5cb436acd463 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -2,130 +2,66 @@
#define _ASM_X86_SWITCH_TO_H
struct task_struct; /* one of the stranger aspects of C forward declarations */
+
+struct task_struct *__switch_to_asm(struct task_struct *prev,
+ struct task_struct *next);
+
__visible struct task_struct *__switch_to(struct task_struct *prev,
- struct task_struct *next);
+ struct task_struct *next);
struct tss_struct;
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss);
-#ifdef CONFIG_X86_32
+/* This runs runs on the previous thread's stack. */
+static inline void prepare_switch_to(struct task_struct *prev,
+ struct task_struct *next)
+{
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * If we switch to a stack that has a top-level paging entry
+ * that is not present in the current mm, the resulting #PF will
+ * will be promoted to a double-fault and we'll panic. Probe
+ * the new stack now so that vmalloc_fault can fix up the page
+ * tables if needed. This can only happen if we use a stack
+ * in vmap space.
+ *
+ * We assume that the stack is aligned so that it never spans
+ * more than one top-level paging entry.
+ *
+ * To minimize cache pollution, just follow the stack pointer.
+ */
+ READ_ONCE(*(unsigned char *)next->thread.sp);
+#endif
+}
+
+asmlinkage void ret_from_fork(void);
+
+/* data that is pointed to by thread.sp */
+struct inactive_task_frame {
+#ifdef CONFIG_X86_64
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+#else
+ unsigned long si;
+ unsigned long di;
+#endif
+ unsigned long bx;
+ unsigned long bp;
+ unsigned long ret_addr;
+};
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary \
- "movl %P[task_canary](%[next]), %%ebx\n\t" \
- "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
-#define __switch_canary_oparam \
- , [stack_canary] "=m" (stack_canary.canary)
-#define __switch_canary_iparam \
- , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
+struct fork_frame {
+ struct inactive_task_frame frame;
+ struct pt_regs regs;
+};
-/*
- * Saving eflags is important. It switches not only IOPL between tasks,
- * it also protects other tasks from NT leaking through sysenter etc.
- */
#define switch_to(prev, next, last) \
do { \
- /* \
- * Context-switching clobbers all registers, so we clobber \
- * them explicitly, via unused output variables. \
- * (EAX and EBP is not listed because EBP is saved/restored \
- * explicitly for wchan access and EAX is the return value of \
- * __switch_to()) \
- */ \
- unsigned long ebx, ecx, edx, esi, edi; \
- \
- asm volatile("pushl %%ebp\n\t" /* save EBP */ \
- "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
- "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
- "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
- "pushl %[next_ip]\n\t" /* restore EIP */ \
- __switch_canary \
- "jmp __switch_to\n" /* regparm call */ \
- "1:\t" \
- "popl %%ebp\n\t" /* restore EBP */ \
- \
- /* output parameters */ \
- : [prev_sp] "=m" (prev->thread.sp), \
- [prev_ip] "=m" (prev->thread.ip), \
- "=a" (last), \
- \
- /* clobbered output registers: */ \
- "=b" (ebx), "=c" (ecx), "=d" (edx), \
- "=S" (esi), "=D" (edi) \
- \
- __switch_canary_oparam \
- \
- /* input parameters: */ \
- : [next_sp] "m" (next->thread.sp), \
- [next_ip] "m" (next->thread.ip), \
- \
- /* regparm parameters for __switch_to(): */ \
- [prev] "a" (prev), \
- [next] "d" (next) \
+ prepare_switch_to(prev, next); \
\
- __switch_canary_iparam \
- \
- : /* reloaded segment registers */ \
- "memory"); \
+ ((last) = __switch_to_asm((prev), (next))); \
} while (0)
-#else /* CONFIG_X86_32 */
-
-/* frame pointer must be last for get_wchan */
-#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t"
-
-#define __EXTRA_CLOBBER \
- , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
- "r12", "r13", "r14", "r15", "flags"
-
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary \
- "movq %P[task_canary](%%rsi),%%r8\n\t" \
- "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
-#define __switch_canary_oparam \
- , [gs_canary] "=m" (irq_stack_union.stack_canary)
-#define __switch_canary_iparam \
- , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
-
-/*
- * There is no need to save or restore flags, because flags are always
- * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL
- * has no effect.
- */
-#define switch_to(prev, next, last) \
- asm volatile(SAVE_CONTEXT \
- "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
- "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
- "call __switch_to\n\t" \
- "movq "__percpu_arg([current_task])",%%rsi\n\t" \
- __switch_canary \
- "movq %P[thread_info](%%rsi),%%r8\n\t" \
- "movq %%rax,%%rdi\n\t" \
- "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
- "jnz ret_from_fork\n\t" \
- RESTORE_CONTEXT \
- : "=a" (last) \
- __switch_canary_oparam \
- : [next] "S" (next), [prev] "D" (prev), \
- [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
- [ti_flags] "i" (offsetof(struct thread_info, flags)), \
- [_tif_fork] "i" (_TIF_FORK), \
- [thread_info] "i" (offsetof(struct task_struct, stack)), \
- [current_task] "m" (current_task) \
- __switch_canary_iparam \
- : "memory", "cc" __EXTRA_CLOBBER)
-
-#endif /* CONFIG_X86_32 */
-
#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 4e23dd15c661..e3c95e8e61c5 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task,
* TS_COMPAT is set for 32-bit syscall entries and then
* remains set until we return to user mode.
*/
- if (task_thread_info(task)->status & (TS_COMPAT|TS_I386_REGS_POKED))
+ if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
/*
* Sign-extend the value so (int)-EFOO becomes (long)-EFOO
* and will match correctly in comparisons.
@@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
- if (task_thread_info(task)->status & TS_COMPAT)
+ if (task->thread.status & TS_COMPAT)
switch (i) {
case 0:
if (!n--) break;
@@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
const unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
- if (task_thread_info(task)->status & TS_COMPAT)
+ if (task->thread.status & TS_COMPAT)
switch (i) {
case 0:
if (!n--) break;
@@ -234,18 +234,8 @@ static inline void syscall_set_arguments(struct task_struct *task,
static inline int syscall_get_arch(void)
{
-#ifdef CONFIG_IA32_EMULATION
- /*
- * TS_COMPAT is set for 32-bit syscall entry and then
- * remains set until we return to user mode.
- *
- * x32 tasks should be considered AUDIT_ARCH_X86_64.
- */
- if (task_thread_info(current)->status & TS_COMPAT)
- return AUDIT_ARCH_I386;
-#endif
- /* Both x32 and x86_64 are considered "64-bit". */
- return AUDIT_ARCH_X86_64;
+ /* x32 tasks should be considered AUDIT_ARCH_X86_64. */
+ return in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}
#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 84b59846154a..2aaca53c0974 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -52,21 +52,6 @@ struct task_struct;
#include <asm/cpufeature.h>
#include <linux/atomic.h>
-struct thread_info {
- struct task_struct *task; /* main task structure */
- __u32 flags; /* low level flags */
- __u32 status; /* thread synchronous flags */
- __u32 cpu; /* current CPU */
-};
-
-#define INIT_THREAD_INFO(tsk) \
-{ \
- .task = &tsk, \
- .flags = 0, \
- .cpu = 0, \
-}
-
-#define init_thread_info (init_thread_union.thread_info)
#define init_stack (init_thread_union.stack)
#else /* !__ASSEMBLY__ */
@@ -95,7 +80,6 @@ struct thread_info {
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* IA32 compatibility process */
-#define TIF_FORK 18 /* ret_from_fork */
#define TIF_NOHZ 19 /* in adaptive nohz mode */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
@@ -119,7 +103,6 @@ struct thread_info {
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
-#define _TIF_FORK (1 << TIF_FORK)
#define _TIF_NOHZ (1 << TIF_NOHZ)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
@@ -160,11 +143,6 @@ struct thread_info {
*/
#ifndef __ASSEMBLY__
-static inline struct thread_info *current_thread_info(void)
-{
- return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
-}
-
static inline unsigned long current_stack_pointer(void)
{
unsigned long sp;
@@ -176,66 +154,69 @@ static inline unsigned long current_stack_pointer(void)
return sp;
}
+/*
+ * Walks up the stack frames to make sure that the specified object is
+ * entirely contained by a single stack frame.
+ *
+ * Returns:
+ * 1 if within a frame
+ * -1 if placed across a frame boundary (or outside stack)
+ * 0 unable to determine (no frame pointers, etc)
+ */
+static inline int arch_within_stack_frames(const void * const stack,
+ const void * const stackend,
+ const void *obj, unsigned long len)
+{
+#if defined(CONFIG_FRAME_POINTER)
+ const void *frame = NULL;
+ const void *oldframe;
+
+ oldframe = __builtin_frame_address(1);
+ if (oldframe)
+ frame = __builtin_frame_address(2);
+ /*
+ * low ----------------------------------------------> high
+ * [saved bp][saved ip][args][local vars][saved bp][saved ip]
+ * ^----------------^
+ * allow copies only within here
+ */
+ while (stack <= frame && frame < stackend) {
+ /*
+ * If obj + len extends past the last frame, this
+ * check won't pass and the next frame will be 0,
+ * causing us to bail out and correctly report
+ * the copy as invalid.
+ */
+ if (obj + len <= frame)
+ return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1;
+ oldframe = frame;
+ frame = *(const void * const *)frame;
+ }
+ return -1;
+#else
+ return 0;
+#endif
+}
+
#else /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_64
# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
#endif
-/*
- * ASM operand which evaluates to a 'thread_info' address of
- * the current task, if it is known that "reg" is exactly "off"
- * bytes below the top of the stack currently.
- *
- * ( The kernel stack's size is known at build time, it is usually
- * 2 or 4 pages, and the bottom of the kernel stack contains
- * the thread_info structure. So to access the thread_info very
- * quickly from assembly code we can calculate down from the
- * top of the kernel stack to the bottom, using constant,
- * build-time calculations only. )
- *
- * For example, to fetch the current thread_info->flags value into %eax
- * on x86-64 defconfig kernels, in syscall entry code where RSP is
- * currently at exactly SIZEOF_PTREGS bytes away from the top of the
- * stack:
- *
- * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax
- *
- * will translate to:
- *
- * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax
- *
- * which is below the current RSP by almost 16K.
- */
-#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg)
-
#endif
-/*
- * Thread-synchronous status.
- *
- * This is different from the flags in that nobody else
- * ever touches our thread-synchronous status, so we don't
- * have to worry about atomic accesses.
- */
-#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
#ifdef CONFIG_COMPAT
#define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */
#endif
-
#ifndef __ASSEMBLY__
-static inline bool in_ia32_syscall(void)
-{
#ifdef CONFIG_X86_32
- return true;
-#endif
-#ifdef CONFIG_IA32_EMULATION
- if (current_thread_info()->status & TS_COMPAT)
- return true;
+#define in_ia32_syscall() true
+#else
+#define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
+ current->thread.status & TS_COMPAT)
#endif
- return false;
-}
/*
* Force syscall return via IRET by making it look as if there was
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4e5be94e079a..6fa85944af83 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -135,7 +135,14 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
static inline void __native_flush_tlb(void)
{
+ /*
+ * If current->mm == NULL then we borrow a mm which may change during a
+ * task switch and therefore we must not be preempted while we write CR3
+ * back:
+ */
+ preempt_disable();
native_write_cr3(native_read_cr3());
+ preempt_enable();
}
static inline void __native_flush_tlb_global_irq_disabled(void)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index c3496619740a..01fd0a7f48cd 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -117,6 +117,12 @@ extern void ist_exit(struct pt_regs *regs);
extern void ist_begin_non_atomic(struct pt_regs *regs);
extern void ist_end_non_atomic(void);
+#ifdef CONFIG_VMAP_STACK
+void __noreturn handle_stack_overflow(const char *message,
+ struct pt_regs *regs,
+ unsigned long fault_address);
+#endif
+
/* Interrupts/Exceptions */
enum {
X86_TRAP_DE = 0, /* 0, Divide-by-zero */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index c03bfb68c503..2131c4ce7d8a 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -433,7 +433,11 @@ do { \
#define __get_user_asm_ex(x, addr, itype, rtype, ltype) \
asm volatile("1: mov"itype" %1,%"rtype"0\n" \
"2:\n" \
- _ASM_EXTABLE_EX(1b, 2b) \
+ ".section .fixup,\"ax\"\n" \
+ "3:xor"itype" %"rtype"0,%"rtype"0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE_EX(1b, 3b) \
: ltype(x) : "m" (__m(addr)))
#define __put_user_nocheck(x, ptr, size) \
@@ -697,44 +701,15 @@ unsigned long __must_check _copy_from_user(void *to, const void __user *from,
unsigned long __must_check _copy_to_user(void __user *to, const void *from,
unsigned n);
-#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
-# define copy_user_diag __compiletime_error
-#else
-# define copy_user_diag __compiletime_warning
-#endif
-
-extern void copy_user_diag("copy_from_user() buffer size is too small")
-copy_from_user_overflow(void);
-extern void copy_user_diag("copy_to_user() buffer size is too small")
-copy_to_user_overflow(void) __asm__("copy_from_user_overflow");
-
-#undef copy_user_diag
-
-#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
-
-extern void
-__compiletime_warning("copy_from_user() buffer size is not provably correct")
-__copy_from_user_overflow(void) __asm__("copy_from_user_overflow");
-#define __copy_from_user_overflow(size, count) __copy_from_user_overflow()
-
-extern void
-__compiletime_warning("copy_to_user() buffer size is not provably correct")
-__copy_to_user_overflow(void) __asm__("copy_from_user_overflow");
-#define __copy_to_user_overflow(size, count) __copy_to_user_overflow()
-
-#else
+extern void __compiletime_error("usercopy buffer size is too small")
+__bad_copy_user(void);
-static inline void
-__copy_from_user_overflow(int size, unsigned long count)
+static inline void copy_user_overflow(int size, unsigned long count)
{
WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
}
-#define __copy_to_user_overflow __copy_from_user_overflow
-
-#endif
-
-static inline unsigned long __must_check
+static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n)
{
int sz = __compiletime_object_size(to);
@@ -743,35 +718,18 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
kasan_check_write(to, n);
- /*
- * While we would like to have the compiler do the checking for us
- * even in the non-constant size case, any false positives there are
- * a problem (especially when DEBUG_STRICT_USER_COPY_CHECKS, but even
- * without - the [hopefully] dangerous looking nature of the warning
- * would make people go look at the respecitive call sites over and
- * over again just to find that there's no problem).
- *
- * And there are cases where it's just not realistic for the compiler
- * to prove the count to be in range. For example when multiple call
- * sites of a helper function - perhaps in different source files -
- * all doing proper range checking, yet the helper function not doing
- * so again.
- *
- * Therefore limit the compile time checking to the constant size
- * case, and do only runtime checking for non-constant sizes.
- */
-
- if (likely(sz < 0 || sz >= n))
+ if (likely(sz < 0 || sz >= n)) {
+ check_object_size(to, n, false);
n = _copy_from_user(to, from, n);
- else if(__builtin_constant_p(n))
- copy_from_user_overflow();
+ } else if (!__builtin_constant_p(n))
+ copy_user_overflow(sz, n);
else
- __copy_from_user_overflow(sz, n);
+ __bad_copy_user();
return n;
}
-static inline unsigned long __must_check
+static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
int sz = __compiletime_object_size(from);
@@ -780,20 +738,17 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
might_fault();
- /* See the comment in copy_from_user() above. */
- if (likely(sz < 0 || sz >= n))
+ if (likely(sz < 0 || sz >= n)) {
+ check_object_size(from, n, true);
n = _copy_to_user(to, from, n);
- else if(__builtin_constant_p(n))
- copy_to_user_overflow();
+ } else if (!__builtin_constant_p(n))
+ copy_user_overflow(sz, n);
else
- __copy_to_user_overflow(sz, n);
+ __bad_copy_user();
return n;
}
-#undef __copy_from_user_overflow
-#undef __copy_to_user_overflow
-
/*
* We rely on the nested NMI work to allow atomic faults from the NMI path; the
* nested NMI paths are careful to preserve CR2.
@@ -812,21 +767,21 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
#define user_access_begin() __uaccess_begin()
#define user_access_end() __uaccess_end()
-#define unsafe_put_user(x, ptr) \
-({ \
+#define unsafe_put_user(x, ptr, err_label) \
+do { \
int __pu_err; \
__put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \
- __builtin_expect(__pu_err, 0); \
-})
+ if (unlikely(__pu_err)) goto err_label; \
+} while (0)
-#define unsafe_get_user(x, ptr) \
-({ \
+#define unsafe_get_user(x, ptr, err_label) \
+do { \
int __gu_err; \
unsigned long __gu_val; \
__get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
- __builtin_expect(__gu_err, 0); \
-})
+ if (unlikely(__gu_err)) goto err_label; \
+} while (0)
#endif /* _ASM_X86_UACCESS_H */
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 4b32da24faaf..7d3bdd1ed697 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -37,6 +37,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero
static __always_inline unsigned long __must_check
__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
{
+ check_object_size(from, n, true);
return __copy_to_user_ll(to, from, n);
}
@@ -95,6 +96,7 @@ static __always_inline unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
might_fault();
+ check_object_size(to, n, false);
if (__builtin_constant_p(n)) {
unsigned long ret;
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 2eac2aa3e37f..673059a109fe 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -54,6 +54,7 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
{
int ret = 0;
+ check_object_size(dst, size, false);
if (!__builtin_constant_p(size))
return copy_user_generic(dst, (__force void *)src, size);
switch (size) {
@@ -119,6 +120,7 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
{
int ret = 0;
+ check_object_size(src, size, true);
if (!__builtin_constant_p(size))
return copy_user_generic((__force void *)dst, src, size);
switch (size) {
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
new file mode 100644
index 000000000000..c4b6d1cafa46
--- /dev/null
+++ b/arch/x86/include/asm/unwind.h
@@ -0,0 +1,73 @@
+#ifndef _ASM_X86_UNWIND_H
+#define _ASM_X86_UNWIND_H
+
+#include <linux/sched.h>
+#include <linux/ftrace.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+
+struct unwind_state {
+ struct stack_info stack_info;
+ unsigned long stack_mask;
+ struct task_struct *task;
+ int graph_idx;
+#ifdef CONFIG_FRAME_POINTER
+ unsigned long *bp;
+#else
+ unsigned long *sp;
+#endif
+};
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame);
+
+bool unwind_next_frame(struct unwind_state *state);
+
+static inline bool unwind_done(struct unwind_state *state)
+{
+ return state->stack_info.type == STACK_TYPE_UNKNOWN;
+}
+
+static inline
+void unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ first_frame = first_frame ? : get_stack_pointer(task, regs);
+
+ __unwind_start(state, task, regs, first_frame);
+}
+
+#ifdef CONFIG_FRAME_POINTER
+
+static inline
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ return state->bp + 1;
+}
+
+unsigned long unwind_get_return_address(struct unwind_state *state);
+
+#else /* !CONFIG_FRAME_POINTER */
+
+static inline
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ return NULL;
+}
+
+static inline
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ *state->sp, state->sp);
+}
+
+#endif /* CONFIG_FRAME_POINTER */
+
+#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index c852590254d5..e652a7cc6186 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -79,7 +79,7 @@ struct uv_gam_range_entry {
u16 nasid; /* HNasid */
u16 sockid; /* Socket ID, high bits of APIC ID */
u16 pnode; /* Index to MMR and GRU spaces */
- u32 pxm; /* ACPI proximity domain number */
+ u32 unused2;
u32 limit; /* PA bits 56:26 (UV_GAM_RANGE_SHFT) */
};
@@ -88,7 +88,8 @@ struct uv_gam_range_entry {
#define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */
#define UV_SYSTAB_VERSION_UV4_1 0x401 /* + gpa_shift */
#define UV_SYSTAB_VERSION_UV4_2 0x402 /* + TYPE_NVRAM/WINDOW/MBOX */
-#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_2
+#define UV_SYSTAB_VERSION_UV4_3 0x403 /* - GAM Range PXM Value */
+#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_3
#define UV_SYSTAB_TYPE_UNUSED 0 /* End of table (offset == 0) */
#define UV_SYSTAB_TYPE_GAM_PARAMS 1 /* GAM PARAM conversions */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index cc44d926c17e..57ab86d94d64 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -49,14 +49,12 @@
#define UV_NET_ENDPOINT_INTD (is_uv1_hub() ? \
UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD)
#define UV_DESC_PSHIFT 49
-#define UV_PAYLOADQ_PNODE_SHIFT 49
+#define UV_PAYLOADQ_GNODE_SHIFT 49
#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
#define UV_BAU_BASENAME "sgi_uv/bau_tunables"
#define UV_BAU_TUNABLES_DIR "sgi_uv"
#define UV_BAU_TUNABLES_FILE "bau_tunables"
#define WHITESPACE " \t\n"
-#define uv_mmask ((1UL << uv_hub_info->m_val) - 1)
-#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
#define cpubit_isset(cpu, bau_local_cpumask) \
test_bit((cpu), (bau_local_cpumask).bits)
@@ -387,6 +385,17 @@ struct uv2_3_bau_msg_header {
/* bits 127:120 */
};
+/* Abstracted BAU functions */
+struct bau_operations {
+ unsigned long (*read_l_sw_ack)(void);
+ unsigned long (*read_g_sw_ack)(int pnode);
+ unsigned long (*bau_gpa_to_offset)(unsigned long vaddr);
+ void (*write_l_sw_ack)(unsigned long mmr);
+ void (*write_g_sw_ack)(int pnode, unsigned long mmr);
+ void (*write_payload_first)(int pnode, unsigned long mmr);
+ void (*write_payload_last)(int pnode, unsigned long mmr);
+};
+
/*
* The activation descriptor:
* The format of the message to send, plus all accompanying control
@@ -655,6 +664,16 @@ static inline void write_gmmr_activation(int pnode, unsigned long mmr_image)
write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image);
}
+static inline void write_mmr_proc_payload_first(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UV4H_LB_PROC_INTD_QUEUE_FIRST, mmr_image);
+}
+
+static inline void write_mmr_proc_payload_last(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UV4H_LB_PROC_INTD_QUEUE_LAST, mmr_image);
+}
+
static inline void write_mmr_payload_first(int pnode, unsigned long mmr_image)
{
write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image);
@@ -700,6 +719,26 @@ static inline unsigned long read_gmmr_sw_ack(int pnode)
return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
}
+static inline void write_mmr_proc_sw_ack(unsigned long mr)
+{
+ uv_write_local_mmr(UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR, mr);
+}
+
+static inline void write_gmmr_proc_sw_ack(int pnode, unsigned long mr)
+{
+ write_gmmr(pnode, UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR, mr);
+}
+
+static inline unsigned long read_mmr_proc_sw_ack(void)
+{
+ return read_lmmr(UV4H_LB_PROC_INTD_SOFT_ACK_PENDING);
+}
+
+static inline unsigned long read_gmmr_proc_sw_ack(int pnode)
+{
+ return read_gmmr(pnode, UV4H_LB_PROC_INTD_SOFT_ACK_PENDING);
+}
+
static inline void write_mmr_data_config(int pnode, unsigned long mr)
{
uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr);
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 43dc55be524e..2444189cbe28 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -41,6 +41,8 @@ extern const struct vdso_image vdso_image_32;
extern void __init init_vdso_image(const struct vdso_image *image);
+extern int map_vdso_once(const struct vdso_image *image, unsigned long addr);
+
#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index 2184943341bf..69a6e07e3149 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -26,6 +26,8 @@ struct mce {
__u32 socketid; /* CPU socket ID */
__u32 apicid; /* CPU initial apic ID */
__u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
+ __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
+ __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
};
#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 3ac5032fae09..ae135de547f5 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -6,4 +6,10 @@
#define ARCH_GET_FS 0x1003
#define ARCH_GET_GS 0x1004
+#ifdef CONFIG_CHECKPOINT_RESTORE
+# define ARCH_MAP_VDSO_X32 0x2001
+# define ARCH_MAP_VDSO_32 0x2002
+# define ARCH_MAP_VDSO_64 0x2003
+#endif
+
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0503f5bfb18d..45257cf84370 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -125,6 +125,12 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
+ifdef CONFIG_FRAME_POINTER
+obj-y += unwind_frame.o
+else
+obj-y += unwind_guess.o
+endif
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 3242e591fa82..26b78d86f25a 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,6 +1,7 @@
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
+obj-$(CONFIG_ACPI_CPPC_LIB) += cppc_msr.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 90d84c3eee53..32a7d70913ac 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -176,15 +176,10 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
return -EINVAL;
}
- if (!enabled) {
- ++disabled_cpus;
- return -EINVAL;
- }
-
if (boot_cpu_physical_apicid != -1U)
- ver = apic_version[boot_cpu_physical_apicid];
+ ver = boot_cpu_apic_version;
- cpu = generic_processor_info(id, ver);
+ cpu = __generic_processor_info(id, ver, enabled);
if (cpu >= 0)
early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid;
@@ -282,6 +277,8 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
return -EINVAL;
+ acpi_table_print_madt_entry(header);
+
acpi_lapic_addr = lapic_addr_ovr->address;
return 0;
@@ -705,7 +702,7 @@ static void __init acpi_set_irq_model_ioapic(void)
#ifdef CONFIG_ACPI_HOTPLUG_CPU
#include <acpi/processor.h>
-static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
{
#ifdef CONFIG_ACPI_NUMA
int nid;
@@ -716,6 +713,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
numa_set_node(cpu, nid);
}
#endif
+ return 0;
}
int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
@@ -998,21 +996,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
- /*
- * Note that the LAPIC address is obtained from the MADT (32-bit value)
- * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
- */
-
- count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
- acpi_parse_lapic_addr_ovr, 0);
- if (count < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing LAPIC address override entry\n");
- return count;
- }
-
- register_lapic_address(acpi_lapic_addr);
-
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
acpi_parse_sapic, MAX_LOCAL_APIC);
@@ -1031,8 +1014,8 @@ static int __init acpi_parse_madt_lapic_entries(void)
return ret;
}
- x2count = madt_proc[0].count;
- count = madt_proc[1].count;
+ count = madt_proc[0].count;
+ x2count = madt_proc[1].count;
}
if (!count && !x2count) {
printk(KERN_ERR PREFIX "No LAPIC entries present\n");
@@ -1513,7 +1496,7 @@ void __init acpi_boot_table_init(void)
* If acpi_disabled, bail out
*/
if (acpi_disabled)
- return;
+ return;
/*
* Initialize the ACPI boot-time table parser.
diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c
new file mode 100644
index 000000000000..6fb478bf82fd
--- /dev/null
+++ b/arch/x86/kernel/acpi/cppc_msr.c
@@ -0,0 +1,58 @@
+/*
+ * cppc_msr.c: MSR Interface for CPPC
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <acpi/cppc_acpi.h>
+#include <asm/msr.h>
+
+/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
+
+bool cpc_ffh_supported(void)
+{
+ return true;
+}
+
+int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
+{
+ int err;
+
+ err = rdmsrl_safe_on_cpu(cpunum, reg->address, val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ *val &= mask;
+ *val >>= reg->bit_offset;
+ }
+ return err;
+}
+
+int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
+{
+ u64 rd_val;
+ int err;
+
+ err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ val <<= reg->bit_offset;
+ val &= mask;
+ rd_val &= ~mask;
+ rd_val |= val;
+ err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val);
+ }
+ return err;
+}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index adb3eaf8fe2a..48587335ede8 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -99,7 +99,7 @@ int x86_acpi_suspend_lowlevel(void)
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
#ifdef CONFIG_SMP
- stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
+ initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
(unsigned long)get_cpu_gdt_table(smp_processor_id());
initial_gs = per_cpu_offset(smp_processor_id());
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 20abd912f0e4..f266b8a92a9e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -64,6 +64,8 @@ unsigned disabled_cpus;
unsigned int boot_cpu_physical_apicid = -1U;
EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
+u8 boot_cpu_apic_version;
+
/*
* The highest APIC ID seen during enumeration.
*/
@@ -313,7 +315,7 @@ int lapic_get_maxlvt(void)
/* Clock divisor */
#define APIC_DIVISOR 16
-#define TSC_DIVISOR 32
+#define TSC_DIVISOR 8
/*
* This function sets up the local APIC timer, with a timeout of
@@ -565,13 +567,37 @@ static void setup_APIC_timer(void)
CLOCK_EVT_FEAT_DUMMY);
levt->set_next_event = lapic_next_deadline;
clockevents_config_and_register(levt,
- (tsc_khz / TSC_DIVISOR) * 1000,
+ tsc_khz * (1000 / TSC_DIVISOR),
0xF, ~0UL);
} else
clockevents_register_device(levt);
}
/*
+ * Install the updated TSC frequency from recalibration at the TSC
+ * deadline clockevent devices.
+ */
+static void __lapic_update_tsc_freq(void *info)
+{
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
+
+ if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return;
+
+ clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR));
+}
+
+void lapic_update_tsc_freq(void)
+{
+ /*
+ * The clockevent device's ->mult and ->shift can both be
+ * changed. In order to avoid races, schedule the frequency
+ * update code on each CPU.
+ */
+ on_each_cpu(__lapic_update_tsc_freq, NULL, 0);
+}
+
+/*
* In this functions we calibrate APIC bus clocks to the external timer.
*
* We want to do the calibration only once since we want to have local timer
@@ -1350,7 +1376,6 @@ void setup_local_APIC(void)
* Actually disabling the focus CPU check just makes the hang less
* frequent as it makes the interrupt distributon model be more
* like LRU than MRU (the short-term load is more even across CPUs).
- * See also the comment in end_level_ioapic_irq(). --macro
*/
/*
@@ -1599,6 +1624,9 @@ void __init enable_IR_x2apic(void)
unsigned long flags;
int ret, ir_stat;
+ if (skip_ioapic_setup)
+ return;
+
ir_stat = irq_remapping_prepare();
if (ir_stat < 0 && !x2apic_supported())
return;
@@ -1789,8 +1817,7 @@ void __init init_apic_mappings(void)
* since smp_sanity_check is prepared for such a case
* and disable smp mode
*/
- apic_version[new_apicid] =
- GET_APIC_VERSION(apic_read(APIC_LVR));
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
}
@@ -1801,17 +1828,14 @@ void __init register_lapic_address(unsigned long address)
if (!x2apic_mode) {
set_fixmap_nocache(FIX_APIC_BASE, address);
apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, mp_lapic_addr);
+ APIC_BASE, address);
}
if (boot_cpu_physical_apicid == -1U) {
boot_cpu_physical_apicid = read_apic_id();
- apic_version[boot_cpu_physical_apicid] =
- GET_APIC_VERSION(apic_read(APIC_LVR));
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
}
-int apic_version[MAX_LOCAL_APIC];
-
/*
* Local APIC interrupts
*/
@@ -2000,7 +2024,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT1, value);
}
-int generic_processor_info(int apicid, int version)
+/*
+ * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated
+ * contiguously, it equals to current allocated max logical CPU ID plus 1.
+ * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of
+ * nr_logical_cpuids is nr_cpu_ids.
+ *
+ * NOTE: Reserve 0 for BSP.
+ */
+static int nr_logical_cpuids = 1;
+
+/*
+ * Used to store mapping between logical CPU IDs and APIC IDs.
+ */
+static int cpuid_to_apicid[] = {
+ [0 ... NR_CPUS - 1] = -1,
+};
+
+/*
+ * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids
+ * and cpuid_to_apicid[] synchronized.
+ */
+static int allocate_logical_cpuid(int apicid)
+{
+ int i;
+
+ /*
+ * cpuid <-> apicid mapping is persistent, so when a cpu is up,
+ * check if the kernel has allocated a cpuid for it.
+ */
+ for (i = 0; i < nr_logical_cpuids; i++) {
+ if (cpuid_to_apicid[i] == apicid)
+ return i;
+ }
+
+ /* Allocate a new cpuid. */
+ if (nr_logical_cpuids >= nr_cpu_ids) {
+ WARN_ONCE(1, "Only %d processors supported."
+ "Processor %d/0x%x and the rest are ignored.\n",
+ nr_cpu_ids - 1, nr_logical_cpuids, apicid);
+ return -1;
+ }
+
+ cpuid_to_apicid[nr_logical_cpuids] = apicid;
+ return nr_logical_cpuids++;
+}
+
+int __generic_processor_info(int apicid, int version, bool enabled)
{
int cpu, max = nr_cpu_ids;
bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
@@ -2066,7 +2136,6 @@ int generic_processor_info(int apicid, int version)
return -EINVAL;
}
- num_processors++;
if (apicid == boot_cpu_physical_apicid) {
/*
* x86_bios_cpu_apicid is required to have processors listed
@@ -2076,8 +2145,16 @@ int generic_processor_info(int apicid, int version)
* for BSP.
*/
cpu = 0;
- } else
- cpu = cpumask_next_zero(-1, cpu_present_mask);
+
+ /* Logical cpuid 0 is reserved for BSP. */
+ cpuid_to_apicid[0] = apicid;
+ } else {
+ cpu = allocate_logical_cpuid(apicid);
+ if (cpu < 0) {
+ disabled_cpus++;
+ return -EINVAL;
+ }
+ }
/*
* This can happen on physical hotplug. The sanity check at boot time
@@ -2089,6 +2166,7 @@ int generic_processor_info(int apicid, int version)
pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n",
thiscpu, apicid);
+
disabled_cpus++;
return -ENOSPC;
}
@@ -2101,14 +2179,12 @@ int generic_processor_info(int apicid, int version)
cpu, apicid);
version = 0x10;
}
- apic_version[apicid] = version;
- if (version != apic_version[boot_cpu_physical_apicid]) {
+ if (version != boot_cpu_apic_version) {
pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
- apic_version[boot_cpu_physical_apicid], cpu, version);
+ boot_cpu_apic_version, cpu, version);
}
- physid_set(apicid, phys_cpu_present_map);
if (apicid > max_physical_apicid)
max_physical_apicid = apicid;
@@ -2121,11 +2197,23 @@ int generic_processor_info(int apicid, int version)
apic->x86_32_early_logical_apicid(cpu);
#endif
set_cpu_possible(cpu, true);
- set_cpu_present(cpu, true);
+
+ if (enabled) {
+ num_processors++;
+ physid_set(apicid, phys_cpu_present_map);
+ set_cpu_present(cpu, true);
+ } else {
+ disabled_cpus++;
+ }
return cpu;
}
+int generic_processor_info(int apicid, int version)
+{
+ return __generic_processor_info(apicid, version, true);
+}
+
int hard_smp_processor_id(void)
{
return read_apic_id();
@@ -2248,7 +2336,7 @@ int __init APIC_init_uniprocessor(void)
* Complain if the BIOS pretends there is one.
*/
if (!boot_cpu_has(X86_FEATURE_APIC) &&
- APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ APIC_INTEGRATED(boot_cpu_apic_version)) {
pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
boot_cpu_physical_apicid);
return -1;
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 5b2ae106bd4a..a4d7ff20ed22 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -25,7 +25,7 @@
static struct apic apic_physflat;
static struct apic apic_flat;
-struct apic __read_mostly *apic = &apic_flat;
+struct apic *apic __ro_after_init = &apic_flat;
EXPORT_SYMBOL_GPL(apic);
static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -116,27 +116,17 @@ static void flat_send_IPI_all(int vector)
static unsigned int flat_get_apic_id(unsigned long x)
{
- unsigned int id;
-
- id = (((x)>>24) & 0xFFu);
-
- return id;
+ return (x >> 24) & 0xFF;
}
static unsigned long set_apic_id(unsigned int id)
{
- unsigned long x;
-
- x = ((id & 0xFFu)<<24);
- return x;
+ return (id & 0xFF) << 24;
}
static unsigned int read_xapic_id(void)
{
- unsigned int id;
-
- id = flat_get_apic_id(apic_read(APIC_ID));
- return id;
+ return flat_get_apic_id(apic_read(APIC_ID));
}
static int flat_apic_id_registered(void)
@@ -154,7 +144,7 @@ static int flat_probe(void)
return 1;
}
-static struct apic apic_flat = {
+static struct apic apic_flat __ro_after_init = {
.name = "flat",
.probe = flat_probe,
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
@@ -248,7 +238,7 @@ static int physflat_probe(void)
return 0;
}
-static struct apic apic_physflat = {
+static struct apic apic_physflat __ro_after_init = {
.name = "physical flat",
.probe = physflat_probe,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index c05688b2deff..b109e4389c92 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -108,7 +108,7 @@ static void noop_apic_write(u32 reg, u32 v)
WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
}
-struct apic apic_noop = {
+struct apic apic_noop __ro_after_init = {
.name = "noop",
.probe = noop_probe,
.acpi_madt_oem_check = NULL,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 714d4fda0d52..e08fe2c8dd8c 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -40,10 +40,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x)
static unsigned long numachip1_set_apic_id(unsigned int id)
{
- unsigned long x;
-
- x = ((id & 0xffU) << 24);
- return x;
+ return (id & 0xff) << 24;
}
static unsigned int numachip2_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 06dbaa458bfe..56012010332c 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -142,7 +142,7 @@ static int probe_bigsmp(void)
return dmi_bigsmp;
}
-static struct apic apic_bigsmp = {
+static struct apic apic_bigsmp __ro_after_init = {
.name = "bigsmp",
.probe = probe_bigsmp,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7491f417a8e4..48e6d84f173e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1593,7 +1593,7 @@ void __init setup_ioapic_ids_from_mpc(void)
* no meaning without the serial APIC bus.
*/
if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ || APIC_XAPIC(boot_cpu_apic_version))
return;
setup_ioapic_ids_from_mpc_nocheck();
}
@@ -2423,7 +2423,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id)
static u8 io_apic_unique_id(int idx, u8 id)
{
if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ !APIC_XAPIC(boot_cpu_apic_version))
return io_apic_get_unique_id(idx, id);
else
return id;
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index ade25320df96..015bbf30e3e3 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -269,7 +269,7 @@ static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
}
-static struct irq_chip hpet_msi_controller = {
+static struct irq_chip hpet_msi_controller __ro_after_init = {
.name = "HPET-MSI",
.irq_unmask = hpet_msi_unmask,
.irq_mask = hpet_msi_mask,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 7c43e716c158..c48264e202fd 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -72,7 +72,7 @@ static int probe_default(void)
return 1;
}
-static struct apic apic_default = {
+static struct apic apic_default __ro_after_init = {
.name = "default",
.probe = probe_default,
@@ -126,7 +126,7 @@ static struct apic apic_default = {
apic_driver(apic_default);
-struct apic *apic = &apic_default;
+struct apic *apic __ro_after_init = &apic_default;
EXPORT_SYMBOL_GPL(apic);
static int cmdline_apic __initdata;
@@ -152,7 +152,7 @@ early_param("apic", parse_apic);
void __init default_setup_apic_routing(void)
{
- int version = apic_version[boot_cpu_physical_apicid];
+ int version = boot_cpu_apic_version;
if (num_possible_cpus() > 8) {
switch (boot_cpu_data.x86_vendor) {
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 6368fa69d2af..200af5ae9662 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -155,7 +155,7 @@ static void init_x2apic_ldr(void)
/*
* At CPU state changes, update the x2apic cluster sibling info.
*/
-int x2apic_prepare_cpu(unsigned int cpu)
+static int x2apic_prepare_cpu(unsigned int cpu)
{
if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL))
return -ENOMEM;
@@ -168,7 +168,7 @@ int x2apic_prepare_cpu(unsigned int cpu)
return 0;
}
-int x2apic_dead_cpu(unsigned int this_cpu)
+static int x2apic_dead_cpu(unsigned int this_cpu)
{
int cpu;
@@ -186,13 +186,18 @@ int x2apic_dead_cpu(unsigned int this_cpu)
static int x2apic_cluster_probe(void)
{
int cpu = smp_processor_id();
+ int ret;
if (!x2apic_mode)
return 0;
+ ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE",
+ x2apic_prepare_cpu, x2apic_dead_cpu);
+ if (ret < 0) {
+ pr_err("Failed to register X2APIC_PREPARE\n");
+ return 0;
+ }
cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
- cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE",
- x2apic_prepare_cpu, x2apic_dead_cpu);
return 1;
}
@@ -222,7 +227,7 @@ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
}
-static struct apic apic_x2apic_cluster = {
+static struct apic apic_x2apic_cluster __ro_after_init = {
.name = "cluster x2apic",
.probe = x2apic_cluster_probe,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 4f13f54f1b1f..ff111f05a314 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -98,7 +98,7 @@ static int x2apic_phys_probe(void)
return apic == &apic_x2apic_phys;
}
-static struct apic apic_x2apic_phys = {
+static struct apic apic_x2apic_phys __ro_after_init = {
.name = "physical x2apic",
.probe = x2apic_phys_probe,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 09b59adaea3f..aeef53ce93e1 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -223,6 +223,11 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
if (strncmp(oem_id, "SGI", 3) != 0)
return 0;
+ if (numa_off) {
+ pr_err("UV: NUMA is off, disabling UV support\n");
+ return 0;
+ }
+
/* Setup early hub type field in uv_hub_info for Node 0 */
uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
@@ -325,7 +330,7 @@ static __init void build_uv_gr_table(void)
struct uv_gam_range_entry *gre = uv_gre_table;
struct uv_gam_range_s *grt;
unsigned long last_limit = 0, ram_limit = 0;
- int bytes, i, sid, lsid = -1;
+ int bytes, i, sid, lsid = -1, indx = 0, lindx = -1;
if (!gre)
return;
@@ -356,11 +361,12 @@ static __init void build_uv_gr_table(void)
}
sid = gre->sockid - _min_socket;
if (lsid < sid) { /* new range */
- grt = &_gr_table[sid];
- grt->base = lsid;
+ grt = &_gr_table[indx];
+ grt->base = lindx;
grt->nasid = gre->nasid;
grt->limit = last_limit = gre->limit;
lsid = sid;
+ lindx = indx++;
continue;
}
if (lsid == sid && !ram_limit) { /* update range */
@@ -371,7 +377,7 @@ static __init void build_uv_gr_table(void)
}
if (!ram_limit) { /* non-contiguous ram range */
grt++;
- grt->base = sid - 1;
+ grt->base = lindx;
grt->nasid = gre->nasid;
grt->limit = last_limit = gre->limit;
continue;
@@ -527,11 +533,8 @@ static unsigned int x2apic_get_apic_id(unsigned long x)
static unsigned long set_apic_id(unsigned int id)
{
- unsigned long x;
-
- /* maskout x2apic_extra_bits ? */
- x = id;
- return x;
+ /* CHECKME: Do we need to mask out the xapic extra bits? */
+ return id;
}
static unsigned int uv_read_apic_id(void)
@@ -554,7 +557,7 @@ static int uv_probe(void)
return apic == &apic_x2apic_uv_x;
}
-static struct apic __refdata apic_x2apic_uv_x = {
+static struct apic apic_x2apic_uv_x __ro_after_init = {
.name = "UV large system",
.probe = uv_probe,
@@ -921,7 +924,7 @@ static void uv_heartbeat(unsigned long ignored)
mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
}
-static void uv_heartbeat_enable(int cpu)
+static int uv_heartbeat_enable(unsigned int cpu)
{
while (!uv_cpu_scir_info(cpu)->enabled) {
struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer;
@@ -935,43 +938,24 @@ static void uv_heartbeat_enable(int cpu)
/* also ensure that boot cpu is enabled */
cpu = 0;
}
+ return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
-static void uv_heartbeat_disable(int cpu)
+static int uv_heartbeat_disable(unsigned int cpu)
{
if (uv_cpu_scir_info(cpu)->enabled) {
uv_cpu_scir_info(cpu)->enabled = 0;
del_timer(&uv_cpu_scir_info(cpu)->timer);
}
uv_set_cpu_scir_bits(cpu, 0xff);
-}
-
-/*
- * cpu hotplug notifier
- */
-static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action,
- void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_FAILED:
- case CPU_ONLINE:
- uv_heartbeat_enable(cpu);
- break;
- case CPU_DOWN_PREPARE:
- uv_heartbeat_disable(cpu);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
+ return 0;
}
static __init void uv_scir_register_cpu_notifier(void)
{
- hotcpu_notifier(uv_scir_cpu_notify, 0);
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/x2apic-uvx:online",
+ uv_heartbeat_enable, uv_heartbeat_disable);
}
#else /* !CONFIG_HOTPLUG_CPU */
@@ -1155,19 +1139,18 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
if (!index) {
pr_info("UV: GAM Range Table...\n");
- pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s %3s\n",
+ pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n",
"Range", "", "Size", "Type", "NASID",
- "SID", "PN", "PXM");
+ "SID", "PN");
}
pr_info(
- "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x %3d\n",
+ "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n",
index++,
(unsigned long)lgre << UV_GAM_RANGE_SHFT,
(unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
((unsigned long)(gre->limit - lgre)) >>
(30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */
- gre->type, gre->nasid, gre->sockid,
- gre->pnode, gre->pxm);
+ gre->type, gre->nasid, gre->sockid, gre->pnode);
lgre = gre->limit;
if (sock_min > gre->sockid)
@@ -1286,7 +1269,7 @@ static void __init build_socket_tables(void)
_pnode_to_socket[i] = SOCK_EMPTY;
/* fill in pnode/node/addr conversion list values */
- pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n");
+ pr_info("UV: GAM Building socket/pnode conversion tables\n");
for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
continue;
@@ -1294,20 +1277,18 @@ static void __init build_socket_tables(void)
if (_socket_to_pnode[i] != SOCK_EMPTY)
continue; /* duplicate */
_socket_to_pnode[i] = gre->pnode;
- _socket_to_node[i] = gre->pxm;
i = gre->pnode - minpnode;
_pnode_to_socket[i] = gre->sockid;
pr_info(
- "UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n",
+ "UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
gre->sockid, gre->type, gre->nasid,
_socket_to_pnode[gre->sockid - minsock],
- _socket_to_node[gre->sockid - minsock],
_pnode_to_socket[gre->pnode - minpnode]);
}
- /* check socket -> node values */
+ /* Set socket -> node values */
lnid = -1;
for_each_present_cpu(cpu) {
int nid = cpu_to_node(cpu);
@@ -1318,14 +1299,9 @@ static void __init build_socket_tables(void)
lnid = nid;
apicid = per_cpu(x86_cpu_to_apicid, cpu);
sockid = apicid >> uv_cpuid.socketid_shift;
- i = sockid - minsock;
-
- if (nid != _socket_to_node[i]) {
- pr_warn(
- "UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n",
- i, sockid, gre->type, _socket_to_node[i], nid);
- _socket_to_node[i] = nid;
- }
+ _socket_to_node[sockid - minsock] = nid;
+ pr_info("UV: sid:%02x: apicid:%04x node:%2d\n",
+ sockid, apicid, nid);
}
/* Setup physical blade to pnode translation from GAM Range Table */
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 2bd5c6ff7ee7..c62e015b126c 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -29,10 +29,13 @@
void common(void) {
BLANK();
- OFFSET(TI_flags, thread_info, flags);
- OFFSET(TI_status, thread_info, status);
+ OFFSET(TASK_threadsp, task_struct, thread.sp);
+#ifdef CONFIG_CC_STACKPROTECTOR
+ OFFSET(TASK_stack_canary, task_struct, stack_canary);
+#endif
BLANK();
+ OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
BLANK();
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index ecdc1d217dc0..880aa093268d 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -57,6 +57,11 @@ void foo(void)
/* Size of SYSENTER_stack */
DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+#ifdef CONFIG_CC_STACKPROTECTOR
+ BLANK();
+ OFFSET(stack_canary_offset, stack_canary, canary);
+#endif
+
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d875f97d4e0b..210927ee2e74 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -56,6 +56,11 @@ int main(void)
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
BLANK();
+#ifdef CONFIG_CC_STACKPROTECTOR
+ DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary));
+ BLANK();
+#endif
+
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
DEFINE(NR_syscalls, sizeof(syscalls_64));
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f5c69d8974e1..b81fe2d63e15 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -669,6 +669,17 @@ static void init_amd_gh(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
}
+#define MSR_AMD64_DE_CFG 0xC0011029
+
+static void init_amd_ln(struct cpuinfo_x86 *c)
+{
+ /*
+ * Apply erratum 665 fix unconditionally so machines without a BIOS
+ * fix work.
+ */
+ msr_set_bit(MSR_AMD64_DE_CFG, 31);
+}
+
static void init_amd_bd(struct cpuinfo_x86 *c)
{
u64 value;
@@ -726,6 +737,7 @@ static void init_amd(struct cpuinfo_x86 *c)
case 6: init_amd_k7(c); break;
case 0xf: init_amd_k8(c); break;
case 0x10: init_amd_gh(c); break;
+ case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 809eda03c527..9bd910a7dd0a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -804,21 +804,20 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
identify_cpu_without_cpuid(c);
/* cyrix could have cpuid enabled via c_identify()*/
- if (!have_cpuid_p())
- return;
-
- cpu_detect(c);
- get_cpu_vendor(c);
- get_cpu_cap(c);
+ if (have_cpuid_p()) {
+ cpu_detect(c);
+ get_cpu_vendor(c);
+ get_cpu_cap(c);
- if (this_cpu->c_early_init)
- this_cpu->c_early_init(c);
+ if (this_cpu->c_early_init)
+ this_cpu->c_early_init(c);
- c->cpu_index = 0;
- filter_cpuid_features(c, false);
+ c->cpu_index = 0;
+ filter_cpuid_features(c, false);
- if (this_cpu->c_bsp_init)
- this_cpu->c_bsp_init(c);
+ if (this_cpu->c_bsp_init)
+ this_cpu->c_bsp_init(c);
+ }
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
fpu__init_system(c);
@@ -1265,9 +1264,14 @@ static __init int setup_disablecpuid(char *arg)
__setup("clearcpuid=", setup_disablecpuid);
#ifdef CONFIG_X86_64
-struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
-struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
- (unsigned long) debug_idt_table };
+struct desc_ptr idt_descr __ro_after_init = {
+ .size = NR_VECTORS * 16 - 1,
+ .address = (unsigned long) idt_table,
+};
+const struct desc_ptr debug_idt_descr = {
+ .size = NR_VECTORS * 16 - 1,
+ .address = (unsigned long) debug_idt_table,
+};
DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE) __visible;
@@ -1281,7 +1285,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
EXPORT_PER_CPU_SYMBOL(current_task);
DEFINE_PER_CPU(char *, irq_stack_ptr) =
- init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+ init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE;
DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
@@ -1305,11 +1309,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
- /*
- * LSTAR and STAR live in a bit strange symbiosis.
- * They both write to the same internal register. STAR allows to
- * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
- */
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 27e46658ebe3..35691a6b0d32 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -86,3 +86,14 @@ bool __init hypervisor_x2apic_available(void)
x86_hyper->x2apic_available &&
x86_hyper->x2apic_available();
}
+
+void hypervisor_pin_vcpu(int cpu)
+{
+ if (!x86_hyper)
+ return;
+
+ if (x86_hyper->pin_vcpu)
+ x86_hyper->pin_vcpu(cpu);
+ else
+ WARN_ONCE(1, "vcpu pinning requested but not supported!\n");
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 79d8ec849468..a7fdf453d895 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -41,6 +41,7 @@
#include <linux/debugfs.h>
#include <linux/irq_work.h>
#include <linux/export.h>
+#include <linux/jump_label.h>
#include <asm/processor.h>
#include <asm/traps.h>
@@ -292,6 +293,13 @@ static void print_mce(struct mce *m)
if (m->misc)
pr_cont("MISC %llx ", m->misc);
+ if (mce_flags.smca) {
+ if (m->synd)
+ pr_cont("SYND %llx ", m->synd);
+ if (m->ipid)
+ pr_cont("IPID %llx ", m->ipid);
+ }
+
pr_cont("\n");
/*
* Note this output is parsed by external tools and old fields
@@ -568,6 +576,7 @@ static void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(msr_ops.misc(i));
+
if (m->status & MCI_STATUS_ADDRV) {
m->addr = mce_rdmsrl(msr_ops.addr(i));
@@ -579,6 +588,23 @@ static void mce_read_aux(struct mce *m, int i)
m->addr >>= shift;
m->addr <<= shift;
}
+
+ /*
+ * Extract [55:<lsb>] where lsb is the least significant
+ * *valid* bit of the address bits.
+ */
+ if (mce_flags.smca) {
+ u8 lsb = (m->addr >> 56) & 0x3f;
+
+ m->addr &= GENMASK_ULL(55, lsb);
+ }
+ }
+
+ if (mce_flags.smca) {
+ m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
+
+ if (m->status & MCI_STATUS_SYNDV)
+ m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
}
}
@@ -1633,17 +1659,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 6 && c->x86_model == 45)
quirk_no_way_out = quirk_sandybridge_ifu;
- /*
- * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
- * whether this processor will actually generate recoverable
- * machine checks. Check to see if this is an E7 model Xeon.
- * We can't do a model number check because E5 and E7 use the
- * same model number. E5 doesn't support recovery, E7 does.
- */
- if (mca_cfg.recovery || (mca_cfg.ser &&
- !strncmp(c->x86_model_id,
- "Intel(R) Xeon(R) CPU E7-", 24)))
- set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
}
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = 0;
@@ -2080,6 +2095,7 @@ void mce_disable_bank(int bank)
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
* mce=bios_cmci_threshold Don't program the CMCI threshold
+ * mce=recovery force enable memcpy_mcsafe()
*/
static int __init mcheck_enable(char *str)
{
@@ -2676,8 +2692,14 @@ static int __init mcheck_debugfs_init(void)
static int __init mcheck_debugfs_init(void) { return -EINVAL; }
#endif
+DEFINE_STATIC_KEY_FALSE(mcsafe_key);
+EXPORT_SYMBOL_GPL(mcsafe_key);
+
static int __init mcheck_late_init(void)
{
+ if (mca_cfg.recovery)
+ static_branch_inc(&mcsafe_key);
+
mcheck_debugfs_init();
/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 7b7f3be783d4..9b5403462936 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/smp.h>
+#include <linux/string.h>
#include <asm/amd_nb.h>
#include <asm/apic.h>
@@ -63,34 +64,71 @@ static const char * const th_names[] = {
"execution_unit",
};
-/* Define HWID to IP type mappings for Scalable MCA */
-struct amd_hwid amd_hwids[] = {
- [SMCA_F17H_CORE] = { "f17h_core", 0xB0 },
- [SMCA_DF] = { "data_fabric", 0x2E },
- [SMCA_UMC] = { "umc", 0x96 },
- [SMCA_PB] = { "param_block", 0x5 },
- [SMCA_PSP] = { "psp", 0xFF },
- [SMCA_SMU] = { "smu", 0x1 },
+static const char * const smca_umc_block_names[] = {
+ "dram_ecc",
+ "misc_umc"
};
-EXPORT_SYMBOL_GPL(amd_hwids);
-
-const char * const amd_core_mcablock_names[] = {
- [SMCA_LS] = "load_store",
- [SMCA_IF] = "insn_fetch",
- [SMCA_L2_CACHE] = "l2_cache",
- [SMCA_DE] = "decode_unit",
- [RES] = "",
- [SMCA_EX] = "execution_unit",
- [SMCA_FP] = "floating_point",
- [SMCA_L3_CACHE] = "l3_cache",
+
+struct smca_bank_name smca_bank_names[] = {
+ [SMCA_LS] = { "load_store", "Load Store Unit" },
+ [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
+ [SMCA_DE] = { "decode_unit", "Decode Unit" },
+ [SMCA_EX] = { "execution_unit", "Execution Unit" },
+ [SMCA_FP] = { "floating_point", "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
+ [SMCA_CS] = { "coherent_slave", "Coherent Slave" },
+ [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
+ [SMCA_UMC] = { "umc", "Unified Memory Controller" },
+ [SMCA_PB] = { "param_block", "Parameter Block" },
+ [SMCA_PSP] = { "psp", "Platform Security Processor" },
+ [SMCA_SMU] = { "smu", "System Management Unit" },
};
-EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+EXPORT_SYMBOL_GPL(smca_bank_names);
+
+static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
+ /* { bank_type, hwid_mcatype, xec_bitmap } */
+
+ /* ZN Core (HWID=0xB0) MCA types */
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF },
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF },
+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF },
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F },
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF },
+
+ /* Data Fabric MCA types */
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF },
+
+ /* Unified Memory Controller MCA type */
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F },
+
+ /* Parameter Block MCA type */
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 },
+
+ /* Platform Security Processor MCA type */
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 },
-const char * const amd_df_mcablock_names[] = {
- [SMCA_CS] = "coherent_slave",
- [SMCA_PIE] = "pie",
+ /* System Management Unit MCA type */
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 },
};
-EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
+struct smca_bank_info smca_banks[MAX_NR_BANKS];
+EXPORT_SYMBOL_GPL(smca_banks);
+
+/*
+ * In SMCA enabled processors, we can have multiple banks for a given IP type.
+ * So to define a unique name for each bank, we use a temp c-string to append
+ * the MCA_IPID[InstanceId] to type's name in get_name().
+ *
+ * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN
+ * is greater than 8 plus 1 (for underscore) plus length of longest type name.
+ */
+#define MAX_MCATYPE_NAME_LEN 30
+static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
@@ -108,6 +146,36 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
* CPU Initialization
*/
+static void get_smca_bank_info(unsigned int bank)
+{
+ unsigned int i, hwid_mcatype, cpu = smp_processor_id();
+ struct smca_hwid_mcatype *type;
+ u32 high, instanceId;
+ u16 hwid, mcatype;
+
+ /* Collect bank_info using CPU 0 for now. */
+ if (cpu)
+ return;
+
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) {
+ pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
+ return;
+ }
+
+ hwid = high & MCI_IPID_HWID;
+ mcatype = (high & MCI_IPID_MCATYPE) >> 16;
+ hwid_mcatype = HWID_MCATYPE(hwid, mcatype);
+
+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+ type = &smca_hwid_mcatypes[i];
+ if (hwid_mcatype == type->hwid_mcatype) {
+ smca_banks[bank].type = type;
+ smca_banks[bank].type_instance = instanceId;
+ break;
+ }
+ }
+}
+
struct thresh_restart {
struct threshold_block *b;
int reset;
@@ -293,7 +361,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
wrmsr(MSR_CU_DEF_ERR, low, high);
}
-static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high,
unsigned int bank, unsigned int block)
{
u32 addr = 0, offset = 0;
@@ -309,13 +377,13 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
*/
u32 low, high;
- if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
return addr;
if (!(low & MCI_CONFIG_MCAX))
return addr;
- if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+ if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
(low & MASK_BLKPTR_LO))
addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
}
@@ -395,6 +463,20 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
*/
smca_high &= ~BIT(2);
+ /*
+ * SMCA sets the Deferred Error Interrupt type per bank.
+ *
+ * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+ * if the DeferredIntType bit field is available.
+ *
+ * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+ * high portion of the MSR). OS should set this to 0x1 to enable
+ * APIC based interrupt. First, check that no interrupt has been
+ * set.
+ */
+ if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3))
+ smca_high |= BIT(5);
+
wrmsr(smca_addr, smca_low, smca_high);
}
@@ -421,12 +503,15 @@ out:
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
u32 low = 0, high = 0, address = 0;
- unsigned int bank, block;
+ unsigned int bank, block, cpu = smp_processor_id();
int offset = -1;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (mce_flags.smca)
+ get_smca_bank_info(bank);
+
for (block = 0; block < NR_BLOCKS; ++block) {
- address = get_block_address(address, low, high, bank, block);
+ address = get_block_address(cpu, address, low, high, bank, block);
if (!address)
break;
@@ -476,9 +561,27 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
if (threshold_err)
m.misc = misc;
- if (m.status & MCI_STATUS_ADDRV)
+ if (m.status & MCI_STATUS_ADDRV) {
rdmsrl(msr_addr, m.addr);
+ /*
+ * Extract [55:<lsb>] where lsb is the least significant
+ * *valid* bit of the address bits.
+ */
+ if (mce_flags.smca) {
+ u8 lsb = (m.addr >> 56) & 0x3f;
+
+ m.addr &= GENMASK_ULL(55, lsb);
+ }
+ }
+
+ if (mce_flags.smca) {
+ rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
+
+ if (m.status & MCI_STATUS_SYNDV)
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
+ }
+
mce_log(&m);
wrmsrl(msr_status, 0);
@@ -541,15 +644,14 @@ static void amd_deferred_error_interrupt(void)
static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
- int cpu = smp_processor_id();
- unsigned int bank, block;
+ unsigned int bank, block, cpu = smp_processor_id();
/* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
for (block = 0; block < NR_BLOCKS; ++block) {
- address = get_block_address(address, low, high, bank, block);
+ address = get_block_address(cpu, address, low, high, bank, block);
if (!address)
break;
@@ -713,6 +815,34 @@ static struct kobj_type threshold_ktype = {
.default_attrs = default_attrs,
};
+static const char *get_name(unsigned int bank, struct threshold_block *b)
+{
+ unsigned int bank_type;
+
+ if (!mce_flags.smca) {
+ if (b && bank == 4)
+ return bank4_names(b);
+
+ return th_names[bank];
+ }
+
+ if (!smca_banks[bank].type)
+ return NULL;
+
+ bank_type = smca_banks[bank].type->bank_type;
+
+ if (b && bank_type == SMCA_UMC) {
+ if (b->block < ARRAY_SIZE(smca_umc_block_names))
+ return smca_umc_block_names[b->block];
+ return NULL;
+ }
+
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
+ "%s_%x", smca_bank_names[bank_type].name,
+ smca_banks[bank].type_instance);
+ return buf_mcatype;
+}
+
static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
unsigned int block, u32 address)
{
@@ -767,11 +897,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
err = kobject_init_and_add(&b->kobj, &threshold_ktype,
per_cpu(threshold_banks, cpu)[bank]->kobj,
- (bank == 4 ? bank4_names(b) : th_names[bank]));
+ get_name(bank, b));
if (err)
goto out_free;
recurse:
- address = get_block_address(address, low, high, bank, ++block);
+ address = get_block_address(cpu, address, low, high, bank, ++block);
if (!address)
return 0;
@@ -822,7 +952,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
struct device *dev = per_cpu(mce_device, cpu);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
- const char *name = th_names[bank];
+ const char *name = get_name(bank, NULL);
int err = 0;
if (is_shared_bank(bank)) {
@@ -869,7 +999,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
}
}
- err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank));
+ err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank));
if (!err)
goto out;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 27a0228c9cae..620ab06bcf45 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -54,6 +54,7 @@ static LIST_HEAD(pcache);
*/
static u8 *container;
static size_t container_size;
+static bool ucode_builtin;
static u32 ucode_new_rev;
static u8 amd_ucode_patch[PATCH_MAX_SIZE];
@@ -281,18 +282,22 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
void __init load_ucode_amd_bsp(unsigned int family)
{
struct cpio_data cp;
+ bool *builtin;
void **data;
size_t *size;
#ifdef CONFIG_X86_32
data = (void **)__pa_nodebug(&ucode_cpio.data);
size = (size_t *)__pa_nodebug(&ucode_cpio.size);
+ builtin = (bool *)__pa_nodebug(&ucode_builtin);
#else
data = &ucode_cpio.data;
size = &ucode_cpio.size;
+ builtin = &ucode_builtin;
#endif
- if (!load_builtin_amd_microcode(&cp, family))
+ *builtin = load_builtin_amd_microcode(&cp, family);
+ if (!*builtin)
cp = find_ucode_in_initrd();
if (!(cp.data && cp.size))
@@ -355,6 +360,7 @@ void load_ucode_amd_ap(void)
unsigned int cpu = smp_processor_id();
struct equiv_cpu_entry *eq;
struct microcode_amd *mc;
+ u8 *cont = container;
u32 rev, eax;
u16 eq_id;
@@ -371,8 +377,12 @@ void load_ucode_amd_ap(void)
if (check_current_patch_level(&rev, false))
return;
+ /* Add CONFIG_RANDOMIZE_MEMORY offset. */
+ if (!ucode_builtin)
+ cont += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+
eax = cpuid_eax(0x00000001);
- eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
+ eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ);
eq_id = find_equiv_id(eq, eax);
if (!eq_id)
@@ -434,6 +444,10 @@ int __init save_microcode_in_initrd_amd(void)
else
container = cont_va;
+ /* Add CONFIG_RANDOMIZE_MEMORY offset. */
+ if (!ucode_builtin)
+ container += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+
eax = cpuid_eax(0x00000001);
eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index df04b2d033f6..5ce5155f0695 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -558,55 +558,36 @@ static struct syscore_ops mc_syscore_ops = {
.resume = mc_bp_resume,
};
-static int
-mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+static int mc_cpu_online(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
struct device *dev;
dev = get_cpu_device(cpu);
+ microcode_update_cpu(cpu);
+ pr_debug("CPU%d added\n", cpu);
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- microcode_update_cpu(cpu);
- pr_debug("CPU%d added\n", cpu);
- /*
- * "break" is missing on purpose here because we want to fall
- * through in order to create the sysfs group.
- */
-
- case CPU_DOWN_FAILED:
- if (sysfs_create_group(&dev->kobj, &mc_attr_group))
- pr_err("Failed to create group for CPU%d\n", cpu);
- break;
+ if (sysfs_create_group(&dev->kobj, &mc_attr_group))
+ pr_err("Failed to create group for CPU%d\n", cpu);
+ return 0;
+}
- case CPU_DOWN_PREPARE:
- /* Suspend is in progress, only remove the interface */
- sysfs_remove_group(&dev->kobj, &mc_attr_group);
- pr_debug("CPU%d removed\n", cpu);
- break;
+static int mc_cpu_down_prep(unsigned int cpu)
+{
+ struct device *dev;
+ dev = get_cpu_device(cpu);
+ /* Suspend is in progress, only remove the interface */
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
+ pr_debug("CPU%d removed\n", cpu);
/*
- * case CPU_DEAD:
- *
* When a CPU goes offline, don't free up or invalidate the copy of
* the microcode in kernel memory, so that we can reuse it when the
* CPU comes back online without unnecessarily requesting the userspace
* for it again.
*/
- }
-
- /* The CPU refused to come up during a system resume */
- if (action == CPU_UP_CANCELED_FROZEN)
- microcode_fini_cpu(cpu);
-
- return NOTIFY_OK;
+ return 0;
}
-static struct notifier_block mc_cpu_notifier = {
- .notifier_call = mc_cpu_callback,
-};
-
static struct attribute *cpu_root_microcode_attrs[] = {
&dev_attr_reload.attr,
NULL
@@ -665,7 +646,8 @@ int __init microcode_init(void)
goto out_ucode_group;
register_syscore_ops(&mc_syscore_ops);
- register_hotcpu_notifier(&mc_cpu_notifier);
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
+ mc_cpu_online, mc_cpu_down_prep);
pr_info("Microcode Update Driver: v" MICROCODE_VERSION
" <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 28f1b54b7fad..24e87e74990d 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -72,14 +72,14 @@ static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
static bool mtrr_aps_delayed_init;
-static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
+static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init;
const struct mtrr_ops *mtrr_if;
static void set_mtrr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
-void set_mtrr_ops(const struct mtrr_ops *ops)
+void __init set_mtrr_ops(const struct mtrr_ops *ops)
{
if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
mtrr_ops[ops->vendor] = ops;
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 6c7ced07d16d..ad8bd763efa5 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -54,7 +54,7 @@ void fill_mtrr_var_range(unsigned int index,
bool get_mtrr_state(void);
void mtrr_bp_pat_init(void);
-extern void set_mtrr_ops(const struct mtrr_ops *ops);
+extern void __init set_mtrr_ops(const struct mtrr_ops *ops);
extern u64 size_or_mask, size_and_mask;
extern const struct mtrr_ops *mtrr_if;
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 92e8f0a7159c..9b7cf5c28f5f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -17,7 +17,7 @@
#include <linux/sysfs.h>
#include <asm/stacktrace.h>
-
+#include <asm/unwind.h>
int panic_on_unrecovered_nmi;
int panic_on_io_nmi;
@@ -25,11 +25,29 @@ unsigned int code_bytes = 64;
int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
static int die_counter;
+bool in_task_stack(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info)
+{
+ unsigned long *begin = task_stack_page(task);
+ unsigned long *end = task_stack_page(task) + THREAD_SIZE;
+
+ if (stack < begin || stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_TASK;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
+
+ return true;
+}
+
static void printk_stack_address(unsigned long address, int reliable,
- void *data)
+ char *log_lvl)
{
+ touch_nmi_watchdog();
printk("%s [<%p>] %s%pB\n",
- (char *)data, (void *)address, reliable ? "" : "? ",
+ log_lvl, (void *)address, reliable ? "" : "? ",
(void *)address);
}
@@ -38,176 +56,120 @@ void printk_address(unsigned long address)
pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address);
}
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void
-print_ftrace_graph_addr(unsigned long addr, void *data,
- const struct stacktrace_ops *ops,
- struct task_struct *task, int *graph)
+void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, char *log_lvl)
{
- unsigned long ret_addr;
- int index;
-
- if (addr != (unsigned long)return_to_handler)
- return;
-
- index = task->curr_ret_stack;
-
- if (!task->ret_stack || index < *graph)
- return;
-
- index -= *graph;
- ret_addr = task->ret_stack[index].ret;
-
- ops->address(data, ret_addr, 1);
+ struct unwind_state state;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
+ int graph_idx = 0;
- (*graph)++;
-}
-#else
-static inline void
-print_ftrace_graph_addr(unsigned long addr, void *data,
- const struct stacktrace_ops *ops,
- struct task_struct *task, int *graph)
-{ }
-#endif
-
-/*
- * x86-64 can have up to three kernel stacks:
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
-
-static inline int valid_stack_ptr(struct task_struct *task,
- void *p, unsigned int size, void *end)
-{
- void *t = task_stack_page(task);
- if (end) {
- if (p < end && p >= (end-THREAD_SIZE))
- return 1;
- else
- return 0;
- }
- return p >= t && p < t + THREAD_SIZE - size;
-}
+ printk("%sCall Trace:\n", log_lvl);
-unsigned long
-print_context_stack(struct task_struct *task,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
+ unwind_start(&state, task, regs, stack);
/*
- * If we overflowed the stack into a guard page, jump back to the
- * bottom of the usable stack.
+ * Iterate through the stacks, starting with the current stack pointer.
+ * Each stack has a pointer to the next one.
+ *
+ * x86-64 can have several stacks:
+ * - task stack
+ * - interrupt stack
+ * - HW exception stacks (double fault, nmi, debug, mce)
+ *
+ * x86-32 can have up to three stacks:
+ * - task stack
+ * - softirq stack
+ * - hardirq stack
*/
- if ((unsigned long)task_stack_page(task) - (unsigned long)stack <
- PAGE_SIZE)
- stack = (unsigned long *)task_stack_page(task);
-
- while (valid_stack_ptr(task, stack, sizeof(*stack), end)) {
- unsigned long addr;
-
- addr = *stack;
- if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + sizeof(long)) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- bp = (unsigned long) frame;
- } else {
- ops->address(data, addr, 0);
- }
- print_ftrace_graph_addr(addr, data, ops, task, graph);
- }
- stack++;
- }
- return bp;
-}
-EXPORT_SYMBOL_GPL(print_context_stack);
-
-unsigned long
-print_context_stack_bp(struct task_struct *task,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
- unsigned long *ret_addr = &frame->return_address;
+ for (; stack; stack = stack_info.next_sp) {
+ const char *str_begin, *str_end;
- while (valid_stack_ptr(task, ret_addr, sizeof(*ret_addr), end)) {
- unsigned long addr = *ret_addr;
+ /*
+ * If we overflowed the task stack into a guard page, jump back
+ * to the bottom of the usable stack.
+ */
+ if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
+ stack = task_stack_page(task);
- if (!__kernel_text_address(addr))
+ if (get_stack_info(stack, task, &stack_info, &visit_mask))
break;
- if (ops->address(data, addr, 1))
- break;
- frame = frame->next_frame;
- ret_addr = &frame->return_address;
- print_ftrace_graph_addr(addr, data, ops, task, graph);
- }
-
- return (unsigned long)frame;
-}
-EXPORT_SYMBOL_GPL(print_context_stack_bp);
-
-static int print_trace_stack(void *data, char *name)
-{
- printk("%s <%s> ", (char *)data, name);
- return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static int print_trace_address(void *data, unsigned long addr, int reliable)
-{
- touch_nmi_watchdog();
- printk_stack_address(addr, reliable, data);
- return 0;
-}
-
-static const struct stacktrace_ops print_trace_ops = {
- .stack = print_trace_stack,
- .address = print_trace_address,
- .walk_stack = print_context_stack,
-};
-
-void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
-{
- printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
+ stack_type_str(stack_info.type, &str_begin, &str_end);
+ if (str_begin)
+ printk("%s <%s> ", log_lvl, str_begin);
+
+ /*
+ * Scan the stack, printing any text addresses we find. At the
+ * same time, follow proper stack frames with the unwinder.
+ *
+ * Addresses found during the scan which are not reported by
+ * the unwinder are considered to be additional clues which are
+ * sometimes useful for debugging and are prefixed with '?'.
+ * This also serves as a failsafe option in case the unwinder
+ * goes off in the weeds.
+ */
+ for (; stack < stack_info.end; stack++) {
+ unsigned long real_addr;
+ int reliable = 0;
+ unsigned long addr = *stack;
+ unsigned long *ret_addr_p =
+ unwind_get_return_address_ptr(&state);
+
+ if (!__kernel_text_address(addr))
+ continue;
+
+ if (stack == ret_addr_p)
+ reliable = 1;
+
+ /*
+ * When function graph tracing is enabled for a
+ * function, its return address on the stack is
+ * replaced with the address of an ftrace handler
+ * (return_to_handler). In that case, before printing
+ * the "real" address, we want to print the handler
+ * address as an "unreliable" hint that function graph
+ * tracing was involved.
+ */
+ real_addr = ftrace_graph_ret_addr(task, &graph_idx,
+ addr, stack);
+ if (real_addr != addr)
+ printk_stack_address(addr, 0, log_lvl);
+ printk_stack_address(real_addr, reliable, log_lvl);
+
+ if (!reliable)
+ continue;
+
+ /*
+ * Get the next frame from the unwinder. No need to
+ * check for an error: if anything goes wrong, the rest
+ * of the addresses will just be printed as unreliable.
+ */
+ unwind_next_frame(&state);
+ }
-void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
-{
- show_trace_log_lvl(task, regs, stack, bp, "");
+ if (str_end)
+ printk("%s <%s> ", log_lvl, str_end);
+ }
}
void show_stack(struct task_struct *task, unsigned long *sp)
{
- unsigned long bp = 0;
- unsigned long stack;
+ task = task ? : current;
/*
* Stack frames below this one aren't interesting. Don't show them
* if we're printing for %current.
*/
- if (!sp && (!task || task == current)) {
- sp = &stack;
- bp = stack_frame(current, NULL);
- }
+ if (!sp && task == current)
+ sp = get_stack_pointer(current, NULL);
- show_stack_log_lvl(task, NULL, sp, bp, "");
+ show_stack_log_lvl(task, NULL, sp, "");
}
void show_stack_regs(struct pt_regs *regs)
{
- show_stack_log_lvl(current, regs, (unsigned long *)regs->sp, regs->bp, "");
+ show_stack_log_lvl(current, regs, NULL, "");
}
static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 09675712eba8..06eb322b5f9f 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,93 +16,121 @@
#include <asm/stacktrace.h>
-static void *is_irq_stack(void *p, void *irq)
+void stack_type_str(enum stack_type type, const char **begin, const char **end)
{
- if (p < irq || p >= (irq + THREAD_SIZE))
- return NULL;
- return irq + THREAD_SIZE;
+ switch (type) {
+ case STACK_TYPE_IRQ:
+ case STACK_TYPE_SOFTIRQ:
+ *begin = "IRQ";
+ *end = "EOI";
+ break;
+ default:
+ *begin = NULL;
+ *end = NULL;
+ }
}
-
-static void *is_hardirq_stack(unsigned long *stack, int cpu)
+static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
{
- void *irq = per_cpu(hardirq_stack, cpu);
+ unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
- return is_irq_stack(stack, irq);
-}
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack < begin || stack > end)
+ return false;
-static void *is_softirq_stack(unsigned long *stack, int cpu)
-{
- void *irq = per_cpu(softirq_stack, cpu);
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
- return is_irq_stack(stack, irq);
+ /*
+ * See irq_32.c -- the next stack pointer is stored at the beginning of
+ * the stack.
+ */
+ info->next_sp = (unsigned long *)*begin;
+
+ return true;
}
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
+static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
{
- const unsigned cpu = get_cpu();
- int graph = 0;
- u32 *prev_esp;
+ unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
- if (!task)
- task = current;
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack < begin || stack > end)
+ return false;
- if (!stack) {
- unsigned long dummy;
+ info->type = STACK_TYPE_SOFTIRQ;
+ info->begin = begin;
+ info->end = end;
- stack = &dummy;
- if (task != current)
- stack = (unsigned long *)task->thread.sp;
- }
+ /*
+ * The next stack pointer is stored at the beginning of the stack.
+ * See irq_32.c.
+ */
+ info->next_sp = (unsigned long *)*begin;
- if (!bp)
- bp = stack_frame(task, regs);
+ return true;
+}
- for (;;) {
- void *end_stack;
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
+{
+ if (!stack)
+ goto unknown;
- end_stack = is_hardirq_stack(stack, cpu);
- if (!end_stack)
- end_stack = is_softirq_stack(stack, cpu);
+ task = task ? : current;
- bp = ops->walk_stack(task, stack, bp, ops, data,
- end_stack, &graph);
+ if (in_task_stack(stack, task, info))
+ goto recursion_check;
- /* Stop if not on irq stack */
- if (!end_stack)
- break;
+ if (task != current)
+ goto unknown;
- /* The previous esp is saved on the bottom of the stack */
- prev_esp = (u32 *)(end_stack - THREAD_SIZE);
- stack = (unsigned long *)*prev_esp;
- if (!stack)
- break;
+ if (in_hardirq_stack(stack, info))
+ goto recursion_check;
- if (ops->stack(data, "IRQ") < 0)
- break;
- touch_nmi_watchdog();
+ if (in_softirq_stack(stack, info))
+ goto recursion_check;
+
+ goto unknown;
+
+recursion_check:
+ /*
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
+ */
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type))
+ goto unknown;
+ *visit_mask |= 1UL << info->type;
}
- put_cpu();
+
+ return 0;
+
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
}
-EXPORT_SYMBOL(dump_trace);
-void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, char *log_lvl)
{
unsigned long *stack;
int i;
- if (sp == NULL) {
- if (regs)
- sp = (unsigned long *)regs->sp;
- else if (task)
- sp = (unsigned long *)task->thread.sp;
- else
- sp = (unsigned long *)&sp;
- }
+ if (!try_get_task_stack(task))
+ return;
+
+ sp = sp ? : get_stack_pointer(task, regs);
stack = sp;
for (i = 0; i < kstack_depth_to_print; i++) {
@@ -117,7 +145,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
touch_nmi_watchdog();
}
pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, log_lvl);
+
+ put_task_stack(task);
}
@@ -139,7 +169,7 @@ void show_regs(struct pt_regs *regs)
u8 *ip;
pr_emerg("Stack:\n");
- show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
+ show_stack_log_lvl(current, regs, NULL, KERN_EMERG);
pr_emerg("Code:");
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 9ee4520ce83c..36cf1a498227 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,261 +16,145 @@
#include <asm/stacktrace.h>
+static char *exception_stack_names[N_EXCEPTION_STACKS] = {
+ [ DOUBLEFAULT_STACK-1 ] = "#DF",
+ [ NMI_STACK-1 ] = "NMI",
+ [ DEBUG_STACK-1 ] = "#DB",
+ [ MCE_STACK-1 ] = "#MC",
+};
-#define N_EXCEPTION_STACKS_END \
- (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
-
-static char x86_stack_ids[][8] = {
- [ DEBUG_STACK-1 ] = "#DB",
- [ NMI_STACK-1 ] = "NMI",
- [ DOUBLEFAULT_STACK-1 ] = "#DF",
- [ MCE_STACK-1 ] = "#MC",
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- [ N_EXCEPTION_STACKS ...
- N_EXCEPTION_STACKS_END ] = "#DB[?]"
-#endif
+static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
};
-static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
- unsigned *usedp, char **idp)
+void stack_type_str(enum stack_type type, const char **begin, const char **end)
{
- unsigned k;
-
- /*
- * Iterate over all exception stacks, and figure out whether
- * 'stack' is in one of them:
- */
- for (k = 0; k < N_EXCEPTION_STACKS; k++) {
- unsigned long end = per_cpu(orig_ist, cpu).ist[k];
- /*
- * Is 'stack' above this exception frame's end?
- * If yes then skip to the next frame.
- */
- if (stack >= end)
- continue;
- /*
- * Is 'stack' above this exception frame's start address?
- * If yes then we found the right frame.
- */
- if (stack >= end - EXCEPTION_STKSZ) {
- /*
- * Make sure we only iterate through an exception
- * stack once. If it comes up for the second time
- * then there's something wrong going on - just
- * break out and return NULL:
- */
- if (*usedp & (1U << k))
- break;
- *usedp |= 1U << k;
- *idp = x86_stack_ids[k];
- return (unsigned long *)end;
- }
- /*
- * If this is a debug stack, and if it has a larger size than
- * the usual exception stacks, then 'stack' might still
- * be within the lower portion of the debug stack:
- */
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
- unsigned j = N_EXCEPTION_STACKS - 1;
-
- /*
- * Black magic. A large debug stack is composed of
- * multiple exception stack entries, which we
- * iterate through now. Dont look:
- */
- do {
- ++j;
- end -= EXCEPTION_STKSZ;
- x86_stack_ids[j][4] = '1' +
- (j - N_EXCEPTION_STACKS);
- } while (stack < end - EXCEPTION_STKSZ);
- if (*usedp & (1U << j))
- break;
- *usedp |= 1U << j;
- *idp = x86_stack_ids[j];
- return (unsigned long *)end;
- }
-#endif
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
+
+ switch (type) {
+ case STACK_TYPE_IRQ:
+ *begin = "IRQ";
+ *end = "EOI";
+ break;
+ case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST:
+ *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION];
+ *end = "EOE";
+ break;
+ default:
+ *begin = NULL;
+ *end = NULL;
}
- return NULL;
}
-static inline int
-in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
- unsigned long *irq_stack_end)
+static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
{
- return (stack >= irq_stack && stack < irq_stack_end);
-}
-
-static const unsigned long irq_stack_size =
- (IRQ_STACK_SIZE - 64) / sizeof(unsigned long);
-
-enum stack_type {
- STACK_IS_UNKNOWN,
- STACK_IS_NORMAL,
- STACK_IS_EXCEPTION,
- STACK_IS_IRQ,
-};
-
-static enum stack_type
-analyze_stack(int cpu, struct task_struct *task, unsigned long *stack,
- unsigned long **stack_end, unsigned long *irq_stack,
- unsigned *used, char **id)
-{
- unsigned long addr;
+ unsigned long *begin, *end;
+ struct pt_regs *regs;
+ unsigned k;
- addr = ((unsigned long)stack & (~(THREAD_SIZE - 1)));
- if ((unsigned long)task_stack_page(task) == addr)
- return STACK_IS_NORMAL;
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
- *stack_end = in_exception_stack(cpu, (unsigned long)stack,
- used, id);
- if (*stack_end)
- return STACK_IS_EXCEPTION;
+ for (k = 0; k < N_EXCEPTION_STACKS; k++) {
+ end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k];
+ begin = end - (exception_stack_sizes[k] / sizeof(long));
+ regs = (struct pt_regs *)end - 1;
- if (!irq_stack)
- return STACK_IS_NORMAL;
+ if (stack < begin || stack >= end)
+ continue;
- *stack_end = irq_stack;
- irq_stack = irq_stack - irq_stack_size;
+ info->type = STACK_TYPE_EXCEPTION + k;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = (unsigned long *)regs->sp;
- if (in_irq_stack(stack, irq_stack, *stack_end))
- return STACK_IS_IRQ;
+ return true;
+ }
- return STACK_IS_UNKNOWN;
+ return false;
}
-/*
- * x86-64 can have up to three kernel stacks:
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
-
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
+static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
- const unsigned cpu = get_cpu();
- unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
- unsigned long dummy;
- unsigned used = 0;
- int graph = 0;
- int done = 0;
-
- if (!task)
- task = current;
-
- if (!stack) {
- if (regs)
- stack = (unsigned long *)regs->sp;
- else if (task != current)
- stack = (unsigned long *)task->thread.sp;
- else
- stack = &dummy;
- }
+ unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr);
+ unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
- if (!bp)
- bp = stack_frame(task, regs);
/*
- * Print function call entries in all stacks, starting at the
- * current stack address. If the stacks consist of nested
- * exceptions
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
*/
- while (!done) {
- unsigned long *stack_end;
- enum stack_type stype;
- char *id;
+ if (stack < begin || stack > end)
+ return false;
- stype = analyze_stack(cpu, task, stack, &stack_end,
- irq_stack, &used, &id);
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
- /* Default finish unless specified to continue */
- done = 1;
+ /*
+ * The next stack pointer is the first thing pushed by the entry code
+ * after switching to the irq stack.
+ */
+ info->next_sp = (unsigned long *)*(end - 1);
- switch (stype) {
+ return true;
+}
- /* Break out early if we are on the thread stack */
- case STACK_IS_NORMAL:
- break;
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
+{
+ if (!stack)
+ goto unknown;
- case STACK_IS_EXCEPTION:
+ task = task ? : current;
- if (ops->stack(data, id) < 0)
- break;
+ if (in_task_stack(stack, task, info))
+ goto recursion_check;
- bp = ops->walk_stack(task, stack, bp, ops,
- data, stack_end, &graph);
- ops->stack(data, "<EOE>");
- /*
- * We link to the next stack via the
- * second-to-last pointer (index -2 to end) in the
- * exception stack:
- */
- stack = (unsigned long *) stack_end[-2];
- done = 0;
- break;
+ if (task != current)
+ goto unknown;
- case STACK_IS_IRQ:
+ if (in_exception_stack(stack, info))
+ goto recursion_check;
- if (ops->stack(data, "IRQ") < 0)
- break;
- bp = ops->walk_stack(task, stack, bp,
- ops, data, stack_end, &graph);
- /*
- * We link to the next stack (which would be
- * the process stack normally) the last
- * pointer (index -1 to end) in the IRQ stack:
- */
- stack = (unsigned long *) (stack_end[-1]);
- irq_stack = NULL;
- ops->stack(data, "EOI");
- done = 0;
- break;
+ if (in_irq_stack(stack, info))
+ goto recursion_check;
- case STACK_IS_UNKNOWN:
- ops->stack(data, "UNK");
- break;
- }
- }
+ goto unknown;
+recursion_check:
/*
- * This handles the process stack:
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
*/
- bp = ops->walk_stack(task, stack, bp, ops, data, NULL, &graph);
- put_cpu();
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type))
+ goto unknown;
+ *visit_mask |= 1UL << info->type;
+ }
+
+ return 0;
+
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
}
-EXPORT_SYMBOL(dump_trace);
-void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, char *log_lvl)
{
unsigned long *irq_stack_end;
unsigned long *irq_stack;
unsigned long *stack;
- int cpu;
int i;
- preempt_disable();
- cpu = smp_processor_id();
+ if (!try_get_task_stack(task))
+ return;
- irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
- irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
+ irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr);
+ irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long));
- /*
- * Debugging aid: "show_stack(NULL, NULL);" prints the
- * back trace for this cpu:
- */
- if (sp == NULL) {
- if (regs)
- sp = (unsigned long *)regs->sp;
- else if (task)
- sp = (unsigned long *)task->thread.sp;
- else
- sp = (unsigned long *)&sp;
- }
+ sp = sp ? : get_stack_pointer(task, regs);
stack = sp;
for (i = 0; i < kstack_depth_to_print; i++) {
@@ -299,18 +183,17 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
stack++;
touch_nmi_watchdog();
}
- preempt_enable();
pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, log_lvl);
+
+ put_task_stack(task);
}
void show_regs(struct pt_regs *regs)
{
int i;
- unsigned long sp;
- sp = regs->sp;
show_regs_print_info(KERN_DEFAULT);
__show_regs(regs, 1);
@@ -325,8 +208,7 @@ void show_regs(struct pt_regs *regs)
u8 *ip;
printk(KERN_DEFAULT "Stack:\n");
- show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
- 0, KERN_DEFAULT);
+ show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT);
printk(KERN_DEFAULT "Code: ");
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 621b501f8935..b85fe5f91c3f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -40,8 +40,10 @@
* user can e.g. boot the original kernel with mem=1G while still booting the
* next kernel with full memory.
*/
-struct e820map e820;
-struct e820map e820_saved;
+static struct e820map initial_e820 __initdata;
+static struct e820map initial_e820_saved __initdata;
+struct e820map *e820 __refdata = &initial_e820;
+struct e820map *e820_saved __refdata = &initial_e820_saved;
/* For PCI or other memory-mapped resources */
unsigned long pci_mem_start = 0xaeedbabe;
@@ -58,8 +60,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
if (type && ei->type != type)
continue;
@@ -81,8 +83,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
if (type && ei->type != type)
continue;
@@ -128,7 +130,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
void __init e820_add_region(u64 start, u64 size, int type)
{
- __e820_add_region(&e820, start, size, type);
+ __e820_add_region(e820, start, size, type);
}
static void __init e820_print_type(u32 type)
@@ -164,12 +166,12 @@ void __init e820_print_map(char *who)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
+ for (i = 0; i < e820->nr_map; i++) {
printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
- (unsigned long long) e820.map[i].addr,
+ (unsigned long long) e820->map[i].addr,
(unsigned long long)
- (e820.map[i].addr + e820.map[i].size - 1));
- e820_print_type(e820.map[i].type);
+ (e820->map[i].addr + e820->map[i].size - 1));
+ e820_print_type(e820->map[i].type);
printk(KERN_CONT "\n");
}
}
@@ -348,7 +350,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
* continue building up new bios map based on this
* information
*/
- if (current_type != last_type || current_type == E820_PRAM) {
+ if (current_type != last_type) {
if (last_type != 0) {
new_bios[new_bios_entry].size =
change_point[chgidx]->addr - last_addr;
@@ -388,11 +390,11 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
while (nr_map) {
u64 start = biosmap->addr;
u64 size = biosmap->size;
- u64 end = start + size;
+ u64 end = start + size - 1;
u32 type = biosmap->type;
/* Overflow in 64 bits? Ignore the memory map. */
- if (start > end)
+ if (start > end && likely(size))
return -1;
e820_add_region(start, size, type);
@@ -493,13 +495,13 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
unsigned new_type)
{
- return __e820_update_range(&e820, start, size, old_type, new_type);
+ return __e820_update_range(e820, start, size, old_type, new_type);
}
static u64 __init e820_update_range_saved(u64 start, u64 size,
unsigned old_type, unsigned new_type)
{
- return __e820_update_range(&e820_saved, start, size, old_type,
+ return __e820_update_range(e820_saved, start, size, old_type,
new_type);
}
@@ -521,8 +523,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
e820_print_type(old_type);
printk(KERN_CONT "\n");
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
u64 final_start, final_end;
u64 ei_end;
@@ -566,15 +568,15 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
void __init update_e820(void)
{
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map))
+ if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map))
return;
printk(KERN_INFO "e820: modified physical RAM map:\n");
e820_print_map("modified");
}
static void __init update_e820_saved(void)
{
- sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map),
- &e820_saved.nr_map);
+ sanitize_e820_map(e820_saved->map, ARRAY_SIZE(e820_saved->map),
+ &e820_saved->nr_map);
}
#define MAX_GAP_END 0x100000000ull
/*
@@ -584,14 +586,14 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
unsigned long start_addr, unsigned long long end_addr)
{
unsigned long long last;
- int i = e820.nr_map;
+ int i = e820->nr_map;
int found = 0;
last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
while (--i >= 0) {
- unsigned long long start = e820.map[i].addr;
- unsigned long long end = start + e820.map[i].size;
+ unsigned long long start = e820->map[i].addr;
+ unsigned long long end = start + e820->map[i].size;
if (end < start_addr)
continue;
@@ -649,6 +651,33 @@ __init void e820_setup_gap(void)
gapstart, gapstart + gapsize - 1);
}
+/*
+ * Called late during init, in free_initmem().
+ *
+ * Initial e820 and e820_saved are largish __initdata arrays.
+ * Copy them to (usually much smaller) dynamically allocated area.
+ * This is done after all tweaks we ever do to them:
+ * all functions which modify them are __init functions,
+ * they won't exist after this point.
+ */
+__init void e820_reallocate_tables(void)
+{
+ struct e820map *n;
+ int size;
+
+ size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map;
+ n = kmalloc(size, GFP_KERNEL);
+ BUG_ON(!n);
+ memcpy(n, e820, size);
+ e820 = n;
+
+ size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map;
+ n = kmalloc(size, GFP_KERNEL);
+ BUG_ON(!n);
+ memcpy(n, e820_saved, size);
+ e820_saved = n;
+}
+
/**
* Because of the size limitation of struct boot_params, only first
* 128 E820 memory entries are passed to kernel via
@@ -665,7 +694,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len)
entries = sdata->len / sizeof(struct e820entry);
extmap = (struct e820entry *)(sdata->data);
__append_e820_map(extmap, entries);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
early_memunmap(sdata, data_len);
printk(KERN_INFO "e820: extended physical RAM map:\n");
e820_print_map("extended");
@@ -686,8 +715,8 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
int i;
unsigned long pfn = 0;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
if (pfn < PFN_UP(ei->addr))
register_nosave_region(pfn, PFN_UP(ei->addr));
@@ -712,8 +741,8 @@ static int __init e820_mark_nvs_memory(void)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
if (ei->type == E820_NVS)
acpi_nvs_register(ei->addr, ei->size);
@@ -754,22 +783,18 @@ u64 __init early_reserve_e820(u64 size, u64 align)
/*
* Find the highest page frame number we have available
*/
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
{
int i;
unsigned long last_pfn = 0;
unsigned long max_arch_pfn = MAX_ARCH_PFN;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
unsigned long start_pfn;
unsigned long end_pfn;
- /*
- * Persistent memory is accounted as ram for purposes of
- * establishing max_pfn and mem_map.
- */
- if (ei->type != E820_RAM && ei->type != E820_PRAM)
+ if (ei->type != type)
continue;
start_pfn = ei->addr >> PAGE_SHIFT;
@@ -794,15 +819,15 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
}
unsigned long __init e820_end_of_ram_pfn(void)
{
- return e820_end_pfn(MAX_ARCH_PFN);
+ return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
}
unsigned long __init e820_end_of_low_ram_pfn(void)
{
- return e820_end_pfn(1UL << (32-PAGE_SHIFT));
+ return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM);
}
-static void early_panic(char *msg)
+static void __init early_panic(char *msg)
{
early_printk(msg);
panic(msg);
@@ -856,7 +881,7 @@ static int __init parse_memmap_one(char *p)
*/
saved_max_pfn = e820_end_of_ram_pfn();
#endif
- e820.nr_map = 0;
+ e820->nr_map = 0;
userdef = 1;
return 0;
}
@@ -903,8 +928,8 @@ early_param("memmap", parse_memmap_opt);
void __init finish_e820_parsing(void)
{
if (userdef) {
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map),
- &e820.nr_map) < 0)
+ if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map),
+ &e820->nr_map) < 0)
early_panic("Invalid user supplied memory map");
printk(KERN_INFO "e820: user-defined physical RAM map:\n");
@@ -912,7 +937,7 @@ void __init finish_e820_parsing(void)
}
}
-static const char *e820_type_to_string(int e820_type)
+static const char *__init e820_type_to_string(int e820_type)
{
switch (e820_type) {
case E820_RESERVED_KERN:
@@ -926,7 +951,7 @@ static const char *e820_type_to_string(int e820_type)
}
}
-static unsigned long e820_type_to_iomem_type(int e820_type)
+static unsigned long __init e820_type_to_iomem_type(int e820_type)
{
switch (e820_type) {
case E820_RESERVED_KERN:
@@ -942,7 +967,7 @@ static unsigned long e820_type_to_iomem_type(int e820_type)
}
}
-static unsigned long e820_type_to_iores_desc(int e820_type)
+static unsigned long __init e820_type_to_iores_desc(int e820_type)
{
switch (e820_type) {
case E820_ACPI:
@@ -961,7 +986,7 @@ static unsigned long e820_type_to_iores_desc(int e820_type)
}
}
-static bool do_mark_busy(u32 type, struct resource *res)
+static bool __init do_mark_busy(u32 type, struct resource *res)
{
/* this is the legacy bios/dos rom-shadow + mmio region */
if (res->start < (1ULL<<20))
@@ -991,35 +1016,35 @@ void __init e820_reserve_resources(void)
struct resource *res;
u64 end;
- res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
+ res = alloc_bootmem(sizeof(struct resource) * e820->nr_map);
e820_res = res;
- for (i = 0; i < e820.nr_map; i++) {
- end = e820.map[i].addr + e820.map[i].size - 1;
+ for (i = 0; i < e820->nr_map; i++) {
+ end = e820->map[i].addr + e820->map[i].size - 1;
if (end != (resource_size_t)end) {
res++;
continue;
}
- res->name = e820_type_to_string(e820.map[i].type);
- res->start = e820.map[i].addr;
+ res->name = e820_type_to_string(e820->map[i].type);
+ res->start = e820->map[i].addr;
res->end = end;
- res->flags = e820_type_to_iomem_type(e820.map[i].type);
- res->desc = e820_type_to_iores_desc(e820.map[i].type);
+ res->flags = e820_type_to_iomem_type(e820->map[i].type);
+ res->desc = e820_type_to_iores_desc(e820->map[i].type);
/*
* don't register the region that could be conflicted with
* pci device BAR resource and insert them later in
* pcibios_resource_survey()
*/
- if (do_mark_busy(e820.map[i].type, res)) {
+ if (do_mark_busy(e820->map[i].type, res)) {
res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
}
res++;
}
- for (i = 0; i < e820_saved.nr_map; i++) {
- struct e820entry *entry = &e820_saved.map[i];
+ for (i = 0; i < e820_saved->nr_map; i++) {
+ struct e820entry *entry = &e820_saved->map[i];
firmware_map_add_early(entry->addr,
entry->addr + entry->size,
e820_type_to_string(entry->type));
@@ -1027,7 +1052,7 @@ void __init e820_reserve_resources(void)
}
/* How much should we pad RAM ending depending on where it is? */
-static unsigned long ram_alignment(resource_size_t pos)
+static unsigned long __init ram_alignment(resource_size_t pos)
{
unsigned long mb = pos >> 20;
@@ -1051,7 +1076,7 @@ void __init e820_reserve_resources_late(void)
struct resource *res;
res = e820_res;
- for (i = 0; i < e820.nr_map; i++) {
+ for (i = 0; i < e820->nr_map; i++) {
if (!res->parent && res->end)
insert_resource_expand_to_fit(&iomem_resource, res);
res++;
@@ -1061,8 +1086,8 @@ void __init e820_reserve_resources_late(void)
* Try to bump up RAM regions to reasonable boundaries to
* avoid stolen RAM:
*/
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *entry = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *entry = &e820->map[i];
u64 start, end;
if (entry->type != E820_RAM)
@@ -1110,7 +1135,7 @@ char *__init default_machine_specific_memory_setup(void)
who = "BIOS-e801";
}
- e820.nr_map = 0;
+ e820->nr_map = 0;
e820_add_region(0, LOWMEMSIZE(), E820_RAM);
e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
}
@@ -1124,7 +1149,7 @@ void __init setup_memory_map(void)
char *who;
who = x86_init.resources.memory_setup();
- memcpy(&e820_saved, &e820, sizeof(struct e820map));
+ memcpy(e820_saved, e820, sizeof(struct e820map));
printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
e820_print_map(who);
}
@@ -1141,8 +1166,8 @@ void __init memblock_x86_fill(void)
*/
memblock_allow_resize();
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
end = ei->addr + ei->size;
if (end != (resource_size_t)end)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index de7501edb21c..18bb3a639197 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -555,7 +555,7 @@ intel_graphics_stolen(int num, int slot, int func,
/* Mark this space as reserved */
e820_add_region(base, size, E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
}
static void __init intel_graphics_quirks(int num, int slot, int func)
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 93982aebb398..2f2b8c7ccb85 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -317,7 +317,6 @@ static void __init fpu__init_system_ctx_switch(void)
on_boot_cpu = 0;
WARN_ON_FPU(current->thread.fpu.fpstate_active);
- current_thread_info()->status = 0;
if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
eagerfpu = ENABLE;
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 680049aa4593..01567aa87503 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -866,105 +866,17 @@ const void *get_xsave_field_ptr(int xsave_state)
return get_xsave_addr(&fpu->state.xsave, xsave_state);
}
-
-/*
- * Set xfeatures (aka XSTATE_BV) bit for a feature that we want
- * to take out of its "init state". This will ensure that an
- * XRSTOR actually restores the state.
- */
-static void fpu__xfeature_set_non_init(struct xregs_state *xsave,
- int xstate_feature_mask)
-{
- xsave->header.xfeatures |= xstate_feature_mask;
-}
-
-/*
- * This function is safe to call whether the FPU is in use or not.
- *
- * Note that this only works on the current task.
- *
- * Inputs:
- * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
- * XFEATURE_MASK_SSE, etc...)
- * @xsave_state_ptr: a pointer to a copy of the state that you would
- * like written in to the current task's FPU xsave state. This pointer
- * must not be located in the current tasks's xsave area.
- * Output:
- * address of the state in the xsave area or NULL if the state
- * is not present or is in its 'init state'.
- */
-static void fpu__xfeature_set_state(int xstate_feature_mask,
- void *xstate_feature_src, size_t len)
-{
- struct xregs_state *xsave = &current->thread.fpu.state.xsave;
- struct fpu *fpu = &current->thread.fpu;
- void *dst;
-
- if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
- WARN_ONCE(1, "%s() attempted with no xsave support", __func__);
- return;
- }
-
- /*
- * Tell the FPU code that we need the FPU state to be in
- * 'fpu' (not in the registers), and that we need it to
- * be stable while we write to it.
- */
- fpu__current_fpstate_write_begin();
-
- /*
- * This method *WILL* *NOT* work for compact-format
- * buffers. If the 'xstate_feature_mask' is unset in
- * xcomp_bv then we may need to move other feature state
- * "up" in the buffer.
- */
- if (xsave->header.xcomp_bv & xstate_feature_mask) {
- WARN_ON_ONCE(1);
- goto out;
- }
-
- /* find the location in the xsave buffer of the desired state */
- dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask);
-
- /*
- * Make sure that the pointer being passed in did not
- * come from the xsave buffer itself.
- */
- WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself");
-
- /* put the caller-provided data in the location */
- memcpy(dst, xstate_feature_src, len);
-
- /*
- * Mark the xfeature so that the CPU knows there is state
- * in the buffer now.
- */
- fpu__xfeature_set_non_init(xsave, xstate_feature_mask);
-out:
- /*
- * We are done writing to the 'fpu'. Reenable preeption
- * and (possibly) move the fpstate back in to the fpregs.
- */
- fpu__current_fpstate_write_end();
-}
-
#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
/*
- * This will go out and modify the XSAVE buffer so that PKRU is
- * set to a particular state for access to 'pkey'.
- *
- * PKRU state does affect kernel access to user memory. We do
- * not modfiy PKRU *itself* here, only the XSAVE state that will
- * be restored in to PKRU when we return back to userspace.
+ * This will go out and modify PKRU register to set the access
+ * rights for @pkey to @init_val.
*/
int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val)
{
- struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
- struct pkru_state *old_pkru_state;
- struct pkru_state new_pkru_state;
+ u32 old_pkru;
int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
u32 new_pkru_bits = 0;
@@ -974,6 +886,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
*/
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return -EINVAL;
+ /*
+ * For most XSAVE components, this would be an arduous task:
+ * brining fpstate up to date with fpregs, updating fpstate,
+ * then re-populating fpregs. But, for components that are
+ * never lazily managed, we can just access the fpregs
+ * directly. PKRU is never managed lazily, so we can just
+ * manipulate it directly. Make sure it stays that way.
+ */
+ WARN_ON_ONCE(!use_eager_fpu());
/* Set the bits we need in PKRU: */
if (init_val & PKEY_DISABLE_ACCESS)
@@ -984,37 +905,12 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
/* Shift the bits in to the correct place in PKRU for pkey: */
new_pkru_bits <<= pkey_shift;
- /* Locate old copy of the state in the xsave buffer: */
- old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU);
-
- /*
- * When state is not in the buffer, it is in the init
- * state, set it manually. Otherwise, copy out the old
- * state.
- */
- if (!old_pkru_state)
- new_pkru_state.pkru = 0;
- else
- new_pkru_state.pkru = old_pkru_state->pkru;
-
- /* Mask off any old bits in place: */
- new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
-
- /* Set the newly-requested bits: */
- new_pkru_state.pkru |= new_pkru_bits;
-
- /*
- * We could theoretically live without zeroing pkru.pad.
- * The current XSAVE feature state definition says that
- * only bytes 0->3 are used. But we do not want to
- * chance leaking kernel stack out to userspace in case a
- * memcpy() of the whole xsave buffer was done.
- *
- * They're in the same cacheline anyway.
- */
- new_pkru_state.pad = 0;
+ /* Get old PKRU and mask off any old bits in place: */
+ old_pkru = read_pkru();
+ old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
- fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, sizeof(new_pkru_state));
+ /* Write old part along with new part: */
+ write_pkru(old_pkru | new_pkru_bits);
return 0;
}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d036cfb4495d..8639bb2ae058 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -1029,7 +1029,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
}
if (ftrace_push_return_trace(old, self_addr, &trace.depth,
- frame_pointer) == -EBUSY) {
+ frame_pointer, parent) == -EBUSY) {
*parent = old;
return;
}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 2dda0bc4576e..f16c55bfc090 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -25,8 +25,6 @@ static void __init i386_default_early_setup(void)
/* Initialize 32bit specific setup functions */
x86_init.resources.reserve_resources = i386_reserve_resources;
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
-
- reserve_bios_regions();
}
asmlinkage __visible void __init i386_start_kernel(void)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 99d48e7d2974..54a2372f5dbb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -183,7 +183,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
copy_bootdata(__va(real_mode_data));
x86_early_init_platform_quirks();
- reserve_bios_regions();
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_INTEL_MID:
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 6f8902b0d151..5f401262f12d 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -94,7 +94,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
*/
__HEAD
ENTRY(startup_32)
- movl pa(stack_start),%ecx
+ movl pa(initial_stack),%ecx
/* test KEEP_SEGMENTS flag to see if the bootloader is asking
us to not reload segments */
@@ -286,7 +286,7 @@ num_subarch_entries = (. - subarch_entries) / 4
* start_secondary().
*/
ENTRY(start_cpu0)
- movl stack_start, %ecx
+ movl initial_stack, %ecx
movl %ecx, %esp
jmp *(initial_code)
ENDPROC(start_cpu0)
@@ -307,7 +307,7 @@ ENTRY(startup_32_smp)
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
- movl pa(stack_start),%ecx
+ movl pa(initial_stack),%ecx
movl %eax,%ss
leal -__PAGE_OFFSET(%ecx),%esp
@@ -703,7 +703,7 @@ ENTRY(initial_page_table)
.data
.balign 4
-ENTRY(stack_start)
+ENTRY(initial_stack)
.long init_thread_union+THREAD_SIZE
__INITRODATA
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 9f8efc9f0075..c98a559c346e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -66,7 +66,7 @@ startup_64:
*/
/*
- * Setup stack for verify_cpu(). "-8" because stack_start is defined
+ * Setup stack for verify_cpu(). "-8" because initial_stack is defined
* this way, see below. Our best guess is a NULL ptr for stack
* termination heuristics and we don't want to break anything which
* might depend on it (kgdb, ...).
@@ -226,7 +226,7 @@ ENTRY(secondary_startup_64)
movq %rax, %cr0
/* Setup a boot time stack */
- movq stack_start(%rip), %rsp
+ movq initial_stack(%rip), %rsp
/* zero EFLAGS after setting rsp */
pushq $0
@@ -310,7 +310,7 @@ ENDPROC(secondary_startup_64)
* start_secondary().
*/
ENTRY(start_cpu0)
- movq stack_start(%rip),%rsp
+ movq initial_stack(%rip),%rsp
movq initial_code(%rip),%rax
pushq $0 # fake return address to stop unwinder
pushq $__KERNEL_CS # set correct cs
@@ -319,17 +319,15 @@ ENTRY(start_cpu0)
ENDPROC(start_cpu0)
#endif
- /* SMP bootup changes these two */
+ /* Both SMP bootup and ACPI suspend change these variables */
__REFDATA
.balign 8
GLOBAL(initial_code)
.quad x86_64_start_kernel
GLOBAL(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union)
-
- GLOBAL(stack_start)
+ GLOBAL(initial_stack)
.quad init_thread_union+THREAD_SIZE-8
- .word 0
__FINITDATA
bad_address:
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ed16e58658a4..274fab99169d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -756,10 +756,104 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd)
/*
* Clock source related code
*/
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+/*
+ * Reading the HPET counter is a very slow operation. If a large number of
+ * CPUs are trying to access the HPET counter simultaneously, it can cause
+ * massive delay and slow down system performance dramatically. This may
+ * happen when HPET is the default clock source instead of TSC. For a
+ * really large system with hundreds of CPUs, the slowdown may be so
+ * severe that it may actually crash the system because of a NMI watchdog
+ * soft lockup, for example.
+ *
+ * If multiple CPUs are trying to access the HPET counter at the same time,
+ * we don't actually need to read the counter multiple times. Instead, the
+ * other CPUs can use the counter value read by the first CPU in the group.
+ *
+ * This special feature is only enabled on x86-64 systems. It is unlikely
+ * that 32-bit x86 systems will have enough CPUs to require this feature
+ * with its associated locking overhead. And we also need 64-bit atomic
+ * read.
+ *
+ * The lock and the hpet value are stored together and can be read in a
+ * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t
+ * is 32 bits in size.
+ */
+union hpet_lock {
+ struct {
+ arch_spinlock_t lock;
+ u32 value;
+ };
+ u64 lockval;
+};
+
+static union hpet_lock hpet __cacheline_aligned = {
+ { .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
+};
+
+static cycle_t read_hpet(struct clocksource *cs)
+{
+ unsigned long flags;
+ union hpet_lock old, new;
+
+ BUILD_BUG_ON(sizeof(union hpet_lock) != 8);
+
+ /*
+ * Read HPET directly if in NMI.
+ */
+ if (in_nmi())
+ return (cycle_t)hpet_readl(HPET_COUNTER);
+
+ /*
+ * Read the current state of the lock and HPET value atomically.
+ */
+ old.lockval = READ_ONCE(hpet.lockval);
+
+ if (arch_spin_is_locked(&old.lock))
+ goto contended;
+
+ local_irq_save(flags);
+ if (arch_spin_trylock(&hpet.lock)) {
+ new.value = hpet_readl(HPET_COUNTER);
+ /*
+ * Use WRITE_ONCE() to prevent store tearing.
+ */
+ WRITE_ONCE(hpet.value, new.value);
+ arch_spin_unlock(&hpet.lock);
+ local_irq_restore(flags);
+ return (cycle_t)new.value;
+ }
+ local_irq_restore(flags);
+
+contended:
+ /*
+ * Contended case
+ * --------------
+ * Wait until the HPET value change or the lock is free to indicate
+ * its value is up-to-date.
+ *
+ * It is possible that old.value has already contained the latest
+ * HPET value while the lock holder was in the process of releasing
+ * the lock. Checking for lock state change will enable us to return
+ * the value immediately instead of waiting for the next HPET reader
+ * to come along.
+ */
+ do {
+ cpu_relax();
+ new.lockval = READ_ONCE(hpet.lockval);
+ } while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
+
+ return (cycle_t)new.value;
+}
+#else
+/*
+ * For UP or 32-bit.
+ */
static cycle_t read_hpet(struct clocksource *cs)
{
return (cycle_t)hpet_readl(HPET_COUNTER);
}
+#endif
static struct clocksource clocksource_hpet = {
.name = "hpet",
@@ -1242,7 +1336,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
memset(&curr_time, 0, sizeof(struct rtc_time));
if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
- mc146818_set_time(&curr_time);
+ mc146818_get_time(&curr_time);
if (hpet_rtc_flags & RTC_UIE &&
curr_time.tm_sec != hpet_prev_update_sec) {
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 61521dc19c10..9f669fdd2010 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -102,8 +102,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_puts(p, " Rescheduling interrupts\n");
seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
- irq_stats(j)->irq_tlb_count);
+ seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
seq_puts(p, " Function call interrupts\n");
seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 4a7903714065..9ebd0b0e73d9 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -40,8 +40,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
if (user_mode(regs))
return;
- if (regs->sp >= curbase + sizeof(struct thread_info) +
- sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
+ if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
regs->sp <= curbase + THREAD_SIZE)
return;
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index f2356bda2b05..3407b148c240 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -99,14 +99,14 @@ static int setup_e820_entries(struct boot_params *params)
{
unsigned int nr_e820_entries;
- nr_e820_entries = e820_saved.nr_map;
+ nr_e820_entries = e820_saved->nr_map;
/* TODO: Pass entries more than E820MAX in bootparams setup data */
if (nr_e820_entries > E820MAX)
nr_e820_entries = E820MAX;
params->e820_entries = nr_e820_entries;
- memcpy(&params->e820_map, &e820_saved.map,
+ memcpy(&params->e820_map, &e820_saved->map,
nr_e820_entries * sizeof(struct e820entry));
return 0;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 04cde527d728..8e36f249646e 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -50,6 +50,7 @@
#include <asm/apicdef.h>
#include <asm/apic.h>
#include <asm/nmi.h>
+#include <asm/switch_to.h>
struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{
@@ -166,21 +167,19 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_DX] = 0;
gdb_regs[GDB_SI] = 0;
gdb_regs[GDB_DI] = 0;
- gdb_regs[GDB_BP] = *(unsigned long *)p->thread.sp;
+ gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp;
#ifdef CONFIG_X86_32
gdb_regs[GDB_DS] = __KERNEL_DS;
gdb_regs[GDB_ES] = __KERNEL_DS;
gdb_regs[GDB_PS] = 0;
gdb_regs[GDB_CS] = __KERNEL_CS;
- gdb_regs[GDB_PC] = p->thread.ip;
gdb_regs[GDB_SS] = __KERNEL_DS;
gdb_regs[GDB_FS] = 0xFFFF;
gdb_regs[GDB_GS] = 0xFFFF;
#else
- gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
+ gdb_regs32[GDB_PS] = 0;
gdb_regs32[GDB_CS] = __KERNEL_CS;
gdb_regs32[GDB_SS] = __KERNEL_DS;
- gdb_regs[GDB_PC] = 0;
gdb_regs[GDB_R8] = 0;
gdb_regs[GDB_R9] = 0;
gdb_regs[GDB_R10] = 0;
@@ -190,6 +189,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_R14] = 0;
gdb_regs[GDB_R15] = 0;
#endif
+ gdb_regs[GDB_PC] = 0;
gdb_regs[GDB_SP] = p->thread.sp;
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 7847e5c0e0b5..28cee019209c 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -45,7 +45,7 @@
#include <linux/slab.h>
#include <linux/hardirq.h>
#include <linux/preempt.h>
-#include <linux/module.h>
+#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 4425f593f0ec..3bb4c5f021f6 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -24,7 +24,7 @@
#include <linux/slab.h>
#include <linux/hardirq.h>
#include <linux/preempt.h>
-#include <linux/module.h>
+#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index c2bedaea11f7..4afc67f5facc 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -184,7 +184,7 @@ out:
static struct kobj_attribute type_attr = __ATTR_RO(type);
-static struct bin_attribute data_attr = {
+static struct bin_attribute data_attr __ro_after_init = {
.attr = {
.name = "data",
.mode = S_IRUGO,
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 1726c4c12336..edbbfc854e39 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -423,12 +423,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
kvm_spinlock_init();
}
-static void kvm_guest_cpu_online(void *dummy)
-{
- kvm_guest_cpu_init();
-}
-
-static void kvm_guest_cpu_offline(void *dummy)
+static void kvm_guest_cpu_offline(void)
{
kvm_disable_steal_time();
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -437,29 +432,21 @@ static void kvm_guest_cpu_offline(void *dummy)
apf_task_wake_all();
}
-static int kvm_cpu_notify(struct notifier_block *self, unsigned long action,
- void *hcpu)
+static int kvm_cpu_online(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
- switch (action) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- case CPU_ONLINE_FROZEN:
- smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
+ local_irq_disable();
+ kvm_guest_cpu_init();
+ local_irq_enable();
+ return 0;
}
-static struct notifier_block kvm_cpu_notifier = {
- .notifier_call = kvm_cpu_notify,
-};
+static int kvm_cpu_down_prepare(unsigned int cpu)
+{
+ local_irq_disable();
+ kvm_guest_cpu_offline();
+ local_irq_enable();
+ return 0;
+}
#endif
static void __init kvm_apf_trap_init(void)
@@ -494,7 +481,9 @@ void __init kvm_guest_init(void)
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
- register_cpu_notifier(&kvm_cpu_notifier);
+ if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
+ kvm_cpu_online, kvm_cpu_down_prepare) < 0)
+ pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
#else
kvm_guest_cpu_init();
#endif
@@ -575,9 +564,6 @@ static void kvm_kick_cpu(int cpu)
kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
}
-
-#ifdef CONFIG_QUEUED_SPINLOCKS
-
#include <asm/qspinlock.h>
static void kvm_wait(u8 *ptr, u8 val)
@@ -606,243 +592,6 @@ out:
local_irq_restore(flags);
}
-#else /* !CONFIG_QUEUED_SPINLOCKS */
-
-enum kvm_contention_stat {
- TAKEN_SLOW,
- TAKEN_SLOW_PICKUP,
- RELEASED_SLOW,
- RELEASED_SLOW_KICKED,
- NR_CONTENTION_STATS
-};
-
-#ifdef CONFIG_KVM_DEBUG_FS
-#define HISTO_BUCKETS 30
-
-static struct kvm_spinlock_stats
-{
- u32 contention_stats[NR_CONTENTION_STATS];
- u32 histo_spin_blocked[HISTO_BUCKETS+1];
- u64 time_blocked;
-} spinlock_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
- u8 ret;
- u8 old;
-
- old = READ_ONCE(zero_stats);
- if (unlikely(old)) {
- ret = cmpxchg(&zero_stats, old, 0);
- /* This ensures only one fellow resets the stat */
- if (ret == old)
- memset(&spinlock_stats, 0, sizeof(spinlock_stats));
- }
-}
-
-static inline void add_stats(enum kvm_contention_stat var, u32 val)
-{
- check_zero();
- spinlock_stats.contention_stats[var] += val;
-}
-
-
-static inline u64 spin_time_start(void)
-{
- return sched_clock();
-}
-
-static void __spin_time_accum(u64 delta, u32 *array)
-{
- unsigned index;
-
- index = ilog2(delta);
- check_zero();
-
- if (index < HISTO_BUCKETS)
- array[index]++;
- else
- array[HISTO_BUCKETS]++;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
- u32 delta;
-
- delta = sched_clock() - start;
- __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
- spinlock_stats.time_blocked += delta;
-}
-
-static struct dentry *d_spin_debug;
-static struct dentry *d_kvm_debug;
-
-static struct dentry *kvm_init_debugfs(void)
-{
- d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
- if (!d_kvm_debug)
- printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");
-
- return d_kvm_debug;
-}
-
-static int __init kvm_spinlock_debugfs(void)
-{
- struct dentry *d_kvm;
-
- d_kvm = kvm_init_debugfs();
- if (d_kvm == NULL)
- return -ENOMEM;
-
- d_spin_debug = debugfs_create_dir("spinlocks", d_kvm);
-
- debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
-
- debugfs_create_u32("taken_slow", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[TAKEN_SLOW]);
- debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
-
- debugfs_create_u32("released_slow", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[RELEASED_SLOW]);
- debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
-
- debugfs_create_u64("time_blocked", 0444, d_spin_debug,
- &spinlock_stats.time_blocked);
-
- debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
- spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
-
- return 0;
-}
-fs_initcall(kvm_spinlock_debugfs);
-#else /* !CONFIG_KVM_DEBUG_FS */
-static inline void add_stats(enum kvm_contention_stat var, u32 val)
-{
-}
-
-static inline u64 spin_time_start(void)
-{
- return 0;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
-}
-#endif /* CONFIG_KVM_DEBUG_FS */
-
-struct kvm_lock_waiting {
- struct arch_spinlock *lock;
- __ticket_t want;
-};
-
-/* cpus 'waiting' on a spinlock to become available */
-static cpumask_t waiting_cpus;
-
-/* Track spinlock on which a cpu is waiting */
-static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting);
-
-__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
-{
- struct kvm_lock_waiting *w;
- int cpu;
- u64 start;
- unsigned long flags;
- __ticket_t head;
-
- if (in_nmi())
- return;
-
- w = this_cpu_ptr(&klock_waiting);
- cpu = smp_processor_id();
- start = spin_time_start();
-
- /*
- * Make sure an interrupt handler can't upset things in a
- * partially setup state.
- */
- local_irq_save(flags);
-
- /*
- * The ordering protocol on this is that the "lock" pointer
- * may only be set non-NULL if the "want" ticket is correct.
- * If we're updating "want", we must first clear "lock".
- */
- w->lock = NULL;
- smp_wmb();
- w->want = want;
- smp_wmb();
- w->lock = lock;
-
- add_stats(TAKEN_SLOW, 1);
-
- /*
- * This uses set_bit, which is atomic but we should not rely on its
- * reordering gurantees. So barrier is needed after this call.
- */
- cpumask_set_cpu(cpu, &waiting_cpus);
-
- barrier();
-
- /*
- * Mark entry to slowpath before doing the pickup test to make
- * sure we don't deadlock with an unlocker.
- */
- __ticket_enter_slowpath(lock);
-
- /* make sure enter_slowpath, which is atomic does not cross the read */
- smp_mb__after_atomic();
-
- /*
- * check again make sure it didn't become free while
- * we weren't looking.
- */
- head = READ_ONCE(lock->tickets.head);
- if (__tickets_equal(head, want)) {
- add_stats(TAKEN_SLOW_PICKUP, 1);
- goto out;
- }
-
- /*
- * halt until it's our turn and kicked. Note that we do safe halt
- * for irq enabled case to avoid hang when lock info is overwritten
- * in irq spinlock slowpath and no spurious interrupt occur to save us.
- */
- if (arch_irqs_disabled_flags(flags))
- halt();
- else
- safe_halt();
-
-out:
- cpumask_clear_cpu(cpu, &waiting_cpus);
- w->lock = NULL;
- local_irq_restore(flags);
- spin_time_accum_blocked(start);
-}
-PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
-
-/* Kick vcpu waiting on @lock->head to reach value @ticket */
-static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
-{
- int cpu;
-
- add_stats(RELEASED_SLOW, 1);
- for_each_cpu(cpu, &waiting_cpus) {
- const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
- if (READ_ONCE(w->lock) == lock &&
- READ_ONCE(w->want) == ticket) {
- add_stats(RELEASED_SLOW_KICKED, 1);
- kvm_kick_cpu(cpu);
- break;
- }
- }
-}
-
-#endif /* !CONFIG_QUEUED_SPINLOCKS */
-
/*
* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
*/
@@ -854,16 +603,11 @@ void __init kvm_spinlock_init(void)
if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
return;
-#ifdef CONFIG_QUEUED_SPINLOCKS
__pv_init_lock_hash();
pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
pv_lock_ops.wait = kvm_wait;
pv_lock_ops.kick = kvm_kick_cpu;
-#else /* !CONFIG_QUEUED_SPINLOCKS */
- pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
- pv_lock_ops.unlock_kick = kvm_unlock_kick;
-#endif
}
static __init int kvm_spinlock_init_jump(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1d39bfbd26bb..60b9949f1e65 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -29,7 +29,7 @@
#include <asm/x86_init.h>
#include <asm/reboot.h>
-static int kvmclock = 1;
+static int kvmclock __ro_after_init = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
static cycle_t kvm_sched_clock_offset;
@@ -289,6 +289,7 @@ void __init kvmclock_init(void)
put_cpu();
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+ x86_platform.calibrate_cpu = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 068c4a929de6..0f8d20497383 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -499,6 +499,9 @@ void __init default_get_smp_config(unsigned int early)
{
struct mpf_intel *mpf = mpf_found;
+ if (!smp_found_config)
+ return;
+
if (!mpf)
return;
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 1939a0269377..2c55a003b793 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,7 +8,6 @@
#include <asm/paravirt.h>
-#ifdef CONFIG_QUEUED_SPINLOCKS
__visible void __native_queued_spin_unlock(struct qspinlock *lock)
{
native_queued_spin_unlock(lock);
@@ -21,19 +20,13 @@ bool pv_is_native_spin_unlock(void)
return pv_lock_ops.queued_spin_unlock.func ==
__raw_callee_save___native_queued_spin_unlock;
}
-#endif
struct pv_lock_ops pv_lock_ops = {
#ifdef CONFIG_SMP
-#ifdef CONFIG_QUEUED_SPINLOCKS
.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
.wait = paravirt_nop,
.kick = paravirt_nop,
-#else /* !CONFIG_QUEUED_SPINLOCKS */
- .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
- .unlock_kick = paravirt_nop,
-#endif /* !CONFIG_QUEUED_SPINLOCKS */
#endif /* SMP */
};
EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ad5bc9578a73..bbf3d5933eaa 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -56,12 +56,12 @@ asm (".pushsection .entry.text, \"ax\"\n"
".popsection");
/* identity function, which can be inlined */
-u32 _paravirt_ident_32(u32 x)
+u32 notrace _paravirt_ident_32(u32 x)
{
return x;
}
-u64 _paravirt_ident_64(u64 x)
+u64 notrace _paravirt_ident_64(u64 x)
{
return x;
}
@@ -332,7 +332,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.read_cr0 = native_read_cr0,
.write_cr0 = native_write_cr0,
.read_cr4 = native_read_cr4,
- .read_cr4_safe = native_read_cr4_safe,
.write_cr4 = native_write_cr4,
#ifdef CONFIG_X86_64
.read_cr8 = native_read_cr8,
@@ -389,7 +388,7 @@ NOKPROBE_SYMBOL(native_load_idt);
#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
#endif
-struct pv_mmu_ops pv_mmu_ops = {
+struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.read_cr2 = native_read_cr2,
.write_cr2 = native_write_cr2,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 158dc0650d5d..920c6ae08592 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -10,7 +10,7 @@ DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
#endif
@@ -49,7 +49,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_cpu_ops, clts);
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
if (pv_is_native_spin_unlock()) {
start = start_pv_lock_ops_queued_spin_unlock;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index e70087a04cc8..bb3840cedb4f 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -19,7 +19,7 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
DEF_NATIVE(, mov32, "mov %edi, %eax");
DEF_NATIVE(, mov64, "mov %rdi, %rax");
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
#endif
@@ -61,7 +61,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_cpu_ops, clts);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
if (pv_is_native_spin_unlock()) {
start = start_pv_lock_ops_queued_spin_unlock;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 62c0b0ea2ce4..4002b475171c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -32,6 +32,7 @@
#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/vm86.h>
+#include <asm/switch_to.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -513,6 +514,17 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
}
/*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+unsigned long thread_saved_pc(struct task_struct *tsk)
+{
+ struct inactive_task_frame *frame =
+ (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp);
+ return READ_ONCE_NOCHECK(frame->ret_addr);
+}
+
+/*
* Called from fs/proc with a reference on @p to find the function
* which called into schedule(). This needs to be done carefully
* because the task might wake up and we might look at a stack
@@ -520,15 +532,18 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
*/
unsigned long get_wchan(struct task_struct *p)
{
- unsigned long start, bottom, top, sp, fp, ip;
+ unsigned long start, bottom, top, sp, fp, ip, ret = 0;
int count = 0;
if (!p || p == current || p->state == TASK_RUNNING)
return 0;
+ if (!try_get_task_stack(p))
+ return 0;
+
start = (unsigned long)task_stack_page(p);
if (!start)
- return 0;
+ goto out;
/*
* Layout of the stack page:
@@ -537,9 +552,7 @@ unsigned long get_wchan(struct task_struct *p)
* PADDING
* ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
* stack
- * ----------- bottom = start + sizeof(thread_info)
- * thread_info
- * ----------- start
+ * ----------- bottom = start
*
* The tasks stack pointer points at the location where the
* framepointer is stored. The data on the stack is:
@@ -550,20 +563,25 @@ unsigned long get_wchan(struct task_struct *p)
*/
top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
top -= 2 * sizeof(unsigned long);
- bottom = start + sizeof(struct thread_info);
+ bottom = start;
sp = READ_ONCE(p->thread.sp);
if (sp < bottom || sp > top)
- return 0;
+ goto out;
- fp = READ_ONCE_NOCHECK(*(unsigned long *)sp);
+ fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp);
do {
if (fp < bottom || fp > top)
- return 0;
+ goto out;
ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
- if (!in_sched_functions(ip))
- return ip;
+ if (!in_sched_functions(ip)) {
+ ret = ip;
+ goto out;
+ }
fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
} while (count++ < 16 && p->state != TASK_RUNNING);
- return 0;
+
+out:
+ put_task_stack(p);
+ return ret;
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index d86be29c38c7..bd7be8efdc4c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,17 +55,6 @@
#include <asm/switch_to.h>
#include <asm/vm86.h>
-asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
-asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
-
-/*
- * Return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- return ((unsigned long *)tsk->thread.sp)[3];
-}
-
void __show_regs(struct pt_regs *regs, int all)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
@@ -101,7 +90,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
- cr4 = __read_cr4_safe();
+ cr4 = __read_cr4();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);
@@ -133,35 +122,31 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
struct pt_regs *childregs = task_pt_regs(p);
+ struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs);
+ struct inactive_task_frame *frame = &fork_frame->frame;
struct task_struct *tsk;
int err;
- p->thread.sp = (unsigned long) childregs;
+ frame->bp = 0;
+ frame->ret_addr = (unsigned long) ret_from_fork;
+ p->thread.sp = (unsigned long) fork_frame;
p->thread.sp0 = (unsigned long) (childregs+1);
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
if (unlikely(p->flags & PF_KTHREAD)) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
- p->thread.ip = (unsigned long) ret_from_kernel_thread;
- task_user_gs(p) = __KERNEL_STACK_CANARY;
- childregs->ds = __USER_DS;
- childregs->es = __USER_DS;
- childregs->fs = __KERNEL_PERCPU;
- childregs->bx = sp; /* function */
- childregs->bp = arg;
- childregs->orig_ax = -1;
- childregs->cs = __KERNEL_CS | get_kernel_rpl();
- childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+ frame->bx = sp; /* function */
+ frame->di = arg;
p->thread.io_bitmap_ptr = NULL;
return 0;
}
+ frame->bx = 0;
*childregs = *current_pt_regs();
childregs->ax = 0;
if (sp)
childregs->sp = sp;
- p->thread.ip = (unsigned long) ret_from_fork;
task_user_gs(p) = get_user_gs(current_pt_regs());
p->thread.io_bitmap_ptr = NULL;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 63236d8f84bf..ee944bd2310d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,8 +49,7 @@
#include <asm/debugreg.h>
#include <asm/switch_to.h>
#include <asm/xen/hypervisor.h>
-
-asmlinkage extern void ret_from_fork(void);
+#include <asm/vdso.h>
__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
@@ -141,12 +140,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
{
int err;
struct pt_regs *childregs;
+ struct fork_frame *fork_frame;
+ struct inactive_task_frame *frame;
struct task_struct *me = current;
p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
childregs = task_pt_regs(p);
- p->thread.sp = (unsigned long) childregs;
- set_tsk_thread_flag(p, TIF_FORK);
+ fork_frame = container_of(childregs, struct fork_frame, regs);
+ frame = &fork_frame->frame;
+ frame->bp = 0;
+ frame->ret_addr = (unsigned long) ret_from_fork;
+ p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap_ptr = NULL;
savesegment(gs, p->thread.gsindex);
@@ -160,15 +164,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
if (unlikely(p->flags & PF_KTHREAD)) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
- childregs->sp = (unsigned long)childregs;
- childregs->ss = __KERNEL_DS;
- childregs->bx = sp; /* function */
- childregs->bp = arg;
- childregs->orig_ax = -1;
- childregs->cs = __KERNEL_CS | get_kernel_rpl();
- childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+ frame->bx = sp; /* function */
+ frame->r12 = arg;
return 0;
}
+ frame->bx = 0;
*childregs = *current_pt_regs();
childregs->ax = 0;
@@ -511,7 +511,7 @@ void set_personality_ia32(bool x32)
current->personality &= ~READ_IMPLIES_EXEC;
/* in_compat_syscall() uses the presence of the x32
syscall bit flag to determine compat status */
- current_thread_info()->status &= ~TS_COMPAT;
+ current->thread.status &= ~TS_COMPAT;
} else {
set_thread_flag(TIF_IA32);
clear_thread_flag(TIF_X32);
@@ -519,11 +519,24 @@ void set_personality_ia32(bool x32)
current->mm->context.ia32_compat = TIF_IA32;
current->personality |= force_personality32;
/* Prepare the first "return" to user space */
- current_thread_info()->status |= TS_COMPAT;
+ current->thread.status |= TS_COMPAT;
}
}
EXPORT_SYMBOL_GPL(set_personality_ia32);
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
+{
+ int ret;
+
+ ret = map_vdso_once(image, addr);
+ if (ret)
+ return ret;
+
+ return (long)image->size;
+}
+#endif
+
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
{
int ret = 0;
@@ -577,6 +590,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
break;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+# ifdef CONFIG_X86_X32_ABI
+ case ARCH_MAP_VDSO_X32:
+ return prctl_map_vdso(&vdso_image_x32, addr);
+# endif
+# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ case ARCH_MAP_VDSO_32:
+ return prctl_map_vdso(&vdso_image_32, addr);
+# endif
+ case ARCH_MAP_VDSO_64:
+ return prctl_map_vdso(&vdso_image_64, addr);
+#endif
+
default:
ret = -EINVAL;
break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index f79576a541ff..0e63c0267f99 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -173,8 +173,8 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs)
return sp;
prev_esp = (u32 *)(context);
- if (prev_esp)
- return (unsigned long)prev_esp;
+ if (*prev_esp)
+ return (unsigned long)*prev_esp;
return (unsigned long)regs;
}
@@ -934,7 +934,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
*/
regs->orig_ax = value;
if (syscall_get_nr(child, regs) >= 0)
- task_thread_info(child)->status |= TS_I386_REGS_POKED;
+ child->thread.status |= TS_I386_REGS_POKED;
break;
case offsetof(struct user32, regs.eflags):
@@ -1250,7 +1250,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
#ifdef CONFIG_X86_64
-static struct user_regset x86_64_regsets[] __read_mostly = {
+static struct user_regset x86_64_regsets[] __ro_after_init = {
[REGSET_GENERAL] = {
.core_note_type = NT_PRSTATUS,
.n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1291,7 +1291,7 @@ static const struct user_regset_view user_x86_64_view = {
#endif /* CONFIG_X86_64 */
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-static struct user_regset x86_32_regsets[] __read_mostly = {
+static struct user_regset x86_32_regsets[] __ro_after_init = {
[REGSET_GENERAL] = {
.core_note_type = NT_PRSTATUS,
.n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1344,7 +1344,7 @@ static const struct user_regset_view user_x86_32_view = {
*/
u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
-void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
+void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
{
#ifdef CONFIG_X86_64
x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
@@ -1358,7 +1358,7 @@ void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
+ if (!user_64bit_mode(task_pt_regs(task)))
#endif
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
return &user_x86_32_view;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index cc457ff818ad..51402a7e4ca6 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -626,3 +626,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
amd_disable_seq_and_redirect_scrub);
#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+#include <linux/jump_label.h>
+#include <asm/string_64.h>
+
+/* Ivy Bridge, Haswell, Broadwell */
+static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+
+ if (capid0 & 0x10)
+ static_branch_inc(&mcsafe_key);
+}
+
+/* Skylake */
+static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+
+ if ((capid0 & 0xc0) == 0xc0)
+ static_branch_inc(&mcsafe_key);
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap);
+#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 63bf27d972b7..e244c19a2451 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -705,7 +705,7 @@ static void native_machine_power_off(void)
tboot_shutdown(TB_SHUTDOWN_HALT);
}
-struct machine_ops machine_ops = {
+struct machine_ops machine_ops __ro_after_init = {
.power_off = native_machine_power_off,
.shutdown = native_machine_shutdown,
.emergency_restart = native_machine_emergency_restart,
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index 80eab01c1a68..2408c1603438 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -27,8 +27,8 @@ static void remove_e820_regions(struct resource *avail)
int i;
struct e820entry *entry;
- for (i = 0; i < e820.nr_map; i++) {
- entry = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ entry = &e820->map[i];
resource_clip(avail, entry->addr,
entry->addr + entry->size - 1);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 991b77986d57..bbfbca5fea0c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -210,9 +210,9 @@ EXPORT_SYMBOL(boot_cpu_data);
#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
-__visible unsigned long mmu_cr4_features;
+__visible unsigned long mmu_cr4_features __ro_after_init;
#else
-__visible unsigned long mmu_cr4_features = X86_CR4_PAE;
+__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
#endif
/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
@@ -458,8 +458,8 @@ static void __init e820_reserve_setup_data(void)
early_memunmap(data, sizeof(*data));
}
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
- memcpy(&e820_saved, &e820, sizeof(struct e820map));
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
+ memcpy(e820_saved, e820, sizeof(struct e820map));
printk(KERN_INFO "extended physical RAM map:\n");
e820_print_map("reserve setup_data");
}
@@ -763,7 +763,7 @@ static void __init trim_bios_range(void)
*/
e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
}
/* called before trim_bios_range() to spare extra sanitize */
@@ -936,8 +936,6 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.arch_setup();
- kernel_randomize_memory();
-
iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
setup_memory_map();
parse_setup_data();
@@ -1034,7 +1032,7 @@ void __init setup_arch(char **cmdline_p)
if (ppro_with_ram_bug()) {
e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
printk(KERN_INFO "fixed physical RAM map:\n");
e820_print_map("bad_ppro");
}
@@ -1055,6 +1053,12 @@ void __init setup_arch(char **cmdline_p)
max_possible_pfn = max_pfn;
+ /*
+ * Define random base addresses for memory sections after max_pfn is
+ * defined and before each memory section base is used.
+ */
+ kernel_randomize_memory();
+
#ifdef CONFIG_X86_32
/* max_low_pfn get updated here */
find_low_pfn_range();
@@ -1092,17 +1096,19 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
- if (efi_enabled(EFI_BOOT)) {
+ reserve_bios_regions();
+
+ if (efi_enabled(EFI_MEMMAP)) {
efi_fake_memmap();
efi_find_mirror();
- }
+ efi_esrt_init();
- /*
- * The EFI specification says that boot service code won't be called
- * after ExitBootServices(). This is, in fact, a lie.
- */
- if (efi_enabled(EFI_MEMMAP))
+ /*
+ * The EFI specification says that boot service code won't be
+ * called after ExitBootServices(). This is, in fact, a lie.
+ */
efi_reserve_boot_services();
+ }
/* preallocate 4k for mptable mpc */
early_reserve_e820_mpc_new();
@@ -1125,7 +1131,13 @@ void __init setup_arch(char **cmdline_p)
early_trap_pf_init();
- setup_real_mode();
+ /*
+ * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
+ * with the current CR4 value. This may not be necessary, but
+ * auditing all the early-boot CR4 manipulation would be needed to
+ * rule it out.
+ */
+ mmu_cr4_features = __read_cr4();
memblock_set_current_limit(get_max_mapped());
@@ -1174,13 +1186,6 @@ void __init setup_arch(char **cmdline_p)
kasan_init();
- if (boot_cpu_data.cpuid_level >= 0) {
- /* A CPU has %cr4 if and only if it has CPUID */
- mmu_cr4_features = __read_cr4();
- if (trampoline_cr4_features)
- *trampoline_cr4_features = mmu_cr4_features;
- }
-
#ifdef CONFIG_X86_32
/* sync back kernel address range */
clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
@@ -1214,8 +1219,7 @@ void __init setup_arch(char **cmdline_p)
/*
* get boot-time SMP configuration:
*/
- if (smp_found_config)
- get_smp_config();
+ get_smp_config();
prefill_possible_map();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 7a40e068302d..2bbd27f89802 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -33,7 +33,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
[0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
};
EXPORT_SYMBOL(__per_cpu_offset);
@@ -246,7 +246,7 @@ void __init setup_per_cpu_areas(void)
#ifdef CONFIG_X86_64
per_cpu(irq_stack_ptr, cpu) =
per_cpu(irq_stack_union.irq_stack, cpu) +
- IRQ_STACK_SIZE - 64;
+ IRQ_STACK_SIZE;
#endif
#ifdef CONFIG_NUMA
per_cpu(x86_cpu_to_node_map, cpu) =
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 04cb3212db2d..763af1d0de64 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -42,6 +42,7 @@
#include <asm/syscalls.h>
#include <asm/sigframe.h>
+#include <asm/signal.h>
#define COPY(x) do { \
get_user_ex(regs->x, &sc->x); \
@@ -547,7 +548,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
return -EFAULT;
if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
- if (copy_siginfo_to_user32(&frame->info, &ksig->info))
+ if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
return -EFAULT;
}
@@ -660,20 +661,21 @@ badframe:
return 0;
}
-static inline int is_ia32_compat_frame(void)
+static inline int is_ia32_compat_frame(struct ksignal *ksig)
{
return IS_ENABLED(CONFIG_IA32_EMULATION) &&
- test_thread_flag(TIF_IA32);
+ ksig->ka.sa.sa_flags & SA_IA32_ABI;
}
-static inline int is_ia32_frame(void)
+static inline int is_ia32_frame(struct ksignal *ksig)
{
- return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame();
+ return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
}
-static inline int is_x32_frame(void)
+static inline int is_x32_frame(struct ksignal *ksig)
{
- return IS_ENABLED(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
+ return IS_ENABLED(CONFIG_X86_X32_ABI) &&
+ ksig->ka.sa.sa_flags & SA_X32_ABI;
}
static int
@@ -684,12 +686,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
compat_sigset_t *cset = (compat_sigset_t *) set;
/* Set up the stack frame */
- if (is_ia32_frame()) {
+ if (is_ia32_frame(ksig)) {
if (ksig->ka.sa.sa_flags & SA_SIGINFO)
return ia32_setup_rt_frame(usig, ksig, cset, regs);
else
return ia32_setup_frame(usig, ksig, cset, regs);
- } else if (is_x32_frame()) {
+ } else if (is_x32_frame(ksig)) {
return x32_setup_rt_frame(ksig, cset, regs);
} else {
return __setup_rt_frame(ksig->sig, ksig, set, regs);
@@ -783,7 +785,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
* than the tracee.
*/
#ifdef CONFIG_IA32_EMULATION
- if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
+ if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
return __NR_ia32_restart_syscall;
#endif
#ifdef CONFIG_X86_X32_ABI
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index b44564bf86a8..40df33753bae 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -1,5 +1,6 @@
#include <linux/compat.h>
#include <linux/uaccess.h>
+#include <linux/ptrace.h>
/*
* The compat_siginfo_t structure and handing code is very easy
@@ -92,10 +93,31 @@ static inline void signal_compat_build_tests(void)
/* any new si_fields should be added here */
}
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
+{
+ /* Don't leak in-kernel non-uapi flags to user-space */
+ if (oact)
+ oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+ if (!act)
+ return;
+
+ /* Don't let flags to be set from userspace */
+ act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+ if (user_64bit_mode(current_pt_regs()))
+ return;
+
+ if (in_ia32_syscall())
+ act->sa.sa_flags |= SA_IA32_ABI;
+ if (in_x32_syscall())
+ act->sa.sa_flags |= SA_X32_ABI;
+}
+
+int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
+ bool x32_ABI)
{
int err = 0;
- bool ia32 = test_thread_flag(TIF_IA32);
signal_compat_build_tests();
@@ -146,7 +168,7 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
put_user_ex(from->si_arch, &to->si_arch);
break;
case __SI_CHLD >> 16:
- if (ia32) {
+ if (!x32_ABI) {
put_user_ex(from->si_utime, &to->si_utime);
put_user_ex(from->si_stime, &to->si_stime);
} else {
@@ -180,6 +202,12 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
return err;
}
+/* from syscall's path, where we know the ABI */
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+{
+ return __copy_siginfo_to_user32(to, from, in_x32_syscall());
+}
+
int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
{
int err = 0;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2a6e84a30a54..42a93621f5b0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -100,10 +100,11 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
/* Logical package management. We might want to allocate that dynamically */
static int *physical_to_logical_pkg __read_mostly;
static unsigned long *physical_package_map __read_mostly;;
-static unsigned long *logical_package_map __read_mostly;
static unsigned int max_physical_pkg_id __read_mostly;
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
+static unsigned int logical_packages __read_mostly;
+static bool logical_packages_frozen __read_mostly;
/* Maximum number of SMT threads on any online core */
int __max_smt_threads __read_mostly;
@@ -277,14 +278,14 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu)
if (test_and_set_bit(pkg, physical_package_map))
goto found;
- new = find_first_zero_bit(logical_package_map, __max_logical_packages);
- if (new >= __max_logical_packages) {
+ if (logical_packages_frozen) {
physical_to_logical_pkg[pkg] = -1;
- pr_warn("APIC(%x) Package %u exceeds logical package map\n",
+ pr_warn("APIC(%x) Package %u exceeds logical package max\n",
apicid, pkg);
return -ENOSPC;
}
- set_bit(new, logical_package_map);
+
+ new = logical_packages++;
pr_info("APIC(%x) Converting physical %u to logical package %u\n",
apicid, pkg, new);
physical_to_logical_pkg[pkg] = new;
@@ -341,6 +342,7 @@ static void __init smp_init_package_map(void)
}
__max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus);
+ logical_packages = 0;
/*
* Possibly larger than what we need as the number of apic ids per
@@ -352,10 +354,6 @@ static void __init smp_init_package_map(void)
memset(physical_to_logical_pkg, 0xff, size);
size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
physical_package_map = kzalloc(size, GFP_KERNEL);
- size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long);
- logical_package_map = kzalloc(size, GFP_KERNEL);
-
- pr_info("Max logical packages: %u\n", __max_logical_packages);
for_each_present_cpu(cpu) {
unsigned int apicid = apic->cpu_present_to_apicid(cpu);
@@ -369,6 +367,15 @@ static void __init smp_init_package_map(void)
set_cpu_possible(cpu, false);
set_cpu_present(cpu, false);
}
+
+ if (logical_packages > __max_logical_packages) {
+ pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n",
+ logical_packages, __max_logical_packages);
+ logical_packages_frozen = true;
+ __max_logical_packages = logical_packages;
+ }
+
+ pr_info("Max logical packages: %u\n", __max_logical_packages);
}
void __init smp_store_boot_cpu_info(void)
@@ -464,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
-static struct sched_domain_topology_level numa_inside_package_topology[] = {
+static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
@@ -473,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = {
#endif
{ NULL, },
};
+
+static struct sched_domain_topology_level x86_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
/*
- * set_sched_topology() sets the topology internal to a CPU. The
- * NUMA topologies are layered on top of it to build the full
- * system topology.
- *
- * If NUMA nodes are observed to occur within a CPU package, this
- * function should be called. It forces the sched domain code to
- * only use the SMT level for the CPU portion of the topology.
- * This essentially falls back to relying on NUMA information
- * from the SRAT table to describe the entire system topology
- * (except for hyperthreads).
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours and Intel Cluster-on-Die have this.
*/
-static void primarily_use_numa_for_topology(void)
-{
- set_sched_topology(numa_inside_package_topology);
-}
+static bool x86_has_numa_in_package;
void set_cpu_sibling_map(int cpu)
{
@@ -551,7 +559,7 @@ void set_cpu_sibling_map(int cpu)
c->booted_cores = cpu_data(i).booted_cores;
}
if (match_die(c, o) && !topology_same_node(c, o))
- primarily_use_numa_for_topology();
+ x86_has_numa_in_package = true;
}
threads = cpumask_weight(topology_sibling_cpumask(cpu));
@@ -683,7 +691,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
maxlvt = lapic_get_maxlvt();
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
@@ -710,7 +718,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
/*
* Be paranoid about clearing APIC errors.
*/
- if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
@@ -749,7 +757,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
* Determine this based on the APIC version.
* If we don't have an integrated APIC, don't send the STARTUP IPIs.
*/
- if (APIC_INTEGRATED(apic_version[phys_apicid]))
+ if (APIC_INTEGRATED(boot_cpu_apic_version))
num_starts = 2;
else
num_starts = 0;
@@ -935,7 +943,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
per_cpu(cpu_current_top_of_stack, cpu) =
(unsigned long)task_stack_page(idle) + THREAD_SIZE;
#else
- clear_tsk_thread_flag(idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
#endif
}
@@ -962,7 +969,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
- stack_start = idle->thread.sp;
+ initial_stack = idle->thread.sp;
/*
* Enable the espfix hack for this CPU
@@ -987,7 +994,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
/*
* Be paranoid about clearing APIC errors.
*/
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
@@ -1108,17 +1115,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
common_cpu_up(cpu, tidle);
- /*
- * We have to walk the irq descriptors to setup the vector
- * space for the cpu which comes online. Prevent irq
- * alloc/free across the bringup.
- */
- irq_lock_sparse();
-
err = do_boot_cpu(apicid, cpu, tidle);
-
if (err) {
- irq_unlock_sparse();
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
return -EIO;
}
@@ -1136,8 +1134,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
touch_nmi_watchdog();
}
- irq_unlock_sparse();
-
return 0;
}
@@ -1242,7 +1238,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
/*
* If we couldn't find a local APIC, then get out of here now!
*/
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
+ if (APIC_INTEGRATED(boot_cpu_apic_version) &&
!boot_cpu_has(X86_FEATURE_APIC)) {
if (!disable_apic) {
pr_err("BIOS bug, local APIC #%d not detected!...\n",
@@ -1297,6 +1293,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
}
+
+ /*
+ * Set 'default' x86 topology, this matches default_topology() in that
+ * it has NUMA nodes as a topology level. See also
+ * native_smp_cpus_done().
+ *
+ * Must be done before set_cpus_sibling_map() is ran.
+ */
+ set_sched_topology(x86_topology);
+
set_cpu_sibling_map(0);
switch (smp_sanity_check(max_cpus)) {
@@ -1316,14 +1322,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
break;
}
- default_setup_apic_routing();
-
if (read_apic_id() != boot_cpu_physical_apicid) {
panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
read_apic_id(), boot_cpu_physical_apicid);
/* Or can we switch back to PIC here? */
}
+ default_setup_apic_routing();
cpu0_logical_apicid = apic_bsp_setup(false);
pr_info("CPU%d: ", 0);
@@ -1363,6 +1368,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
{
pr_debug("Boot done\n");
+ if (x86_has_numa_in_package)
+ set_sched_topology(x86_numa_in_package_topology);
+
nmi_selftest();
impress_friends();
setup_ioapic_dest();
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 4738f5e0f2ab..0653788026e2 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -8,80 +8,69 @@
#include <linux/export.h>
#include <linux/uaccess.h>
#include <asm/stacktrace.h>
+#include <asm/unwind.h>
-static int save_stack_stack(void *data, char *name)
+static int save_stack_address(struct stack_trace *trace, unsigned long addr,
+ bool nosched)
{
- return 0;
-}
-
-static int
-__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
-{
- struct stack_trace *trace = data;
-#ifdef CONFIG_FRAME_POINTER
- if (!reliable)
- return 0;
-#endif
if (nosched && in_sched_functions(addr))
return 0;
+
if (trace->skip > 0) {
trace->skip--;
return 0;
}
- if (trace->nr_entries < trace->max_entries) {
- trace->entries[trace->nr_entries++] = addr;
- return 0;
- } else {
- return -1; /* no more room, stop walking the stack */
- }
-}
-static int save_stack_address(void *data, unsigned long addr, int reliable)
-{
- return __save_stack_address(data, addr, reliable, false);
+ if (trace->nr_entries >= trace->max_entries)
+ return -1;
+
+ trace->entries[trace->nr_entries++] = addr;
+ return 0;
}
-static int
-save_stack_address_nosched(void *data, unsigned long addr, int reliable)
+static void __save_stack_trace(struct stack_trace *trace,
+ struct task_struct *task, struct pt_regs *regs,
+ bool nosched)
{
- return __save_stack_address(data, addr, reliable, true);
-}
+ struct unwind_state state;
+ unsigned long addr;
-static const struct stacktrace_ops save_stack_ops = {
- .stack = save_stack_stack,
- .address = save_stack_address,
- .walk_stack = print_context_stack,
-};
+ if (regs)
+ save_stack_address(trace, regs->ip, nosched);
-static const struct stacktrace_ops save_stack_ops_nosched = {
- .stack = save_stack_stack,
- .address = save_stack_address_nosched,
- .walk_stack = print_context_stack,
-};
+ for (unwind_start(&state, task, regs, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr || save_stack_address(trace, addr, nosched))
+ break;
+ }
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
/*
* Save stack-backtrace addresses into a stack_trace buffer.
*/
void save_stack_trace(struct stack_trace *trace)
{
- dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ __save_stack_trace(trace, current, NULL, false);
}
EXPORT_SYMBOL_GPL(save_stack_trace);
void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
{
- dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ __save_stack_trace(trace, current, regs, false);
}
void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
- dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ if (!try_get_task_stack(tsk))
+ return;
+
+ __save_stack_trace(trace, tsk, NULL, true);
+
+ put_task_stack(tsk);
}
EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 654f6c66fe45..8402907825b0 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -188,12 +188,12 @@ static int tboot_setup_sleep(void)
tboot->num_mac_regions = 0;
- for (i = 0; i < e820.nr_map; i++) {
- if ((e820.map[i].type != E820_RAM)
- && (e820.map[i].type != E820_RESERVED_KERN))
+ for (i = 0; i < e820->nr_map; i++) {
+ if ((e820->map[i].type != E820_RAM)
+ && (e820->map[i].type != E820_RESERVED_KERN))
continue;
- add_mac_region(e820.map[i].addr, e820.map[i].size);
+ add_mac_region(e820->map[i].addr, e820->map[i].size);
}
tboot->acpi_sinfo.kernel_s3_resume_vector =
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b70ca12dd389..bd4e3d4d3625 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
+#ifdef CONFIG_VMAP_STACK
+__visible void __noreturn handle_stack_overflow(const char *message,
+ struct pt_regs *regs,
+ unsigned long fault_address)
+{
+ printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
+ (void *)fault_address, current->stack,
+ (char *)current->stack + THREAD_SIZE - 1);
+ die(message, regs, 0);
+
+ /* Be absolutely certain we don't return. */
+ panic(message);
+}
+#endif
+
#ifdef CONFIG_X86_64
/* Runs on IST stack */
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
+#ifdef CONFIG_VMAP_STACK
+ unsigned long cr2;
+#endif
#ifdef CONFIG_X86_ESPFIX64
extern unsigned char native_irq_return_iret[];
@@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_DF;
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * If we overflow the stack into a guard page, the CPU will fail
+ * to deliver #PF and will send #DF instead. Similarly, if we
+ * take any non-IST exception while too close to the bottom of
+ * the stack, the processor will get a page fault while
+ * delivering the exception and will generate a double fault.
+ *
+ * According to the SDM (footnote in 6.15 under "Interrupt 14 -
+ * Page-Fault Exception (#PF):
+ *
+ * Processors update CR2 whenever a page fault is detected. If a
+ * second page fault occurs while an earlier page fault is being
+ * deliv- ered, the faulting linear address of the second fault will
+ * overwrite the contents of CR2 (replacing the previous
+ * address). These updates to CR2 occur even if the page fault
+ * results in a double fault or occurs during the delivery of a
+ * double fault.
+ *
+ * The logic below has a small possibility of incorrectly diagnosing
+ * some errors as stack overflows. For example, if the IDT or GDT
+ * gets corrupted such that #GP delivery fails due to a bad descriptor
+ * causing #GP and we hit this condition while CR2 coincidentally
+ * points to the stack guard page, we'll think we overflowed the
+ * stack. Given that we're going to panic one way or another
+ * if this happens, this isn't necessarily worth fixing.
+ *
+ * If necessary, we could improve the test by only diagnosing
+ * a stack overflow if the saved RSP points within 47 bytes of
+ * the bottom of the stack: if RSP == tsk_stack + 48 and we
+ * take an exception, the stack is already aligned and there
+ * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
+ * possible error code, so a stack overflow would *not* double
+ * fault. With any less space left, exception delivery could
+ * fail, and, as a practical matter, we've overflowed the
+ * stack even if the actual trigger for the double fault was
+ * something else.
+ */
+ cr2 = read_cr2();
+ if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
+ handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
+#endif
+
#ifdef CONFIG_DOUBLEFAULT
df_debug(regs, error_code);
#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 1ef87e887051..46b2f41f8b05 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -22,6 +22,8 @@
#include <asm/nmi.h>
#include <asm/x86_init.h>
#include <asm/geode.h>
+#include <asm/apic.h>
+#include <asm/intel-family.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -685,11 +687,16 @@ unsigned long native_calibrate_tsc(void)
if (crystal_khz == 0) {
switch (boot_cpu_data.x86_model) {
- case 0x4E: /* SKL */
- case 0x5E: /* SKL */
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
crystal_khz = 24000; /* 24.0 MHz */
break;
- case 0x5C: /* BXT */
+ case INTEL_FAM6_SKYLAKE_X:
+ crystal_khz = 25000; /* 25.0 MHz */
+ break;
+ case INTEL_FAM6_ATOM_GOLDMONT:
crystal_khz = 19200; /* 19.2 MHz */
break;
}
@@ -1249,6 +1256,9 @@ static void tsc_refine_calibration_work(struct work_struct *work)
(unsigned long)tsc_khz / 1000,
(unsigned long)tsc_khz % 1000);
+ /* Inform the TSC deadline clockevent devices about the recalibration */
+ lapic_update_tsc_freq();
+
out:
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
new file mode 100644
index 000000000000..a2456d4d286a
--- /dev/null
+++ b/arch/x86/kernel/unwind_frame.c
@@ -0,0 +1,93 @@
+#include <linux/sched.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+#define FRAME_HEADER_SIZE (sizeof(long) * 2)
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ unsigned long addr;
+ unsigned long *addr_p = unwind_get_return_address_ptr(state);
+
+ if (unwind_done(state))
+ return 0;
+
+ addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p,
+ addr_p);
+
+ return __kernel_text_address(addr) ? addr : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+static bool update_stack_state(struct unwind_state *state, void *addr,
+ size_t len)
+{
+ struct stack_info *info = &state->stack_info;
+
+ /*
+ * If addr isn't on the current stack, switch to the next one.
+ *
+ * We may have to traverse multiple stacks to deal with the possibility
+ * that 'info->next_sp' could point to an empty stack and 'addr' could
+ * be on a subsequent stack.
+ */
+ while (!on_stack(info, addr, len))
+ if (get_stack_info(info->next_sp, state->task, info,
+ &state->stack_mask))
+ return false;
+
+ return true;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ unsigned long *next_bp;
+
+ if (unwind_done(state))
+ return false;
+
+ next_bp = (unsigned long *)*state->bp;
+
+ /* make sure the next frame's data is accessible */
+ if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE))
+ return false;
+
+ /* move to the next frame */
+ state->bp = next_bp;
+ return true;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+
+ /* don't even attempt to start from user mode regs */
+ if (regs && user_mode(regs)) {
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return;
+ }
+
+ /* set up the starting stack frame */
+ state->bp = get_frame_pointer(task, regs);
+
+ /* initialize stack info and make sure the frame data is accessible */
+ get_stack_info(state->bp, state->task, &state->stack_info,
+ &state->stack_mask);
+ update_stack_state(state, state->bp, FRAME_HEADER_SIZE);
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ state->bp < first_frame))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
new file mode 100644
index 000000000000..b5a834c93065
--- /dev/null
+++ b/arch/x86/kernel/unwind_guess.c
@@ -0,0 +1,43 @@
+#include <linux/sched.h>
+#include <linux/ftrace.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ struct stack_info *info = &state->stack_info;
+
+ if (unwind_done(state))
+ return false;
+
+ do {
+ for (state->sp++; state->sp < info->end; state->sp++)
+ if (__kernel_text_address(*state->sp))
+ return true;
+
+ state->sp = info->next_sp;
+
+ } while (!get_stack_info(state->sp, state->task, info,
+ &state->stack_mask));
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+
+ state->task = task;
+ state->sp = first_frame;
+
+ get_stack_info(first_frame, state->task, &state->stack_info,
+ &state->stack_mask);
+
+ if (!__kernel_text_address(*first_frame))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c1ff31d99ff..495c776de4b4 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
*cursor &= 0xfe;
}
/*
- * Similar treatment for VEX3 prefix.
- * TODO: add XOP/EVEX treatment when insn decoder supports them
+ * Similar treatment for VEX3/EVEX prefix.
+ * TODO: add XOP treatment when insn decoder supports them
*/
- if (insn->vex_prefix.nbytes == 3) {
+ if (insn->vex_prefix.nbytes >= 3) {
/*
* vex2: c5 rvvvvLpp (has no b bit)
* vex3/xop: c4/8f rxbmmmmm wvvvvLpp
* evex: 62 rxbR00mm wvvvv1pp zllBVaaa
- * (evex will need setting of both b and x since
- * in non-sib encoding evex.x is 4th bit of MODRM.rm)
- * Setting VEX3.b (setting because it has inverted meaning):
+ * Setting VEX3.b (setting because it has inverted meaning).
+ * Setting EVEX.x since (in non-SIB encoding) EVEX.x
+ * is the 4th bit of MODRM.rm, and needs the same treatment.
+ * For VEX3-encoded insns, VEX3.x value has no effect in
+ * non-SIB encoding, the change is superfluous but harmless.
*/
cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
- *cursor |= 0x20;
+ *cursor |= 0x60;
}
/*
@@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
reg = MODRM_REG(insn); /* Fetch modrm.reg */
reg2 = 0xff; /* Fetch vex.vvvv */
- if (insn->vex_prefix.nbytes == 2)
- reg2 = insn->vex_prefix.bytes[1];
- else if (insn->vex_prefix.nbytes == 3)
+ if (insn->vex_prefix.nbytes)
reg2 = insn->vex_prefix.bytes[2];
/*
- * TODO: add XOP, EXEV vvvv reading.
+ * TODO: add XOP vvvv reading.
*
* vex.vvvv field is in bits 6-3, bits are inverted.
* But in 32-bit mode, high-order bit may be ignored.
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 95e49f6e4fc3..b2cee3d19477 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -38,7 +38,7 @@ EXPORT_SYMBOL(__copy_user_nocache);
EXPORT_SYMBOL(_copy_from_user);
EXPORT_SYMBOL(_copy_to_user);
-EXPORT_SYMBOL_GPL(memcpy_mcsafe);
+EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled);
EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 76c5e52436c4..0bd9f1287f39 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -91,7 +91,7 @@ struct x86_cpuinit_ops x86_cpuinit = {
static void default_nmi_init(void) { };
static int default_i8042_detect(void) { return 1; };
-struct x86_platform_ops x86_platform = {
+struct x86_platform_ops x86_platform __ro_after_init = {
.calibrate_cpu = native_calibrate_cpu,
.calibrate_tsc = native_calibrate_tsc,
.get_wallclock = mach_get_cmos_time,
@@ -108,7 +108,7 @@ struct x86_platform_ops x86_platform = {
EXPORT_SYMBOL_GPL(x86_platform);
#if defined(CONFIG_PCI_MSI)
-struct x86_msi_ops x86_msi = {
+struct x86_msi_ops x86_msi __ro_after_init = {
.setup_msi_irqs = native_setup_msi_irqs,
.teardown_msi_irq = native_teardown_msi_irq,
.teardown_msi_irqs = default_teardown_msi_irqs,
@@ -137,7 +137,7 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
}
#endif
-struct x86_io_apic_ops x86_io_apic_ops = {
+struct x86_io_apic_ops x86_io_apic_ops __ro_after_init = {
.read = native_io_apic_read,
.disable = native_disable_io_apic,
};
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 5f42d038fcb4..c7220ba94aa7 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -109,6 +109,7 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
{
bool new_val, old_val;
struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
union kvm_ioapic_redirect_entry *e;
e = &ioapic->redirtbl[RTC_GSI];
@@ -117,16 +118,17 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
return;
new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
- old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
+ old_val = test_bit(vcpu->vcpu_id, dest_map->map);
if (new_val == old_val)
return;
if (new_val) {
- __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
+ __set_bit(vcpu->vcpu_id, dest_map->map);
+ dest_map->vectors[vcpu->vcpu_id] = e->fields.vector;
ioapic->rtc_status.pending_eoi++;
} else {
- __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
+ __clear_bit(vcpu->vcpu_id, dest_map->map);
ioapic->rtc_status.pending_eoi--;
rtc_status_pending_eoi_check_valid(ioapic);
}
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index 39b91127ef07..cd944435dfbd 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -23,8 +23,8 @@
static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
[0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
- [2] = { 0x80, 0x00, PERF_COUNT_HW_CACHE_REFERENCES },
- [3] = { 0x81, 0x00, PERF_COUNT_HW_CACHE_MISSES },
+ [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES },
+ [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES },
[4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
[5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
[6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index af523d84d102..1e6b84b96ea6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4961,7 +4961,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-static struct kvm_x86_ops svm_x86_ops = {
+static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
.hardware_setup = svm_hardware_setup,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a45d8580f91e..121fdf6e9ed0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -422,6 +422,7 @@ struct nested_vmx {
struct list_head vmcs02_pool;
int vmcs02_num;
u64 vmcs01_tsc_offset;
+ bool change_vmcs01_virtual_x2apic_mode;
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
/*
@@ -435,6 +436,8 @@ struct nested_vmx {
bool pi_pending;
u16 posted_intr_nv;
+ unsigned long *msr_bitmap;
+
struct hrtimer preemption_timer;
bool preemption_timer_expired;
@@ -924,7 +927,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
static unsigned long *vmx_msr_bitmap_legacy_x2apic;
static unsigned long *vmx_msr_bitmap_longmode_x2apic;
-static unsigned long *vmx_msr_bitmap_nested;
static unsigned long *vmx_vmread_bitmap;
static unsigned long *vmx_vmwrite_bitmap;
@@ -2198,6 +2200,12 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
new.control) != old.control);
}
+static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
+{
+ vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
+ vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+}
+
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
@@ -2256,10 +2264,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
/* Setup TSC multiplier */
if (kvm_has_tsc_control &&
- vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
- vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
- vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
- }
+ vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
+ decache_tsc_multiplier(vmx);
vmx_vcpu_pi_load(vcpu, cpu);
vmx->host_pkru = read_pkru();
@@ -2508,7 +2514,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
unsigned long *msr_bitmap;
if (is_guest_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_nested;
+ msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
else if (cpu_has_secondary_exec_ctrls() &&
(vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
@@ -6363,13 +6369,6 @@ static __init int hardware_setup(void)
if (!vmx_msr_bitmap_longmode_x2apic)
goto out4;
- if (nested) {
- vmx_msr_bitmap_nested =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_nested)
- goto out5;
- }
-
vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmread_bitmap)
goto out6;
@@ -6392,8 +6391,6 @@ static __init int hardware_setup(void)
memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
- if (nested)
- memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
@@ -6529,9 +6526,6 @@ out8:
out7:
free_page((unsigned long)vmx_vmread_bitmap);
out6:
- if (nested)
- free_page((unsigned long)vmx_msr_bitmap_nested);
-out5:
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
out4:
free_page((unsigned long)vmx_msr_bitmap_longmode);
@@ -6557,8 +6551,6 @@ static __exit void hardware_unsetup(void)
free_page((unsigned long)vmx_io_bitmap_a);
free_page((unsigned long)vmx_vmwrite_bitmap);
free_page((unsigned long)vmx_vmread_bitmap);
- if (nested)
- free_page((unsigned long)vmx_msr_bitmap_nested);
free_kvm_area();
}
@@ -6995,16 +6987,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
+ if (cpu_has_vmx_msr_bitmap()) {
+ vmx->nested.msr_bitmap =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx->nested.msr_bitmap)
+ goto out_msr_bitmap;
+ }
+
vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
if (!vmx->nested.cached_vmcs12)
- return -ENOMEM;
+ goto out_cached_vmcs12;
if (enable_shadow_vmcs) {
shadow_vmcs = alloc_vmcs();
- if (!shadow_vmcs) {
- kfree(vmx->nested.cached_vmcs12);
- return -ENOMEM;
- }
+ if (!shadow_vmcs)
+ goto out_shadow_vmcs;
/* mark vmcs as shadow */
shadow_vmcs->revision_id |= (1u << 31);
/* init shadow vmcs */
@@ -7024,6 +7021,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
skip_emulated_instruction(vcpu);
nested_vmx_succeed(vcpu);
return 1;
+
+out_shadow_vmcs:
+ kfree(vmx->nested.cached_vmcs12);
+
+out_cached_vmcs12:
+ free_page((unsigned long)vmx->nested.msr_bitmap);
+
+out_msr_bitmap:
+ return -ENOMEM;
}
/*
@@ -7098,6 +7104,10 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.vmxon = false;
free_vpid(vmx->nested.vpid02);
nested_release_vmcs12(vmx);
+ if (vmx->nested.msr_bitmap) {
+ free_page((unsigned long)vmx->nested.msr_bitmap);
+ vmx->nested.msr_bitmap = NULL;
+ }
if (enable_shadow_vmcs)
free_vmcs(vmx->nested.current_shadow_vmcs);
kfree(vmx->nested.cached_vmcs12);
@@ -8419,6 +8429,12 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
{
u32 sec_exec_control;
+ /* Postpone execution until vmcs01 is the current VMCS. */
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
+ return;
+ }
+
/*
* There is not point to enable virtualize x2apic without enable
* apicv
@@ -9472,8 +9488,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
{
int msr;
struct page *page;
- unsigned long *msr_bitmap;
+ unsigned long *msr_bitmap_l1;
+ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
+ /* This shortcut is ok because we support only x2APIC MSRs so far. */
if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
return false;
@@ -9482,63 +9500,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
WARN_ON(1);
return false;
}
- msr_bitmap = (unsigned long *)kmap(page);
- if (!msr_bitmap) {
+ msr_bitmap_l1 = (unsigned long *)kmap(page);
+ if (!msr_bitmap_l1) {
nested_release_page_clean(page);
WARN_ON(1);
return false;
}
+ memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
+
if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
if (nested_cpu_has_apic_reg_virt(vmcs12))
for (msr = 0x800; msr <= 0x8ff; msr++)
nested_vmx_disable_intercept_for_msr(
- msr_bitmap,
- vmx_msr_bitmap_nested,
+ msr_bitmap_l1, msr_bitmap_l0,
msr, MSR_TYPE_R);
- /* TPR is allowed */
- nested_vmx_disable_intercept_for_msr(msr_bitmap,
- vmx_msr_bitmap_nested,
+
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
APIC_BASE_MSR + (APIC_TASKPRI >> 4),
MSR_TYPE_R | MSR_TYPE_W);
+
if (nested_cpu_has_vid(vmcs12)) {
- /* EOI and self-IPI are allowed */
nested_vmx_disable_intercept_for_msr(
- msr_bitmap,
- vmx_msr_bitmap_nested,
+ msr_bitmap_l1, msr_bitmap_l0,
APIC_BASE_MSR + (APIC_EOI >> 4),
MSR_TYPE_W);
nested_vmx_disable_intercept_for_msr(
- msr_bitmap,
- vmx_msr_bitmap_nested,
+ msr_bitmap_l1, msr_bitmap_l0,
APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
MSR_TYPE_W);
}
- } else {
- /*
- * Enable reading intercept of all the x2apic
- * MSRs. We should not rely on vmcs12 to do any
- * optimizations here, it may have been modified
- * by L1.
- */
- for (msr = 0x800; msr <= 0x8ff; msr++)
- __vmx_enable_intercept_for_msr(
- vmx_msr_bitmap_nested,
- msr,
- MSR_TYPE_R);
-
- __vmx_enable_intercept_for_msr(
- vmx_msr_bitmap_nested,
- APIC_BASE_MSR + (APIC_TASKPRI >> 4),
- MSR_TYPE_W);
- __vmx_enable_intercept_for_msr(
- vmx_msr_bitmap_nested,
- APIC_BASE_MSR + (APIC_EOI >> 4),
- MSR_TYPE_W);
- __vmx_enable_intercept_for_msr(
- vmx_msr_bitmap_nested,
- APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
- MSR_TYPE_W);
}
kunmap(page);
nested_release_page_clean(page);
@@ -9957,10 +9949,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
}
if (cpu_has_vmx_msr_bitmap() &&
- exec_control & CPU_BASED_USE_MSR_BITMAPS) {
- nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
- /* MSR_BITMAP will be set by following vmx_set_efer. */
- } else
+ exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+ nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
+ ; /* MSR_BITMAP will be set by following vmx_set_efer. */
+ else
exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
/*
@@ -10011,6 +10003,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
else
vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+ if (kvm_has_tsc_control)
+ decache_tsc_multiplier(vmx);
if (enable_vpid) {
/*
@@ -10767,6 +10761,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
else
vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
PIN_BASED_VMX_PREEMPTION_TIMER);
+ if (kvm_has_tsc_control)
+ decache_tsc_multiplier(vmx);
+
+ if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
+ vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
+ vmx_set_virtual_x2apic_mode(vcpu,
+ vcpu->arch.apic_base & X2APIC_ENABLE);
+ }
/* This is needed for same reason as it was needed in prepare_vmcs02 */
vmx->host_rsp = 0;
@@ -11175,7 +11177,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
~FEATURE_CONTROL_LMCE;
}
-static struct kvm_x86_ops vmx_x86_ops = {
+static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
.hardware_setup = hardware_setup,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 19f9f9e05c2a..699f8726539a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2743,16 +2743,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
- if (kvm_lapic_hv_timer_in_use(vcpu) &&
- kvm_x86_ops->set_hv_timer(vcpu,
- kvm_get_lapic_tscdeadline_msr(vcpu)))
- kvm_lapic_switch_to_sw_timer(vcpu);
if (check_tsc_unstable()) {
u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
kvm_x86_ops->write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
}
+ if (kvm_lapic_hv_timer_in_use(vcpu) &&
+ kvm_x86_ops->set_hv_timer(vcpu,
+ kvm_get_lapic_tscdeadline_msr(vcpu)))
+ kvm_lapic_switch_to_sw_timer(vcpu);
/*
* On a host with synchronized TSC, there is no need to update
* kvmclock on vcpu->cpu migration
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index 02de3d74d2c5..8a602a1e404a 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -35,6 +35,7 @@ ENDPROC(__sw_hweight32)
ENTRY(__sw_hweight64)
#ifdef CONFIG_X86_64
+ pushq %rdi
pushq %rdx
movq %rdi, %rdx # w -> t
@@ -60,6 +61,7 @@ ENTRY(__sw_hweight64)
shrq $56, %rax # w = w_tmp >> 56
popq %rdx
+ popq %rdi
ret
#else /* CONFIG_X86_32 */
/* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
index f7dfeda83e5c..121f59c6ee54 100644
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@@ -19,7 +19,7 @@
#include <asm/cpufeature.h>
#include <asm/setup.h>
-#define debug_putstr(v) early_printk(v)
+#define debug_putstr(v) early_printk("%s", v)
#define has_cpuflag(f) boot_cpu_has(f)
#define get_boot_seed() kaslr_offset()
#endif
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 2ec0b0abbfaa..49e6ebac7e73 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -181,11 +181,11 @@ ENDPROC(memcpy_orig)
#ifndef CONFIG_UML
/*
- * memcpy_mcsafe - memory copy with machine check exception handling
+ * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
* Note that we only catch machine checks when reading the source addresses.
* Writes to target are posted and don't generate machine checks.
*/
-ENTRY(memcpy_mcsafe)
+ENTRY(memcpy_mcsafe_unrolled)
cmpl $8, %edx
/* Less than 8 bytes? Go to byte copy loop */
jb .L_no_whole_words
@@ -273,7 +273,7 @@ ENTRY(memcpy_mcsafe)
.L_done_memcpy_trap:
xorq %rax, %rax
ret
-ENDPROC(memcpy_mcsafe)
+ENDPROC(memcpy_mcsafe_unrolled)
.section .fixup, "ax"
/* Return -EFAULT for any failure */
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index ba47524f56e8..d1c7de095808 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -52,21 +52,6 @@ static __init int find_northbridge(void)
return -ENOENT;
}
-static __init void early_get_boot_cpu_id(void)
-{
- /*
- * need to get the APIC ID of the BSP so can use that to
- * create apicid_to_node in amd_scan_nodes()
- */
-#ifdef CONFIG_X86_MPPARSE
- /*
- * get boot-time SMP configuration:
- */
- if (smp_found_config)
- early_get_smp_config();
-#endif
-}
-
int __init amd_numa_init(void)
{
u64 start = PFN_PHYS(0);
@@ -180,8 +165,11 @@ int __init amd_numa_init(void)
cores = 1 << bits;
apicid_base = 0;
- /* get the APIC ID of the BSP early for systems with apicid lifting */
- early_get_boot_cpu_id();
+ /*
+ * get boot-time SMP configuration:
+ */
+ early_get_smp_config();
+
if (boot_cpu_physical_apicid > 0) {
pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
apicid_base = boot_cpu_physical_apicid;
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 832b98f822be..79ae939970d3 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,4 +1,4 @@
-#include <linux/module.h>
+#include <linux/extable.h>
#include <asm/uaccess.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index dc8023060456..1e525122cbe4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -5,7 +5,7 @@
*/
#include <linux/sched.h> /* test_thread_flag(), ... */
#include <linux/kdebug.h> /* oops_begin/end, ... */
-#include <linux/module.h> /* search_exception_table */
+#include <linux/extable.h> /* search_exception_table */
#include <linux/bootmem.h> /* max_low_pfn */
#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
@@ -753,6 +753,38 @@ no_context(struct pt_regs *regs, unsigned long error_code,
return;
}
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * Stack overflow? During boot, we can fault near the initial
+ * stack in the direct map, but that's not an overflow -- check
+ * that we're in vmalloc space to avoid this.
+ */
+ if (is_vmalloc_addr((void *)address) &&
+ (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
+ address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+ register void *__sp asm("rsp");
+ unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
+ /*
+ * We're likely to be running with very little stack space
+ * left. It's plausible that we'd hit this condition but
+ * double-fault even before we get this far, in which case
+ * we're fine: the double-fault handler will deal with it.
+ *
+ * We don't want to make it all the way into the oops code
+ * and then double-fault, though, because we're likely to
+ * break the console driver and lose most of the stack dump.
+ */
+ asm volatile ("movq %[stack], %%rsp\n\t"
+ "call handle_stack_overflow\n\t"
+ "1: jmp 1b"
+ : "+r" (__sp)
+ : "D" ("kernel stack overflow (page fault)"),
+ "S" (regs), "d" (address),
+ [stack] "rm" (stack));
+ unreachable();
+ }
+#endif
+
/*
* 32-bit:
*
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index ec21796ac5fd..4473cb4f8b90 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -3,15 +3,17 @@
* included by both the compressed kernel and the regular kernel.
*/
-static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
+static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
unsigned long addr, unsigned long end)
{
addr &= PMD_MASK;
for (; addr < end; addr += PMD_SIZE) {
pmd_t *pmd = pmd_page + pmd_index(addr);
- if (!pmd_present(*pmd))
- set_pmd(pmd, __pmd(addr | pmd_flag));
+ if (pmd_present(*pmd))
+ continue;
+
+ set_pmd(pmd, __pmd((addr - info->offset) | info->pmd_flag));
}
}
@@ -30,13 +32,13 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
if (pud_present(*pud)) {
pmd = pmd_offset(pud, 0);
- ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ ident_pmd_init(info, pmd, addr, next);
continue;
}
pmd = (pmd_t *)info->alloc_pgt_page(info->context);
if (!pmd)
return -ENOMEM;
- ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ ident_pmd_init(info, pmd, addr, next);
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
}
@@ -44,14 +46,15 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
}
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
- unsigned long addr, unsigned long end)
+ unsigned long pstart, unsigned long pend)
{
+ unsigned long addr = pstart + info->offset;
+ unsigned long end = pend + info->offset;
unsigned long next;
int result;
- int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
for (; addr < end; addr = next) {
- pgd_t *pgd = pgd_page + pgd_index(addr) + off;
+ pgd_t *pgd = pgd_page + pgd_index(addr);
pud_t *pud;
next = (addr & PGDIR_MASK) + PGDIR_SIZE;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 620928903be3..22af912d66d2 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -122,8 +122,18 @@ __ref void *alloc_low_pages(unsigned int num)
return __va(pfn << PAGE_SHIFT);
}
-/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */
-#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE)
+/*
+ * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS.
+ * With KASLR memory randomization, depending on the machine e820 memory
+ * and the PUD alignment. We may need twice more pages when KASLR memory
+ * randomization is enabled.
+ */
+#ifndef CONFIG_RANDOMIZE_MEMORY
+#define INIT_PGD_PAGE_COUNT 6
+#else
+#define INIT_PGD_PAGE_COUNT 12
+#endif
+#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE)
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
void __init early_alloc_pgt_buf(void)
{
@@ -689,8 +699,10 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
}
}
-void free_initmem(void)
+void __ref free_initmem(void)
{
+ e820_reallocate_tables();
+
free_init_pages("unused kernel",
(unsigned long)(&__init_begin),
(unsigned long)(&__init_end));
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 26dccd6c0df1..ddd2661c4502 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -40,17 +40,26 @@
* You need to add an if/def entry if you introduce a new memory region
* compatible with KASLR. Your entry must be in logical order with memory
* layout. For example, ESPFIX is before EFI because its virtual address is
- * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to
+ * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
* ensure that this order is correct and won't be changed.
*/
static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
-static const unsigned long vaddr_end = VMEMMAP_START;
+
+#if defined(CONFIG_X86_ESPFIX64)
+static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
+#elif defined(CONFIG_EFI)
+static const unsigned long vaddr_end = EFI_VA_START;
+#else
+static const unsigned long vaddr_end = __START_KERNEL_map;
+#endif
/* Default values */
unsigned long page_offset_base = __PAGE_OFFSET_BASE;
EXPORT_SYMBOL(page_offset_base);
unsigned long vmalloc_base = __VMALLOC_BASE;
EXPORT_SYMBOL(vmalloc_base);
+unsigned long vmemmap_base = __VMEMMAP_BASE;
+EXPORT_SYMBOL(vmemmap_base);
/*
* Memory regions randomized by KASLR (except modules that use a separate logic
@@ -63,6 +72,7 @@ static __initdata struct kaslr_memory_region {
} kaslr_regions[] = {
{ &page_offset_base, 64/* Maximum */ },
{ &vmalloc_base, VMALLOC_SIZE_TB },
+ { &vmemmap_base, 1 },
};
/* Get size in bytes used by the memory region */
@@ -77,7 +87,7 @@ static inline unsigned long get_padding(struct kaslr_memory_region *region)
*/
static inline bool kaslr_memory_enabled(void)
{
- return kaslr_enabled() && !config_enabled(CONFIG_KASAN);
+ return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
}
/* Initialize base and padding for each memory region randomized with KASLR */
@@ -89,6 +99,18 @@ void __init kernel_randomize_memory(void)
struct rnd_state rand_state;
unsigned long remain_entropy;
+ /*
+ * All these BUILD_BUG_ON checks ensures the memory layout is
+ * consistent with the vaddr_start/vaddr_end variables.
+ */
+ BUILD_BUG_ON(vaddr_start >= vaddr_end);
+ BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) &&
+ vaddr_end >= EFI_VA_START);
+ BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) ||
+ config_enabled(CONFIG_EFI)) &&
+ vaddr_end >= __START_KERNEL_map);
+ BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+
if (!kaslr_memory_enabled())
return;
@@ -97,7 +119,7 @@ void __init kernel_randomize_memory(void)
* add padding if needed (especially for memory hotplug support).
*/
BUG_ON(kaslr_regions[0].base != &page_offset_base);
- memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT) +
+ memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
/* Adapt phyiscal memory region size based on available memory */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index fb682108f4dc..3f35b48d1d9d 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -722,22 +722,19 @@ void __init x86_numa_init(void)
numa_init(dummy_numa_init);
}
-static __init int find_near_online_node(int node)
+static void __init init_memory_less_node(int nid)
{
- int n, val;
- int min_val = INT_MAX;
- int best_node = -1;
+ unsigned long zones_size[MAX_NR_ZONES] = {0};
+ unsigned long zholes_size[MAX_NR_ZONES] = {0};
- for_each_online_node(n) {
- val = node_distance(node, n);
+ /* Allocate and initialize node data. Memory-less node is now online.*/
+ alloc_node_data(nid);
+ free_area_init_node(nid, zones_size, 0, zholes_size);
- if (val < min_val) {
- min_val = val;
- best_node = n;
- }
- }
-
- return best_node;
+ /*
+ * All zonelists will be built later in start_kernel() after per cpu
+ * areas are initialized.
+ */
}
/*
@@ -766,8 +763,10 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;
+
if (!node_online(node))
- node = find_near_online_node(node);
+ init_memory_less_node(node);
+
numa_set_node(cpu, node);
}
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 849dc09fa4f0..e3353c97d086 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -917,11 +917,11 @@ static void populate_pte(struct cpa_data *cpa,
}
}
-static int populate_pmd(struct cpa_data *cpa,
- unsigned long start, unsigned long end,
- unsigned num_pages, pud_t *pud, pgprot_t pgprot)
+static long populate_pmd(struct cpa_data *cpa,
+ unsigned long start, unsigned long end,
+ unsigned num_pages, pud_t *pud, pgprot_t pgprot)
{
- unsigned int cur_pages = 0;
+ long cur_pages = 0;
pmd_t *pmd;
pgprot_t pmd_pgprot;
@@ -991,12 +991,12 @@ static int populate_pmd(struct cpa_data *cpa,
return num_pages;
}
-static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
- pgprot_t pgprot)
+static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
+ pgprot_t pgprot)
{
pud_t *pud;
unsigned long end;
- int cur_pages = 0;
+ long cur_pages = 0;
pgprot_t pud_pgprot;
end = start + (cpa->numpages << PAGE_SHIFT);
@@ -1052,7 +1052,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
/* Map trailing leftover */
if (start < end) {
- int tmp;
+ long tmp;
pud = pud_offset(pgd, start);
if (pud_none(*pud))
@@ -1078,7 +1078,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
pud_t *pud = NULL; /* shut up gcc */
pgd_t *pgd_entry;
- int ret;
+ long ret;
pgd_entry = cpa->pgd + pgd_index(addr);
@@ -1327,7 +1327,8 @@ static int cpa_process_alias(struct cpa_data *cpa)
static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
{
- int ret, numpages = cpa->numpages;
+ unsigned long numpages = cpa->numpages;
+ int ret;
while (numpages) {
/*
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index ecb1b69c1651..170cc4ff057b 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -927,9 +927,10 @@ int track_pfn_copy(struct vm_area_struct *vma)
}
/*
- * prot is passed in as a parameter for the new mapping. If the vma has a
- * linear pfn mapping for the entire range reserve the entire vma range with
- * single reserve_pfn_range call.
+ * prot is passed in as a parameter for the new mapping. If the vma has
+ * a linear pfn mapping for the entire range, or no vma is provided,
+ * reserve the entire pfn + size range with single reserve_pfn_range
+ * call.
*/
int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
unsigned long pfn, unsigned long addr, unsigned long size)
@@ -938,11 +939,12 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
enum page_cache_mode pcm;
/* reserve the whole chunk starting from paddr */
- if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
+ if (!vma || (addr == vma->vm_start
+ && size == (vma->vm_end - vma->vm_start))) {
int ret;
ret = reserve_pfn_range(paddr, size, prot, 0);
- if (!ret)
+ if (ret == 0 && vma)
vma->vm_flags |= VM_PAT;
return ret;
}
@@ -997,7 +999,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
resource_size_t paddr;
unsigned long prot;
- if (!(vma->vm_flags & VM_PAT))
+ if (vma && !(vma->vm_flags & VM_PAT))
return;
/* free the chunk starting from pfn or the whole chunk */
@@ -1011,7 +1013,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
size = vma->vm_end - vma->vm_start;
}
free_pfn_range(paddr, size);
- vma->vm_flags &= ~VM_PAT;
+ if (vma)
+ vma->vm_flags &= ~VM_PAT;
}
/*
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index de391b7bc19a..159b52ccd600 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -254,9 +254,7 @@ struct memtype *rbt_memtype_erase(u64 start, u64 end)
struct memtype *rbt_memtype_lookup(u64 addr)
{
- struct memtype *data;
- data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);
- return data;
+ return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);
}
#if defined(CONFIG_DEBUG_FS)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4dbe65622810..a7655f6caf7d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
unsigned cpu = smp_processor_id();
if (likely(prev != next)) {
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ /*
+ * If our current stack is in vmalloc space and isn't
+ * mapped in the new pgd, we'll double-fault. Forcibly
+ * map it.
+ */
+ unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
+
+ pgd_t *pgd = next->pgd + stack_pgd_index;
+
+ if (unlikely(pgd_none(*pgd)))
+ set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
+ }
+
#ifdef CONFIG_SMP
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
this_cpu_write(cpu_tlbstate.active_mm, next);
#endif
+
cpumask_set_cpu(cpu, mm_cpumask(next));
/*
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index cb31a4440e58..a2488b6e27d6 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -16,27 +16,7 @@
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
-
-static int backtrace_stack(void *data, char *name)
-{
- /* Yes, we want all stacks */
- return 0;
-}
-
-static int backtrace_address(void *data, unsigned long addr, int reliable)
-{
- unsigned int *depth = data;
-
- if ((*depth)--)
- oprofile_add_trace(addr);
- return 0;
-}
-
-static struct stacktrace_ops backtrace_ops = {
- .stack = backtrace_stack,
- .address = backtrace_address,
- .walk_stack = print_context_stack,
-};
+#include <asm/unwind.h>
#ifdef CONFIG_COMPAT
static struct stack_frame_ia32 *
@@ -113,10 +93,29 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
if (!user_mode(regs)) {
- unsigned long stack = kernel_stack_pointer(regs);
- if (depth)
- dump_trace(NULL, regs, (unsigned long *)stack, 0,
- &backtrace_ops, &depth);
+ struct unwind_state state;
+ unsigned long addr;
+
+ if (!depth)
+ return;
+
+ oprofile_add_trace(regs->ip);
+
+ if (!--depth)
+ return;
+
+ for (unwind_start(&state, current, regs, NULL);
+ !unwind_done(&state); unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr)
+ break;
+
+ oprofile_add_trace(addr);
+
+ if (!--depth)
+ break;
+ }
+
return;
}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 837ea36a837d..6d52b94f4bb9 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -553,15 +553,21 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev)
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone);
/*
- * Broadwell EP Home Agent BARs erroneously return non-zero values when read.
+ * Device [8086:2fc0]
+ * Erratum HSE43
+ * CONFIG_TDP_NOMINAL CSR Implemented at Incorrect Offset
+ * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html
*
- * See http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html
- * entry BDF2.
+ * Devices [8086:6f60,6fa0,6fc0]
+ * Erratum BDF2
+ * PCI BARs in the Home Agent Will Return Non-Zero Values During Enumeration
+ * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html
*/
-static void pci_bdwep_bar(struct pci_dev *dev)
+static void pci_invalid_bar(struct pci_dev *dev)
{
dev->non_compliant_bars = 1;
}
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_bdwep_bar);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar);
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 9770e55e768f..1d97cea3b3a4 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -120,9 +120,12 @@ static unsigned long __init bios32_service(unsigned long service)
static struct {
unsigned long address;
unsigned short segment;
-} pci_indirect = { 0, __KERNEL_CS };
+} pci_indirect __ro_after_init = {
+ .address = 0,
+ .segment = __KERNEL_CS,
+};
-static int pci_bios_present;
+static int pci_bios_present __ro_after_init;
static int __init check_pcibios(void)
{
diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c
index b814ca675131..7948be342ee9 100644
--- a/arch/x86/pci/vmd.c
+++ b/arch/x86/pci/vmd.c
@@ -41,6 +41,7 @@ static DEFINE_RAW_SPINLOCK(list_lock);
* @node: list item for parent traversal.
* @rcu: RCU callback item for freeing.
* @irq: back pointer to parent.
+ * @enabled: true if driver enabled IRQ
* @virq: the virtual IRQ value provided to the requesting driver.
*
* Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to
@@ -50,6 +51,7 @@ struct vmd_irq {
struct list_head node;
struct rcu_head rcu;
struct vmd_irq_list *irq;
+ bool enabled;
unsigned int virq;
};
@@ -122,7 +124,9 @@ static void vmd_irq_enable(struct irq_data *data)
unsigned long flags;
raw_spin_lock_irqsave(&list_lock, flags);
+ WARN_ON(vmdirq->enabled);
list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list);
+ vmdirq->enabled = true;
raw_spin_unlock_irqrestore(&list_lock, flags);
data->chip->irq_unmask(data);
@@ -136,8 +140,10 @@ static void vmd_irq_disable(struct irq_data *data)
data->chip->irq_mask(data);
raw_spin_lock_irqsave(&list_lock, flags);
- list_del_rcu(&vmdirq->node);
- INIT_LIST_HEAD_RCU(&vmdirq->node);
+ if (vmdirq->enabled) {
+ list_del_rcu(&vmdirq->node);
+ vmdirq->enabled = false;
+ }
raw_spin_unlock_irqrestore(&list_lock, flags);
}
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 184842ef332e..3c3c19ea94df 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -8,6 +8,7 @@ obj-y += iris/
obj-y += intel/
obj-y += intel-mid/
obj-y += intel-quark/
+obj-y += mellanox/
obj-y += olpc/
obj-y += scx200/
obj-y += sfi/
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
index 8ff7b9355416..d49d3be81953 100644
--- a/arch/x86/platform/atom/punit_atom_debug.c
+++ b/arch/x86/platform/atom/punit_atom_debug.c
@@ -155,7 +155,7 @@ static void punit_dbgfs_unregister(void)
static const struct x86_cpu_id intel_punit_cpu_ids[] = {
ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt),
- ICPU(INTEL_FAM6_ATOM_MERRIFIELD1, punit_device_tng),
+ ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng),
ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht),
{}
};
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index 6a2f5691b1ab..6aad870e8962 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -82,21 +82,12 @@ void __init efi_bgrt_init(void)
}
bgrt_image_size = bmp_header.size;
- bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN);
+ bgrt_image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB);
if (!bgrt_image) {
- pr_notice("Ignoring BGRT: failed to allocate memory for image (wanted %zu bytes)\n",
- bgrt_image_size);
- return;
- }
-
- image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB);
- if (!image) {
pr_notice("Ignoring BGRT: failed to map image memory\n");
- kfree(bgrt_image);
bgrt_image = NULL;
return;
}
- memcpy(bgrt_image, image, bgrt_image_size);
- memunmap(image);
+ efi_mem_reserve(bgrt_tab->image_address, bgrt_image_size);
}
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 1fbb408e2e72..bf99aa7005eb 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -166,13 +166,15 @@ static void __init do_add_efi_memmap(void)
}
e820_add_region(start, size, e820_type);
}
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
}
int __init efi_memblock_x86_reserve_range(void)
{
struct efi_info *e = &boot_params.efi_info;
+ struct efi_memory_map_data data;
phys_addr_t pmap;
+ int rv;
if (efi_enabled(EFI_PARAVIRT))
return 0;
@@ -187,11 +189,17 @@ int __init efi_memblock_x86_reserve_range(void)
#else
pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
#endif
- efi.memmap.phys_map = pmap;
- efi.memmap.nr_map = e->efi_memmap_size /
- e->efi_memdesc_size;
- efi.memmap.desc_size = e->efi_memdesc_size;
- efi.memmap.desc_version = e->efi_memdesc_version;
+ data.phys_map = pmap;
+ data.size = e->efi_memmap_size;
+ data.desc_size = e->efi_memdesc_size;
+ data.desc_version = e->efi_memdesc_version;
+
+ rv = efi_memmap_init_early(&data);
+ if (rv)
+ return rv;
+
+ if (add_efi_memmap)
+ do_add_efi_memmap();
WARN(efi.memmap.desc_version != 1,
"Unexpected EFI_MEMORY_DESCRIPTOR version %ld",
@@ -218,19 +226,6 @@ void __init efi_print_memmap(void)
}
}
-void __init efi_unmap_memmap(void)
-{
- unsigned long size;
-
- clear_bit(EFI_MEMMAP, &efi.flags);
-
- size = efi.memmap.nr_map * efi.memmap.desc_size;
- if (efi.memmap.map) {
- early_memunmap(efi.memmap.map, size);
- efi.memmap.map = NULL;
- }
-}
-
static int __init efi_systab_init(void *phys)
{
if (efi_enabled(EFI_64BIT)) {
@@ -414,33 +409,6 @@ static int __init efi_runtime_init(void)
return 0;
}
-static int __init efi_memmap_init(void)
-{
- unsigned long addr, size;
-
- if (efi_enabled(EFI_PARAVIRT))
- return 0;
-
- /* Map the EFI memory map */
- size = efi.memmap.nr_map * efi.memmap.desc_size;
- addr = (unsigned long)efi.memmap.phys_map;
-
- efi.memmap.map = early_memremap(addr, size);
- if (efi.memmap.map == NULL) {
- pr_err("Could not map the memory map!\n");
- return -ENOMEM;
- }
-
- efi.memmap.map_end = efi.memmap.map + size;
-
- if (add_efi_memmap)
- do_add_efi_memmap();
-
- set_bit(EFI_MEMMAP, &efi.flags);
-
- return 0;
-}
-
void __init efi_init(void)
{
efi_char16_t *c16;
@@ -498,16 +466,14 @@ void __init efi_init(void)
if (!efi_runtime_supported())
pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n");
else {
- if (efi_runtime_disabled() || efi_runtime_init())
+ if (efi_runtime_disabled() || efi_runtime_init()) {
+ efi_memmap_unmap();
return;
+ }
}
- if (efi_memmap_init())
- return;
if (efi_enabled(EFI_DBG))
efi_print_memmap();
-
- efi_esrt_init();
}
void __init efi_late_init(void)
@@ -624,42 +590,6 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
}
}
-static void __init save_runtime_map(void)
-{
-#ifdef CONFIG_KEXEC_CORE
- unsigned long desc_size;
- efi_memory_desc_t *md;
- void *tmp, *q = NULL;
- int count = 0;
-
- if (efi_enabled(EFI_OLD_MEMMAP))
- return;
-
- desc_size = efi.memmap.desc_size;
-
- for_each_efi_memory_desc(md) {
- if (!(md->attribute & EFI_MEMORY_RUNTIME) ||
- (md->type == EFI_BOOT_SERVICES_CODE) ||
- (md->type == EFI_BOOT_SERVICES_DATA))
- continue;
- tmp = krealloc(q, (count + 1) * desc_size, GFP_KERNEL);
- if (!tmp)
- goto out;
- q = tmp;
-
- memcpy(q + count * desc_size, md, desc_size);
- count++;
- }
-
- efi_runtime_map_setup(q, count, desc_size);
- return;
-
-out:
- kfree(q);
- pr_err("Error saving runtime map, efi runtime on kexec non-functional!!\n");
-#endif
-}
-
static void *realloc_pages(void *old_memmap, int old_shift)
{
void *ret;
@@ -745,6 +675,46 @@ static void *efi_map_next_entry(void *entry)
return entry;
}
+static bool should_map_region(efi_memory_desc_t *md)
+{
+ /*
+ * Runtime regions always require runtime mappings (obviously).
+ */
+ if (md->attribute & EFI_MEMORY_RUNTIME)
+ return true;
+
+ /*
+ * 32-bit EFI doesn't suffer from the bug that requires us to
+ * reserve boot services regions, and mixed mode support
+ * doesn't exist for 32-bit kernels.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
+ /*
+ * Map all of RAM so that we can access arguments in the 1:1
+ * mapping when making EFI runtime calls.
+ */
+ if (IS_ENABLED(CONFIG_EFI_MIXED) && !efi_is_native()) {
+ if (md->type == EFI_CONVENTIONAL_MEMORY ||
+ md->type == EFI_LOADER_DATA ||
+ md->type == EFI_LOADER_CODE)
+ return true;
+ }
+
+ /*
+ * Map boot services regions as a workaround for buggy
+ * firmware that accesses them even when they shouldn't.
+ *
+ * See efi_{reserve,free}_boot_services().
+ */
+ if (md->type == EFI_BOOT_SERVICES_CODE ||
+ md->type == EFI_BOOT_SERVICES_DATA)
+ return true;
+
+ return false;
+}
+
/*
* Map the efi memory ranges of the runtime services and update new_mmap with
* virtual addresses.
@@ -761,13 +731,9 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
p = NULL;
while ((p = efi_map_next_entry(p))) {
md = p;
- if (!(md->attribute & EFI_MEMORY_RUNTIME)) {
-#ifdef CONFIG_X86_64
- if (md->type != EFI_BOOT_SERVICES_CODE &&
- md->type != EFI_BOOT_SERVICES_DATA)
-#endif
- continue;
- }
+
+ if (!should_map_region(md))
+ continue;
efi_map_region(md);
get_systab_virt_addr(md);
@@ -803,7 +769,7 @@ static void __init kexec_enter_virtual_mode(void)
* non-native EFI
*/
if (!efi_is_native()) {
- efi_unmap_memmap();
+ efi_memmap_unmap();
clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return;
}
@@ -823,7 +789,18 @@ static void __init kexec_enter_virtual_mode(void)
get_systab_virt_addr(md);
}
- save_runtime_map();
+ /*
+ * Unregister the early EFI memmap from efi_init() and install
+ * the new EFI memory map.
+ */
+ efi_memmap_unmap();
+
+ if (efi_memmap_init_late(efi.memmap.phys_map,
+ efi.memmap.desc_size * efi.memmap.nr_map)) {
+ pr_err("Failed to remap late EFI memory map\n");
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ return;
+ }
BUG_ON(!efi.systab);
@@ -884,6 +861,7 @@ static void __init __efi_enter_virtual_mode(void)
int count = 0, pg_shift = 0;
void *new_memmap = NULL;
efi_status_t status;
+ phys_addr_t pa;
efi.systab = NULL;
@@ -901,11 +879,24 @@ static void __init __efi_enter_virtual_mode(void)
return;
}
- save_runtime_map();
+ pa = __pa(new_memmap);
+
+ /*
+ * Unregister the early EFI memmap from efi_init() and install
+ * the new EFI memory map that we are about to pass to the
+ * firmware via SetVirtualAddressMap().
+ */
+ efi_memmap_unmap();
+
+ if (efi_memmap_init_late(pa, efi.memmap.desc_size * count)) {
+ pr_err("Failed to remap late EFI memory map\n");
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ return;
+ }
BUG_ON(!efi.systab);
- if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift)) {
+ if (efi_setup_page_tables(pa, 1 << pg_shift)) {
clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return;
}
@@ -917,14 +908,14 @@ static void __init __efi_enter_virtual_mode(void)
efi.memmap.desc_size * count,
efi.memmap.desc_size,
efi.memmap.desc_version,
- (efi_memory_desc_t *)__pa(new_memmap));
+ (efi_memory_desc_t *)pa);
} else {
status = efi_thunk_set_virtual_address_map(
efi_phys.set_virtual_address_map,
efi.memmap.desc_size * count,
efi.memmap.desc_size,
efi.memmap.desc_version,
- (efi_memory_desc_t *)__pa(new_memmap));
+ (efi_memory_desc_t *)pa);
}
if (status != EFI_SUCCESS) {
@@ -956,15 +947,6 @@ static void __init __efi_enter_virtual_mode(void)
efi_runtime_update_mappings();
efi_dump_pagetable();
- /*
- * We mapped the descriptor array into the EFI pagetable above
- * but we're not unmapping it here because if we're running in
- * EFI mixed mode we need all of memory to be accessible when
- * we pass parameters to the EFI runtime services in the
- * thunking code.
- */
- free_pages((unsigned long)new_memmap, pg_shift);
-
/* clean DUMMY object */
efi_delete_dummy_variable();
}
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 677e29e29473..58b0f801f66f 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -85,7 +85,7 @@ pgd_t * __init efi_call_phys_prolog(void)
early_code_mapping_set_exec(1);
n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
- save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL);
+ save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL);
for (pgd = 0; pgd < n_pgds; pgd++) {
save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE);
@@ -214,7 +214,6 @@ void efi_sync_low_kernel_mappings(void)
int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
unsigned long pfn, text;
- efi_memory_desc_t *md;
struct page *page;
unsigned npages;
pgd_t *pgd;
@@ -245,28 +244,9 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
* text and allocate a new stack because we can't rely on the
* stack pointer being < 4GB.
*/
- if (!IS_ENABLED(CONFIG_EFI_MIXED))
+ if (!IS_ENABLED(CONFIG_EFI_MIXED) || efi_is_native())
return 0;
- /*
- * Map all of RAM so that we can access arguments in the 1:1
- * mapping when making EFI runtime calls.
- */
- for_each_efi_memory_desc(md) {
- if (md->type != EFI_CONVENTIONAL_MEMORY &&
- md->type != EFI_LOADER_DATA &&
- md->type != EFI_LOADER_CODE)
- continue;
-
- pfn = md->phys_addr >> PAGE_SHIFT;
- npages = md->num_pages;
-
- if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, npages, _PAGE_RW)) {
- pr_err("Failed to map 1:1 memory\n");
- return 1;
- }
- }
-
page = alloc_page(GFP_KERNEL|__GFP_DMA32);
if (!page)
panic("Unable to allocate EFI runtime stack < 4GB\n");
@@ -359,6 +339,7 @@ void __init efi_map_region(efi_memory_desc_t *md)
*/
void __init efi_map_region_fixed(efi_memory_desc_t *md)
{
+ __map_region(md, md->phys_addr);
__map_region(md, md->virt_addr);
}
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 4480c06cade7..10aca63a50d7 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -164,6 +164,75 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size,
EXPORT_SYMBOL_GPL(efi_query_variable_store);
/*
+ * The UEFI specification makes it clear that the operating system is
+ * free to do whatever it wants with boot services code after
+ * ExitBootServices() has been called. Ignoring this recommendation a
+ * significant bunch of EFI implementations continue calling into boot
+ * services code (SetVirtualAddressMap). In order to work around such
+ * buggy implementations we reserve boot services region during EFI
+ * init and make sure it stays executable. Then, after
+ * SetVirtualAddressMap(), it is discarded.
+ *
+ * However, some boot services regions contain data that is required
+ * by drivers, so we need to track which memory ranges can never be
+ * freed. This is done by tagging those regions with the
+ * EFI_MEMORY_RUNTIME attribute.
+ *
+ * Any driver that wants to mark a region as reserved must use
+ * efi_mem_reserve() which will insert a new EFI memory descriptor
+ * into efi.memmap (splitting existing regions if necessary) and tag
+ * it with EFI_MEMORY_RUNTIME.
+ */
+void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
+{
+ phys_addr_t new_phys, new_size;
+ struct efi_mem_range mr;
+ efi_memory_desc_t md;
+ int num_entries;
+ void *new;
+
+ if (efi_mem_desc_lookup(addr, &md)) {
+ pr_err("Failed to lookup EFI memory descriptor for %pa\n", &addr);
+ return;
+ }
+
+ if (addr + size > md.phys_addr + (md.num_pages << EFI_PAGE_SHIFT)) {
+ pr_err("Region spans EFI memory descriptors, %pa\n", &addr);
+ return;
+ }
+
+ size += addr % EFI_PAGE_SIZE;
+ size = round_up(size, EFI_PAGE_SIZE);
+ addr = round_down(addr, EFI_PAGE_SIZE);
+
+ mr.range.start = addr;
+ mr.range.end = addr + size - 1;
+ mr.attribute = md.attribute | EFI_MEMORY_RUNTIME;
+
+ num_entries = efi_memmap_split_count(&md, &mr.range);
+ num_entries += efi.memmap.nr_map;
+
+ new_size = efi.memmap.desc_size * num_entries;
+
+ new_phys = memblock_alloc(new_size, 0);
+ if (!new_phys) {
+ pr_err("Could not allocate boot services memmap\n");
+ return;
+ }
+
+ new = early_memremap(new_phys, new_size);
+ if (!new) {
+ pr_err("Failed to map new boot services memmap\n");
+ return;
+ }
+
+ efi_memmap_insert(&efi.memmap, new, &mr);
+ early_memunmap(new, new_size);
+
+ efi_memmap_install(new_phys, num_entries);
+}
+
+/*
* Helper function for efi_reserve_boot_services() to figure out if we
* can free regions in efi_free_boot_services().
*
@@ -184,15 +253,6 @@ static bool can_free_region(u64 start, u64 size)
return true;
}
-/*
- * The UEFI specification makes it clear that the operating system is free to do
- * whatever it wants with boot services code after ExitBootServices() has been
- * called. Ignoring this recommendation a significant bunch of EFI implementations
- * continue calling into boot services code (SetVirtualAddressMap). In order to
- * work around such buggy implementations we reserve boot services region during
- * EFI init and make sure it stays executable. Then, after SetVirtualAddressMap(), it
-* is discarded.
-*/
void __init efi_reserve_boot_services(void)
{
efi_memory_desc_t *md;
@@ -249,24 +309,86 @@ void __init efi_reserve_boot_services(void)
void __init efi_free_boot_services(void)
{
+ phys_addr_t new_phys, new_size;
efi_memory_desc_t *md;
+ int num_entries = 0;
+ void *new, *new_md;
for_each_efi_memory_desc(md) {
unsigned long long start = md->phys_addr;
unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+ size_t rm_size;
if (md->type != EFI_BOOT_SERVICES_CODE &&
- md->type != EFI_BOOT_SERVICES_DATA)
+ md->type != EFI_BOOT_SERVICES_DATA) {
+ num_entries++;
continue;
+ }
/* Do not free, someone else owns it: */
- if (md->attribute & EFI_MEMORY_RUNTIME)
+ if (md->attribute & EFI_MEMORY_RUNTIME) {
+ num_entries++;
continue;
+ }
+
+ /*
+ * Nasty quirk: if all sub-1MB memory is used for boot
+ * services, we can get here without having allocated the
+ * real mode trampoline. It's too late to hand boot services
+ * memory back to the memblock allocator, so instead
+ * try to manually allocate the trampoline if needed.
+ *
+ * I've seen this on a Dell XPS 13 9350 with firmware
+ * 1.4.4 with SGX enabled booting Linux via Fedora 24's
+ * grub2-efi on a hard disk. (And no, I don't know why
+ * this happened, but Linux should still try to boot rather
+ * panicing early.)
+ */
+ rm_size = real_mode_size_needed();
+ if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) {
+ set_real_mode_mem(start, rm_size);
+ start += rm_size;
+ size -= rm_size;
+ }
free_bootmem_late(start, size);
}
- efi_unmap_memmap();
+ new_size = efi.memmap.desc_size * num_entries;
+ new_phys = memblock_alloc(new_size, 0);
+ if (!new_phys) {
+ pr_err("Failed to allocate new EFI memmap\n");
+ return;
+ }
+
+ new = memremap(new_phys, new_size, MEMREMAP_WB);
+ if (!new) {
+ pr_err("Failed to map new EFI memmap\n");
+ return;
+ }
+
+ /*
+ * Build a new EFI memmap that excludes any boot services
+ * regions that are not tagged EFI_MEMORY_RUNTIME, since those
+ * regions have now been freed.
+ */
+ new_md = new;
+ for_each_efi_memory_desc(md) {
+ if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
+ (md->type == EFI_BOOT_SERVICES_CODE ||
+ md->type == EFI_BOOT_SERVICES_DATA))
+ continue;
+
+ memcpy(new_md, md, efi.memmap.desc_size);
+ new_md += efi.memmap.desc_size;
+ }
+
+ memunmap(new);
+
+ if (efi_memmap_install(new_phys, num_entries)) {
+ pr_err("Could not install new EFI memmap\n");
+ return;
+ }
}
/*
@@ -344,7 +466,7 @@ void __init efi_apply_memmap_quirks(void)
*/
if (!efi_runtime_supported()) {
pr_info("Setup done, disabling due to 32/64-bit mismatch\n");
- efi_unmap_memmap();
+ efi_memmap_unmap();
}
/* UV2+ BIOS has a fix for this issue. UV1 still needs the quirk. */
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
index fc135bf70511..429d08be7848 100644
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -1,5 +1,9 @@
# Family-Level Interface Shim (FLIS)
obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o
+# SDHCI Devices
+obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o
+# WiFi
+obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o
# IPC Devices
obj-y += platform_ipc.o
obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
new file mode 100644
index 000000000000..4392c15ed9e0
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
@@ -0,0 +1,95 @@
+/*
+ * platform_bcm43xx.c: bcm43xx platform data initilization file
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gpio.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/machine.h>
+#include <linux/regulator/fixed.h>
+#include <linux/sfi.h>
+
+#include <asm/intel-mid.h>
+
+#define WLAN_SFI_GPIO_IRQ_NAME "WLAN-interrupt"
+#define WLAN_SFI_GPIO_ENABLE_NAME "WLAN-enable"
+
+#define WLAN_DEV_NAME "0000:00:01.3"
+
+static struct regulator_consumer_supply bcm43xx_vmmc_supply = {
+ .dev_name = WLAN_DEV_NAME,
+ .supply = "vmmc",
+};
+
+static struct regulator_init_data bcm43xx_vmmc_data = {
+ .constraints = {
+ .valid_ops_mask = REGULATOR_CHANGE_STATUS,
+ },
+ .num_consumer_supplies = 1,
+ .consumer_supplies = &bcm43xx_vmmc_supply,
+};
+
+static struct fixed_voltage_config bcm43xx_vmmc = {
+ .supply_name = "bcm43xx-vmmc-regulator",
+ /*
+ * Announce 2.0V here to be compatible with SDIO specification. The
+ * real voltage and signaling are still 1.8V.
+ */
+ .microvolts = 2000000, /* 1.8V */
+ .gpio = -EINVAL,
+ .startup_delay = 250 * 1000, /* 250ms */
+ .enable_high = 1, /* active high */
+ .enabled_at_boot = 0, /* disabled at boot */
+ .init_data = &bcm43xx_vmmc_data,
+};
+
+static struct platform_device bcm43xx_vmmc_regulator = {
+ .name = "reg-fixed-voltage",
+ .id = PLATFORM_DEVID_AUTO,
+ .dev = {
+ .platform_data = &bcm43xx_vmmc,
+ },
+};
+
+static int __init bcm43xx_regulator_register(void)
+{
+ int ret;
+
+ bcm43xx_vmmc.gpio = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME);
+ ret = platform_device_register(&bcm43xx_vmmc_regulator);
+ if (ret) {
+ pr_err("%s: vmmc regulator register failed\n", __func__);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __init *bcm43xx_platform_data(void *info)
+{
+ int ret;
+
+ ret = bcm43xx_regulator_register();
+ if (ret)
+ return NULL;
+
+ pr_info("Using generic wifi platform data\n");
+
+ /* For now it's empty */
+ return NULL;
+}
+
+static const struct devs_id bcm43xx_clk_vmmc_dev_id __initconst = {
+ .name = "bcm43xx_clk_vmmc",
+ .type = SFI_DEV_TYPE_SD,
+ .get_platform_data = &bcm43xx_platform_data,
+};
+
+sfi_device(bcm43xx_clk_vmmc_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
new file mode 100644
index 000000000000..00c4a034ad93
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
@@ -0,0 +1,47 @@
+/*
+ * SDHCI platform data initilisation file
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/pci.h>
+
+#include <linux/mmc/sdhci-pci-data.h>
+
+#include <asm/intel-mid.h>
+
+#define INTEL_MRFLD_SD 2
+#define INTEL_MRFLD_SD_CD_GPIO 77
+
+static struct sdhci_pci_data mrfld_sdhci_pci_data = {
+ .rst_n_gpio = -EINVAL,
+ .cd_gpio = INTEL_MRFLD_SD_CD_GPIO,
+};
+
+static struct sdhci_pci_data *
+mrfld_sdhci_pci_get_data(struct pci_dev *pdev, int slotno)
+{
+ unsigned int func = PCI_FUNC(pdev->devfn);
+
+ if (func == INTEL_MRFLD_SD)
+ return &mrfld_sdhci_pci_data;
+
+ return NULL;
+}
+
+static int __init mrfld_sd_init(void)
+{
+ if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
+ return -ENODEV;
+
+ sdhci_pci_get_data = mrfld_sdhci_pci_get_data;
+ return 0;
+}
+arch_initcall(mrfld_sd_init);
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index ce119d2ba0d0..7850128f0026 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -70,6 +70,11 @@ EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip);
static void intel_mid_power_off(void)
{
+ /* Shut down South Complex via PWRMU */
+ intel_mid_pwr_power_off();
+
+ /* Only for Tangier, the rest will ignore this command */
+ intel_scu_ipc_simple_command(IPCMSG_COLD_OFF, 1);
};
static void intel_mid_reboot(void)
diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c
index c901a3423772..5d3b45ad1c03 100644
--- a/arch/x86/platform/intel-mid/pwr.c
+++ b/arch/x86/platform/intel-mid/pwr.c
@@ -44,7 +44,19 @@
/* Bits in PM_CMD */
#define PM_CMD_CMD(x) ((x) << 0)
#define PM_CMD_IOC (1 << 8)
-#define PM_CMD_D3cold (1 << 21)
+#define PM_CMD_CM_NOP (0 << 9)
+#define PM_CMD_CM_IMMEDIATE (1 << 9)
+#define PM_CMD_CM_DELAY (2 << 9)
+#define PM_CMD_CM_TRIGGER (3 << 9)
+
+/* System states */
+#define PM_CMD_SYS_STATE_S5 (5 << 16)
+
+/* Trigger variants */
+#define PM_CMD_CFG_TRIGGER_NC (3 << 19)
+
+/* Message to wait for TRIGGER_NC case */
+#define TRIGGER_NC_MSG_2 (2 << 22)
/* List of commands */
#define CMD_SET_CFG 0x01
@@ -137,7 +149,7 @@ static int mid_pwr_wait(struct mid_pwr *pwr)
static int mid_pwr_wait_for_cmd(struct mid_pwr *pwr, u8 cmd)
{
- writel(PM_CMD_CMD(cmd), pwr->regs + PM_CMD);
+ writel(PM_CMD_CMD(cmd) | PM_CMD_CM_IMMEDIATE, pwr->regs + PM_CMD);
return mid_pwr_wait(pwr);
}
@@ -260,6 +272,20 @@ int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state)
}
EXPORT_SYMBOL_GPL(intel_mid_pci_set_power_state);
+void intel_mid_pwr_power_off(void)
+{
+ struct mid_pwr *pwr = midpwr;
+ u32 cmd = PM_CMD_SYS_STATE_S5 |
+ PM_CMD_CMD(CMD_SET_CFG) |
+ PM_CMD_CM_TRIGGER |
+ PM_CMD_CFG_TRIGGER_NC |
+ TRIGGER_NC_MSG_2;
+
+ /* Send command to SCU */
+ writel(cmd, pwr->regs + PM_CMD);
+ mid_pwr_wait(pwr);
+}
+
int intel_mid_pwr_get_lss_id(struct pci_dev *pdev)
{
int vndr;
@@ -354,7 +380,7 @@ static int mid_pwr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
return 0;
}
-static int mid_set_initial_state(struct mid_pwr *pwr)
+static int mid_set_initial_state(struct mid_pwr *pwr, const u32 *states)
{
unsigned int i, j;
int ret;
@@ -379,10 +405,10 @@ static int mid_set_initial_state(struct mid_pwr *pwr)
* NOTE: The actual device mapping is provided by a platform at run
* time using vendor capability of PCI configuration space.
*/
- mid_pwr_set_state(pwr, 0, 0xffffffff);
- mid_pwr_set_state(pwr, 1, 0xffffffff);
- mid_pwr_set_state(pwr, 2, 0xffffffff);
- mid_pwr_set_state(pwr, 3, 0xffffffff);
+ mid_pwr_set_state(pwr, 0, states[0]);
+ mid_pwr_set_state(pwr, 1, states[1]);
+ mid_pwr_set_state(pwr, 2, states[2]);
+ mid_pwr_set_state(pwr, 3, states[3]);
/* Send command to SCU */
ret = mid_pwr_wait_for_cmd(pwr, CMD_SET_CFG);
@@ -397,13 +423,41 @@ static int mid_set_initial_state(struct mid_pwr *pwr)
return 0;
}
-static const struct mid_pwr_device_info mid_info = {
- .set_initial_state = mid_set_initial_state,
+static int pnw_set_initial_state(struct mid_pwr *pwr)
+{
+ /* On Penwell SRAM must stay powered on */
+ const u32 states[] = {
+ 0xf00fffff, /* PM_SSC(0) */
+ 0xffffffff, /* PM_SSC(1) */
+ 0xffffffff, /* PM_SSC(2) */
+ 0xffffffff, /* PM_SSC(3) */
+ };
+ return mid_set_initial_state(pwr, states);
+}
+
+static int tng_set_initial_state(struct mid_pwr *pwr)
+{
+ const u32 states[] = {
+ 0xffffffff, /* PM_SSC(0) */
+ 0xffffffff, /* PM_SSC(1) */
+ 0xffffffff, /* PM_SSC(2) */
+ 0xffffffff, /* PM_SSC(3) */
+ };
+ return mid_set_initial_state(pwr, states);
+}
+
+static const struct mid_pwr_device_info pnw_info = {
+ .set_initial_state = pnw_set_initial_state,
+};
+
+static const struct mid_pwr_device_info tng_info = {
+ .set_initial_state = tng_set_initial_state,
};
+/* This table should be in sync with the one in drivers/pci/pci-mid.c */
static const struct pci_device_id mid_pwr_pci_ids[] = {
- { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PENWELL), (kernel_ulong_t)&mid_info },
- { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_TANGIER), (kernel_ulong_t)&mid_info },
+ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PENWELL), (kernel_ulong_t)&pnw_info },
+ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_TANGIER), (kernel_ulong_t)&tng_info },
{}
};
diff --git a/arch/x86/platform/mellanox/Makefile b/arch/x86/platform/mellanox/Makefile
new file mode 100644
index 000000000000..f43c93188a1d
--- /dev/null
+++ b/arch/x86/platform/mellanox/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_MLX_PLATFORM) += mlx-platform.o
diff --git a/arch/x86/platform/mellanox/mlx-platform.c b/arch/x86/platform/mellanox/mlx-platform.c
new file mode 100644
index 000000000000..7dcfcca97399
--- /dev/null
+++ b/arch/x86/platform/mellanox/mlx-platform.c
@@ -0,0 +1,266 @@
+/*
+ * arch/x86/platform/mellanox/mlx-platform.c
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Vadim Pasternak <vadimp@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/device.h>
+#include <linux/dmi.h>
+#include <linux/i2c.h>
+#include <linux/i2c-mux.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/platform_data/i2c-mux-reg.h>
+
+#define MLX_PLAT_DEVICE_NAME "mlxplat"
+
+/* LPC bus IO offsets */
+#define MLXPLAT_CPLD_LPC_I2C_BASE_ADRR 0x2000
+#define MLXPLAT_CPLD_LPC_REG_BASE_ADRR 0x2500
+#define MLXPLAT_CPLD_LPC_IO_RANGE 0x100
+#define MLXPLAT_CPLD_LPC_I2C_CH1_OFF 0xdb
+#define MLXPLAT_CPLD_LPC_I2C_CH2_OFF 0xda
+#define MLXPLAT_CPLD_LPC_PIO_OFFSET 0x10000UL
+#define MLXPLAT_CPLD_LPC_REG1 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \
+ MLXPLAT_CPLD_LPC_I2C_CH1_OFF) | \
+ MLXPLAT_CPLD_LPC_PIO_OFFSET)
+#define MLXPLAT_CPLD_LPC_REG2 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \
+ MLXPLAT_CPLD_LPC_I2C_CH2_OFF) | \
+ MLXPLAT_CPLD_LPC_PIO_OFFSET)
+
+/* Start channel numbers */
+#define MLXPLAT_CPLD_CH1 2
+#define MLXPLAT_CPLD_CH2 10
+
+/* Number of LPC attached MUX platform devices */
+#define MLXPLAT_CPLD_LPC_MUX_DEVS 2
+
+/* mlxplat_priv - platform private data
+ * @pdev_i2c - i2c controller platform device
+ * @pdev_mux - array of mux platform devices
+ */
+struct mlxplat_priv {
+ struct platform_device *pdev_i2c;
+ struct platform_device *pdev_mux[MLXPLAT_CPLD_LPC_MUX_DEVS];
+};
+
+/* Regions for LPC I2C controller and LPC base register space */
+static const struct resource mlxplat_lpc_resources[] = {
+ [0] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_I2C_BASE_ADRR,
+ MLXPLAT_CPLD_LPC_IO_RANGE,
+ "mlxplat_cpld_lpc_i2c_ctrl", IORESOURCE_IO),
+ [1] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_REG_BASE_ADRR,
+ MLXPLAT_CPLD_LPC_IO_RANGE,
+ "mlxplat_cpld_lpc_regs",
+ IORESOURCE_IO),
+};
+
+/* Platform default channels */
+static const int mlxplat_default_channels[][8] = {
+ {
+ MLXPLAT_CPLD_CH1, MLXPLAT_CPLD_CH1 + 1, MLXPLAT_CPLD_CH1 + 2,
+ MLXPLAT_CPLD_CH1 + 3, MLXPLAT_CPLD_CH1 + 4, MLXPLAT_CPLD_CH1 +
+ 5, MLXPLAT_CPLD_CH1 + 6, MLXPLAT_CPLD_CH1 + 7
+ },
+ {
+ MLXPLAT_CPLD_CH2, MLXPLAT_CPLD_CH2 + 1, MLXPLAT_CPLD_CH2 + 2,
+ MLXPLAT_CPLD_CH2 + 3, MLXPLAT_CPLD_CH2 + 4, MLXPLAT_CPLD_CH2 +
+ 5, MLXPLAT_CPLD_CH2 + 6, MLXPLAT_CPLD_CH2 + 7
+ },
+};
+
+/* Platform channels for MSN21xx system family */
+static const int mlxplat_msn21xx_channels[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+
+/* Platform mux data */
+static struct i2c_mux_reg_platform_data mlxplat_mux_data[] = {
+ {
+ .parent = 1,
+ .base_nr = MLXPLAT_CPLD_CH1,
+ .write_only = 1,
+ .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG1,
+ .reg_size = 1,
+ .idle_in_use = 1,
+ },
+ {
+ .parent = 1,
+ .base_nr = MLXPLAT_CPLD_CH2,
+ .write_only = 1,
+ .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG2,
+ .reg_size = 1,
+ .idle_in_use = 1,
+ },
+
+};
+
+static struct platform_device *mlxplat_dev;
+
+static int __init mlxplat_dmi_default_matched(const struct dmi_system_id *dmi)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) {
+ mlxplat_mux_data[i].values = mlxplat_default_channels[i];
+ mlxplat_mux_data[i].n_values =
+ ARRAY_SIZE(mlxplat_default_channels[i]);
+ }
+
+ return 1;
+};
+
+static int __init mlxplat_dmi_msn21xx_matched(const struct dmi_system_id *dmi)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) {
+ mlxplat_mux_data[i].values = mlxplat_msn21xx_channels;
+ mlxplat_mux_data[i].n_values =
+ ARRAY_SIZE(mlxplat_msn21xx_channels);
+ }
+
+ return 1;
+};
+
+static struct dmi_system_id mlxplat_dmi_table[] __initdata = {
+ {
+ .callback = mlxplat_dmi_default_matched,
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MSN24"),
+ },
+ },
+ {
+ .callback = mlxplat_dmi_default_matched,
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MSN27"),
+ },
+ },
+ {
+ .callback = mlxplat_dmi_default_matched,
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MSB"),
+ },
+ },
+ {
+ .callback = mlxplat_dmi_default_matched,
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MSX"),
+ },
+ },
+ {
+ .callback = mlxplat_dmi_msn21xx_matched,
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MSN21"),
+ },
+ },
+ { }
+};
+
+static int __init mlxplat_init(void)
+{
+ struct mlxplat_priv *priv;
+ int i, err;
+
+ if (!dmi_check_system(mlxplat_dmi_table))
+ return -ENODEV;
+
+ mlxplat_dev = platform_device_register_simple(MLX_PLAT_DEVICE_NAME, -1,
+ mlxplat_lpc_resources,
+ ARRAY_SIZE(mlxplat_lpc_resources));
+
+ if (IS_ERR(mlxplat_dev))
+ return PTR_ERR(mlxplat_dev);
+
+ priv = devm_kzalloc(&mlxplat_dev->dev, sizeof(struct mlxplat_priv),
+ GFP_KERNEL);
+ if (!priv) {
+ err = -ENOMEM;
+ goto fail_alloc;
+ }
+ platform_set_drvdata(mlxplat_dev, priv);
+
+ priv->pdev_i2c = platform_device_register_simple("i2c_mlxcpld", -1,
+ NULL, 0);
+ if (IS_ERR(priv->pdev_i2c)) {
+ err = PTR_ERR(priv->pdev_i2c);
+ goto fail_alloc;
+ };
+
+ for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) {
+ priv->pdev_mux[i] = platform_device_register_resndata(
+ &mlxplat_dev->dev,
+ "i2c-mux-reg", i, NULL,
+ 0, &mlxplat_mux_data[i],
+ sizeof(mlxplat_mux_data[i]));
+ if (IS_ERR(priv->pdev_mux[i])) {
+ err = PTR_ERR(priv->pdev_mux[i]);
+ goto fail_platform_mux_register;
+ }
+ }
+
+ return 0;
+
+fail_platform_mux_register:
+ for (i--; i > 0 ; i--)
+ platform_device_unregister(priv->pdev_mux[i]);
+ platform_device_unregister(priv->pdev_i2c);
+fail_alloc:
+ platform_device_unregister(mlxplat_dev);
+
+ return err;
+}
+module_init(mlxplat_init);
+
+static void __exit mlxplat_exit(void)
+{
+ struct mlxplat_priv *priv = platform_get_drvdata(mlxplat_dev);
+ int i;
+
+ for (i = ARRAY_SIZE(mlxplat_mux_data) - 1; i >= 0 ; i--)
+ platform_device_unregister(priv->pdev_mux[i]);
+
+ platform_device_unregister(priv->pdev_i2c);
+ platform_device_unregister(mlxplat_dev);
+}
+module_exit(mlxplat_exit);
+
+MODULE_AUTHOR("Vadim Pasternak (vadimp@mellanox.com)");
+MODULE_DESCRIPTION("Mellanox platform driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("dmi:*:*Mellanox*:MSN24*:");
+MODULE_ALIAS("dmi:*:*Mellanox*:MSN27*:");
+MODULE_ALIAS("dmi:*:*Mellanox*:MSB*:");
+MODULE_ALIAS("dmi:*:*Mellanox*:MSX*:");
+MODULE_ALIAS("dmi:*:*Mellanox*:MSN21*:");
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 66b2166ea4a1..b4d5e95fe4df 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -149,11 +149,8 @@ EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
s64
uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
{
- s64 ret;
-
- ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
- (u64)addr, buf, (u64)len, 0);
- return ret;
+ return uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
+ (u64)addr, buf, (u64)len, 0);
}
EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
@@ -187,7 +184,8 @@ EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
void uv_bios_init(void)
{
uv_systab = NULL;
- if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab) {
+ if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
+ !efi.uv_systab || efi_runtime_disabled()) {
pr_crit("UV: UVsystab: missing\n");
return;
}
@@ -199,12 +197,14 @@ void uv_bios_init(void)
return;
}
+ /* Starting with UV4 the UV systab size is variable */
if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) {
+ int size = uv_systab->size;
+
iounmap(uv_systab);
- uv_systab = ioremap(efi.uv_systab, uv_systab->size);
+ uv_systab = ioremap(efi.uv_systab, size);
if (!uv_systab) {
- pr_err("UV: UVsystab: ioremap(%d) failed!\n",
- uv_systab->size);
+ pr_err("UV: UVsystab: ioremap(%d) failed!\n", size);
return;
}
}
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index fdb4d42b4ce5..9e42842e924a 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -24,6 +24,29 @@
#include <asm/irq_vectors.h>
#include <asm/timer.h>
+static struct bau_operations ops;
+
+static struct bau_operations uv123_bau_ops = {
+ .bau_gpa_to_offset = uv_gpa_to_offset,
+ .read_l_sw_ack = read_mmr_sw_ack,
+ .read_g_sw_ack = read_gmmr_sw_ack,
+ .write_l_sw_ack = write_mmr_sw_ack,
+ .write_g_sw_ack = write_gmmr_sw_ack,
+ .write_payload_first = write_mmr_payload_first,
+ .write_payload_last = write_mmr_payload_last,
+};
+
+static struct bau_operations uv4_bau_ops = {
+ .bau_gpa_to_offset = uv_gpa_to_soc_phys_ram,
+ .read_l_sw_ack = read_mmr_proc_sw_ack,
+ .read_g_sw_ack = read_gmmr_proc_sw_ack,
+ .write_l_sw_ack = write_mmr_proc_sw_ack,
+ .write_g_sw_ack = write_gmmr_proc_sw_ack,
+ .write_payload_first = write_mmr_proc_payload_first,
+ .write_payload_last = write_mmr_proc_payload_last,
+};
+
+
/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
static int timeout_base_ns[] = {
20,
@@ -55,16 +78,16 @@ static int congested_reps = CONGESTED_REPS;
static int disabled_period = DISABLED_PERIOD;
static struct tunables tunables[] = {
- {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
- {&plugged_delay, PLUGGED_DELAY},
- {&plugsb4reset, PLUGSB4RESET},
- {&timeoutsb4reset, TIMEOUTSB4RESET},
- {&ipi_reset_limit, IPI_RESET_LIMIT},
- {&complete_threshold, COMPLETE_THRESHOLD},
- {&congested_respns_us, CONGESTED_RESPONSE_US},
- {&congested_reps, CONGESTED_REPS},
- {&disabled_period, DISABLED_PERIOD},
- {&giveup_limit, GIVEUP_LIMIT}
+ {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
+ {&plugged_delay, PLUGGED_DELAY},
+ {&plugsb4reset, PLUGSB4RESET},
+ {&timeoutsb4reset, TIMEOUTSB4RESET},
+ {&ipi_reset_limit, IPI_RESET_LIMIT},
+ {&complete_threshold, COMPLETE_THRESHOLD},
+ {&congested_respns_us, CONGESTED_RESPONSE_US},
+ {&congested_reps, CONGESTED_REPS},
+ {&disabled_period, DISABLED_PERIOD},
+ {&giveup_limit, GIVEUP_LIMIT}
};
static struct dentry *tunables_dir;
@@ -216,7 +239,7 @@ static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp,
msg = mdp->msg;
if (!msg->canceled && do_acknowledge) {
dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
- write_mmr_sw_ack(dw);
+ ops.write_l_sw_ack(dw);
}
msg->replied_to = 1;
msg->swack_vec = 0;
@@ -252,7 +275,7 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
msg->swack_vec) == 0) &&
(msg2->sending_cpu == msg->sending_cpu) &&
(msg2->msg_type != MSG_NOOP)) {
- mmr = read_mmr_sw_ack();
+ mmr = ops.read_l_sw_ack();
msg_res = msg2->swack_vec;
/*
* This is a message retry; clear the resources held
@@ -270,7 +293,7 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
stat->d_canceled++;
cancel_count++;
mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
- write_mmr_sw_ack(mr);
+ ops.write_l_sw_ack(mr);
}
}
}
@@ -403,12 +426,12 @@ static void do_reset(void *ptr)
/*
* only reset the resource if it is still pending
*/
- mmr = read_mmr_sw_ack();
+ mmr = ops.read_l_sw_ack();
msg_res = msg->swack_vec;
mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
if (mmr & msg_res) {
stat->d_rcanceled++;
- write_mmr_sw_ack(mr);
+ ops.write_l_sw_ack(mr);
}
}
}
@@ -580,11 +603,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
*/
static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc)
{
- unsigned long descriptor_status;
-
- descriptor_status =
- ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
- return descriptor_status;
+ return ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
}
/*
@@ -1202,7 +1221,7 @@ void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
struct bau_pq_entry *msg = mdp->msg;
struct bau_pq_entry *other_msg;
- mmr_image = read_mmr_sw_ack();
+ mmr_image = ops.read_l_sw_ack();
swack_vec = msg->swack_vec;
if ((swack_vec & mmr_image) == 0) {
@@ -1431,7 +1450,7 @@ static int ptc_seq_show(struct seq_file *file, void *data)
/* destination side statistics */
seq_printf(file,
"%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
- read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
+ ops.read_g_sw_ack(uv_cpu_to_pnode(cpu)),
stat->d_requestee, cycles_2_us(stat->d_time),
stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
stat->d_nomsg, stat->d_retries, stat->d_canceled,
@@ -1497,16 +1516,16 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user,
}
if (kstrtol(optstr, 10, &input_arg) < 0) {
- printk(KERN_DEBUG "%s is invalid\n", optstr);
+ pr_debug("%s is invalid\n", optstr);
return -EINVAL;
}
if (input_arg == 0) {
elements = ARRAY_SIZE(stat_description);
- printk(KERN_DEBUG "# cpu: cpu number\n");
- printk(KERN_DEBUG "Sender statistics:\n");
+ pr_debug("# cpu: cpu number\n");
+ pr_debug("Sender statistics:\n");
for (i = 0; i < elements; i++)
- printk(KERN_DEBUG "%s\n", stat_description[i]);
+ pr_debug("%s\n", stat_description[i]);
} else if (input_arg == -1) {
for_each_present_cpu(cpu) {
stat = &per_cpu(ptcstats, cpu);
@@ -1554,7 +1573,7 @@ static int parse_tunables_write(struct bau_control *bcp, char *instr,
break;
}
if (cnt != e) {
- printk(KERN_INFO "bau tunable error: should be %d values\n", e);
+ pr_info("bau tunable error: should be %d values\n", e);
return -EINVAL;
}
@@ -1571,7 +1590,7 @@ static int parse_tunables_write(struct bau_control *bcp, char *instr,
continue;
}
if (val < 1 || val > bcp->cpus_in_uvhub) {
- printk(KERN_DEBUG
+ pr_debug(
"Error: BAU max concurrent %d is invalid\n",
val);
return -EINVAL;
@@ -1619,17 +1638,17 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
for_each_present_cpu(cpu) {
bcp = &per_cpu(bau_control, cpu);
- bcp->max_concurr = max_concurr;
- bcp->max_concurr_const = max_concurr;
- bcp->plugged_delay = plugged_delay;
- bcp->plugsb4reset = plugsb4reset;
- bcp->timeoutsb4reset = timeoutsb4reset;
- bcp->ipi_reset_limit = ipi_reset_limit;
- bcp->complete_threshold = complete_threshold;
- bcp->cong_response_us = congested_respns_us;
- bcp->cong_reps = congested_reps;
- bcp->disabled_period = sec_2_cycles(disabled_period);
- bcp->giveup_limit = giveup_limit;
+ bcp->max_concurr = max_concurr;
+ bcp->max_concurr_const = max_concurr;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->cong_response_us = congested_respns_us;
+ bcp->cong_reps = congested_reps;
+ bcp->disabled_period = sec_2_cycles(disabled_period);
+ bcp->giveup_limit = giveup_limit;
}
return count;
}
@@ -1676,21 +1695,21 @@ static int __init uv_ptc_init(void)
proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
&proc_uv_ptc_operations);
if (!proc_uv_ptc) {
- printk(KERN_ERR "unable to create %s proc entry\n",
+ pr_err("unable to create %s proc entry\n",
UV_PTC_BASENAME);
return -EINVAL;
}
tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
if (!tunables_dir) {
- printk(KERN_ERR "unable to create debugfs directory %s\n",
+ pr_err("unable to create debugfs directory %s\n",
UV_BAU_TUNABLES_DIR);
return -EINVAL;
}
tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
tunables_dir, NULL, &tunables_fops);
if (!tunables_file) {
- printk(KERN_ERR "unable to create debugfs file %s\n",
+ pr_err("unable to create debugfs file %s\n",
UV_BAU_TUNABLES_FILE);
return -EINVAL;
}
@@ -1725,7 +1744,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
gpa = uv_gpa(bau_desc);
n = uv_gpa_to_gnode(gpa);
- m = uv_gpa_to_offset(gpa);
+ m = ops.bau_gpa_to_offset(gpa);
if (is_uv1_hub())
uv1 = 1;
@@ -1740,7 +1759,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
memset(bd2, 0, sizeof(struct bau_desc));
if (uv1) {
uv1_hdr = &bd2->header.uv1_hdr;
- uv1_hdr->swack_flag = 1;
+ uv1_hdr->swack_flag = 1;
/*
* The base_dest_nasid set in the message header
* is the nasid of the first uvhub in the partition.
@@ -1749,10 +1768,10 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
* if nasid striding is being used.
*/
uv1_hdr->base_dest_nasid =
- UV_PNODE_TO_NASID(base_pnode);
- uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID;
- uv1_hdr->command = UV_NET_ENDPOINT_INTD;
- uv1_hdr->int_both = 1;
+ UV_PNODE_TO_NASID(base_pnode);
+ uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID;
+ uv1_hdr->command = UV_NET_ENDPOINT_INTD;
+ uv1_hdr->int_both = 1;
/*
* all others need to be set to zero:
* fairness chaining multilevel count replied_to
@@ -1763,11 +1782,11 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
* uses native mode for selective broadcasts.
*/
uv2_3_hdr = &bd2->header.uv2_3_hdr;
- uv2_3_hdr->swack_flag = 1;
+ uv2_3_hdr->swack_flag = 1;
uv2_3_hdr->base_dest_nasid =
- UV_PNODE_TO_NASID(base_pnode);
- uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID;
- uv2_3_hdr->command = UV_NET_ENDPOINT_INTD;
+ UV_PNODE_TO_NASID(base_pnode);
+ uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID;
+ uv2_3_hdr->command = UV_NET_ENDPOINT_INTD;
}
}
for_each_present_cpu(cpu) {
@@ -1790,10 +1809,7 @@ static void pq_init(int node, int pnode)
size_t plsize;
char *cp;
void *vp;
- unsigned long pn;
- unsigned long first;
- unsigned long pn_first;
- unsigned long last;
+ unsigned long gnode, first, last, tail;
struct bau_pq_entry *pqp;
struct bau_control *bcp;
@@ -1814,17 +1830,25 @@ static void pq_init(int node, int pnode)
bcp->bau_msg_head = pqp;
bcp->queue_last = pqp + (DEST_Q_SIZE - 1);
}
+
+ first = ops.bau_gpa_to_offset(uv_gpa(pqp));
+ last = ops.bau_gpa_to_offset(uv_gpa(pqp + (DEST_Q_SIZE - 1)));
+
/*
- * need the gnode of where the memory was really allocated
+ * Pre UV4, the gnode is required to locate the payload queue
+ * and the payload queue tail must be maintained by the kernel.
*/
- pn = uv_gpa_to_gnode(uv_gpa(pqp));
- first = uv_physnodeaddr(pqp);
- pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first;
- last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1));
- write_mmr_payload_first(pnode, pn_first);
- write_mmr_payload_tail(pnode, first);
- write_mmr_payload_last(pnode, last);
- write_gmmr_sw_ack(pnode, 0xffffUL);
+ bcp = &per_cpu(bau_control, smp_processor_id());
+ if (bcp->uvhub_version <= 3) {
+ tail = first;
+ gnode = uv_gpa_to_gnode(uv_gpa(pqp));
+ first = (gnode << UV_PAYLOADQ_GNODE_SHIFT) | tail;
+ write_mmr_payload_tail(pnode, tail);
+ }
+
+ ops.write_payload_first(pnode, first);
+ ops.write_payload_last(pnode, last);
+ ops.write_g_sw_ack(pnode, 0xffffUL);
/* in effect, all msg_type's are set to MSG_NOOP */
memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
@@ -1914,8 +1938,8 @@ static void __init init_per_cpu_tunables(void)
bcp->complete_threshold = complete_threshold;
bcp->cong_response_us = congested_respns_us;
bcp->cong_reps = congested_reps;
- bcp->disabled_period = sec_2_cycles(disabled_period);
- bcp->giveup_limit = giveup_limit;
+ bcp->disabled_period = sec_2_cycles(disabled_period);
+ bcp->giveup_limit = giveup_limit;
spin_lock_init(&bcp->queue_lock);
spin_lock_init(&bcp->uvhub_lock);
spin_lock_init(&bcp->disable_lock);
@@ -1944,7 +1968,7 @@ static int __init get_cpu_topology(int base_pnode,
pnode = uv_cpu_hub_info(cpu)->pnode;
if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) {
- printk(KERN_EMERG
+ pr_emerg(
"cpu %d pnode %d-%d beyond %d; BAU disabled\n",
cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE);
return 1;
@@ -1969,7 +1993,7 @@ static int __init get_cpu_topology(int base_pnode,
sdp->cpu_number[sdp->num_cpus] = cpu;
sdp->num_cpus++;
if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
- printk(KERN_EMERG "%d cpus per socket invalid\n",
+ pr_emerg("%d cpus per socket invalid\n",
sdp->num_cpus);
return 1;
}
@@ -2035,15 +2059,17 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
bcp->uvhub_version = 2;
else if (is_uv3_hub())
bcp->uvhub_version = 3;
+ else if (is_uv4_hub())
+ bcp->uvhub_version = 4;
else {
- printk(KERN_EMERG "uvhub version not 1, 2 or 3\n");
+ pr_emerg("uvhub version not 1, 2, 3, or 4\n");
return 1;
}
bcp->uvhub_master = *hmasterp;
bcp->uvhub_cpu = uv_cpu_blade_processor_id(cpu);
if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
- printk(KERN_EMERG "%d cpus per uvhub invalid\n",
+ pr_emerg("%d cpus per uvhub invalid\n",
bcp->uvhub_cpu);
return 1;
}
@@ -2098,7 +2124,8 @@ static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
void *vp;
struct uvhub_desc *uvhub_descs;
- timeout_us = calculate_destination_timeout();
+ if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub())
+ timeout_us = calculate_destination_timeout();
vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
uvhub_descs = (struct uvhub_desc *)vp;
@@ -2138,6 +2165,15 @@ static int __init uv_bau_init(void)
if (!is_uv_system())
return 0;
+ if (is_uv4_hub())
+ ops = uv4_bau_ops;
+ else if (is_uv3_hub())
+ ops = uv123_bau_ops;
+ else if (is_uv2_hub())
+ ops = uv123_bau_ops;
+ else if (is_uv1_hub())
+ ops = uv123_bau_ops;
+
for_each_possible_cpu(cur_cpu) {
mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
@@ -2153,7 +2189,9 @@ static int __init uv_bau_init(void)
uv_base_pnode = uv_blade_to_pnode(uvhub);
}
- enable_timeouts();
+ /* software timeouts are not supported on UV4 */
+ if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub())
+ enable_timeouts();
if (init_per_cpu(nuvhubs, uv_base_pnode)) {
set_bau_off();
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index b12c26e2e309..53cace2ec0e2 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -130,7 +130,7 @@ static void __save_processor_state(struct saved_context *ctxt)
ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2();
ctxt->cr3 = read_cr3();
- ctxt->cr4 = __read_cr4_safe();
+ ctxt->cr4 = __read_cr4();
#ifdef CONFIG_X86_64
ctxt->cr8 = read_cr8();
#endif
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index f0b5f2d402af..9634557a5444 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -87,7 +87,7 @@ static int set_up_temporary_mappings(void)
struct x86_mapping_info info = {
.alloc_pgt_page = alloc_pgt_page,
.pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
- .kernel_mapping = true,
+ .offset = __PAGE_OFFSET,
};
unsigned long mstart, mend;
pgd_t *pgd;
@@ -113,7 +113,7 @@ static int set_up_temporary_mappings(void)
return result;
}
- temp_level4_pgt = (unsigned long)pgd - __PAGE_OFFSET;
+ temp_level4_pgt = __pa(pgd);
return 0;
}
diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
index 1104515d5ad2..1ac76479c266 100644
--- a/arch/x86/ras/mce_amd_inj.c
+++ b/arch/x86/ras/mce_amd_inj.c
@@ -68,6 +68,7 @@ static int inj_##reg##_set(void *data, u64 val) \
MCE_INJECT_SET(status);
MCE_INJECT_SET(misc);
MCE_INJECT_SET(addr);
+MCE_INJECT_SET(synd);
#define MCE_INJECT_GET(reg) \
static int inj_##reg##_get(void *data, u64 *val) \
@@ -81,10 +82,12 @@ static int inj_##reg##_get(void *data, u64 *val) \
MCE_INJECT_GET(status);
MCE_INJECT_GET(misc);
MCE_INJECT_GET(addr);
+MCE_INJECT_GET(synd);
DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
/*
* Caller needs to be make sure this cpu doesn't disappear
@@ -243,27 +246,27 @@ static void toggle_nb_mca_mst_cpu(u16 nid)
static void prepare_msrs(void *info)
{
- struct mce i_mce = *(struct mce *)info;
- u8 b = i_mce.bank;
+ struct mce m = *(struct mce *)info;
+ u8 b = m.bank;
- wrmsrl(MSR_IA32_MCG_STATUS, i_mce.mcgstatus);
+ wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
if (boot_cpu_has(X86_FEATURE_SMCA)) {
- if (i_mce.inject_flags == DFR_INT_INJ) {
- wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), i_mce.status);
- wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), i_mce.addr);
+ if (m.inject_flags == DFR_INT_INJ) {
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
} else {
- wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), i_mce.status);
- wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), i_mce.addr);
+ wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
}
- wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc);
+ wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
+ wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
} else {
- wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status);
- wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr);
- wrmsrl(MSR_IA32_MCx_MISC(b), i_mce.misc);
+ wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
+ wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
+ wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
}
-
}
static void do_inject(void)
@@ -275,6 +278,9 @@ static void do_inject(void)
if (i_mce.misc)
i_mce.status |= MCI_STATUS_MISCV;
+ if (i_mce.synd)
+ i_mce.status |= MCI_STATUS_SYNDV;
+
if (inj_type == SW_INJ) {
mce_inject_log(&i_mce);
return;
@@ -301,7 +307,9 @@ static void do_inject(void)
* only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
* Fam10h and later BKDGs.
*/
- if (static_cpu_has(X86_FEATURE_AMD_DCM) && b == 4) {
+ if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
+ b == 4 &&
+ boot_cpu_data.x86 < 0x17) {
toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
cpu = get_nbc_for_node(amd_get_nb_id(cpu));
}
@@ -371,6 +379,9 @@ static const char readme_msg[] =
"\t used for error thresholding purposes and its validity is indicated by\n"
"\t MCi_STATUS[MiscV].\n"
"\n"
+"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
+"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
+"\n"
"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
"\t associated with the error.\n"
"\n"
@@ -420,6 +431,7 @@ static struct dfs_node {
{ .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
@@ -428,7 +440,7 @@ static struct dfs_node {
static int __init init_mce_inject(void)
{
- int i;
+ unsigned int i;
u64 cap;
rdmsrl(MSR_IA32_MCG_CAP, cap);
@@ -452,26 +464,22 @@ static int __init init_mce_inject(void)
return 0;
err_dfs_add:
- while (--i >= 0)
+ while (i-- > 0)
debugfs_remove(dfs_fls[i].d);
debugfs_remove(dfs_inj);
dfs_inj = NULL;
- return -ENOMEM;
+ return -ENODEV;
}
static void __exit exit_mce_inject(void)
{
- int i;
- for (i = 0; i < ARRAY_SIZE(dfs_fls); i++)
- debugfs_remove(dfs_fls[i].d);
+ debugfs_remove_recursive(dfs_inj);
+ dfs_inj = NULL;
memset(&dfs_fls, 0, sizeof(dfs_fls));
-
- debugfs_remove(dfs_inj);
- dfs_inj = NULL;
}
module_init(init_mce_inject);
module_exit(exit_mce_inject);
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 705e3fffb4a1..5db706f14111 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -1,9 +1,11 @@
#include <linux/io.h>
+#include <linux/slab.h>
#include <linux/memblock.h>
#include <asm/cacheflush.h>
#include <asm/pgtable.h>
#include <asm/realmode.h>
+#include <asm/tlbflush.h>
struct real_mode_header *real_mode_header;
u32 *trampoline_cr4_features;
@@ -11,25 +13,37 @@ u32 *trampoline_cr4_features;
/* Hold the pgd entry used on booting additional CPUs */
pgd_t trampoline_pgd_entry;
+void __init set_real_mode_mem(phys_addr_t mem, size_t size)
+{
+ void *base = __va(mem);
+
+ real_mode_header = (struct real_mode_header *) base;
+ printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
+ base, (unsigned long long)mem, size);
+}
+
void __init reserve_real_mode(void)
{
phys_addr_t mem;
- unsigned char *base;
- size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
+ size_t size = real_mode_size_needed();
+
+ if (!size)
+ return;
+
+ WARN_ON(slab_is_available());
/* Has to be under 1M so we can execute real-mode AP code. */
mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
- if (!mem)
- panic("Cannot allocate trampoline\n");
+ if (!mem) {
+ pr_info("No sub-1M memory is available for the trampoline\n");
+ return;
+ }
- base = __va(mem);
memblock_reserve(mem, size);
- real_mode_header = (struct real_mode_header *) base;
- printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
- base, (unsigned long long)mem, size);
+ set_real_mode_mem(mem, size);
}
-void __init setup_real_mode(void)
+static void __init setup_real_mode(void)
{
u16 real_mode_seg;
const u32 *rel;
@@ -84,7 +98,7 @@ void __init setup_real_mode(void)
trampoline_header->start = (u64) secondary_startup_64;
trampoline_cr4_features = &trampoline_header->cr4;
- *trampoline_cr4_features = __read_cr4();
+ *trampoline_cr4_features = mmu_cr4_features;
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
@@ -100,7 +114,7 @@ void __init setup_real_mode(void)
* need to mark it executable at do_pre_smp_initcalls() at least,
* thus run it as a early_initcall().
*/
-static int __init set_real_mode_permissions(void)
+static void __init set_real_mode_permissions(void)
{
unsigned char *base = (unsigned char *) real_mode_header;
size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
@@ -119,7 +133,16 @@ static int __init set_real_mode_permissions(void)
set_memory_nx((unsigned long) base, size >> PAGE_SHIFT);
set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT);
set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
+}
+
+static int __init init_real_mode(void)
+{
+ if (!real_mode_header)
+ panic("Real mode trampoline was not allocated");
+
+ setup_real_mode();
+ set_real_mode_permissions();
return 0;
}
-early_initcall(set_real_mode_permissions);
+early_initcall(init_real_mode);
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index ebd4dd6ef73b..5766ead6fdb9 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -84,7 +84,10 @@ int putreg(struct task_struct *child, int regno, unsigned long value)
case EAX:
case EIP:
case UESP:
+ break;
case ORIG_EAX:
+ /* Update the syscall number. */
+ UPT_SYSCALL_NR(&child->thread.regs.regs) = value;
break;
case FS:
if (value && (value & 3) != 3)
@@ -191,7 +194,7 @@ int peek_user(struct task_struct *child, long addr, long data)
static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
{
- int err, n, cpu = ((struct thread_info *) child->stack)->cpu;
+ int err, n, cpu = task_cpu(child);
struct user_i387_struct fpregs;
err = save_i387_registers(userspace_pid[cpu],
@@ -208,7 +211,7 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c
static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
{
- int n, cpu = ((struct thread_info *) child->stack)->cpu;
+ int n, cpu = task_cpu(child);
struct user_i387_struct fpregs;
n = copy_from_user(&fpregs, buf, sizeof(fpregs));
@@ -221,7 +224,7 @@ static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *c
static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child)
{
- int err, n, cpu = ((struct thread_info *) child->stack)->cpu;
+ int err, n, cpu = task_cpu(child);
struct user_fxsr_struct fpregs;
err = save_fpx_registers(userspace_pid[cpu], (unsigned long *) &fpregs);
@@ -237,7 +240,7 @@ static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *
static int set_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child)
{
- int n, cpu = ((struct thread_info *) child->stack)->cpu;
+ int n, cpu = task_cpu(child);
struct user_fxsr_struct fpregs;
n = copy_from_user(&fpregs, buf, sizeof(fpregs));
diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c
index faab418876ce..0b5c184dd5b3 100644
--- a/arch/x86/um/ptrace_64.c
+++ b/arch/x86/um/ptrace_64.c
@@ -78,7 +78,11 @@ int putreg(struct task_struct *child, int regno, unsigned long value)
case RSI:
case RDI:
case RBP:
+ break;
+
case ORIG_RAX:
+ /* Update the syscall number. */
+ UPT_SYSCALL_NR(&child->thread.regs.regs) = value;
break;
case FS:
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 8ffb089b19a5..f1d2182e071f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -118,7 +118,7 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
/* Linux <-> Xen vCPU id mapping */
-DEFINE_PER_CPU(int, xen_vcpu_id) = -1;
+DEFINE_PER_CPU(uint32_t, xen_vcpu_id);
EXPORT_PER_CPU_SYMBOL(xen_vcpu_id);
enum xen_domain_type xen_domain_type = XEN_NATIVE;
@@ -1237,7 +1237,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.write_cr0 = xen_write_cr0,
.read_cr4 = native_read_cr4,
- .read_cr4_safe = native_read_cr4_safe,
.write_cr4 = xen_write_cr4,
#ifdef CONFIG_X86_64
@@ -1925,6 +1924,45 @@ static void xen_set_cpu_features(struct cpuinfo_x86 *c)
}
}
+static void xen_pin_vcpu(int cpu)
+{
+ static bool disable_pinning;
+ struct sched_pin_override pin_override;
+ int ret;
+
+ if (disable_pinning)
+ return;
+
+ pin_override.pcpu = cpu;
+ ret = HYPERVISOR_sched_op(SCHEDOP_pin_override, &pin_override);
+
+ /* Ignore errors when removing override. */
+ if (cpu < 0)
+ return;
+
+ switch (ret) {
+ case -ENOSYS:
+ pr_warn("Unable to pin on physical cpu %d. In case of problems consider vcpu pinning.\n",
+ cpu);
+ disable_pinning = true;
+ break;
+ case -EPERM:
+ WARN(1, "Trying to pin vcpu without having privilege to do so\n");
+ disable_pinning = true;
+ break;
+ case -EINVAL:
+ case -EBUSY:
+ pr_warn("Physical cpu %d not available for pinning. Check Xen cpu configuration.\n",
+ cpu);
+ break;
+ case 0:
+ break;
+ default:
+ WARN(1, "rc %d while trying to pin vcpu\n", ret);
+ disable_pinning = true;
+ }
+}
+
const struct hypervisor_x86 x86_hyper_xen = {
.name = "Xen",
.detect = xen_platform,
@@ -1933,6 +1971,7 @@ const struct hypervisor_x86 x86_hyper_xen = {
#endif
.x2apic_available = xen_x2apic_para_available,
.set_cpu_features = xen_set_cpu_features,
+ .pin_vcpu = xen_pin_vcpu,
};
EXPORT_SYMBOL(x86_hyper_xen);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 176425233e4d..f8960fca0827 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -861,7 +861,7 @@ char * __init xen_memory_setup(void)
e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
/*
* Check whether the kernel itself conflicts with the target E820 map.
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index f42e78de1e10..3d6e0064cbfc 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -21,8 +21,6 @@ static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
static DEFINE_PER_CPU(char *, irq_name);
static bool xen_pvspin = true;
-#ifdef CONFIG_QUEUED_SPINLOCKS
-
#include <asm/qspinlock.h>
static void xen_qlock_kick(int cpu)
@@ -71,207 +69,6 @@ static void xen_qlock_wait(u8 *byte, u8 val)
xen_poll_irq(irq);
}
-#else /* CONFIG_QUEUED_SPINLOCKS */
-
-enum xen_contention_stat {
- TAKEN_SLOW,
- TAKEN_SLOW_PICKUP,
- TAKEN_SLOW_SPURIOUS,
- RELEASED_SLOW,
- RELEASED_SLOW_KICKED,
- NR_CONTENTION_STATS
-};
-
-
-#ifdef CONFIG_XEN_DEBUG_FS
-#define HISTO_BUCKETS 30
-static struct xen_spinlock_stats
-{
- u32 contention_stats[NR_CONTENTION_STATS];
- u32 histo_spin_blocked[HISTO_BUCKETS+1];
- u64 time_blocked;
-} spinlock_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
- u8 ret;
- u8 old = READ_ONCE(zero_stats);
- if (unlikely(old)) {
- ret = cmpxchg(&zero_stats, old, 0);
- /* This ensures only one fellow resets the stat */
- if (ret == old)
- memset(&spinlock_stats, 0, sizeof(spinlock_stats));
- }
-}
-
-static inline void add_stats(enum xen_contention_stat var, u32 val)
-{
- check_zero();
- spinlock_stats.contention_stats[var] += val;
-}
-
-static inline u64 spin_time_start(void)
-{
- return xen_clocksource_read();
-}
-
-static void __spin_time_accum(u64 delta, u32 *array)
-{
- unsigned index = ilog2(delta);
-
- check_zero();
-
- if (index < HISTO_BUCKETS)
- array[index]++;
- else
- array[HISTO_BUCKETS]++;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
- u32 delta = xen_clocksource_read() - start;
-
- __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
- spinlock_stats.time_blocked += delta;
-}
-#else /* !CONFIG_XEN_DEBUG_FS */
-static inline void add_stats(enum xen_contention_stat var, u32 val)
-{
-}
-
-static inline u64 spin_time_start(void)
-{
- return 0;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
-}
-#endif /* CONFIG_XEN_DEBUG_FS */
-
-struct xen_lock_waiting {
- struct arch_spinlock *lock;
- __ticket_t want;
-};
-
-static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
-static cpumask_t waiting_cpus;
-
-__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
-{
- int irq = __this_cpu_read(lock_kicker_irq);
- struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting);
- int cpu = smp_processor_id();
- u64 start;
- __ticket_t head;
- unsigned long flags;
-
- /* If kicker interrupts not initialized yet, just spin */
- if (irq == -1)
- return;
-
- start = spin_time_start();
-
- /*
- * Make sure an interrupt handler can't upset things in a
- * partially setup state.
- */
- local_irq_save(flags);
- /*
- * We don't really care if we're overwriting some other
- * (lock,want) pair, as that would mean that we're currently
- * in an interrupt context, and the outer context had
- * interrupts enabled. That has already kicked the VCPU out
- * of xen_poll_irq(), so it will just return spuriously and
- * retry with newly setup (lock,want).
- *
- * The ordering protocol on this is that the "lock" pointer
- * may only be set non-NULL if the "want" ticket is correct.
- * If we're updating "want", we must first clear "lock".
- */
- w->lock = NULL;
- smp_wmb();
- w->want = want;
- smp_wmb();
- w->lock = lock;
-
- /* This uses set_bit, which atomic and therefore a barrier */
- cpumask_set_cpu(cpu, &waiting_cpus);
- add_stats(TAKEN_SLOW, 1);
-
- /* clear pending */
- xen_clear_irq_pending(irq);
-
- /* Only check lock once pending cleared */
- barrier();
-
- /*
- * Mark entry to slowpath before doing the pickup test to make
- * sure we don't deadlock with an unlocker.
- */
- __ticket_enter_slowpath(lock);
-
- /* make sure enter_slowpath, which is atomic does not cross the read */
- smp_mb__after_atomic();
-
- /*
- * check again make sure it didn't become free while
- * we weren't looking
- */
- head = READ_ONCE(lock->tickets.head);
- if (__tickets_equal(head, want)) {
- add_stats(TAKEN_SLOW_PICKUP, 1);
- goto out;
- }
-
- /* Allow interrupts while blocked */
- local_irq_restore(flags);
-
- /*
- * If an interrupt happens here, it will leave the wakeup irq
- * pending, which will cause xen_poll_irq() to return
- * immediately.
- */
-
- /* Block until irq becomes pending (or perhaps a spurious wakeup) */
- xen_poll_irq(irq);
- add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq));
-
- local_irq_save(flags);
-
- kstat_incr_irq_this_cpu(irq);
-out:
- cpumask_clear_cpu(cpu, &waiting_cpus);
- w->lock = NULL;
-
- local_irq_restore(flags);
-
- spin_time_accum_blocked(start);
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning);
-
-static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
-{
- int cpu;
-
- add_stats(RELEASED_SLOW, 1);
-
- for_each_cpu(cpu, &waiting_cpus) {
- const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
-
- /* Make sure we read lock before want */
- if (READ_ONCE(w->lock) == lock &&
- READ_ONCE(w->want) == next) {
- add_stats(RELEASED_SLOW_KICKED, 1);
- xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
- break;
- }
- }
-}
-#endif /* CONFIG_QUEUED_SPINLOCKS */
-
static irqreturn_t dummy_handler(int irq, void *dev_id)
{
BUG();
@@ -334,16 +131,12 @@ void __init xen_init_spinlocks(void)
return;
}
printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
-#ifdef CONFIG_QUEUED_SPINLOCKS
+
__pv_init_lock_hash();
pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
pv_lock_ops.wait = xen_qlock_wait;
pv_lock_ops.kick = xen_qlock_kick;
-#else
- pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
- pv_lock_ops.unlock_kick = xen_unlock_kick;
-#endif
}
/*
@@ -372,44 +165,3 @@ static __init int xen_parse_nopvspin(char *arg)
}
early_param("xen_nopvspin", xen_parse_nopvspin);
-#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS)
-
-static struct dentry *d_spin_debug;
-
-static int __init xen_spinlock_debugfs(void)
-{
- struct dentry *d_xen = xen_init_debugfs();
-
- if (d_xen == NULL)
- return -ENOMEM;
-
- if (!xen_pvspin)
- return 0;
-
- d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
-
- debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
-
- debugfs_create_u32("taken_slow", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[TAKEN_SLOW]);
- debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
- debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]);
-
- debugfs_create_u32("released_slow", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[RELEASED_SLOW]);
- debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
-
- debugfs_create_u64("time_blocked", 0444, d_spin_debug,
- &spinlock_stats.time_blocked);
-
- debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
- spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
-
- return 0;
-}
-fs_initcall(xen_spinlock_debugfs);
-
-#endif /* CONFIG_XEN_DEBUG_FS */