aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/boot/compressed/Makefile4
-rw-r--r--arch/x86/boot/compressed/head_32.S5
-rw-r--r--arch/x86/boot/compressed/head_64.S5
-rw-r--r--arch/x86/boot/compressed/misc.c13
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c9
-rw-r--r--arch/x86/include/asm/page_32_types.h1
-rw-r--r--arch/x86/include/asm/page_64_types.h11
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_early.c33
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c10
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c49
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/entry_64.S81
-rw-r--r--arch/x86/kernel/ptrace.c2
-rw-r--r--arch/x86/kernel/smpboot.c15
-rw-r--r--arch/x86/kernel/traps.c71
-rw-r--r--arch/x86/kvm/emulate.c8
-rw-r--r--arch/x86/kvm/mmu.c6
-rw-r--r--arch/x86/lib/csum-wrappers_64.c5
-rw-r--r--arch/x86/mm/init_64.c11
-rw-r--r--arch/x86/tools/calc_run_size.pl39
-rw-r--r--arch/x86/xen/smp.c3
27 files changed, 268 insertions, 124 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ded8a6774ac9..41a503c15862 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -144,7 +144,7 @@ config INSTRUCTION_DECODER
config PERF_EVENTS_INTEL_UNCORE
def_bool y
- depends on PERF_EVENTS && SUP_SUP_INTEL && PCI
+ depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
config OUTPUT_FORMAT
string
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 704f58aa79cd..45abc363dd3e 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -76,8 +76,10 @@ suffix-$(CONFIG_KERNEL_XZ) := xz
suffix-$(CONFIG_KERNEL_LZO) := lzo
suffix-$(CONFIG_KERNEL_LZ4) := lz4
+RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \
+ perl $(srctree)/arch/x86/tools/calc_run_size.pl)
quiet_cmd_mkpiggy = MKPIGGY $@
- cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
+ cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false )
targets += piggy.S
$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index cbed1407a5cd..1d7fbbcc196d 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -207,7 +207,8 @@ relocated:
* Do the decompression, and jump to the new kernel..
*/
/* push arguments for decompress_kernel: */
- pushl $z_output_len /* decompressed length */
+ pushl $z_run_size /* size of kernel with .bss and .brk */
+ pushl $z_output_len /* decompressed length, end of relocs */
leal z_extract_offset_negative(%ebx), %ebp
pushl %ebp /* output address */
pushl $z_input_len /* input_len */
@@ -217,7 +218,7 @@ relocated:
pushl %eax /* heap area */
pushl %esi /* real mode pointer */
call decompress_kernel /* returns kernel location in %eax */
- addl $24, %esp
+ addl $28, %esp
/*
* Jump to the decompressed kernel.
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 2884e0c3e8a5..6b1766c6c082 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -402,13 +402,16 @@ relocated:
* Do the decompression, and jump to the new kernel..
*/
pushq %rsi /* Save the real mode argument */
+ movq $z_run_size, %r9 /* size of kernel with .bss and .brk */
+ pushq %r9
movq %rsi, %rdi /* real mode address */
leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
leaq input_data(%rip), %rdx /* input_data */
movl $z_input_len, %ecx /* input_len */
movq %rbp, %r8 /* output target address */
- movq $z_output_len, %r9 /* decompressed length */
+ movq $z_output_len, %r9 /* decompressed length, end of relocs */
call decompress_kernel /* returns kernel location in %rax */
+ popq %r9
popq %rsi
/*
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 57ab74df7eea..30dd59a9f0b4 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -358,7 +358,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
unsigned char *input_data,
unsigned long input_len,
unsigned char *output,
- unsigned long output_len)
+ unsigned long output_len,
+ unsigned long run_size)
{
real_mode = rmode;
@@ -381,8 +382,14 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
- output = choose_kernel_location(input_data, input_len,
- output, output_len);
+ /*
+ * The memory hole needed for the kernel is the larger of either
+ * the entire decompressed kernel plus relocation table, or the
+ * entire decompressed kernel plus .bss and .brk sections.
+ */
+ output = choose_kernel_location(input_data, input_len, output,
+ output_len > run_size ? output_len
+ : run_size);
/* Validate memory location choices. */
if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index b669ab65bf6c..d8222f213182 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -36,11 +36,13 @@ int main(int argc, char *argv[])
uint32_t olen;
long ilen;
unsigned long offs;
+ unsigned long run_size;
FILE *f = NULL;
int retval = 1;
- if (argc < 2) {
- fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
+ if (argc < 3) {
+ fprintf(stderr, "Usage: %s compressed_file run_size\n",
+ argv[0]);
goto bail;
}
@@ -74,6 +76,7 @@ int main(int argc, char *argv[])
offs += olen >> 12; /* Add 8 bytes for each 32K block */
offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */
offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
+ run_size = atoi(argv[2]);
printf(".section \".rodata..compressed\",\"a\",@progbits\n");
printf(".globl z_input_len\n");
@@ -85,6 +88,8 @@ int main(int argc, char *argv[])
/* z_extract_offset_negative allows simplification of head_32.S */
printf(".globl z_extract_offset_negative\n");
printf("z_extract_offset_negative = -0x%lx\n", offs);
+ printf(".globl z_run_size\n");
+ printf("z_run_size = %lu\n", run_size);
printf(".globl input_data, input_data_end\n");
printf("input_data:\n");
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index f48b17df4224..3a52ee0e726d 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -20,7 +20,6 @@
#define THREAD_SIZE_ORDER 1
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
-#define STACKFAULT_STACK 0
#define DOUBLEFAULT_STACK 1
#define NMI_STACK 0
#define DEBUG_STACK 0
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 678205195ae1..75450b2c7be4 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -14,12 +14,11 @@
#define IRQ_STACK_ORDER 2
#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
-#define STACKFAULT_STACK 1
-#define DOUBLEFAULT_STACK 2
-#define NMI_STACK 3
-#define DEBUG_STACK 4
-#define MCE_STACK 5
-#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
+#define DOUBLEFAULT_STACK 1
+#define NMI_STACK 2
+#define DEBUG_STACK 3
+#define MCE_STACK 4
+#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */
#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd27e08e23c..8cd1cc3bc835 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,6 +150,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
}
void cpu_disable_common(void);
+void cpu_die_common(unsigned int cpu);
void native_smp_prepare_boot_cpu(void);
void native_smp_prepare_cpus(unsigned int max_cpus);
void native_smp_cpus_done(unsigned int max_cpus);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 854053889d4d..547e344a6dc6 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -141,7 +141,7 @@ struct thread_info {
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
- _TIF_USER_RETURN_NOTIFY)
+ _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index bc8352e7010a..707adc6549d8 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -39,6 +39,7 @@ asmlinkage void simd_coprocessor_error(void);
#ifdef CONFIG_TRACING
asmlinkage void trace_page_fault(void);
+#define trace_stack_segment stack_segment
#define trace_divide_error divide_error
#define trace_bounds bounds
#define trace_invalid_op invalid_op
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4b4f78c9ba19..cfa9b5b2c27a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -146,6 +146,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
static int __init x86_xsave_setup(char *s)
{
+ if (strlen(s))
+ return 0;
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
index 7aa1acc79789..06674473b0e6 100644
--- a/arch/x86/kernel/cpu/microcode/amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -108,12 +108,13 @@ static size_t compute_container_size(u8 *data, u32 total_size)
* load_microcode_amd() to save equivalent cpu table and microcode patches in
* kernel heap memory.
*/
-static void apply_ucode_in_initrd(void *ucode, size_t size)
+static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
{
struct equiv_cpu_entry *eq;
size_t *cont_sz;
u32 *header;
u8 *data, **cont;
+ u8 (*patch)[PATCH_MAX_SIZE];
u16 eq_id = 0;
int offset, left;
u32 rev, eax, ebx, ecx, edx;
@@ -123,10 +124,12 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
cont_sz = (size_t *)__pa_nodebug(&container_size);
cont = (u8 **)__pa_nodebug(&container);
+ patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
#else
new_rev = &ucode_new_rev;
cont_sz = &container_size;
cont = &container;
+ patch = &amd_ucode_patch;
#endif
data = ucode;
@@ -213,9 +216,9 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
rev = mc->hdr.patch_id;
*new_rev = rev;
- /* save ucode patch */
- memcpy(amd_ucode_patch, mc,
- min_t(u32, header[1], PATCH_MAX_SIZE));
+ if (save_patch)
+ memcpy(patch, mc,
+ min_t(u32, header[1], PATCH_MAX_SIZE));
}
}
@@ -246,7 +249,7 @@ void __init load_ucode_amd_bsp(void)
*data = cp.data;
*size = cp.size;
- apply_ucode_in_initrd(cp.data, cp.size);
+ apply_ucode_in_initrd(cp.data, cp.size, true);
}
#ifdef CONFIG_X86_32
@@ -263,7 +266,7 @@ void load_ucode_amd_ap(void)
size_t *usize;
void **ucode;
- mc = (struct microcode_amd *)__pa(amd_ucode_patch);
+ mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
__apply_microcode_amd(mc);
return;
@@ -275,7 +278,7 @@ void load_ucode_amd_ap(void)
if (!*ucode || !*usize)
return;
- apply_ucode_in_initrd(*ucode, *usize);
+ apply_ucode_in_initrd(*ucode, *usize, false);
}
static void __init collect_cpu_sig_on_bsp(void *arg)
@@ -339,7 +342,7 @@ void load_ucode_amd_ap(void)
* AP has a different equivalence ID than BSP, looks like
* mixed-steppings silicon so go through the ucode blob anew.
*/
- apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size);
+ apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
}
}
#endif
@@ -347,7 +350,9 @@ void load_ucode_amd_ap(void)
int __init save_microcode_in_initrd_amd(void)
{
unsigned long cont;
+ int retval = 0;
enum ucode_state ret;
+ u8 *cont_va;
u32 eax;
if (!container)
@@ -355,13 +360,15 @@ int __init save_microcode_in_initrd_amd(void)
#ifdef CONFIG_X86_32
get_bsp_sig();
- cont = (unsigned long)container;
+ cont = (unsigned long)container;
+ cont_va = __va(container);
#else
/*
* We need the physical address of the container for both bitness since
* boot_params.hdr.ramdisk_image is a physical address.
*/
- cont = __pa(container);
+ cont = __pa(container);
+ cont_va = container;
#endif
/*
@@ -372,6 +379,8 @@ int __init save_microcode_in_initrd_amd(void)
if (relocated_ramdisk)
container = (u8 *)(__va(relocated_ramdisk) +
(cont - boot_params.hdr.ramdisk_image));
+ else
+ container = cont_va;
if (ucode_new_rev)
pr_info("microcode: updated early to new patch_level=0x%08x\n",
@@ -382,7 +391,7 @@ int __init save_microcode_in_initrd_amd(void)
ret = load_microcode_amd(eax, container, container_size);
if (ret != UCODE_OK)
- return -EINVAL;
+ retval = -EINVAL;
/*
* This will be freed any msec now, stash patches for the current
@@ -391,5 +400,5 @@ int __init save_microcode_in_initrd_amd(void)
container = NULL;
container_size = 0;
- return 0;
+ return retval;
}
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index dd9d6190b08d..08fe6e8a726e 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -465,6 +465,16 @@ static void mc_bp_resume(void)
if (uci->valid && uci->mc)
microcode_ops->apply_microcode(cpu);
+#ifdef CONFIG_X86_64
+ else if (!uci->mc)
+ /*
+ * We might resume and not have applied late microcode but still
+ * have a newer patch stashed from the early loader. We don't
+ * have it in uci->mc so we have to load it the same way we're
+ * applying patches early on the APs.
+ */
+ load_ucode_ap();
+#endif
}
static struct syscore_ops mc_syscore_ops = {
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index 5f28a64e71ea..2c017f242a78 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -124,7 +124,7 @@ void __init load_ucode_bsp(void)
static bool check_loader_disabled_ap(void)
{
#ifdef CONFIG_X86_32
- return __pa_nodebug(dis_ucode_ldr);
+ return *((bool *)__pa_nodebug(&dis_ucode_ldr));
#else
return dis_ucode_ldr;
#endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index adf138eac85c..f9ed429d6e4f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -486,14 +486,17 @@ static struct attribute_group snbep_uncore_qpi_format_group = {
.attrs = snbep_uncore_qpi_formats_attr,
};
-#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
- .init_box = snbep_uncore_msr_init_box, \
+#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
.disable_box = snbep_uncore_msr_disable_box, \
.enable_box = snbep_uncore_msr_enable_box, \
.disable_event = snbep_uncore_msr_disable_event, \
.enable_event = snbep_uncore_msr_enable_event, \
.read_counter = uncore_msr_read_counter
+#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
+ __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), \
+ .init_box = snbep_uncore_msr_init_box \
+
static struct intel_uncore_ops snbep_uncore_msr_ops = {
SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
};
@@ -1919,6 +1922,30 @@ static struct intel_uncore_type hswep_uncore_cbox = {
.format_group = &hswep_uncore_cbox_format_group,
};
+/*
+ * Write SBOX Initialization register bit by bit to avoid spurious #GPs
+ */
+static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+
+ if (msr) {
+ u64 init = SNBEP_PMON_BOX_CTL_INT;
+ u64 flags = 0;
+ int i;
+
+ for_each_set_bit(i, (unsigned long *)&init, 64) {
+ flags |= (1ULL << i);
+ wrmsrl(msr, flags);
+ }
+ }
+}
+
+static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
+ __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .init_box = hswep_uncore_sbox_msr_init_box
+};
+
static struct attribute *hswep_uncore_sbox_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
@@ -1944,7 +1971,7 @@ static struct intel_uncore_type hswep_uncore_sbox = {
.event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
.box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL,
.msr_offset = HSWEP_SBOX_MSR_OFFSET,
- .ops = &snbep_uncore_msr_ops,
+ .ops = &hswep_uncore_sbox_msr_ops,
.format_group = &hswep_uncore_sbox_format_group,
};
@@ -2025,13 +2052,27 @@ static struct intel_uncore_type hswep_uncore_imc = {
SNBEP_UNCORE_PCI_COMMON_INIT(),
};
+static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
+
+static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+ pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
static struct intel_uncore_ops hswep_uncore_irp_ops = {
.init_box = snbep_uncore_pci_init_box,
.disable_box = snbep_uncore_pci_disable_box,
.enable_box = snbep_uncore_pci_enable_box,
.disable_event = ivbep_uncore_irp_disable_event,
.enable_event = ivbep_uncore_irp_enable_event,
- .read_counter = ivbep_uncore_irp_read_counter,
+ .read_counter = hswep_uncore_irp_read_counter,
};
static struct intel_uncore_type hswep_uncore_irp = {
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 1abcb50b48ae..ff86f19b5758 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -24,7 +24,6 @@ static char x86_stack_ids[][8] = {
[ DEBUG_STACK-1 ] = "#DB",
[ NMI_STACK-1 ] = "NMI",
[ DOUBLEFAULT_STACK-1 ] = "#DF",
- [ STACKFAULT_STACK-1 ] = "#SS",
[ MCE_STACK-1 ] = "#MC",
#if DEBUG_STKSZ > EXCEPTION_STKSZ
[ N_EXCEPTION_STACKS ...
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index df088bb03fb3..c0226ab54106 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -828,9 +828,15 @@ ENTRY(native_iret)
jnz native_irq_return_ldt
#endif
+.global native_irq_return_iret
native_irq_return_iret:
+ /*
+ * This may fault. Non-paranoid faults on return to userspace are
+ * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
+ * Double-faults due to espfix64 are handled in do_double_fault.
+ * Other faults here are fatal.
+ */
iretq
- _ASM_EXTABLE(native_irq_return_iret, bad_iret)
#ifdef CONFIG_X86_ESPFIX64
native_irq_return_ldt:
@@ -858,25 +864,6 @@ native_irq_return_ldt:
jmp native_irq_return_iret
#endif
- .section .fixup,"ax"
-bad_iret:
- /*
- * The iret traps when the %cs or %ss being restored is bogus.
- * We've lost the original trap vector and error code.
- * #GPF is the most likely one to get for an invalid selector.
- * So pretend we completed the iret and took the #GPF in user mode.
- *
- * We are now running with the kernel GS after exception recovery.
- * But error_entry expects us to have user GS to match the user %cs,
- * so swap back.
- */
- pushq $0
-
- SWAPGS
- jmp general_protection
-
- .previous
-
/* edi: workmask, edx: work */
retint_careful:
CFI_RESTORE_STATE
@@ -922,37 +909,6 @@ ENTRY(retint_kernel)
CFI_ENDPROC
END(common_interrupt)
- /*
- * If IRET takes a fault on the espfix stack, then we
- * end up promoting it to a doublefault. In that case,
- * modify the stack to make it look like we just entered
- * the #GP handler from user space, similar to bad_iret.
- */
-#ifdef CONFIG_X86_ESPFIX64
- ALIGN
-__do_double_fault:
- XCPT_FRAME 1 RDI+8
- movq RSP(%rdi),%rax /* Trap on the espfix stack? */
- sarq $PGDIR_SHIFT,%rax
- cmpl $ESPFIX_PGD_ENTRY,%eax
- jne do_double_fault /* No, just deliver the fault */
- cmpl $__KERNEL_CS,CS(%rdi)
- jne do_double_fault
- movq RIP(%rdi),%rax
- cmpq $native_irq_return_iret,%rax
- jne do_double_fault /* This shouldn't happen... */
- movq PER_CPU_VAR(kernel_stack),%rax
- subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
- movq %rax,RSP(%rdi)
- movq $0,(%rax) /* Missing (lost) #GP error code */
- movq $general_protection,RIP(%rdi)
- retq
- CFI_ENDPROC
-END(__do_double_fault)
-#else
-# define __do_double_fault do_double_fault
-#endif
-
/*
* APIC interrupts.
*/
@@ -1124,7 +1080,7 @@ idtentry overflow do_overflow has_error_code=0
idtentry bounds do_bounds has_error_code=0
idtentry invalid_op do_invalid_op has_error_code=0
idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault __do_double_fault has_error_code=1 paranoid=1
+idtentry double_fault do_double_fault has_error_code=1 paranoid=1
idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
idtentry invalid_TSS do_invalid_TSS has_error_code=1
idtentry segment_not_present do_segment_not_present has_error_code=1
@@ -1289,7 +1245,7 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
-idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1
+idtentry stack_segment do_stack_segment has_error_code=1
#ifdef CONFIG_XEN
idtentry xen_debug do_debug has_error_code=0
idtentry xen_int3 do_int3 has_error_code=0
@@ -1399,17 +1355,16 @@ error_sti:
/*
* There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. The exception handlers after iret run with
- * kernel gs again, so don't set the user space flag. B stepping K8s
- * sometimes report an truncated RIP for IRET exceptions returning to
- * compat mode. Check for these here too.
+ * usergs. Handle them here. B stepping K8s sometimes report a
+ * truncated RIP for IRET exceptions returning to compat mode. Check
+ * for these here too.
*/
error_kernelspace:
CFI_REL_OFFSET rcx, RCX+8
incl %ebx
leaq native_irq_return_iret(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
+ je error_bad_iret
movl %ecx,%eax /* zero extend */
cmpq %rax,RIP+8(%rsp)
je bstep_iret
@@ -1420,7 +1375,15 @@ error_kernelspace:
bstep_iret:
/* Fix truncated RIP */
movq %rcx,RIP+8(%rsp)
- jmp error_swapgs
+ /* fall through */
+
+error_bad_iret:
+ SWAPGS
+ mov %rsp,%rdi
+ call fixup_bad_iret
+ mov %rax,%rsp
+ decl %ebx /* Return to usergs */
+ jmp error_sti
CFI_ENDPROC
END(error_entry)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 749b0e423419..e510618b2e91 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1484,7 +1484,7 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
*/
if (work & _TIF_NOHZ) {
user_exit();
- work &= ~TIF_NOHZ;
+ work &= ~_TIF_NOHZ;
}
#ifdef CONFIG_SECCOMP
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 4d2128ac70bd..668d8f2a8781 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1303,10 +1303,14 @@ static void __ref remove_cpu_from_maps(int cpu)
numa_remove_cpu(cpu);
}
+static DEFINE_PER_CPU(struct completion, die_complete);
+
void cpu_disable_common(void)
{
int cpu = smp_processor_id();
+ init_completion(&per_cpu(die_complete, smp_processor_id()));
+
remove_siblinginfo(cpu);
/* It's now safe to remove this processor from the online map */
@@ -1316,8 +1320,6 @@ void cpu_disable_common(void)
fixup_irqs();
}
-static DEFINE_PER_CPU(struct completion, die_complete);
-
int native_cpu_disable(void)
{
int ret;
@@ -1327,16 +1329,21 @@ int native_cpu_disable(void)
return ret;
clear_local_APIC();
- init_completion(&per_cpu(die_complete, smp_processor_id()));
cpu_disable_common();
return 0;
}
+void cpu_die_common(unsigned int cpu)
+{
+ wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
+}
+
void native_cpu_die(unsigned int cpu)
{
/* We don't do anything here: idle task is faking death itself. */
- wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
+
+ cpu_die_common(cpu);
/* They ack this in play_dead() by setting CPU_DEAD */
if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 0d0e922fafc1..de801f22128a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -233,32 +233,40 @@ DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op)
DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun)
DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
-#ifdef CONFIG_X86_32
DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
-#endif
DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
#ifdef CONFIG_X86_64
/* Runs on IST stack */
-dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
-{
- enum ctx_state prev_state;
-
- prev_state = exception_enter();
- if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
- X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) {
- preempt_conditional_sti(regs);
- do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
- preempt_conditional_cli(regs);
- }
- exception_exit(prev_state);
-}
-
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
+#ifdef CONFIG_X86_ESPFIX64
+ extern unsigned char native_irq_return_iret[];
+
+ /*
+ * If IRET takes a non-IST fault on the espfix64 stack, then we
+ * end up promoting it to a doublefault. In that case, modify
+ * the stack to make it look like we just entered the #GP
+ * handler from user space, similar to bad_iret.
+ */
+ if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
+ regs->cs == __KERNEL_CS &&
+ regs->ip == (unsigned long)native_irq_return_iret)
+ {
+ struct pt_regs *normal_regs = task_pt_regs(current);
+
+ /* Fake a #GP(0) from userspace. */
+ memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
+ normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
+ regs->ip = (unsigned long)general_protection;
+ regs->sp = (unsigned long)&normal_regs->orig_ax;
+ return;
+ }
+#endif
+
exception_enter();
/* Return not checked because double check cannot be ignored */
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
@@ -399,6 +407,35 @@ asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs)
return regs;
}
NOKPROBE_SYMBOL(sync_regs);
+
+struct bad_iret_stack {
+ void *error_entry_ret;
+ struct pt_regs regs;
+};
+
+asmlinkage __visible
+struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
+{
+ /*
+ * This is called from entry_64.S early in handling a fault
+ * caused by a bad iret to user mode. To handle the fault
+ * correctly, we want move our stack frame to task_pt_regs
+ * and we want to pretend that the exception came from the
+ * iret target.
+ */
+ struct bad_iret_stack *new_stack =
+ container_of(task_pt_regs(current),
+ struct bad_iret_stack, regs);
+
+ /* Copy the IRET target to the new stack. */
+ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
+
+ /* Copy the remainder of the stack from the current stack. */
+ memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
+
+ BUG_ON(!user_mode_vm(&new_stack->regs));
+ return new_stack;
+}
#endif
/*
@@ -778,7 +815,7 @@ void __init trap_init(void)
set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
set_intr_gate(X86_TRAP_TS, invalid_TSS);
set_intr_gate(X86_TRAP_NP, segment_not_present);
- set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
+ set_intr_gate(X86_TRAP_SS, stack_segment);
set_intr_gate(X86_TRAP_GP, general_protection);
set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
set_intr_gate(X86_TRAP_MF, coprocessor_error);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5edf088ca51e..9f8a2faf5040 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4287,6 +4287,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
fetch_register_operand(op);
break;
case OpCL:
+ op->type = OP_IMM;
op->bytes = 1;
op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff;
break;
@@ -4294,6 +4295,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
rc = decode_imm(ctxt, op, 1, true);
break;
case OpOne:
+ op->type = OP_IMM;
op->bytes = 1;
op->val = 1;
break;
@@ -4352,21 +4354,27 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
ctxt->memop.bytes = ctxt->op_bytes + 2;
goto mem_common;
case OpES:
+ op->type = OP_IMM;
op->val = VCPU_SREG_ES;
break;
case OpCS:
+ op->type = OP_IMM;
op->val = VCPU_SREG_CS;
break;
case OpSS:
+ op->type = OP_IMM;
op->val = VCPU_SREG_SS;
break;
case OpDS:
+ op->type = OP_IMM;
op->val = VCPU_SREG_DS;
break;
case OpFS:
+ op->type = OP_IMM;
op->val = VCPU_SREG_FS;
break;
case OpGS:
+ op->type = OP_IMM;
op->val = VCPU_SREG_GS;
break;
case OpImplicit:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ac1c4de3a484..978f402006ee 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -630,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
* kvm mmu, before reclaiming the page, we should
* unmap it from mmu first.
*/
- WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+ WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
@@ -2461,7 +2461,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
+ kvm_is_reserved_pfn(pfn));
if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
@@ -2737,7 +2737,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
* here.
*/
- if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
!has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 7609e0e421ec..1318f75d56e4 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -41,9 +41,8 @@ csum_partial_copy_from_user(const void __user *src, void *dst,
while (((unsigned long)src & 6) && len >= 2) {
__u16 val16;
- *errp = __get_user(val16, (const __u16 __user *)src);
- if (*errp)
- return isum;
+ if (__get_user(val16, (const __u16 __user *)src))
+ goto out_err;
*(__u16 *)dst = val16;
isum = (__force __wsum)add32_with_carry(
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 4cb8763868fc..4e5dfec750fc 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1123,7 +1123,7 @@ void mark_rodata_ro(void)
unsigned long end = (unsigned long) &__end_rodata_hpage_align;
unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
- unsigned long all_end = PFN_ALIGN(&_end);
+ unsigned long all_end;
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
@@ -1134,7 +1134,16 @@ void mark_rodata_ro(void)
/*
* The rodata/data/bss/brk section (but not the kernel text!)
* should also be not-executable.
+ *
+ * We align all_end to PMD_SIZE because the existing mapping
+ * is a full PMD. If we would align _brk_end to PAGE_SIZE we
+ * split the PMD and the reminder between _brk_end and the end
+ * of the PMD will remain mapped executable.
+ *
+ * Any PMD which was setup after the one which covers _brk_end
+ * has been zapped already via cleanup_highmem().
*/
+ all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
rodata_test();
diff --git a/arch/x86/tools/calc_run_size.pl b/arch/x86/tools/calc_run_size.pl
new file mode 100644
index 000000000000..23210baade2d
--- /dev/null
+++ b/arch/x86/tools/calc_run_size.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/perl
+#
+# Calculate the amount of space needed to run the kernel, including room for
+# the .bss and .brk sections.
+#
+# Usage:
+# objdump -h a.out | perl calc_run_size.pl
+use strict;
+
+my $mem_size = 0;
+my $file_offset = 0;
+
+my $sections=" *[0-9]+ \.(?:bss|brk) +";
+while (<>) {
+ if (/^$sections([0-9a-f]+) +(?:[0-9a-f]+ +){2}([0-9a-f]+)/) {
+ my $size = hex($1);
+ my $offset = hex($2);
+ $mem_size += $size;
+ if ($file_offset == 0) {
+ $file_offset = $offset;
+ } elsif ($file_offset != $offset) {
+ # BFD linker shows the same file offset in ELF.
+ # Gold linker shows them as consecutive.
+ next if ($file_offset + $mem_size == $offset + $size);
+
+ printf STDERR "file_offset: 0x%lx\n", $file_offset;
+ printf STDERR "mem_size: 0x%lx\n", $mem_size;
+ printf STDERR "offset: 0x%lx\n", $offset;
+ printf STDERR "size: 0x%lx\n", $size;
+
+ die ".bss and .brk are non-contiguous\n";
+ }
+ }
+}
+
+if ($file_offset == 0) {
+ die "Never found .bss or .brk file offset\n";
+}
+printf("%d\n", $mem_size + $file_offset);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 8650cdb53209..4c071aeb8417 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -510,6 +510,9 @@ static void xen_cpu_die(unsigned int cpu)
current->state = TASK_UNINTERRUPTIBLE;
schedule_timeout(HZ/10);
}
+
+ cpu_die_common(cpu);
+
xen_smp_intr_free(cpu);
xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu);