aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile11
-rw-r--r--arch/x86/kernel/acpi/apei.c5
-rw-r--r--arch/x86/kernel/acpi/boot.c5
-rw-r--r--arch/x86/kernel/alternative.c26
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c243
-rw-r--r--arch/x86/kernel/apic/apic_common.c46
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c10
-rw-r--r--arch/x86/kernel/apic/apic_noop.c25
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c12
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c8
-rw-r--r--arch/x86/kernel/apic/htirq.c5
-rw-r--r--arch/x86/kernel/apic/io_apic.c139
-rw-r--r--arch/x86/kernel/apic/probe_32.c29
-rw-r--r--arch/x86/kernel/apic/vector.c1099
-rw-r--r--arch/x86/kernel/apic/x2apic.h9
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c196
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c44
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c65
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c11
-rw-r--r--arch/x86/kernel/cpu/common.c58
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c121
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c64
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c1
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h7
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c50
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_monitor.c2
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c131
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c13
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c6
-rw-r--r--arch/x86/kernel/cpu/vmware.c8
-rw-r--r--arch/x86/kernel/crash.c18
-rw-r--r--arch/x86/kernel/espfix_64.c6
-rw-r--r--arch/x86/kernel/fpu/init.c11
-rw-r--r--arch/x86/kernel/fpu/xstate.c43
-rw-r--r--arch/x86/kernel/head_32.S5
-rw-r--r--arch/x86/kernel/head_64.S45
-rw-r--r--arch/x86/kernel/i8259.c1
-rw-r--r--arch/x86/kernel/idt.c14
-rw-r--r--arch/x86/kernel/irq.c101
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kprobes/common.h6
-rw-r--r--arch/x86/kernel/kprobes/core.c61
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c32
-rw-r--r--arch/x86/kernel/kprobes/opt.c79
-rw-r--r--arch/x86/kernel/kvm.c49
-rw-r--r--arch/x86/kernel/kvmclock.c65
-rw-r--r--arch/x86/kernel/ldt.c18
-rw-r--r--arch/x86/kernel/nmi.c2
-rw-r--r--arch/x86/kernel/paravirt.c14
-rw-r--r--arch/x86/kernel/pci-calgary_64.c8
-rw-r--r--arch/x86/kernel/pmem.c2
-rw-r--r--arch/x86/kernel/process.c8
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c5
-rw-r--r--arch/x86/kernel/setup.c51
-rw-r--r--arch/x86/kernel/smpboot.c120
-rw-r--r--arch/x86/kernel/stacktrace.c10
-rw-r--r--arch/x86/kernel/time.c5
-rw-r--r--arch/x86/kernel/traps.c21
-rw-r--r--arch/x86/kernel/tsc.c47
-rw-r--r--arch/x86/kernel/tsc_sync.c56
-rw-r--r--arch/x86/kernel/umip.c366
-rw-r--r--arch/x86/kernel/unwind_orc.c2
-rw-r--r--arch/x86/kernel/uprobes.c15
-rw-r--r--arch/x86/kernel/verify_cpu.S3
-rw-r--r--arch/x86/kernel/vm86_32.c20
-rw-r--r--arch/x86/kernel/vsmp_64.c19
-rw-r--r--arch/x86/kernel/x86_init.c11
71 files changed, 2380 insertions, 1430 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5f70044340ff..81bb565f4497 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,9 +25,9 @@ endif
KASAN_SANITIZE_head$(BITS).o := n
KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
-KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_paravirt.o := n
-OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
@@ -127,10 +127,11 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
+obj-$(CONFIG_X86_INTEL_UMIP) += umip.o
-obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o
-obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o
-obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o
+obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
+obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
+obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
###
# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
index ea3046e0b0cf..bb8d300fecbd 100644
--- a/arch/x86/kernel/acpi/apei.c
+++ b/arch/x86/kernel/acpi/apei.c
@@ -52,8 +52,3 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
apei_mce_report_mem_error(sev, mem_err);
#endif
}
-
-void arch_apei_flush_tlb_one(unsigned long addr)
-{
- __flush_tlb_one(addr);
-}
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 079535e53e2a..ef9e02e614d0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -961,6 +961,11 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
x86_platform.legacy.rtc = 0;
}
+ if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) {
+ pr_debug("ACPI: probing for VGA not safe\n");
+ x86_platform.legacy.no_vga = 1;
+ }
+
#ifdef CONFIG_X86_PM_TIMER
/* detect the location of the ACPI PM Timer */
if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) {
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 3344d3382e91..dbaf14d69ebd 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -442,7 +442,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
{
const s32 *poff;
- mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
u8 *ptr = (u8 *)poff + *poff;
@@ -452,7 +451,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
if (*ptr == 0x3e)
text_poke(ptr, ((unsigned char []){0xf0}), 1);
}
- mutex_unlock(&text_mutex);
}
static void alternatives_smp_unlock(const s32 *start, const s32 *end,
@@ -460,7 +458,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
{
const s32 *poff;
- mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
u8 *ptr = (u8 *)poff + *poff;
@@ -470,7 +467,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
if (*ptr == 0xf0)
text_poke(ptr, ((unsigned char []){0x3E}), 1);
}
- mutex_unlock(&text_mutex);
}
struct smp_alt_module {
@@ -489,8 +485,7 @@ struct smp_alt_module {
struct list_head next;
};
static LIST_HEAD(smp_alt_modules);
-static DEFINE_MUTEX(smp_alt);
-static bool uniproc_patched = false; /* protected by smp_alt */
+static bool uniproc_patched = false; /* protected by text_mutex */
void __init_or_module alternatives_smp_module_add(struct module *mod,
char *name,
@@ -499,7 +494,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
{
struct smp_alt_module *smp;
- mutex_lock(&smp_alt);
+ mutex_lock(&text_mutex);
if (!uniproc_patched)
goto unlock;
@@ -526,14 +521,14 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
smp_unlock:
alternatives_smp_unlock(locks, locks_end, text, text_end);
unlock:
- mutex_unlock(&smp_alt);
+ mutex_unlock(&text_mutex);
}
void __init_or_module alternatives_smp_module_del(struct module *mod)
{
struct smp_alt_module *item;
- mutex_lock(&smp_alt);
+ mutex_lock(&text_mutex);
list_for_each_entry(item, &smp_alt_modules, next) {
if (mod != item->mod)
continue;
@@ -541,7 +536,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
kfree(item);
break;
}
- mutex_unlock(&smp_alt);
+ mutex_unlock(&text_mutex);
}
void alternatives_enable_smp(void)
@@ -551,7 +546,7 @@ void alternatives_enable_smp(void)
/* Why bother if there are no other CPUs? */
BUG_ON(num_possible_cpus() == 1);
- mutex_lock(&smp_alt);
+ mutex_lock(&text_mutex);
if (uniproc_patched) {
pr_info("switching to SMP code\n");
@@ -563,10 +558,13 @@ void alternatives_enable_smp(void)
mod->text, mod->text_end);
uniproc_patched = false;
}
- mutex_unlock(&smp_alt);
+ mutex_unlock(&text_mutex);
}
-/* Return 1 if the address range is reserved for smp-alternatives */
+/*
+ * Return 1 if the address range is reserved for SMP-alternatives.
+ * Must hold text_mutex.
+ */
int alternatives_text_reserved(void *start, void *end)
{
struct smp_alt_module *mod;
@@ -574,6 +572,8 @@ int alternatives_text_reserved(void *start, void *end)
u8 *text_start = start;
u8 *text_end = end;
+ lockdep_assert_held(&text_mutex);
+
list_for_each_entry(mod, &smp_alt_modules, next) {
if (mod->text > text_end || mod->text_end < text_start)
continue;
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 2fb7309c6900..a9e08924927e 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -7,7 +7,7 @@
# In particualr, smp_apic_timer_interrupt() is called in random places.
KCOV_INSTRUMENT := n
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o vector.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o
obj-y += hw_nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ff891772c9f8..6e272f3ea984 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -211,11 +211,7 @@ static inline int lapic_get_version(void)
*/
static inline int lapic_is_integrated(void)
{
-#ifdef CONFIG_X86_64
- return 1;
-#else
return APIC_INTEGRATED(lapic_get_version());
-#endif
}
/*
@@ -298,14 +294,11 @@ int get_physical_broadcast(void)
*/
int lapic_get_maxlvt(void)
{
- unsigned int v;
-
- v = apic_read(APIC_LVR);
/*
* - we always have APIC integrated on 64bit mode
* - 82489DXs do not report # of LVT entries
*/
- return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
+ return lapic_is_integrated() ? GET_APIC_MAXLVT(apic_read(APIC_LVR)) : 2;
}
/*
@@ -1229,53 +1222,100 @@ void __init sync_Arb_IDs(void)
APIC_INT_LEVELTRIG | APIC_DM_INIT);
}
-/*
- * An initial setup of the virtual wire mode.
- */
-void __init init_bsp_APIC(void)
+enum apic_intr_mode_id apic_intr_mode;
+
+static int __init apic_intr_mode_select(void)
{
- unsigned int value;
+ /* Check kernel option */
+ if (disable_apic) {
+ pr_info("APIC disabled via kernel command line\n");
+ return APIC_PIC;
+ }
- /*
- * Don't do the setup now if we have a SMP BIOS as the
- * through-I/O-APIC virtual wire mode might be active.
- */
- if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC))
- return;
+ /* Check BIOS */
+#ifdef CONFIG_X86_64
+ /* On 64-bit, the APIC must be integrated, Check local APIC only */
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
+ disable_apic = 1;
+ pr_info("APIC disabled by BIOS\n");
+ return APIC_PIC;
+ }
+#else
+ /* On 32-bit, the APIC may be integrated APIC or 82489DX */
- /*
- * Do not trust the local APIC being empty at bootup.
- */
- clear_local_APIC();
+ /* Neither 82489DX nor integrated APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) {
+ disable_apic = 1;
+ return APIC_PIC;
+ }
- /*
- * Enable APIC.
- */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- value |= APIC_SPIV_APIC_ENABLED;
+ /* If the BIOS pretends there is an integrated APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) &&
+ APIC_INTEGRATED(boot_cpu_apic_version)) {
+ disable_apic = 1;
+ pr_err(FW_BUG "Local APIC %d not detected, force emulation\n",
+ boot_cpu_physical_apicid);
+ return APIC_PIC;
+ }
+#endif
-#ifdef CONFIG_X86_32
- /* This bit is reserved on P4/Xeon and should be cleared */
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- (boot_cpu_data.x86 == 15))
- value &= ~APIC_SPIV_FOCUS_DISABLED;
- else
+ /* Check MP table or ACPI MADT configuration */
+ if (!smp_found_config) {
+ disable_ioapic_support();
+ if (!acpi_lapic) {
+ pr_info("APIC: ACPI MADT or MP tables are not detected\n");
+ return APIC_VIRTUAL_WIRE_NO_CONFIG;
+ }
+ return APIC_VIRTUAL_WIRE;
+ }
+
+#ifdef CONFIG_SMP
+ /* If SMP should be disabled, then really disable it! */
+ if (!setup_max_cpus) {
+ pr_info("APIC: SMP mode deactivated\n");
+ return APIC_SYMMETRIC_IO_NO_ROUTING;
+ }
+
+ if (read_apic_id() != boot_cpu_physical_apicid) {
+ panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
+ read_apic_id(), boot_cpu_physical_apicid);
+ /* Or can we switch back to PIC here? */
+ }
#endif
- value |= APIC_SPIV_FOCUS_DISABLED;
- value |= SPURIOUS_APIC_VECTOR;
- apic_write(APIC_SPIV, value);
- /*
- * Set up the virtual wire mode.
- */
- apic_write(APIC_LVT0, APIC_DM_EXTINT);
- value = APIC_DM_NMI;
- if (!lapic_is_integrated()) /* 82489DX */
- value |= APIC_LVT_LEVEL_TRIGGER;
- if (apic_extnmi == APIC_EXTNMI_NONE)
- value |= APIC_LVT_MASKED;
- apic_write(APIC_LVT1, value);
+ return APIC_SYMMETRIC_IO;
+}
+
+/* Init the interrupt delivery mode for the BSP */
+void __init apic_intr_mode_init(void)
+{
+ bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT);
+
+ apic_intr_mode = apic_intr_mode_select();
+
+ switch (apic_intr_mode) {
+ case APIC_PIC:
+ pr_info("APIC: Keep in PIC mode(8259)\n");
+ return;
+ case APIC_VIRTUAL_WIRE:
+ pr_info("APIC: Switch to virtual wire mode setup\n");
+ default_setup_apic_routing();
+ break;
+ case APIC_VIRTUAL_WIRE_NO_CONFIG:
+ pr_info("APIC: Switch to virtual wire mode setup with no configuration\n");
+ upmode = true;
+ default_setup_apic_routing();
+ break;
+ case APIC_SYMMETRIC_IO:
+ pr_info("APIC: Switch to symmetric I/O mode setup\n");
+ default_setup_apic_routing();
+ break;
+ case APIC_SYMMETRIC_IO_NO_ROUTING:
+ pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n");
+ break;
+ }
+
+ apic_bsp_setup(upmode);
}
static void lapic_setup_esr(void)
@@ -1473,7 +1513,7 @@ void setup_local_APIC(void)
/*
* Set up LVT0, LVT1:
*
- * set up through-local-APIC on the BP's LINT0. This is not
+ * set up through-local-APIC on the boot CPU's LINT0. This is not
* strictly necessary in pure symmetric-IO mode, but sometimes
* we delegate interrupts to the 8259A.
*/
@@ -1499,7 +1539,9 @@ void setup_local_APIC(void)
value = APIC_DM_NMI;
else
value = APIC_DM_NMI | APIC_LVT_MASKED;
- if (!lapic_is_integrated()) /* 82489DX */
+
+ /* Is 82489DX ? */
+ if (!lapic_is_integrated())
value |= APIC_LVT_LEVEL_TRIGGER;
apic_write(APIC_LVT1, value);
@@ -1645,7 +1687,7 @@ static __init void try_to_enable_x2apic(int remap_mode)
* under KVM
*/
if (max_physical_apicid > 255 ||
- !hypervisor_x2apic_available()) {
+ !x86_init.hyper.x2apic_available()) {
pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
x2apic_disable();
return;
@@ -1885,8 +1927,8 @@ void __init init_apic_mappings(void)
* yeah -- we lie about apic_version
* in case if apic was disabled via boot option
* but it's not a problem for SMP compiled kernel
- * since smp_sanity_check is prepared for such a case
- * and disable smp mode
+ * since apic_intr_mode_select is prepared for such
+ * a case and disable smp mode
*/
boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
@@ -2242,44 +2284,6 @@ int hard_smp_processor_id(void)
return read_apic_id();
}
-void default_init_apic_ldr(void)
-{
- unsigned long val;
-
- apic_write(APIC_DFR, APIC_DFR_VALUE);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
- apic_write(APIC_LDR, val);
-}
-
-int default_cpu_mask_to_apicid(const struct cpumask *mask,
- struct irq_data *irqdata,
- unsigned int *apicid)
-{
- unsigned int cpu = cpumask_first(mask);
-
- if (cpu >= nr_cpu_ids)
- return -EINVAL;
- *apicid = per_cpu(x86_cpu_to_apicid, cpu);
- irq_data_update_effective_affinity(irqdata, cpumask_of(cpu));
- return 0;
-}
-
-int flat_cpu_mask_to_apicid(const struct cpumask *mask,
- struct irq_data *irqdata,
- unsigned int *apicid)
-
-{
- struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata);
- unsigned long cpu_mask = cpumask_bits(mask)[0] & APIC_ALL_CPUS;
-
- if (!cpu_mask)
- return -EINVAL;
- *apicid = (unsigned int)cpu_mask;
- cpumask_bits(effmsk)[0] = cpu_mask;
- return 0;
-}
-
/*
* Override the generic EOI implementation with an optimized version.
* Only called during early boot when only one CPU is active and with
@@ -2322,72 +2326,27 @@ static void __init apic_bsp_up_setup(void)
* Returns:
* apic_id of BSP APIC
*/
-int __init apic_bsp_setup(bool upmode)
+void __init apic_bsp_setup(bool upmode)
{
- int id;
-
connect_bsp_APIC();
if (upmode)
apic_bsp_up_setup();
setup_local_APIC();
- if (x2apic_mode)
- id = apic_read(APIC_LDR);
- else
- id = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
-
enable_IO_APIC();
end_local_APIC_setup();
irq_remap_enable_fault_handling();
setup_IO_APIC();
- /* Setup local timer */
- x86_init.timers.setup_percpu_clockev();
- return id;
-}
-
-/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
- */
-int __init APIC_init_uniprocessor(void)
-{
- if (disable_apic) {
- pr_info("Apic disabled\n");
- return -1;
- }
-#ifdef CONFIG_X86_64
- if (!boot_cpu_has(X86_FEATURE_APIC)) {
- disable_apic = 1;
- pr_info("Apic disabled by BIOS\n");
- return -1;
- }
-#else
- if (!smp_found_config && !boot_cpu_has(X86_FEATURE_APIC))
- return -1;
-
- /*
- * Complain if the BIOS pretends there is one.
- */
- if (!boot_cpu_has(X86_FEATURE_APIC) &&
- APIC_INTEGRATED(boot_cpu_apic_version)) {
- pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
- boot_cpu_physical_apicid);
- return -1;
- }
-#endif
-
- if (!smp_found_config)
- disable_ioapic_support();
-
- default_setup_apic_routing();
- apic_bsp_setup(true);
- return 0;
}
#ifdef CONFIG_UP_LATE_INIT
void __init up_late_init(void)
{
- APIC_init_uniprocessor();
+ if (apic_intr_mode == APIC_PIC)
+ return;
+
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
}
#endif
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
new file mode 100644
index 000000000000..a360801779ae
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -0,0 +1,46 @@
+/*
+ * Common functions shared between the various APIC flavours
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+#include <linux/irq.h>
+#include <asm/apic.h>
+
+u32 apic_default_calc_apicid(unsigned int cpu)
+{
+ return per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+u32 apic_flat_calc_apicid(unsigned int cpu)
+{
+ return 1U << cpu;
+}
+
+bool default_check_apicid_used(physid_mask_t *map, int apicid)
+{
+ return physid_isset(apicid, *map);
+}
+
+void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
+{
+ *retmap = *phys_map;
+}
+
+int default_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid);
+
+int default_check_phys_apicid_present(int phys_apicid)
+{
+ return physid_isset(phys_apicid, phys_cpu_present_map);
+}
+
+int default_apic_id_valid(int apicid)
+{
+ return (apicid < 255);
+}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index dedd5a41ba48..aa85690e9b64 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -119,7 +119,7 @@ static unsigned int flat_get_apic_id(unsigned long x)
return (x >> 24) & 0xFF;
}
-static unsigned long set_apic_id(unsigned int id)
+static u32 set_apic_id(unsigned int id)
{
return (id & 0xFF) << 24;
}
@@ -154,12 +154,10 @@ static struct apic apic_flat __ro_after_init = {
.irq_delivery_mode = dest_LowestPrio,
.irq_dest_mode = 1, /* logical */
- .target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
- .vector_allocation_domain = flat_vector_allocation_domain,
.init_apic_ldr = flat_init_apic_ldr,
.ioapic_phys_id_map = NULL,
@@ -172,7 +170,7 @@ static struct apic apic_flat __ro_after_init = {
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
- .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+ .calc_dest_apicid = apic_flat_calc_apicid,
.send_IPI = default_send_IPI_single,
.send_IPI_mask = flat_send_IPI_mask,
@@ -249,12 +247,10 @@ static struct apic apic_physflat __ro_after_init = {
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 0, /* physical */
- .target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
- .vector_allocation_domain = default_vector_allocation_domain,
/* not needed, but shouldn't hurt: */
.init_apic_ldr = flat_init_apic_ldr,
@@ -268,7 +264,7 @@ static struct apic apic_physflat __ro_after_init = {
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
- .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
+ .calc_dest_apicid = apic_default_calc_apicid,
.send_IPI = default_send_IPI_single_phys,
.send_IPI_mask = default_send_IPI_mask_sequence_phys,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index c8d211277315..7b659c4480c9 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -84,20 +84,6 @@ static int noop_apic_id_registered(void)
return physid_isset(0, phys_cpu_present_map);
}
-static const struct cpumask *noop_target_cpus(void)
-{
- /* only BSP here */
- return cpumask_of(0);
-}
-
-static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask,
- const struct cpumask *mask)
-{
- if (cpu != 0)
- pr_warning("APIC: Vector allocated for non-BSP cpu\n");
- cpumask_copy(retmask, cpumask_of(cpu));
-}
-
static u32 noop_apic_read(u32 reg)
{
WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
@@ -109,6 +95,13 @@ static void noop_apic_write(u32 reg, u32 v)
WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
}
+#ifdef CONFIG_X86_32
+static int noop_x86_32_early_logical_apicid(int cpu)
+{
+ return BAD_APICID;
+}
+#endif
+
struct apic apic_noop __ro_after_init = {
.name = "noop",
.probe = noop_probe,
@@ -121,12 +114,10 @@ struct apic apic_noop __ro_after_init = {
/* logical delivery broadcast to all CPUs: */
.irq_dest_mode = 1,
- .target_cpus = noop_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = default_check_apicid_used,
- .vector_allocation_domain = noop_vector_allocation_domain,
.init_apic_ldr = noop_init_apic_ldr,
.ioapic_phys_id_map = default_ioapic_phys_id_map,
@@ -142,7 +133,7 @@ struct apic apic_noop __ro_after_init = {
.get_apic_id = noop_get_apic_id,
.set_apic_id = NULL,
- .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+ .calc_dest_apicid = apic_flat_calc_apicid,
.send_IPI = noop_send_IPI,
.send_IPI_mask = noop_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 2fda912219a6..134e04506ab4 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -38,7 +38,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x)
return id;
}
-static unsigned long numachip1_set_apic_id(unsigned int id)
+static u32 numachip1_set_apic_id(unsigned int id)
{
return (id & 0xff) << 24;
}
@@ -51,7 +51,7 @@ static unsigned int numachip2_get_apic_id(unsigned long x)
return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24);
}
-static unsigned long numachip2_set_apic_id(unsigned int id)
+static u32 numachip2_set_apic_id(unsigned int id)
{
return id << 24;
}
@@ -249,12 +249,10 @@ static const struct apic apic_numachip1 __refconst = {
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 0, /* physical */
- .target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
- .vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = flat_init_apic_ldr,
.ioapic_phys_id_map = NULL,
@@ -267,7 +265,7 @@ static const struct apic apic_numachip1 __refconst = {
.get_apic_id = numachip1_get_apic_id,
.set_apic_id = numachip1_set_apic_id,
- .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
+ .calc_dest_apicid = apic_default_calc_apicid,
.send_IPI = numachip_send_IPI_one,
.send_IPI_mask = numachip_send_IPI_mask,
@@ -300,12 +298,10 @@ static const struct apic apic_numachip2 __refconst = {
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 0, /* physical */
- .target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
- .vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = flat_init_apic_ldr,
.ioapic_phys_id_map = NULL,
@@ -318,7 +314,7 @@ static const struct apic apic_numachip2 __refconst = {
.get_apic_id = numachip2_get_apic_id,
.set_apic_id = numachip2_set_apic_id,
- .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
+ .calc_dest_apicid = apic_default_calc_apicid,
.send_IPI = numachip_send_IPI_one,
.send_IPI_mask = numachip_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index e12fbcfc9571..afee386ff711 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -27,9 +27,9 @@ static int bigsmp_apic_id_registered(void)
return 1;
}
-static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
+static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
{
- return 0;
+ return false;
}
static int bigsmp_early_logical_apicid(int cpu)
@@ -155,12 +155,10 @@ static struct apic apic_bigsmp __ro_after_init = {
/* phys delivery to target CPU: */
.irq_dest_mode = 0,
- .target_cpus = default_target_cpus,
.disable_esr = 1,
.dest_logical = 0,
.check_apicid_used = bigsmp_check_apicid_used,
- .vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = bigsmp_init_apic_ldr,
.ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
@@ -173,7 +171,7 @@ static struct apic apic_bigsmp __ro_after_init = {
.get_apic_id = bigsmp_get_apic_id,
.set_apic_id = NULL,
- .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
+ .calc_dest_apicid = apic_default_calc_apicid,
.send_IPI = default_send_IPI_single_phys,
.send_IPI_mask = default_send_IPI_mask_sequence_phys,
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
index 56ccf9346b08..b07075dce8b7 100644
--- a/arch/x86/kernel/apic/htirq.c
+++ b/arch/x86/kernel/apic/htirq.c
@@ -112,8 +112,8 @@ static void htirq_domain_free(struct irq_domain *domain, unsigned int virq,
irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
-static void htirq_domain_activate(struct irq_domain *domain,
- struct irq_data *irq_data)
+static int htirq_domain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool early)
{
struct ht_irq_msg msg;
struct irq_cfg *cfg = irqd_cfg(irq_data);
@@ -132,6 +132,7 @@ static void htirq_domain_activate(struct irq_domain *domain,
HT_IRQ_LOW_MT_ARBITRATED) |
HT_IRQ_LOW_IRQ_MASKED;
write_ht_irq_msg(irq_data->irq, &msg);
+ return 0;
}
static void htirq_domain_deactivate(struct irq_domain *domain,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3b89b27945ff..201579dc5242 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1014,6 +1014,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
info->ioapic_pin))
return -ENOMEM;
} else {
+ info->flags |= X86_IRQ_ALLOC_LEGACY;
irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true,
NULL);
if (irq >= 0) {
@@ -1586,6 +1587,43 @@ static int __init notimercheck(char *s)
}
__setup("no_timer_check", notimercheck);
+static void __init delay_with_tsc(void)
+{
+ unsigned long long start, now;
+ unsigned long end = jiffies + 4;
+
+ start = rdtsc();
+
+ /*
+ * We don't know the TSC frequency yet, but waiting for
+ * 40000000000/HZ TSC cycles is safe:
+ * 4 GHz == 10 jiffies
+ * 1 GHz == 40 jiffies
+ */
+ do {
+ rep_nop();
+ now = rdtsc();
+ } while ((now - start) < 40000000000UL / HZ &&
+ time_before_eq(jiffies, end));
+}
+
+static void __init delay_without_tsc(void)
+{
+ unsigned long end = jiffies + 4;
+ int band = 1;
+
+ /*
+ * We don't know any frequency yet, but waiting for
+ * 40940000000/HZ cycles is safe:
+ * 4 GHz == 10 jiffies
+ * 1 GHz == 40 jiffies
+ * 1 << 1 + 1 << 2 +...+ 1 << 11 = 4094
+ */
+ do {
+ __delay(((1U << band++) * 10000000UL) / HZ);
+ } while (band < 12 && time_before_eq(jiffies, end));
+}
+
/*
* There is a nasty bug in some older SMP boards, their mptable lies
* about the timer IRQ. We do the following to work around the situation:
@@ -1604,8 +1642,12 @@ static int __init timer_irq_works(void)
local_save_flags(flags);
local_irq_enable();
- /* Let ten ticks pass... */
- mdelay((10 * 1000) / HZ);
+
+ if (boot_cpu_has(X86_FEATURE_TSC))
+ delay_with_tsc();
+ else
+ delay_without_tsc();
+
local_irq_restore(flags);
/*
@@ -1821,26 +1863,36 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data)
eoi_ioapic_pin(data->entry.vector, data);
}
+static void ioapic_configure_entry(struct irq_data *irqd)
+{
+ struct mp_chip_data *mpd = irqd->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(irqd);
+ struct irq_pin_list *entry;
+
+ /*
+ * Only update when the parent is the vector domain, don't touch it
+ * if the parent is the remapping domain. Check the installed
+ * ioapic chip to verify that.
+ */
+ if (irqd->chip == &ioapic_chip) {
+ mpd->entry.dest = cfg->dest_apicid;
+ mpd->entry.vector = cfg->vector;
+ }
+ for_each_irq_pin(entry, mpd->irq_2_pin)
+ __ioapic_write_entry(entry->apic, entry->pin, mpd->entry);
+}
+
static int ioapic_set_affinity(struct irq_data *irq_data,
const struct cpumask *mask, bool force)
{
struct irq_data *parent = irq_data->parent_data;
- struct mp_chip_data *data = irq_data->chip_data;
- struct irq_pin_list *entry;
- struct irq_cfg *cfg;
unsigned long flags;
int ret;
ret = parent->chip->irq_set_affinity(parent, mask, force);
raw_spin_lock_irqsave(&ioapic_lock, flags);
- if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
- cfg = irqd_cfg(irq_data);
- data->entry.dest = cfg->dest_apicid;
- data->entry.vector = cfg->vector;
- for_each_irq_pin(entry, data->irq_2_pin)
- __ioapic_write_entry(entry->apic, entry->pin,
- data->entry);
- }
+ if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE)
+ ioapic_configure_entry(irq_data);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return ret;
@@ -2097,7 +2149,7 @@ static inline void __init check_timer(void)
unmask_ioapic_irq(irq_get_irq_data(0));
}
irq_domain_deactivate_irq(irq_data);
- irq_domain_activate_irq(irq_data);
+ irq_domain_activate_irq(irq_data, false);
if (timer_irq_works()) {
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
@@ -2119,7 +2171,7 @@ static inline void __init check_timer(void)
*/
replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
irq_domain_deactivate_irq(irq_data);
- irq_domain_activate_irq(irq_data);
+ irq_domain_activate_irq(irq_data, false);
legacy_pic->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2513,52 +2565,9 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
}
/*
- * This function currently is only a helper for the i386 smp boot process where
- * we need to reprogram the ioredtbls to cater for the cpus which have come online
- * so mask in all cases should simply be apic->target_cpus()
+ * This function updates target affinity of IOAPIC interrupts to include
+ * the CPUs which came online during SMP bringup.
*/
-#ifdef CONFIG_SMP
-void __init setup_ioapic_dest(void)
-{
- int pin, ioapic, irq, irq_entry;
- const struct cpumask *mask;
- struct irq_desc *desc;
- struct irq_data *idata;
- struct irq_chip *chip;
-
- if (skip_ioapic_setup == 1)
- return;
-
- for_each_ioapic_pin(ioapic, pin) {
- irq_entry = find_irq_entry(ioapic, pin, mp_INT);
- if (irq_entry == -1)
- continue;
-
- irq = pin_2_irq(irq_entry, ioapic, pin, 0);
- if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq))
- continue;
-
- desc = irq_to_desc(irq);
- raw_spin_lock_irq(&desc->lock);
- idata = irq_desc_get_irq_data(desc);
-
- /*
- * Honour affinities which have been set in early boot
- */
- if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
- mask = irq_data_get_affinity_mask(idata);
- else
- mask = apic->target_cpus();
-
- chip = irq_data_get_irq_chip(idata);
- /* Might be lapic_chip for irq 0 */
- if (chip->irq_set_affinity)
- chip->irq_set_affinity(idata, mask, false);
- raw_spin_unlock_irq(&desc->lock);
- }
-}
-#endif
-
#define IOAPIC_RESOURCE_NAME_SIZE 11
static struct resource *ioapic_resources;
@@ -2978,17 +2987,15 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
-void mp_irqdomain_activate(struct irq_domain *domain,
- struct irq_data *irq_data)
+int mp_irqdomain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool early)
{
unsigned long flags;
- struct irq_pin_list *entry;
- struct mp_chip_data *data = irq_data->chip_data;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- for_each_irq_pin(entry, data->irq_2_pin)
- __ioapic_write_entry(entry->apic, entry->pin, data->entry);
+ ioapic_configure_entry(irq_data);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return 0;
}
void mp_irqdomain_deactivate(struct irq_domain *domain,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 63287659adb6..fa22017de806 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -66,6 +66,31 @@ static void setup_apic_flat_routing(void)
#endif
}
+static int default_apic_id_registered(void)
+{
+ return physid_isset(read_apic_id(), phys_cpu_present_map);
+}
+
+/*
+ * Set up the logical destination ID. Intel recommends to set DFR, LDR and
+ * TPR before enabling an APIC. See e.g. "AP-388 82489DX User's Manual"
+ * (Intel document number 292116).
+ */
+static void default_init_apic_ldr(void)
+{
+ unsigned long val;
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+ apic_write(APIC_LDR, val);
+}
+
+static int default_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
/* should be called last. */
static int probe_default(void)
{
@@ -84,12 +109,10 @@ static struct apic apic_default __ro_after_init = {
/* logical delivery broadcast to all CPUs: */
.irq_dest_mode = 1,
- .target_cpus = default_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = default_check_apicid_used,
- .vector_allocation_domain = flat_vector_allocation_domain,
.init_apic_ldr = default_init_apic_ldr,
.ioapic_phys_id_map = default_ioapic_phys_id_map,
@@ -102,7 +125,7 @@ static struct apic apic_default __ro_after_init = {
.get_apic_id = default_get_apic_id,
.set_apic_id = NULL,
- .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+ .calc_dest_apicid = apic_flat_calc_apicid,
.send_IPI = default_send_IPI_single,
.send_IPI_mask = default_send_IPI_mask_logical,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 88c214e75a6b..05c85e693a5d 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -11,6 +11,7 @@
* published by the Free Software Foundation.
*/
#include <linux/interrupt.h>
+#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/slab.h>
@@ -21,20 +22,30 @@
#include <asm/desc.h>
#include <asm/irq_remapping.h>
+#include <asm/trace/irq_vectors.h>
+
struct apic_chip_data {
- struct irq_cfg cfg;
- cpumask_var_t domain;
- cpumask_var_t old_domain;
- u8 move_in_progress : 1;
+ struct irq_cfg hw_irq_cfg;
+ unsigned int vector;
+ unsigned int prev_vector;
+ unsigned int cpu;
+ unsigned int prev_cpu;
+ unsigned int irq;
+ struct hlist_node clist;
+ unsigned int move_in_progress : 1,
+ is_managed : 1,
+ can_reserve : 1,
+ has_reserved : 1;
};
struct irq_domain *x86_vector_domain;
EXPORT_SYMBOL_GPL(x86_vector_domain);
static DEFINE_RAW_SPINLOCK(vector_lock);
-static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask;
+static cpumask_var_t vector_searchmask;
static struct irq_chip lapic_controller;
-#ifdef CONFIG_X86_IO_APIC
-static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
+static struct irq_matrix *vector_matrix;
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct hlist_head, cleanup_list);
#endif
void lock_vector_lock(void)
@@ -50,22 +61,37 @@ void unlock_vector_lock(void)
raw_spin_unlock(&vector_lock);
}
-static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data)
+void init_irq_alloc_info(struct irq_alloc_info *info,
+ const struct cpumask *mask)
+{
+ memset(info, 0, sizeof(*info));
+ info->mask = mask;
+}
+
+void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
{
- if (!irq_data)
+ if (src)
+ *dst = *src;
+ else
+ memset(dst, 0, sizeof(*dst));
+}
+
+static struct apic_chip_data *apic_chip_data(struct irq_data *irqd)
+{
+ if (!irqd)
return NULL;
- while (irq_data->parent_data)
- irq_data = irq_data->parent_data;
+ while (irqd->parent_data)
+ irqd = irqd->parent_data;
- return irq_data->chip_data;
+ return irqd->chip_data;
}
-struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
+struct irq_cfg *irqd_cfg(struct irq_data *irqd)
{
- struct apic_chip_data *data = apic_chip_data(irq_data);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
- return data ? &data->cfg : NULL;
+ return apicd ? &apicd->hw_irq_cfg : NULL;
}
EXPORT_SYMBOL_GPL(irqd_cfg);
@@ -76,270 +102,395 @@ struct irq_cfg *irq_cfg(unsigned int irq)
static struct apic_chip_data *alloc_apic_chip_data(int node)
{
- struct apic_chip_data *data;
+ struct apic_chip_data *apicd;
- data = kzalloc_node(sizeof(*data), GFP_KERNEL, node);
- if (!data)
- return NULL;
- if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node))
- goto out_data;
- if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
- goto out_domain;
- return data;
-out_domain:
- free_cpumask_var(data->domain);
-out_data:
- kfree(data);
- return NULL;
-}
-
-static void free_apic_chip_data(struct apic_chip_data *data)
-{
- if (data) {
- free_cpumask_var(data->domain);
- free_cpumask_var(data->old_domain);
- kfree(data);
+ apicd = kzalloc_node(sizeof(*apicd), GFP_KERNEL, node);
+ if (apicd)
+ INIT_HLIST_NODE(&apicd->clist);
+ return apicd;
+}
+
+static void free_apic_chip_data(struct apic_chip_data *apicd)
+{
+ kfree(apicd);
+}
+
+static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector,
+ unsigned int cpu)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ lockdep_assert_held(&vector_lock);
+
+ apicd->hw_irq_cfg.vector = vector;
+ apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu);
+ irq_data_update_effective_affinity(irqd, cpumask_of(cpu));
+ trace_vector_config(irqd->irq, vector, cpu,
+ apicd->hw_irq_cfg.dest_apicid);
+}
+
+static void apic_update_vector(struct irq_data *irqd, unsigned int newvec,
+ unsigned int newcpu)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ struct irq_desc *desc = irq_data_to_desc(irqd);
+
+ lockdep_assert_held(&vector_lock);
+
+ trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector,
+ apicd->cpu);
+
+ /* Setup the vector move, if required */
+ if (apicd->vector && cpu_online(apicd->cpu)) {
+ apicd->move_in_progress = true;
+ apicd->prev_vector = apicd->vector;
+ apicd->prev_cpu = apicd->cpu;
+ } else {
+ apicd->prev_vector = 0;
}
+
+ apicd->vector = newvec;
+ apicd->cpu = newcpu;
+ BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));
+ per_cpu(vector_irq, newcpu)[newvec] = desc;
}
-static int __assign_irq_vector(int irq, struct apic_chip_data *d,
- const struct cpumask *mask,
- struct irq_data *irqdata)
+static void vector_assign_managed_shutdown(struct irq_data *irqd)
{
- /*
- * NOTE! The local APIC isn't very good at handling
- * multiple interrupts at the same interrupt level.
- * As the interrupt level is determined by taking the
- * vector number and shifting that right by 4, we
- * want to spread these out a bit so that they don't
- * all fall in the same interrupt level.
- *
- * Also, we've got to be careful not to trash gate
- * 0x80, because int 0x80 is hm, kind of importantish. ;)
- */
- static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
- static int current_offset = VECTOR_OFFSET_START % 16;
- int cpu, vector;
+ unsigned int cpu = cpumask_first(cpu_online_mask);
- /*
- * If there is still a move in progress or the previous move has not
- * been cleaned up completely, tell the caller to come back later.
- */
- if (d->move_in_progress ||
- cpumask_intersects(d->old_domain, cpu_online_mask))
- return -EBUSY;
+ apic_update_irq_cfg(irqd, MANAGED_IRQ_SHUTDOWN_VECTOR, cpu);
+}
- /* Only try and allocate irqs on cpus that are present */
- cpumask_clear(d->old_domain);
- cpumask_clear(searched_cpumask);
- cpu = cpumask_first_and(mask, cpu_online_mask);
- while (cpu < nr_cpu_ids) {
- int new_cpu, offset;
+static int reserve_managed_vector(struct irq_data *irqd)
+{
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+ int ret;
- /* Get the possible target cpus for @mask/@cpu from the apic */
- apic->vector_allocation_domain(cpu, vector_cpumask, mask);
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ apicd->is_managed = true;
+ ret = irq_matrix_reserve_managed(vector_matrix, affmsk);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ trace_vector_reserve_managed(irqd->irq, ret);
+ return ret;
+}
- /*
- * Clear the offline cpus from @vector_cpumask for searching
- * and verify whether the result overlaps with @mask. If true,
- * then the call to apic->cpu_mask_to_apicid() will
- * succeed as well. If not, no point in trying to find a
- * vector in this mask.
- */
- cpumask_and(vector_searchmask, vector_cpumask, cpu_online_mask);
- if (!cpumask_intersects(vector_searchmask, mask))
- goto next_cpu;
-
- if (cpumask_subset(vector_cpumask, d->domain)) {
- if (cpumask_equal(vector_cpumask, d->domain))
- goto success;
- /*
- * Mark the cpus which are not longer in the mask for
- * cleanup.
- */
- cpumask_andnot(d->old_domain, d->domain, vector_cpumask);
- vector = d->cfg.vector;
- goto update;
- }
+static void reserve_irq_vector_locked(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
- vector = current_vector;
- offset = current_offset;
-next:
- vector += 16;
- if (vector >= FIRST_SYSTEM_VECTOR) {
- offset = (offset + 1) % 16;
- vector = FIRST_EXTERNAL_VECTOR + offset;
- }
+ irq_matrix_reserve(vector_matrix);
+ apicd->can_reserve = true;
+ apicd->has_reserved = true;
+ trace_vector_reserve(irqd->irq, 0);
+ vector_assign_managed_shutdown(irqd);
+}
- /* If the search wrapped around, try the next cpu */
- if (unlikely(current_vector == vector))
- goto next_cpu;
+static int reserve_irq_vector(struct irq_data *irqd)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ reserve_irq_vector_locked(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return 0;
+}
- if (test_bit(vector, used_vectors))
- goto next;
+static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ bool resvd = apicd->has_reserved;
+ unsigned int cpu = apicd->cpu;
+ int vector = apicd->vector;
- for_each_cpu(new_cpu, vector_searchmask) {
- if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
- goto next;
- }
- /* Found one! */
- current_vector = vector;
- current_offset = offset;
- /* Schedule the old vector for cleanup on all cpus */
- if (d->cfg.vector)
- cpumask_copy(d->old_domain, d->domain);
- for_each_cpu(new_cpu, vector_searchmask)
- per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
- goto update;
-
-next_cpu:
- /*
- * We exclude the current @vector_cpumask from the requested
- * @mask and try again with the next online cpu in the
- * result. We cannot modify @mask, so we use @vector_cpumask
- * as a temporary buffer here as it will be reassigned when
- * calling apic->vector_allocation_domain() above.
- */
- cpumask_or(searched_cpumask, searched_cpumask, vector_cpumask);
- cpumask_andnot(vector_cpumask, mask, searched_cpumask);
- cpu = cpumask_first_and(vector_cpumask, cpu_online_mask);
- continue;
- }
- return -ENOSPC;
+ lockdep_assert_held(&vector_lock);
-update:
/*
- * Exclude offline cpus from the cleanup mask and set the
- * move_in_progress flag when the result is not empty.
+ * If the current target CPU is online and in the new requested
+ * affinity mask, there is no point in moving the interrupt from
+ * one CPU to another.
*/
- cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
- d->move_in_progress = !cpumask_empty(d->old_domain);
- d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0;
- d->cfg.vector = vector;
- cpumask_copy(d->domain, vector_cpumask);
-success:
- /*
- * Cache destination APIC IDs into cfg->dest_apicid. This cannot fail
- * as we already established, that mask & d->domain & cpu_online_mask
- * is not empty.
- *
- * vector_searchmask is a subset of d->domain and has the offline
- * cpus masked out.
- */
- cpumask_and(vector_searchmask, vector_searchmask, mask);
- BUG_ON(apic->cpu_mask_to_apicid(vector_searchmask, irqdata,
- &d->cfg.dest_apicid));
+ if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest))
+ return 0;
+
+ vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu);
+ if (vector > 0)
+ apic_update_vector(irqd, vector, cpu);
+ trace_vector_alloc(irqd->irq, vector, resvd, vector);
+ return vector;
+}
+
+static int assign_vector_locked(struct irq_data *irqd,
+ const struct cpumask *dest)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ int vector = allocate_vector(irqd, dest);
+
+ if (vector < 0)
+ return vector;
+
+ apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu);
return 0;
}
-static int assign_irq_vector(int irq, struct apic_chip_data *data,
- const struct cpumask *mask,
- struct irq_data *irqdata)
+static int assign_irq_vector(struct irq_data *irqd, const struct cpumask *dest)
{
- int err;
unsigned long flags;
+ int ret;
raw_spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, data, mask, irqdata);
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ ret = assign_vector_locked(irqd, vector_searchmask);
raw_spin_unlock_irqrestore(&vector_lock, flags);
- return err;
+ return ret;
}
-static int assign_irq_vector_policy(int irq, int node,
- struct apic_chip_data *data,
- struct irq_alloc_info *info,
- struct irq_data *irqdata)
+static int assign_irq_vector_any_locked(struct irq_data *irqd)
{
- if (info && info->mask)
- return assign_irq_vector(irq, data, info->mask, irqdata);
- if (node != NUMA_NO_NODE &&
- assign_irq_vector(irq, data, cpumask_of_node(node), irqdata) == 0)
+ /* Get the affinity mask - either irq_default_affinity or (user) set */
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ int node = irq_data_get_node(irqd);
+
+ if (node == NUMA_NO_NODE)
+ goto all;
+ /* Try the intersection of @affmsk and node mask */
+ cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
+ if (!assign_vector_locked(irqd, vector_searchmask))
+ return 0;
+ /* Try the node mask */
+ if (!assign_vector_locked(irqd, cpumask_of_node(node)))
return 0;
- return assign_irq_vector(irq, data, apic->target_cpus(), irqdata);
+all:
+ /* Try the full affinity mask */
+ cpumask_and(vector_searchmask, affmsk, cpu_online_mask);
+ if (!assign_vector_locked(irqd, vector_searchmask))
+ return 0;
+ /* Try the full online mask */
+ return assign_vector_locked(irqd, cpu_online_mask);
+}
+
+static int
+assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info)
+{
+ if (irqd_affinity_is_managed(irqd))
+ return reserve_managed_vector(irqd);
+ if (info->mask)
+ return assign_irq_vector(irqd, info->mask);
+ /*
+ * Make only a global reservation with no guarantee. A real vector
+ * is associated at activation time.
+ */
+ return reserve_irq_vector(irqd);
}
-static void clear_irq_vector(int irq, struct apic_chip_data *data)
+static int
+assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest)
{
- struct irq_desc *desc;
- int cpu, vector;
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ int vector, cpu;
- if (!data->cfg.vector)
+ cpumask_and(vector_searchmask, vector_searchmask, affmsk);
+ cpu = cpumask_first(vector_searchmask);
+ if (cpu >= nr_cpu_ids)
+ return -EINVAL;
+ /* set_affinity might call here for nothing */
+ if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask))
+ return 0;
+ vector = irq_matrix_alloc_managed(vector_matrix, cpu);
+ trace_vector_alloc_managed(irqd->irq, vector, vector);
+ if (vector < 0)
+ return vector;
+ apic_update_vector(irqd, vector, cpu);
+ apic_update_irq_cfg(irqd, vector, cpu);
+ return 0;
+}
+
+static void clear_irq_vector(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ bool managed = irqd_affinity_is_managed(irqd);
+ unsigned int vector = apicd->vector;
+
+ lockdep_assert_held(&vector_lock);
+
+ if (!vector)
return;
- vector = data->cfg.vector;
- for_each_cpu_and(cpu, data->domain, cpu_online_mask)
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
+ trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector,
+ apicd->prev_cpu);
- data->cfg.vector = 0;
- cpumask_clear(data->domain);
+ per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED;
+ irq_matrix_free(vector_matrix, apicd->cpu, vector, managed);
+ apicd->vector = 0;
- /*
- * If move is in progress or the old_domain mask is not empty,
- * i.e. the cleanup IPI has not been processed yet, we need to remove
- * the old references to desc from all cpus vector tables.
- */
- if (!data->move_in_progress && cpumask_empty(data->old_domain))
+ /* Clean up move in progress */
+ vector = apicd->prev_vector;
+ if (!vector)
return;
- desc = irq_to_desc(irq);
- for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
- vector++) {
- if (per_cpu(vector_irq, cpu)[vector] != desc)
- continue;
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
- break;
- }
+ per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED;
+ irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed);
+ apicd->prev_vector = 0;
+ apicd->move_in_progress = 0;
+ hlist_del_init(&apicd->clist);
+}
+
+static void x86_vector_deactivate(struct irq_domain *dom, struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+
+ trace_vector_deactivate(irqd->irq, apicd->is_managed,
+ apicd->can_reserve, false);
+
+ /* Regular fixed assigned interrupt */
+ if (!apicd->is_managed && !apicd->can_reserve)
+ return;
+ /* If the interrupt has a global reservation, nothing to do */
+ if (apicd->has_reserved)
+ return;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ clear_irq_vector(irqd);
+ if (apicd->can_reserve)
+ reserve_irq_vector_locked(irqd);
+ else
+ vector_assign_managed_shutdown(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+}
+
+static int activate_reserved(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ int ret;
+
+ ret = assign_irq_vector_any_locked(irqd);
+ if (!ret)
+ apicd->has_reserved = false;
+ return ret;
+}
+
+static int activate_managed(struct irq_data *irqd)
+{
+ const struct cpumask *dest = irq_data_get_affinity_mask(irqd);
+ int ret;
+
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ if (WARN_ON_ONCE(cpumask_empty(vector_searchmask))) {
+ /* Something in the core code broke! Survive gracefully */
+ pr_err("Managed startup for irq %u, but no CPU\n", irqd->irq);
+ return EINVAL;
+ }
+
+ ret = assign_managed_vector(irqd, vector_searchmask);
+ /*
+ * This should not happen. The vector reservation got buggered. Handle
+ * it gracefully.
+ */
+ if (WARN_ON_ONCE(ret < 0)) {
+ pr_err("Managed startup irq %u, no vector available\n",
+ irqd->irq);
}
- data->move_in_progress = 0;
+ return ret;
}
-void init_irq_alloc_info(struct irq_alloc_info *info,
- const struct cpumask *mask)
+static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
+ bool early)
{
- memset(info, 0, sizeof(*info));
- info->mask = mask;
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+ int ret = 0;
+
+ trace_vector_activate(irqd->irq, apicd->is_managed,
+ apicd->can_reserve, early);
+
+ /* Nothing to do for fixed assigned vectors */
+ if (!apicd->can_reserve && !apicd->is_managed)
+ return 0;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ if (early || irqd_is_managed_and_shutdown(irqd))
+ vector_assign_managed_shutdown(irqd);
+ else if (apicd->is_managed)
+ ret = activate_managed(irqd);
+ else if (apicd->has_reserved)
+ ret = activate_reserved(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return ret;
}
-void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
+static void vector_free_reserved_and_managed(struct irq_data *irqd)
{
- if (src)
- *dst = *src;
- else
- memset(dst, 0, sizeof(*dst));
+ const struct cpumask *dest = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ trace_vector_teardown(irqd->irq, apicd->is_managed,
+ apicd->has_reserved);
+
+ if (apicd->has_reserved)
+ irq_matrix_remove_reserved(vector_matrix);
+ if (apicd->is_managed)
+ irq_matrix_remove_managed(vector_matrix, dest);
}
static void x86_vector_free_irqs(struct irq_domain *domain,
unsigned int virq, unsigned int nr_irqs)
{
- struct apic_chip_data *apic_data;
- struct irq_data *irq_data;
+ struct apic_chip_data *apicd;
+ struct irq_data *irqd;
unsigned long flags;
int i;
for (i = 0; i < nr_irqs; i++) {
- irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
- if (irq_data && irq_data->chip_data) {
+ irqd = irq_domain_get_irq_data(x86_vector_domain, virq + i);
+ if (irqd && irqd->chip_data) {
raw_spin_lock_irqsave(&vector_lock, flags);
- clear_irq_vector(virq + i, irq_data->chip_data);
- apic_data = irq_data->chip_data;
- irq_domain_reset_irq_data(irq_data);
+ clear_irq_vector(irqd);
+ vector_free_reserved_and_managed(irqd);
+ apicd = irqd->chip_data;
+ irq_domain_reset_irq_data(irqd);
raw_spin_unlock_irqrestore(&vector_lock, flags);
- free_apic_chip_data(apic_data);
-#ifdef CONFIG_X86_IO_APIC
- if (virq + i < nr_legacy_irqs())
- legacy_irq_data[virq + i] = NULL;
-#endif
+ free_apic_chip_data(apicd);
}
}
}
+static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd,
+ struct apic_chip_data *apicd)
+{
+ unsigned long flags;
+ bool realloc = false;
+
+ apicd->vector = ISA_IRQ_VECTOR(virq);
+ apicd->cpu = 0;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ /*
+ * If the interrupt is activated, then it must stay at this vector
+ * position. That's usually the timer interrupt (0).
+ */
+ if (irqd_is_activated(irqd)) {
+ trace_vector_setup(virq, true, 0);
+ apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu);
+ } else {
+ /* Release the vector */
+ apicd->can_reserve = true;
+ clear_irq_vector(irqd);
+ realloc = true;
+ }
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return realloc;
+}
+
static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
unsigned int nr_irqs, void *arg)
{
struct irq_alloc_info *info = arg;
- struct apic_chip_data *data;
- struct irq_data *irq_data;
+ struct apic_chip_data *apicd;
+ struct irq_data *irqd;
int i, err, node;
if (disable_apic)
@@ -350,34 +501,37 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
return -ENOSYS;
for (i = 0; i < nr_irqs; i++) {
- irq_data = irq_domain_get_irq_data(domain, virq + i);
- BUG_ON(!irq_data);
- node = irq_data_get_node(irq_data);
-#ifdef CONFIG_X86_IO_APIC
- if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
- data = legacy_irq_data[virq + i];
- else
-#endif
- data = alloc_apic_chip_data(node);
- if (!data) {
+ irqd = irq_domain_get_irq_data(domain, virq + i);
+ BUG_ON(!irqd);
+ node = irq_data_get_node(irqd);
+ WARN_ON_ONCE(irqd->chip_data);
+ apicd = alloc_apic_chip_data(node);
+ if (!apicd) {
err = -ENOMEM;
goto error;
}
- irq_data->chip = &lapic_controller;
- irq_data->chip_data = data;
- irq_data->hwirq = virq + i;
- err = assign_irq_vector_policy(virq + i, node, data, info,
- irq_data);
- if (err)
- goto error;
+ apicd->irq = virq + i;
+ irqd->chip = &lapic_controller;
+ irqd->chip_data = apicd;
+ irqd->hwirq = virq + i;
+ irqd_set_single_target(irqd);
/*
- * If the apic destination mode is physical, then the
- * effective affinity is restricted to a single target
- * CPU. Mark the interrupt accordingly.
+ * Legacy vectors are already assigned when the IOAPIC
+ * takes them over. They stay on the same vector. This is
+ * required for check_timer() to work correctly as it might
+ * switch back to legacy mode. Only update the hardware
+ * config.
*/
- if (!apic->irq_dest_mode)
- irqd_set_single_target(irq_data);
+ if (info->flags & X86_IRQ_ALLOC_LEGACY) {
+ if (!vector_configure_legacy(virq + i, irqd, apicd))
+ continue;
+ }
+
+ err = assign_irq_vector_policy(irqd, info);
+ trace_vector_setup(virq + i, false, err);
+ if (err)
+ goto error;
}
return 0;
@@ -387,9 +541,56 @@ error:
return err;
}
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,
+ struct irq_data *irqd, int ind)
+{
+ unsigned int cpu, vector, prev_cpu, prev_vector;
+ struct apic_chip_data *apicd;
+ unsigned long flags;
+ int irq;
+
+ if (!irqd) {
+ irq_matrix_debug_show(m, vector_matrix, ind);
+ return;
+ }
+
+ irq = irqd->irq;
+ if (irq < nr_legacy_irqs() && !test_bit(irq, &io_apic_irqs)) {
+ seq_printf(m, "%*sVector: %5d\n", ind, "", ISA_IRQ_VECTOR(irq));
+ seq_printf(m, "%*sTarget: Legacy PIC all CPUs\n", ind, "");
+ return;
+ }
+
+ apicd = irqd->chip_data;
+ if (!apicd) {
+ seq_printf(m, "%*sVector: Not assigned\n", ind, "");
+ return;
+ }
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ cpu = apicd->cpu;
+ vector = apicd->vector;
+ prev_cpu = apicd->prev_cpu;
+ prev_vector = apicd->prev_vector;
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ seq_printf(m, "%*sVector: %5u\n", ind, "", vector);
+ seq_printf(m, "%*sTarget: %5u\n", ind, "", cpu);
+ if (prev_vector) {
+ seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", prev_vector);
+ seq_printf(m, "%*sPrevious target: %5u\n", ind, "", prev_cpu);
+ }
+}
+#endif
+
static const struct irq_domain_ops x86_vector_domain_ops = {
- .alloc = x86_vector_alloc_irqs,
- .free = x86_vector_free_irqs,
+ .alloc = x86_vector_alloc_irqs,
+ .free = x86_vector_free_irqs,
+ .activate = x86_vector_activate,
+ .deactivate = x86_vector_deactivate,
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+ .debug_show = x86_vector_debug_show,
+#endif
};
int __init arch_probe_nr_irqs(void)
@@ -419,35 +620,40 @@ int __init arch_probe_nr_irqs(void)
return legacy_pic->probe();
}
-#ifdef CONFIG_X86_IO_APIC
-static void __init init_legacy_irqs(void)
+void lapic_assign_legacy_vector(unsigned int irq, bool replace)
{
- int i, node = cpu_to_node(0);
- struct apic_chip_data *data;
-
/*
- * For legacy IRQ's, start with assigning irq0 to irq15 to
- * ISA_IRQ_VECTOR(i) for all cpu's.
+ * Use assign system here so it wont get accounted as allocated
+ * and moveable in the cpu hotplug check and it prevents managed
+ * irq reservation from touching it.
*/
- for (i = 0; i < nr_legacy_irqs(); i++) {
- data = legacy_irq_data[i] = alloc_apic_chip_data(node);
- BUG_ON(!data);
+ irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
+}
+
+void __init lapic_assign_system_vectors(void)
+{
+ unsigned int i, vector = 0;
- data->cfg.vector = ISA_IRQ_VECTOR(i);
- cpumask_setall(data->domain);
- irq_set_chip_data(i, data);
+ for_each_set_bit_from(vector, system_vectors, NR_VECTORS)
+ irq_matrix_assign_system(vector_matrix, vector, false);
+
+ if (nr_legacy_irqs() > 1)
+ lapic_assign_legacy_vector(PIC_CASCADE_IR, false);
+
+ /* System vectors are reserved, online it */
+ irq_matrix_online(vector_matrix);
+
+ /* Mark the preallocated legacy interrupts */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ if (i != PIC_CASCADE_IR)
+ irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i));
}
}
-#else
-static inline void init_legacy_irqs(void) { }
-#endif
int __init arch_early_irq_init(void)
{
struct fwnode_handle *fn;
- init_legacy_irqs();
-
fn = irq_domain_alloc_named_fwnode("VECTOR");
BUG_ON(!fn);
x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops,
@@ -459,100 +665,115 @@ int __init arch_early_irq_init(void)
arch_init_msi_domain(x86_vector_domain);
arch_init_htirq_domain(x86_vector_domain);
- BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
- BUG_ON(!alloc_cpumask_var(&searched_cpumask, GFP_KERNEL));
+
+ /*
+ * Allocate the vector matrix allocator data structure and limit the
+ * search area.
+ */
+ vector_matrix = irq_alloc_matrix(NR_VECTORS, FIRST_EXTERNAL_VECTOR,
+ FIRST_SYSTEM_VECTOR);
+ BUG_ON(!vector_matrix);
return arch_early_ioapic_init();
}
-/* Initialize vector_irq on a new cpu */
-static void __setup_vector_irq(int cpu)
+#ifdef CONFIG_SMP
+
+static struct irq_desc *__setup_vector_irq(int vector)
{
- struct apic_chip_data *data;
- struct irq_desc *desc;
- int irq, vector;
+ int isairq = vector - ISA_IRQ_VECTOR(0);
+
+ /* Check whether the irq is in the legacy space */
+ if (isairq < 0 || isairq >= nr_legacy_irqs())
+ return VECTOR_UNUSED;
+ /* Check whether the irq is handled by the IOAPIC */
+ if (test_bit(isairq, &io_apic_irqs))
+ return VECTOR_UNUSED;
+ return irq_to_desc(isairq);
+}
- /* Mark the inuse vectors */
- for_each_irq_desc(irq, desc) {
- struct irq_data *idata = irq_desc_get_irq_data(desc);
+/* Online the local APIC infrastructure and initialize the vectors */
+void lapic_online(void)
+{
+ unsigned int vector;
- data = apic_chip_data(idata);
- if (!data || !cpumask_test_cpu(cpu, data->domain))
- continue;
- vector = data->cfg.vector;
- per_cpu(vector_irq, cpu)[vector] = desc;
- }
- /* Mark the free vectors */
- for (vector = 0; vector < NR_VECTORS; ++vector) {
- desc = per_cpu(vector_irq, cpu)[vector];
- if (IS_ERR_OR_NULL(desc))
- continue;
+ lockdep_assert_held(&vector_lock);
- data = apic_chip_data(irq_desc_get_irq_data(desc));
- if (!cpumask_test_cpu(cpu, data->domain))
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
- }
+ /* Online the vector matrix array for this CPU */
+ irq_matrix_online(vector_matrix);
+
+ /*
+ * The interrupt affinity logic never targets interrupts to offline
+ * CPUs. The exception are the legacy PIC interrupts. In general
+ * they are only targeted to CPU0, but depending on the platform
+ * they can be distributed to any online CPU in hardware. The
+ * kernel has no influence on that. So all active legacy vectors
+ * must be installed on all CPUs. All non legacy interrupts can be
+ * cleared.
+ */
+ for (vector = 0; vector < NR_VECTORS; vector++)
+ this_cpu_write(vector_irq[vector], __setup_vector_irq(vector));
}
-/*
- * Setup the vector to irq mappings. Must be called with vector_lock held.
- */
-void setup_vector_irq(int cpu)
+void lapic_offline(void)
{
- int irq;
+ lock_vector_lock();
+ irq_matrix_offline(vector_matrix);
+ unlock_vector_lock();
+}
+
+static int apic_set_affinity(struct irq_data *irqd,
+ const struct cpumask *dest, bool force)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ int err;
- lockdep_assert_held(&vector_lock);
/*
- * On most of the platforms, legacy PIC delivers the interrupts on the
- * boot cpu. But there are certain platforms where PIC interrupts are
- * delivered to multiple cpu's. If the legacy IRQ is handled by the
- * legacy PIC, for the new cpu that is coming online, setup the static
- * legacy vector to irq mapping:
+ * Core code can call here for inactive interrupts. For inactive
+ * interrupts which use managed or reservation mode there is no
+ * point in going through the vector assignment right now as the
+ * activation will assign a vector which fits the destination
+ * cpumask. Let the core code store the destination mask and be
+ * done with it.
*/
- for (irq = 0; irq < nr_legacy_irqs(); irq++)
- per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq);
+ if (!irqd_is_activated(irqd) &&
+ (apicd->is_managed || apicd->can_reserve))
+ return IRQ_SET_MASK_OK;
- __setup_vector_irq(cpu);
+ raw_spin_lock(&vector_lock);
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ if (irqd_affinity_is_managed(irqd))
+ err = assign_managed_vector(irqd, vector_searchmask);
+ else
+ err = assign_vector_locked(irqd, vector_searchmask);
+ raw_spin_unlock(&vector_lock);
+ return err ? err : IRQ_SET_MASK_OK;
}
-static int apic_retrigger_irq(struct irq_data *irq_data)
+#else
+# define apic_set_affinity NULL
+#endif
+
+static int apic_retrigger_irq(struct irq_data *irqd)
{
- struct apic_chip_data *data = apic_chip_data(irq_data);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
unsigned long flags;
- int cpu;
raw_spin_lock_irqsave(&vector_lock, flags);
- cpu = cpumask_first_and(data->domain, cpu_online_mask);
- apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector);
+ apic->send_IPI(apicd->cpu, apicd->vector);
raw_spin_unlock_irqrestore(&vector_lock, flags);
return 1;
}
-void apic_ack_edge(struct irq_data *data)
+void apic_ack_edge(struct irq_data *irqd)
{
- irq_complete_move(irqd_cfg(data));
- irq_move_irq(data);
+ irq_complete_move(irqd_cfg(irqd));
+ irq_move_irq(irqd);
ack_APIC_irq();
}
-static int apic_set_affinity(struct irq_data *irq_data,
- const struct cpumask *dest, bool force)
-{
- struct apic_chip_data *data = irq_data->chip_data;
- int err, irq = irq_data->irq;
-
- if (!IS_ENABLED(CONFIG_SMP))
- return -EPERM;
-
- if (!cpumask_intersects(dest, cpu_online_mask))
- return -EINVAL;
-
- err = assign_irq_vector(irq, data, dest, irq_data);
- return err ? err : IRQ_SET_MASK_OK;
-}
-
static struct irq_chip lapic_controller = {
.name = "APIC",
.irq_ack = apic_ack_edge,
@@ -561,115 +782,98 @@ static struct irq_chip lapic_controller = {
};
#ifdef CONFIG_SMP
-static void __send_cleanup_vector(struct apic_chip_data *data)
-{
- raw_spin_lock(&vector_lock);
- cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
- data->move_in_progress = 0;
- if (!cpumask_empty(data->old_domain))
- apic->send_IPI_mask(data->old_domain, IRQ_MOVE_CLEANUP_VECTOR);
- raw_spin_unlock(&vector_lock);
-}
-void send_cleanup_vector(struct irq_cfg *cfg)
+static void free_moved_vector(struct apic_chip_data *apicd)
{
- struct apic_chip_data *data;
+ unsigned int vector = apicd->prev_vector;
+ unsigned int cpu = apicd->prev_cpu;
+ bool managed = apicd->is_managed;
- data = container_of(cfg, struct apic_chip_data, cfg);
- if (data->move_in_progress)
- __send_cleanup_vector(data);
+ /*
+ * This should never happen. Managed interrupts are not
+ * migrated except on CPU down, which does not involve the
+ * cleanup vector. But try to keep the accounting correct
+ * nevertheless.
+ */
+ WARN_ON_ONCE(managed);
+
+ trace_vector_free_moved(apicd->irq, cpu, vector, managed);
+ irq_matrix_free(vector_matrix, cpu, vector, managed);
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
+ hlist_del_init(&apicd->clist);
+ apicd->prev_vector = 0;
+ apicd->move_in_progress = 0;
}
asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
{
- unsigned vector, me;
+ struct hlist_head *clhead = this_cpu_ptr(&cleanup_list);
+ struct apic_chip_data *apicd;
+ struct hlist_node *tmp;
entering_ack_irq();
-
/* Prevent vectors vanishing under us */
raw_spin_lock(&vector_lock);
- me = smp_processor_id();
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- struct apic_chip_data *data;
- struct irq_desc *desc;
- unsigned int irr;
-
- retry:
- desc = __this_cpu_read(vector_irq[vector]);
- if (IS_ERR_OR_NULL(desc))
- continue;
-
- if (!raw_spin_trylock(&desc->lock)) {
- raw_spin_unlock(&vector_lock);
- cpu_relax();
- raw_spin_lock(&vector_lock);
- goto retry;
- }
-
- data = apic_chip_data(irq_desc_get_irq_data(desc));
- if (!data)
- goto unlock;
+ hlist_for_each_entry_safe(apicd, tmp, clhead, clist) {
+ unsigned int irr, vector = apicd->prev_vector;
/*
- * Nothing to cleanup if irq migration is in progress
- * or this cpu is not set in the cleanup mask.
- */
- if (data->move_in_progress ||
- !cpumask_test_cpu(me, data->old_domain))
- goto unlock;
-
- /*
- * We have two cases to handle here:
- * 1) vector is unchanged but the target mask got reduced
- * 2) vector and the target mask has changed
- *
- * #1 is obvious, but in #2 we have two vectors with the same
- * irq descriptor: the old and the new vector. So we need to
- * make sure that we only cleanup the old vector. The new
- * vector has the current @vector number in the config and
- * this cpu is part of the target mask. We better leave that
- * one alone.
- */
- if (vector == data->cfg.vector &&
- cpumask_test_cpu(me, data->domain))
- goto unlock;
-
- irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
- /*
- * Check if the vector that needs to be cleanedup is
- * registered at the cpu's IRR. If so, then this is not
- * the best time to clean it up. Lets clean it up in the
+ * Paranoia: Check if the vector that needs to be cleaned
+ * up is registered at the APICs IRR. If so, then this is
+ * not the best time to clean it up. Clean it up in the
* next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
- * to myself.
+ * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest
+ * priority external vector, so on return from this
+ * interrupt the device interrupt will happen first.
*/
- if (irr & (1 << (vector % 32))) {
+ irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ if (irr & (1U << (vector % 32))) {
apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
- goto unlock;
+ continue;
}
- __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
- cpumask_clear_cpu(me, data->old_domain);
-unlock:
- raw_spin_unlock(&desc->lock);
+ free_moved_vector(apicd);
}
raw_spin_unlock(&vector_lock);
-
exiting_irq();
}
+static void __send_cleanup_vector(struct apic_chip_data *apicd)
+{
+ unsigned int cpu;
+
+ raw_spin_lock(&vector_lock);
+ apicd->move_in_progress = 0;
+ cpu = apicd->prev_cpu;
+ if (cpu_online(cpu)) {
+ hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu));
+ apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR);
+ } else {
+ apicd->prev_vector = 0;
+ }
+ raw_spin_unlock(&vector_lock);
+}
+
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
+ if (apicd->move_in_progress)
+ __send_cleanup_vector(apicd);
+}
+
static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
{
- unsigned me;
- struct apic_chip_data *data;
+ struct apic_chip_data *apicd;
- data = container_of(cfg, struct apic_chip_data, cfg);
- if (likely(!data->move_in_progress))
+ apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
+ if (likely(!apicd->move_in_progress))
return;
- me = smp_processor_id();
- if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain))
- __send_cleanup_vector(data);
+ if (vector == apicd->vector && apicd->cpu == smp_processor_id())
+ __send_cleanup_vector(apicd);
}
void irq_complete_move(struct irq_cfg *cfg)
@@ -682,10 +886,9 @@ void irq_complete_move(struct irq_cfg *cfg)
*/
void irq_force_complete_move(struct irq_desc *desc)
{
- struct irq_data *irqdata;
- struct apic_chip_data *data;
- struct irq_cfg *cfg;
- unsigned int cpu;
+ struct apic_chip_data *apicd;
+ struct irq_data *irqd;
+ unsigned int vector;
/*
* The function is called for all descriptors regardless of which
@@ -696,43 +899,31 @@ void irq_force_complete_move(struct irq_desc *desc)
* Check first that the chip_data is what we expect
* (apic_chip_data) before touching it any further.
*/
- irqdata = irq_domain_get_irq_data(x86_vector_domain,
- irq_desc_get_irq(desc));
- if (!irqdata)
+ irqd = irq_domain_get_irq_data(x86_vector_domain,
+ irq_desc_get_irq(desc));
+ if (!irqd)
return;
- data = apic_chip_data(irqdata);
- cfg = data ? &data->cfg : NULL;
+ raw_spin_lock(&vector_lock);
+ apicd = apic_chip_data(irqd);
+ if (!apicd)
+ goto unlock;
- if (!cfg)
- return;
+ /*
+ * If prev_vector is empty, no action required.
+ */
+ vector = apicd->prev_vector;
+ if (!vector)
+ goto unlock;
/*
- * This is tricky. If the cleanup of @data->old_domain has not been
+ * This is tricky. If the cleanup of the old vector has not been
* done yet, then the following setaffinity call will fail with
* -EBUSY. This can leave the interrupt in a stale state.
*
* All CPUs are stuck in stop machine with interrupts disabled so
* calling __irq_complete_move() would be completely pointless.
- */
- raw_spin_lock(&vector_lock);
- /*
- * Clean out all offline cpus (including the outgoing one) from the
- * old_domain mask.
- */
- cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
-
- /*
- * If move_in_progress is cleared and the old_domain mask is empty,
- * then there is nothing to cleanup. fixup_irqs() will take care of
- * the stale vectors on the outgoing cpu.
- */
- if (!data->move_in_progress && cpumask_empty(data->old_domain)) {
- raw_spin_unlock(&vector_lock);
- return;
- }
-
- /*
+ *
* 1) The interrupt is in move_in_progress state. That means that we
* have not seen an interrupt since the io_apic was reprogrammed to
* the new vector.
@@ -740,7 +931,7 @@ void irq_force_complete_move(struct irq_desc *desc)
* 2) The interrupt has fired on the new vector, but the cleanup IPIs
* have not been processed yet.
*/
- if (data->move_in_progress) {
+ if (apicd->move_in_progress) {
/*
* In theory there is a race:
*
@@ -774,21 +965,43 @@ void irq_force_complete_move(struct irq_desc *desc)
* area arises.
*/
pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
- irqdata->irq, cfg->old_vector);
+ irqd->irq, vector);
}
- /*
- * If old_domain is not empty, then other cpus still have the irq
- * descriptor set in their vector array. Clean it up.
- */
- for_each_cpu(cpu, data->old_domain)
- per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED;
+ free_moved_vector(apicd);
+unlock:
+ raw_spin_unlock(&vector_lock);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Note, this is not accurate accounting, but at least good enough to
+ * prevent that the actual interrupt move will run out of vectors.
+ */
+int lapic_can_unplug_cpu(void)
+{
+ unsigned int rsvd, avl, tomove, cpu = smp_processor_id();
+ int ret = 0;
- /* Cleanup the left overs of the (half finished) move */
- cpumask_clear(data->old_domain);
- data->move_in_progress = 0;
+ raw_spin_lock(&vector_lock);
+ tomove = irq_matrix_allocated(vector_matrix);
+ avl = irq_matrix_available(vector_matrix, true);
+ if (avl < tomove) {
+ pr_warn("CPU %u has %u vectors, %u available. Cannot disable CPU\n",
+ cpu, tomove, avl);
+ ret = -ENOSPC;
+ goto out;
+ }
+ rsvd = irq_matrix_reserved(vector_matrix);
+ if (avl < rsvd) {
+ pr_warn("Reserved vectors %u > available %u. IRQ request may fail\n",
+ rsvd, avl);
+ }
+out:
raw_spin_unlock(&vector_lock);
+ return ret;
}
-#endif
+#endif /* HOTPLUG_CPU */
+#endif /* SMP */
static void __init print_APIC_field(int base)
{
diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h
new file mode 100644
index 000000000000..b107de381cb5
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic.h
@@ -0,0 +1,9 @@
+/* Common bits for X2APIC cluster/physical modes. */
+
+int x2apic_apic_id_valid(int apicid);
+int x2apic_apic_id_registered(void);
+void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
+unsigned int x2apic_get_apic_id(unsigned long id);
+u32 x2apic_set_apic_id(unsigned int id);
+int x2apic_phys_pkg_id(int initial_apicid, int index_msb);
+void x2apic_send_IPI_self(int vector);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e216cf3d64d2..622f13ca8a94 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -9,22 +9,24 @@
#include <linux/cpu.h>
#include <asm/smp.h>
-#include <asm/x2apic.h>
+#include "x2apic.h"
+
+struct cluster_mask {
+ unsigned int clusterid;
+ int node;
+ struct cpumask mask;
+};
static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
-static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
+static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
+static struct cluster_mask *cluster_hotplug_mask;
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
return x2apic_enabled();
}
-static inline u32 x2apic_cluster(int cpu)
-{
- return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
-}
-
static void x2apic_send_IPI(int cpu, int vector)
{
u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
@@ -36,49 +38,34 @@ static void x2apic_send_IPI(int cpu, int vector)
static void
__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
- struct cpumask *cpus_in_cluster_ptr;
- struct cpumask *ipi_mask_ptr;
- unsigned int cpu, this_cpu;
+ unsigned int cpu, clustercpu;
+ struct cpumask *tmpmsk;
unsigned long flags;
u32 dest;
x2apic_wrmsr_fence();
-
local_irq_save(flags);
- this_cpu = smp_processor_id();
+ tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask);
+ cpumask_copy(tmpmsk, mask);
+ /* If IPI should not be sent to self, clear current CPU */
+ if (apic_dest != APIC_DEST_ALLINC)
+ cpumask_clear_cpu(smp_processor_id(), tmpmsk);
- /*
- * We are to modify mask, so we need an own copy
- * and be sure it's manipulated with irq off.
- */
- ipi_mask_ptr = this_cpu_cpumask_var_ptr(ipi_mask);
- cpumask_copy(ipi_mask_ptr, mask);
-
- /*
- * The idea is to send one IPI per cluster.
- */
- for_each_cpu(cpu, ipi_mask_ptr) {
- unsigned long i;
+ /* Collapse cpus in a cluster so a single IPI per cluster is sent */
+ for_each_cpu(cpu, tmpmsk) {
+ struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu);
- cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
dest = 0;
-
- /* Collect cpus in cluster. */
- for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
- if (apic_dest == APIC_DEST_ALLINC || i != this_cpu)
- dest |= per_cpu(x86_cpu_to_logical_apicid, i);
- }
+ for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
+ dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
if (!dest)
continue;
__x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
- /*
- * Cluster sibling cpus should be discared now so
- * we would not send IPI them second time.
- */
- cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
+ /* Remove cluster CPUs from tmpmask */
+ cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask);
}
local_irq_restore(flags);
@@ -105,125 +92,90 @@ static void x2apic_send_IPI_all(int vector)
__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
}
-static int
-x2apic_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata,
- unsigned int *apicid)
+static u32 x2apic_calc_apicid(unsigned int cpu)
{
- struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata);
- unsigned int cpu;
- u32 dest = 0;
- u16 cluster;
-
- cpu = cpumask_first(mask);
- if (cpu >= nr_cpu_ids)
- return -EINVAL;
-
- dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
- cluster = x2apic_cluster(cpu);
-
- cpumask_clear(effmsk);
- for_each_cpu(cpu, mask) {
- if (cluster != x2apic_cluster(cpu))
- continue;
- dest |= per_cpu(x86_cpu_to_logical_apicid, cpu);
- cpumask_set_cpu(cpu, effmsk);
- }
-
- *apicid = dest;
- return 0;
+ return per_cpu(x86_cpu_to_logical_apicid, cpu);
}
static void init_x2apic_ldr(void)
{
- unsigned int this_cpu = smp_processor_id();
+ struct cluster_mask *cmsk = this_cpu_read(cluster_masks);
+ u32 cluster, apicid = apic_read(APIC_LDR);
unsigned int cpu;
- per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
+ this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+
+ if (cmsk)
+ goto update;
- cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+ cluster = apicid >> 16;
for_each_online_cpu(cpu) {
- if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
- continue;
- cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu));
- cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu));
+ cmsk = per_cpu(cluster_masks, cpu);
+ /* Matching cluster found. Link and update it. */
+ if (cmsk && cmsk->clusterid == cluster)
+ goto update;
}
+ cmsk = cluster_hotplug_mask;
+ cluster_hotplug_mask = NULL;
+update:
+ this_cpu_write(cluster_masks, cmsk);
+ cpumask_set_cpu(smp_processor_id(), &cmsk->mask);
}
-/*
- * At CPU state changes, update the x2apic cluster sibling info.
- */
-static int x2apic_prepare_cpu(unsigned int cpu)
+static int alloc_clustermask(unsigned int cpu, int node)
{
- if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL))
- return -ENOMEM;
+ if (per_cpu(cluster_masks, cpu))
+ return 0;
+ /*
+ * If a hotplug spare mask exists, check whether it's on the right
+ * node. If not, free it and allocate a new one.
+ */
+ if (cluster_hotplug_mask) {
+ if (cluster_hotplug_mask->node == node)
+ return 0;
+ kfree(cluster_hotplug_mask);
+ }
- if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) {
- free_cpumask_var(per_cpu(cpus_in_cluster, cpu));
+ cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask),
+ GFP_KERNEL, node);
+ if (!cluster_hotplug_mask)
return -ENOMEM;
- }
+ cluster_hotplug_mask->node = node;
+ return 0;
+}
+static int x2apic_prepare_cpu(unsigned int cpu)
+{
+ if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0)
+ return -ENOMEM;
+ if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
+ return -ENOMEM;
return 0;
}
-static int x2apic_dead_cpu(unsigned int this_cpu)
+static int x2apic_dead_cpu(unsigned int dead_cpu)
{
- int cpu;
+ struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu);
- for_each_online_cpu(cpu) {
- if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
- continue;
- cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu));
- cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu));
- }
- free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
- free_cpumask_var(per_cpu(ipi_mask, this_cpu));
+ cpumask_clear_cpu(dead_cpu, &cmsk->mask);
+ free_cpumask_var(per_cpu(ipi_mask, dead_cpu));
return 0;
}
static int x2apic_cluster_probe(void)
{
- int cpu = smp_processor_id();
- int ret;
-
if (!x2apic_mode)
return 0;
- ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
- x2apic_prepare_cpu, x2apic_dead_cpu);
- if (ret < 0) {
+ if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
+ x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
pr_err("Failed to register X2APIC_PREPARE\n");
return 0;
}
- cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
+ init_x2apic_ldr();
return 1;
}
-static const struct cpumask *x2apic_cluster_target_cpus(void)
-{
- return cpu_all_mask;
-}
-
-/*
- * Each x2apic cluster is an allocation domain.
- */
-static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
- const struct cpumask *mask)
-{
- /*
- * To minimize vector pressure, default case of boot, device bringup
- * etc will use a single cpu for the interrupt destination.
- *
- * On explicit migration requests coming from irqbalance etc,
- * interrupts will be routed to the x2apic cluster (cluster-id
- * derived from the first cpu in the mask) members specified
- * in the mask.
- */
- if (mask == x2apic_cluster_target_cpus())
- cpumask_copy(retmask, cpumask_of(cpu));
- else
- cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
-}
-
static struct apic apic_x2apic_cluster __ro_after_init = {
.name = "cluster x2apic",
@@ -235,12 +187,10 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.irq_delivery_mode = dest_LowestPrio,
.irq_dest_mode = 1, /* logical */
- .target_cpus = x2apic_cluster_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
- .vector_allocation_domain = cluster_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,
.ioapic_phys_id_map = NULL,
@@ -253,7 +203,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
- .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+ .calc_dest_apicid = x2apic_calc_apicid,
.send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index b94d35320f85..f8d9d69994e6 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -7,7 +7,8 @@
#include <linux/dmar.h>
#include <asm/smp.h>
-#include <asm/x2apic.h>
+#include <asm/ipi.h>
+#include "x2apic.h"
int x2apic_phys;
@@ -99,6 +100,43 @@ static int x2apic_phys_probe(void)
return apic == &apic_x2apic_phys;
}
+/* Common x2apic functions, also used by x2apic_cluster */
+int x2apic_apic_id_valid(int apicid)
+{
+ return 1;
+}
+
+int x2apic_apic_id_registered(void)
+{
+ return 1;
+}
+
+void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+{
+ unsigned long cfg = __prepare_ICR(0, vector, dest);
+ native_x2apic_icr_write(cfg, apicid);
+}
+
+unsigned int x2apic_get_apic_id(unsigned long id)
+{
+ return id;
+}
+
+u32 x2apic_set_apic_id(unsigned int id)
+{
+ return id;
+}
+
+int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+{
+ return initial_apicid >> index_msb;
+}
+
+void x2apic_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
static struct apic apic_x2apic_phys __ro_after_init = {
.name = "physical x2apic",
@@ -110,12 +148,10 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 0, /* physical */
- .target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
- .vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,
.ioapic_phys_id_map = NULL,
@@ -128,7 +164,7 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
- .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
+ .calc_dest_apicid = apic_default_calc_apicid,
.send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 0d57bb9079c9..e1b8e8bf6b3c 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -154,6 +154,48 @@ static int __init early_get_pnodeid(void)
return pnode;
}
+static void __init uv_tsc_check_sync(void)
+{
+ u64 mmr;
+ int sync_state;
+ int mmr_shift;
+ char *state;
+ bool valid;
+
+ /* Accommodate different UV arch BIOSes */
+ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR);
+ mmr_shift =
+ is_uv1_hub() ? 0 :
+ is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT;
+ if (mmr_shift)
+ sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK;
+ else
+ sync_state = 0;
+
+ switch (sync_state) {
+ case UVH_TSC_SYNC_VALID:
+ state = "in sync";
+ valid = true;
+ break;
+
+ case UVH_TSC_SYNC_INVALID:
+ state = "unstable";
+ valid = false;
+ break;
+ default:
+ state = "unknown: assuming valid";
+ valid = true;
+ break;
+ }
+ pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state);
+
+ /* Mark flag that says TSC != 0 is valid for socket 0 */
+ if (valid)
+ mark_tsc_async_resets("UV BIOS");
+ else
+ mark_tsc_unstable("UV BIOS");
+}
+
/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */
#define SMT_LEVEL 0 /* Leaf 0xb SMT level */
@@ -288,6 +330,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
}
pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic);
+ uv_tsc_check_sync();
return uv_apic;
@@ -525,16 +568,9 @@ static void uv_init_apic_ldr(void)
{
}
-static int
-uv_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata,
- unsigned int *apicid)
+static u32 apic_uv_calc_apicid(unsigned int cpu)
{
- int ret = default_cpu_mask_to_apicid(mask, irqdata, apicid);
-
- if (!ret)
- *apicid |= uv_apicid_hibits;
-
- return ret;
+ return apic_default_calc_apicid(cpu) | uv_apicid_hibits;
}
static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -547,7 +583,7 @@ static unsigned int x2apic_get_apic_id(unsigned long x)
return id;
}
-static unsigned long set_apic_id(unsigned int id)
+static u32 set_apic_id(unsigned int id)
{
/* CHECKME: Do we need to mask out the xapic extra bits? */
return id;
@@ -584,12 +620,10 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 0, /* Physical */
- .target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
- .vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = uv_init_apic_ldr,
.ioapic_phys_id_map = NULL,
@@ -602,7 +636,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = set_apic_id,
- .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
+ .calc_dest_apicid = apic_uv_calc_apicid,
.send_IPI = uv_send_IPI_one,
.send_IPI_mask = uv_send_IPI_mask,
@@ -920,9 +954,8 @@ static __init void uv_rtc_init(void)
/*
* percpu heartbeat timer
*/
-static void uv_heartbeat(unsigned long ignored)
+static void uv_heartbeat(struct timer_list *timer)
{
- struct timer_list *timer = &uv_scir_info->timer;
unsigned char bits = uv_scir_info->state;
/* Flip heartbeat bit: */
@@ -947,7 +980,7 @@ static int uv_heartbeat_enable(unsigned int cpu)
struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer;
uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
- setup_pinned_timer(timer, uv_heartbeat, cpu);
+ timer_setup(timer, uv_heartbeat, TIMER_PINNED);
timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
add_timer_on(timer, cpu);
uv_cpu_scir_info(cpu)->enabled = 1;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c60922a66385..90cb82dbba57 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -23,6 +23,7 @@ obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
obj-$(CONFIG_CPU_FREQ) += aperfmperf.o
+obj-y += cpuid-deps.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 0ee83321a313..957813e0180d 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -42,10 +42,6 @@ static void aperfmperf_snapshot_khz(void *dummy)
s64 time_delta = ktime_ms_delta(now, s->time);
unsigned long flags;
- /* Don't bother re-computing within the cache threshold time. */
- if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
- return;
-
local_irq_save(flags);
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
@@ -74,6 +70,7 @@ static void aperfmperf_snapshot_khz(void *dummy)
unsigned int arch_freq_get_on_cpu(int cpu)
{
+ s64 time_delta;
unsigned int khz;
if (!cpu_khz)
@@ -82,6 +79,12 @@ unsigned int arch_freq_get_on_cpu(int cpu)
if (!static_cpu_has(X86_FEATURE_APERFMPERF))
return 0;
+ /* Don't bother re-computing within the cache threshold time. */
+ time_delta = ktime_ms_delta(ktime_get(), per_cpu(samples.time, cpu));
+ khz = per_cpu(samples.khz, cpu);
+ if (khz && time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
+ return khz;
+
smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
khz = per_cpu(samples.khz, cpu);
if (khz)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c9176bae7fd8..13ae9e5eec2f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -329,6 +329,28 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
}
}
+static __always_inline void setup_umip(struct cpuinfo_x86 *c)
+{
+ /* Check the boot processor, plus build option for UMIP. */
+ if (!cpu_feature_enabled(X86_FEATURE_UMIP))
+ goto out;
+
+ /* Check the current processor's cpuid bits. */
+ if (!cpu_has(c, X86_FEATURE_UMIP))
+ goto out;
+
+ cr4_set_bits(X86_CR4_UMIP);
+
+ return;
+
+out:
+ /*
+ * Make sure UMIP is disabled in case it was enabled in a
+ * previous boot (e.g., via kexec).
+ */
+ cr4_clear_bits(X86_CR4_UMIP);
+}
+
/*
* Protection Keys are not available in 32-bit mode.
*/
@@ -863,8 +885,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
* cache alignment.
* The others are not touched to avoid unwanted side effects.
*
- * WARNING: this function is only called on the BP. Don't add code here
- * that is supposed to run on all CPUs.
+ * WARNING: this function is only called on the boot CPU. Don't add code
+ * here that is supposed to run on all CPUs.
*/
static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{
@@ -1147,9 +1169,10 @@ static void identify_cpu(struct cpuinfo_x86 *c)
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
- /* Set up SMEP/SMAP */
+ /* Set up SMEP/SMAP/UMIP */
setup_smep(c);
setup_smap(c);
+ setup_umip(c);
/*
* The vendor-specific functions might have changed features.
@@ -1301,18 +1324,16 @@ void print_cpu_info(struct cpuinfo_x86 *c)
pr_cont(")\n");
}
-static __init int setup_disablecpuid(char *arg)
+/*
+ * clearcpuid= was already parsed in fpu__init_parse_early_param.
+ * But we need to keep a dummy __setup around otherwise it would
+ * show up as an environment variable for init.
+ */
+static __init int setup_clearcpuid(char *arg)
{
- int bit;
-
- if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32)
- setup_clear_cpu_cap(bit);
- else
- return 0;
-
return 1;
}
-__setup("clearcpuid=", setup_disablecpuid);
+__setup("clearcpuid=", setup_clearcpuid);
#ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(union irq_stack_union,
@@ -1572,9 +1593,13 @@ void cpu_init(void)
initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, me);
- load_sp0(t, &current->thread);
+ /*
+ * Initialize the TSS. Don't bother initializing sp0, as the initial
+ * task never enters user mode.
+ */
set_tss_desc(cpu, t);
load_TR_desc();
+
load_mm_ldt(&init_mm);
clear_all_debug_regs();
@@ -1596,7 +1621,6 @@ void cpu_init(void)
int cpu = smp_processor_id();
struct task_struct *curr = current;
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
- struct thread_struct *thread = &curr->thread;
wait_for_master_cpu(cpu);
@@ -1627,9 +1651,13 @@ void cpu_init(void)
initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, curr);
- load_sp0(t, thread);
+ /*
+ * Initialize the TSS. Don't bother initializing sp0, as the initial
+ * task never enters user mode.
+ */
set_tss_desc(cpu, t);
load_TR_desc();
+
load_mm_ldt(&init_mm);
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
new file mode 100644
index 000000000000..904b0a3c4e53
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -0,0 +1,121 @@
+/* Declare dependencies between CPUIDs */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+
+struct cpuid_dep {
+ unsigned int feature;
+ unsigned int depends;
+};
+
+/*
+ * Table of CPUID features that depend on others.
+ *
+ * This only includes dependencies that can be usefully disabled, not
+ * features part of the base set (like FPU).
+ *
+ * Note this all is not __init / __initdata because it can be
+ * called from cpu hotplug. It shouldn't do anything in this case,
+ * but it's difficult to tell that to the init reference checker.
+ */
+const static struct cpuid_dep cpuid_deps[] = {
+ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
+ { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
+ { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM2, X86_FEATURE_XMM },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
+ { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
+ { X86_FEATURE_AES, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
+ {}
+};
+
+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ /*
+ * Note: This could use the non atomic __*_bit() variants, but the
+ * rest of the cpufeature code uses atomics as well, so keep it for
+ * consistency. Cleanup all of it separately.
+ */
+ if (!c) {
+ clear_cpu_cap(&boot_cpu_data, feature);
+ set_bit(feature, (unsigned long *)cpu_caps_cleared);
+ } else {
+ clear_bit(feature, (unsigned long *)c->x86_capability);
+ }
+}
+
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
+ const struct cpuid_dep *d;
+ bool changed;
+
+ if (WARN_ON(feature >= MAX_FEATURE_BITS))
+ return;
+
+ clear_feature(c, feature);
+
+ /* Collect all features to disable, handling dependencies */
+ memset(disable, 0, sizeof(disable));
+ __set_bit(feature, disable);
+
+ /* Loop until we get a stable state. */
+ do {
+ changed = false;
+ for (d = cpuid_deps; d->feature; d++) {
+ if (!test_bit(d->depends, disable))
+ continue;
+ if (__test_and_set_bit(d->feature, disable))
+ continue;
+
+ changed = true;
+ clear_feature(c, d->feature);
+ }
+ } while (changed);
+}
+
+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ do_clear_cpu_cap(c, feature);
+}
+
+void setup_clear_cpu_cap(unsigned int feature)
+{
+ do_clear_cpu_cap(NULL, feature);
+}
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 4fa90006ac68..bea8d3e24f50 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -26,6 +26,12 @@
#include <asm/processor.h>
#include <asm/hypervisor.h>
+extern const struct hypervisor_x86 x86_hyper_vmware;
+extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+extern const struct hypervisor_x86 x86_hyper_xen_pv;
+extern const struct hypervisor_x86 x86_hyper_xen_hvm;
+extern const struct hypervisor_x86 x86_hyper_kvm;
+
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
#ifdef CONFIG_XEN_PV
@@ -41,54 +47,52 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
#endif
};
-const struct hypervisor_x86 *x86_hyper;
-EXPORT_SYMBOL(x86_hyper);
+enum x86_hypervisor_type x86_hyper_type;
+EXPORT_SYMBOL(x86_hyper_type);
-static inline void __init
+static inline const struct hypervisor_x86 * __init
detect_hypervisor_vendor(void)
{
- const struct hypervisor_x86 *h, * const *p;
+ const struct hypervisor_x86 *h = NULL, * const *p;
uint32_t pri, max_pri = 0;
for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
- h = *p;
- pri = h->detect();
- if (pri != 0 && pri > max_pri) {
+ pri = (*p)->detect();
+ if (pri > max_pri) {
max_pri = pri;
- x86_hyper = h;
+ h = *p;
}
}
- if (max_pri)
- pr_info("Hypervisor detected: %s\n", x86_hyper->name);
+ if (h)
+ pr_info("Hypervisor detected: %s\n", h->name);
+
+ return h;
}
-void __init init_hypervisor_platform(void)
+static void __init copy_array(const void *src, void *target, unsigned int size)
{
+ unsigned int i, n = size / sizeof(void *);
+ const void * const *from = (const void * const *)src;
+ const void **to = (const void **)target;
- detect_hypervisor_vendor();
-
- if (!x86_hyper)
- return;
-
- if (x86_hyper->init_platform)
- x86_hyper->init_platform();
+ for (i = 0; i < n; i++)
+ if (from[i])
+ to[i] = from[i];
}
-bool __init hypervisor_x2apic_available(void)
+void __init init_hypervisor_platform(void)
{
- return x86_hyper &&
- x86_hyper->x2apic_available &&
- x86_hyper->x2apic_available();
-}
+ const struct hypervisor_x86 *h;
-void hypervisor_pin_vcpu(int cpu)
-{
- if (!x86_hyper)
+ h = detect_hypervisor_vendor();
+
+ if (!h)
return;
- if (x86_hyper->pin_vcpu)
- x86_hyper->pin_vcpu(cpu);
- else
- WARN_ONCE(1, "vcpu pinning requested but not supported!\n");
+ copy_array(&h->init, &x86_init.hyper, sizeof(h->init));
+ copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime));
+
+ x86_hyper_type = h->type;
+ x86_init.hyper.init_platform();
}
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index cd5fc61ba450..88dcf8479013 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -267,6 +267,7 @@ static void rdt_get_cdp_l3_config(int type)
r->num_closid = r_l3->num_closid / 2;
r->cache.cbm_len = r_l3->cache.cbm_len;
r->default_ctrl = r_l3->default_ctrl;
+ r->cache.shareable_bits = r_l3->cache.shareable_bits;
r->data_width = (r->cache.cbm_len + 3) / 4;
r->alloc_capable = true;
/*
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index a43a72d8e88e..3397244984f5 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -127,12 +127,15 @@ struct rdtgroup {
#define RFTYPE_BASE BIT(1)
#define RF_CTRLSHIFT 4
#define RF_MONSHIFT 5
+#define RF_TOPSHIFT 6
#define RFTYPE_CTRL BIT(RF_CTRLSHIFT)
#define RFTYPE_MON BIT(RF_MONSHIFT)
+#define RFTYPE_TOP BIT(RF_TOPSHIFT)
#define RFTYPE_RES_CACHE BIT(8)
#define RFTYPE_RES_MB BIT(9)
#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
+#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP)
#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
/* List of all resource groups */
@@ -409,6 +412,10 @@ union cpuid_0x10_x_edx {
unsigned int full;
};
+void rdt_last_cmd_clear(void);
+void rdt_last_cmd_puts(const char *s);
+void rdt_last_cmd_printf(const char *fmt, ...);
+
void rdt_ctrl_update(void *arg);
struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
void rdtgroup_kn_unlock(struct kernfs_node *kn);
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index f6ea94f8954a..23e1d5c249c6 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -42,15 +42,22 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
/*
* Only linear delay values is supported for current Intel SKUs.
*/
- if (!r->membw.delay_linear)
+ if (!r->membw.delay_linear) {
+ rdt_last_cmd_puts("No support for non-linear MB domains\n");
return false;
+ }
ret = kstrtoul(buf, 10, &bw);
- if (ret)
+ if (ret) {
+ rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf);
return false;
+ }
- if (bw < r->membw.min_bw || bw > r->default_ctrl)
+ if (bw < r->membw.min_bw || bw > r->default_ctrl) {
+ rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
+ r->membw.min_bw, r->default_ctrl);
return false;
+ }
*data = roundup(bw, (unsigned long)r->membw.bw_gran);
return true;
@@ -60,8 +67,10 @@ int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
{
unsigned long data;
- if (d->have_new_ctrl)
+ if (d->have_new_ctrl) {
+ rdt_last_cmd_printf("duplicate domain %d\n", d->id);
return -EINVAL;
+ }
if (!bw_validate(buf, &data, r))
return -EINVAL;
@@ -84,20 +93,29 @@ static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
int ret;
ret = kstrtoul(buf, 16, &val);
- if (ret)
+ if (ret) {
+ rdt_last_cmd_printf("non-hex character in mask %s\n", buf);
return false;
+ }
- if (val == 0 || val > r->default_ctrl)
+ if (val == 0 || val > r->default_ctrl) {
+ rdt_last_cmd_puts("mask out of range\n");
return false;
+ }
first_bit = find_first_bit(&val, cbm_len);
zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
- if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)
+ if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) {
+ rdt_last_cmd_printf("mask %lx has non-consecutive 1-bits\n", val);
return false;
+ }
- if ((zero_bit - first_bit) < r->cache.min_cbm_bits)
+ if ((zero_bit - first_bit) < r->cache.min_cbm_bits) {
+ rdt_last_cmd_printf("Need at least %d bits in mask\n",
+ r->cache.min_cbm_bits);
return false;
+ }
*data = val;
return true;
@@ -111,8 +129,10 @@ int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
{
unsigned long data;
- if (d->have_new_ctrl)
+ if (d->have_new_ctrl) {
+ rdt_last_cmd_printf("duplicate domain %d\n", d->id);
return -EINVAL;
+ }
if(!cbm_validate(buf, &data, r))
return -EINVAL;
@@ -139,8 +159,10 @@ next:
return 0;
dom = strsep(&line, ";");
id = strsep(&dom, "=");
- if (!dom || kstrtoul(id, 10, &dom_id))
+ if (!dom || kstrtoul(id, 10, &dom_id)) {
+ rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
return -EINVAL;
+ }
dom = strim(dom);
list_for_each_entry(d, &r->domains, list) {
if (d->id == dom_id) {
@@ -196,6 +218,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
if (!strcmp(resname, r->name) && closid < r->num_closid)
return parse_line(tok, r);
}
+ rdt_last_cmd_printf("unknown/unsupported resource name '%s'\n", resname);
return -EINVAL;
}
@@ -218,6 +241,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
rdtgroup_kn_unlock(of->kn);
return -ENOENT;
}
+ rdt_last_cmd_clear();
closid = rdtgrp->closid;
@@ -229,6 +253,12 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
while ((tok = strsep(&buf, "\n")) != NULL) {
resname = strim(strsep(&tok, ":"));
if (!tok) {
+ rdt_last_cmd_puts("Missing ':'\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (tok[0] == '\0') {
+ rdt_last_cmd_printf("Missing '%s' value\n", resname);
ret = -EINVAL;
goto out;
}
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 30827510094b..681450eee428 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -51,7 +51,7 @@ static LIST_HEAD(rmid_free_lru);
* may have a occupancy value > intel_cqm_threshold. User can change
* the threshold occupancy value.
*/
-unsigned int rmid_limbo_count;
+static unsigned int rmid_limbo_count;
/**
* @rmid_entry - The entry in the limbo and free lists.
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index a869d4a073c5..64c5ff97ee0d 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -24,6 +24,7 @@
#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kernfs.h>
+#include <linux/seq_buf.h>
#include <linux/seq_file.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
@@ -51,6 +52,31 @@ static struct kernfs_node *kn_mongrp;
/* Kernel fs node for "mon_data" directory under root */
static struct kernfs_node *kn_mondata;
+static struct seq_buf last_cmd_status;
+static char last_cmd_status_buf[512];
+
+void rdt_last_cmd_clear(void)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+ seq_buf_clear(&last_cmd_status);
+}
+
+void rdt_last_cmd_puts(const char *s)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+ seq_buf_puts(&last_cmd_status, s);
+}
+
+void rdt_last_cmd_printf(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ lockdep_assert_held(&rdtgroup_mutex);
+ seq_buf_vprintf(&last_cmd_status, fmt, ap);
+ va_end(ap);
+}
+
/*
* Trivial allocator for CLOSIDs. Since h/w only supports a small number,
* we can keep a bitmap of free CLOSIDs in a single integer.
@@ -238,8 +264,10 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
/* Check whether cpus belong to parent ctrl group */
cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
- if (cpumask_weight(tmpmask))
+ if (cpumask_weight(tmpmask)) {
+ rdt_last_cmd_puts("can only add CPUs to mongroup that belong to parent\n");
return -EINVAL;
+ }
/* Check whether cpus are dropped from this group */
cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
@@ -291,8 +319,10 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
if (cpumask_weight(tmpmask)) {
/* Can't drop from default group */
- if (rdtgrp == &rdtgroup_default)
+ if (rdtgrp == &rdtgroup_default) {
+ rdt_last_cmd_puts("Can't drop CPUs from default group\n");
return -EINVAL;
+ }
/* Give any dropped cpus to rdtgroup_default */
cpumask_or(&rdtgroup_default.cpu_mask,
@@ -357,8 +387,10 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
}
rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ rdt_last_cmd_clear();
if (!rdtgrp) {
ret = -ENOENT;
+ rdt_last_cmd_puts("directory was removed\n");
goto unlock;
}
@@ -367,13 +399,16 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
else
ret = cpumask_parse(buf, newmask);
- if (ret)
+ if (ret) {
+ rdt_last_cmd_puts("bad cpu list/mask\n");
goto unlock;
+ }
/* check that user didn't specify any offline cpus */
cpumask_andnot(tmpmask, newmask, cpu_online_mask);
if (cpumask_weight(tmpmask)) {
ret = -EINVAL;
+ rdt_last_cmd_puts("can only assign online cpus\n");
goto unlock;
}
@@ -452,6 +487,7 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
*/
atomic_dec(&rdtgrp->waitcount);
kfree(callback);
+ rdt_last_cmd_puts("task exited\n");
} else {
/*
* For ctrl_mon groups move both closid and rmid.
@@ -462,10 +498,12 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
tsk->closid = rdtgrp->closid;
tsk->rmid = rdtgrp->mon.rmid;
} else if (rdtgrp->type == RDTMON_GROUP) {
- if (rdtgrp->mon.parent->closid == tsk->closid)
+ if (rdtgrp->mon.parent->closid == tsk->closid) {
tsk->rmid = rdtgrp->mon.rmid;
- else
+ } else {
+ rdt_last_cmd_puts("Can't move task to different control group\n");
ret = -EINVAL;
+ }
}
}
return ret;
@@ -484,8 +522,10 @@ static int rdtgroup_task_write_permission(struct task_struct *task,
*/
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid))
+ !uid_eq(cred->euid, tcred->suid)) {
+ rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
ret = -EPERM;
+ }
put_cred(tcred);
return ret;
@@ -502,6 +542,7 @@ static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
tsk = find_task_by_vpid(pid);
if (!tsk) {
rcu_read_unlock();
+ rdt_last_cmd_printf("No task %d\n", pid);
return -ESRCH;
}
} else {
@@ -529,6 +570,7 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return -EINVAL;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ rdt_last_cmd_clear();
if (rdtgrp)
ret = rdtgroup_move_task(pid, rdtgrp, of);
@@ -569,6 +611,21 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
return ret;
}
+static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ int len;
+
+ mutex_lock(&rdtgroup_mutex);
+ len = seq_buf_used(&last_cmd_status);
+ if (len)
+ seq_printf(seq, "%.*s", len, last_cmd_status_buf);
+ else
+ seq_puts(seq, "ok\n");
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
static int rdt_num_closids_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
@@ -686,6 +743,13 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
/* rdtgroup information files for one cache resource. */
static struct rftype res_common_files[] = {
{
+ .name = "last_cmd_status",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_last_cmd_status_show,
+ .fflags = RF_TOP_INFO,
+ },
+ {
.name = "num_closids",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
@@ -855,6 +919,10 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
return PTR_ERR(kn_info);
kernfs_get(kn_info);
+ ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
+ if (ret)
+ goto out_destroy;
+
for_each_alloc_enabled_rdt_resource(r) {
fflags = r->fflags | RF_CTRL_INFO;
ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
@@ -1081,6 +1149,7 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
struct dentry *dentry;
int ret;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
/*
* resctrl file system can only be mounted once.
@@ -1130,12 +1199,12 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
goto out_mondata;
if (rdt_alloc_capable)
- static_branch_enable(&rdt_alloc_enable_key);
+ static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
if (rdt_mon_capable)
- static_branch_enable(&rdt_mon_enable_key);
+ static_branch_enable_cpuslocked(&rdt_mon_enable_key);
if (rdt_alloc_capable || rdt_mon_capable)
- static_branch_enable(&rdt_enable_key);
+ static_branch_enable_cpuslocked(&rdt_enable_key);
if (is_mbm_enabled()) {
r = &rdt_resources_all[RDT_RESOURCE_L3];
@@ -1156,7 +1225,9 @@ out_info:
out_cdp:
cdp_disable();
out:
+ rdt_last_cmd_clear();
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
return dentry;
}
@@ -1295,9 +1366,7 @@ static void rmdir_all_sub(void)
kfree(rdtgrp);
}
/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
- get_online_cpus();
update_closid_rmid(cpu_online_mask, &rdtgroup_default);
- put_online_cpus();
kernfs_remove(kn_info);
kernfs_remove(kn_mongrp);
@@ -1308,6 +1377,7 @@ static void rdt_kill_sb(struct super_block *sb)
{
struct rdt_resource *r;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
/*Put everything back to default values. */
@@ -1315,11 +1385,12 @@ static void rdt_kill_sb(struct super_block *sb)
reset_all_ctrls(r);
cdp_disable();
rmdir_all_sub();
- static_branch_disable(&rdt_alloc_enable_key);
- static_branch_disable(&rdt_mon_enable_key);
- static_branch_disable(&rdt_enable_key);
+ static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
+ static_branch_disable_cpuslocked(&rdt_mon_enable_key);
+ static_branch_disable_cpuslocked(&rdt_enable_key);
kernfs_kill_sb(sb);
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
}
static struct file_system_type rdt_fs_type = {
@@ -1524,8 +1595,10 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
int ret;
prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
+ rdt_last_cmd_clear();
if (!prdtgrp) {
ret = -ENODEV;
+ rdt_last_cmd_puts("directory was removed\n");
goto out_unlock;
}
@@ -1533,6 +1606,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
if (!rdtgrp) {
ret = -ENOSPC;
+ rdt_last_cmd_puts("kernel out of memory\n");
goto out_unlock;
}
*r = rdtgrp;
@@ -1544,6 +1618,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
+ rdt_last_cmd_puts("kernfs create error\n");
goto out_free_rgrp;
}
rdtgrp->kn = kn;
@@ -1557,24 +1632,31 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
kernfs_get(kn);
ret = rdtgroup_kn_set_ugid(kn);
- if (ret)
+ if (ret) {
+ rdt_last_cmd_puts("kernfs perm error\n");
goto out_destroy;
+ }
- files = RFTYPE_BASE | RFTYPE_CTRL;
files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
ret = rdtgroup_add_files(kn, files);
- if (ret)
+ if (ret) {
+ rdt_last_cmd_puts("kernfs fill error\n");
goto out_destroy;
+ }
if (rdt_mon_capable) {
ret = alloc_rmid();
- if (ret < 0)
+ if (ret < 0) {
+ rdt_last_cmd_puts("out of RMIDs\n");
goto out_destroy;
+ }
rdtgrp->mon.rmid = ret;
ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
- if (ret)
+ if (ret) {
+ rdt_last_cmd_puts("kernfs subdir error\n");
goto out_idfree;
+ }
}
kernfs_activate(kn);
@@ -1652,8 +1734,10 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
kn = rdtgrp->kn;
ret = closid_alloc();
- if (ret < 0)
+ if (ret < 0) {
+ rdt_last_cmd_puts("out of CLOSIDs\n");
goto out_common_fail;
+ }
closid = ret;
rdtgrp->closid = closid;
@@ -1665,8 +1749,10 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
* of tasks and cpus to monitor.
*/
ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
- if (ret)
+ if (ret) {
+ rdt_last_cmd_puts("kernfs subdir error\n");
goto out_id_free;
+ }
}
goto out_unlock;
@@ -1902,6 +1988,9 @@ int __init rdtgroup_init(void)
{
int ret = 0;
+ seq_buf_init(&last_cmd_status, last_cmd_status_buf,
+ sizeof(last_cmd_status_buf));
+
ret = rdtgroup_setup_root();
if (ret)
return ret;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 87cc9ab7a13c..4ca632a06e0b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -204,7 +204,7 @@ static int error_context(struct mce *m)
return IN_KERNEL;
}
-static int mce_severity_amd_smca(struct mce *m, int err_ctx)
+static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
{
u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
u32 low, high;
@@ -245,6 +245,9 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc
if (m->status & MCI_STATUS_UC) {
+ if (ctx == IN_KERNEL)
+ return MCE_PANIC_SEVERITY;
+
/*
* On older systems where overflow_recov flag is not present, we
* should simply panic if an error overflow occurs. If
@@ -255,10 +258,6 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc
if (mce_flags.smca)
return mce_severity_amd_smca(m, ctx);
- /* software can try to contain */
- if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL))
- return MCE_PANIC_SEVERITY;
-
/* kill current process */
return MCE_AR_SEVERITY;
} else {
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3b413065c613..b1d616d08eee 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1367,13 +1367,12 @@ static void __start_timer(struct timer_list *t, unsigned long interval)
local_irq_restore(flags);
}
-static void mce_timer_fn(unsigned long data)
+static void mce_timer_fn(struct timer_list *t)
{
- struct timer_list *t = this_cpu_ptr(&mce_timer);
- int cpu = smp_processor_id();
+ struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
unsigned long iv;
- WARN_ON(cpu != data);
+ WARN_ON(cpu_t != t);
iv = __this_cpu_read(mce_next_interval);
@@ -1763,17 +1762,15 @@ static void mce_start_timer(struct timer_list *t)
static void __mcheck_cpu_setup_timer(void)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- unsigned int cpu = smp_processor_id();
- setup_pinned_timer(t, mce_timer_fn, cpu);
+ timer_setup(t, mce_timer_fn, TIMER_PINNED);
}
static void __mcheck_cpu_init_timer(void)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- unsigned int cpu = smp_processor_id();
- setup_pinned_timer(t, mce_timer_fn, cpu);
+ timer_setup(t, mce_timer_fn, TIMER_PINNED);
mce_start_timer(t);
}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 236324e83a3a..85eb5fc180c8 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -254,9 +254,9 @@ static void __init ms_hyperv_init_platform(void)
#endif
}
-const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
- .init_platform = ms_hyperv_init_platform,
+ .type = X86_HYPER_MS_HYPERV,
+ .init.init_platform = ms_hyperv_init_platform,
};
-EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 40ed26852ebd..8e005329648b 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -205,10 +205,10 @@ static bool __init vmware_legacy_x2apic_available(void)
(eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
}
-const __refconst struct hypervisor_x86 x86_hyper_vmware = {
+const __initconst struct hypervisor_x86 x86_hyper_vmware = {
.name = "VMware",
.detect = vmware_platform,
- .init_platform = vmware_platform_setup,
- .x2apic_available = vmware_legacy_x2apic_available,
+ .type = X86_HYPER_VMWARE,
+ .init.init_platform = vmware_platform_setup,
+ .init.x2apic_available = vmware_legacy_x2apic_available,
};
-EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 44404e2307bb..10e74d4778a1 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -209,7 +209,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
}
#ifdef CONFIG_KEXEC_FILE
-static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
+static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
{
unsigned int *nr_ranges = arg;
@@ -342,7 +342,7 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced,
return ret;
}
-static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
+static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
{
struct crash_elf_data *ced = arg;
Elf64_Ehdr *ehdr;
@@ -355,7 +355,7 @@ static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
ehdr = ced->ehdr;
/* Exclude unwanted mem ranges */
- ret = elf_header_exclude_ranges(ced, start, end);
+ ret = elf_header_exclude_ranges(ced, res->start, res->end);
if (ret)
return ret;
@@ -518,14 +518,14 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
return 0;
}
-static int memmap_entry_callback(u64 start, u64 end, void *arg)
+static int memmap_entry_callback(struct resource *res, void *arg)
{
struct crash_memmap_data *cmd = arg;
struct boot_params *params = cmd->params;
struct e820_entry ei;
- ei.addr = start;
- ei.size = end - start + 1;
+ ei.addr = res->start;
+ ei.size = resource_size(res);
ei.type = cmd->type;
add_e820_entry(params, &ei);
@@ -619,12 +619,12 @@ out:
return ret;
}
-static int determine_backup_region(u64 start, u64 end, void *arg)
+static int determine_backup_region(struct resource *res, void *arg)
{
struct kimage *image = arg;
- image->arch.backup_src_start = start;
- image->arch.backup_src_sz = end - start + 1;
+ image->arch.backup_src_start = res->start;
+ image->arch.backup_src_sz = resource_size(res);
/* Expecting only one range for backup region */
return 1;
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 9c4e7ba6870c..7d7715dde901 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -155,14 +155,14 @@ void init_espfix_ap(int cpu)
page = cpu/ESPFIX_STACKS_PER_PAGE;
/* Did another CPU already set this up? */
- stack_page = ACCESS_ONCE(espfix_pages[page]);
+ stack_page = READ_ONCE(espfix_pages[page]);
if (likely(stack_page))
goto done;
mutex_lock(&espfix_init_mutex);
/* Did we race on the lock? */
- stack_page = ACCESS_ONCE(espfix_pages[page]);
+ stack_page = READ_ONCE(espfix_pages[page]);
if (stack_page)
goto unlock_done;
@@ -200,7 +200,7 @@ void init_espfix_ap(int cpu)
set_pte(&pte_p[n*PTE_STRIDE], pte);
/* Job is done for this CPU and any CPU which shares this page */
- ACCESS_ONCE(espfix_pages[page]) = stack_page;
+ WRITE_ONCE(espfix_pages[page], stack_page);
unlock_done:
mutex_unlock(&espfix_init_mutex);
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 7affb7e3d9a5..6abd83572b01 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void)
*/
static void __init fpu__init_parse_early_param(void)
{
+ char arg[32];
+ char *argptr = arg;
+ int bit;
+
if (cmdline_find_option_bool(boot_command_line, "no387"))
setup_clear_cpu_cap(X86_FEATURE_FPU);
@@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+ if (cmdline_find_option(boot_command_line, "clearcpuid", arg,
+ sizeof(arg)) &&
+ get_option(&argptr, &bit) &&
+ bit >= 0 &&
+ bit < NCAPINTS * 32)
+ setup_clear_cpu_cap(bit);
}
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index f1d5476c9022..87a57b7642d3 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -15,6 +15,7 @@
#include <asm/fpu/xstate.h>
#include <asm/tlbflush.h>
+#include <asm/cpufeature.h>
/*
* Although we spell it out in here, the Processor Trace
@@ -36,6 +37,19 @@ static const char *xfeature_names[] =
"unknown xstate feature" ,
};
+static short xsave_cpuid_features[] __initdata = {
+ X86_FEATURE_FPU,
+ X86_FEATURE_XMM,
+ X86_FEATURE_AVX,
+ X86_FEATURE_MPX,
+ X86_FEATURE_MPX,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_INTEL_PT,
+ X86_FEATURE_PKU,
+};
+
/*
* Mask of xstate features supported by the CPU and the kernel:
*/
@@ -59,26 +73,6 @@ unsigned int fpu_user_xstate_size;
void fpu__xstate_clear_all_cpu_caps(void)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
- setup_clear_cpu_cap(X86_FEATURE_AVX);
- setup_clear_cpu_cap(X86_FEATURE_AVX2);
- setup_clear_cpu_cap(X86_FEATURE_AVX512F);
- setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
- setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
- setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
- setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
- setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
- setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
- setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
- setup_clear_cpu_cap(X86_FEATURE_MPX);
- setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
- setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
- setup_clear_cpu_cap(X86_FEATURE_PKU);
- setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
- setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
- setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ);
}
/*
@@ -726,6 +720,7 @@ void __init fpu__init_system_xstate(void)
unsigned int eax, ebx, ecx, edx;
static int on_boot_cpu __initdata = 1;
int err;
+ int i;
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -759,6 +754,14 @@ void __init fpu__init_system_xstate(void)
goto out_disable;
}
+ /*
+ * Clear XSAVE features that are disabled in the normal CPUID.
+ */
+ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
+ if (!boot_cpu_has(xsave_cpuid_features[i]))
+ xfeatures_mask &= ~BIT(i);
+ }
+
xfeatures_mask &= fpu__get_supported_xfeatures_mask();
/* Enable xstate instructions to be able to continue with initialization: */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f1d528bb66a6..c29020907886 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -212,9 +212,6 @@ ENTRY(startup_32_smp)
#endif
.Ldefault_entry:
-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
- X86_CR0_PG)
movl $(CR0_STATE & ~X86_CR0_PG),%eax
movl %eax,%cr0
@@ -402,7 +399,7 @@ ENTRY(early_idt_handler_array)
# 24(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
pushl $0 # Dummy error code, to make stack frame uniform
.endif
pushl $i # 20(%esp) Vector number
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6dde3f3fc1f8..7dca675fe78d 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -38,11 +38,12 @@
*
*/
-#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
+#endif
L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
@@ -50,6 +51,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
.code64
.globl startup_64
startup_64:
+ UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded an identity mapped page table
@@ -89,6 +91,7 @@ startup_64:
addq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
+ UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded a mapped page table.
@@ -133,6 +136,7 @@ ENTRY(secondary_startup_64)
movq $1f, %rax
jmp *%rax
1:
+ UNWIND_HINT_EMPTY
/* Check if nx is implemented */
movl $0x80000001, %eax
@@ -150,9 +154,6 @@ ENTRY(secondary_startup_64)
1: wrmsr /* Make changes effective */
/* Setup cr0 */
-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
- X86_CR0_PG)
movl $CR0_STATE, %eax
/* Make changes effective */
movq %rax, %cr0
@@ -235,7 +236,7 @@ ENTRY(secondary_startup_64)
pushq %rax # target address in negative space
lretq
.Lafter_lret:
-ENDPROC(secondary_startup_64)
+END(secondary_startup_64)
#include "verify_cpu.S"
@@ -247,6 +248,7 @@ ENDPROC(secondary_startup_64)
*/
ENTRY(start_cpu0)
movq initial_stack(%rip), %rsp
+ UNWIND_HINT_EMPTY
jmp .Ljump_to_C_code
ENDPROC(start_cpu0)
#endif
@@ -266,26 +268,24 @@ ENDPROC(start_cpu0)
.quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
__FINITDATA
-bad_address:
- jmp bad_address
-
__INIT
ENTRY(early_idt_handler_array)
- # 104(%rsp) %rflags
- # 96(%rsp) %cs
- # 88(%rsp) %rip
- # 80(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
- pushq $0 # Dummy error code, to make stack frame uniform
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+ UNWIND_HINT_IRET_REGS
+ pushq $0 # Dummy error code, to make stack frame uniform
+ .else
+ UNWIND_HINT_IRET_REGS offset=8
.endif
pushq $i # 72(%rsp) Vector number
jmp early_idt_handler_common
+ UNWIND_HINT_IRET_REGS
i = i + 1
.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
-ENDPROC(early_idt_handler_array)
+ UNWIND_HINT_IRET_REGS offset=16
+END(early_idt_handler_array)
early_idt_handler_common:
/*
@@ -313,6 +313,7 @@ early_idt_handler_common:
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
+ UNWIND_HINT_REGS
cmpq $14,%rsi /* Page fault? */
jnz 10f
@@ -327,8 +328,8 @@ early_idt_handler_common:
20:
decl early_recursion_flag(%rip)
- jmp restore_regs_and_iret
-ENDPROC(early_idt_handler_common)
+ jmp restore_regs_and_return_to_kernel
+END(early_idt_handler_common)
__INITDATA
@@ -362,10 +363,7 @@ NEXT_PAGE(early_dynamic_pgts)
.data
-#ifndef CONFIG_XEN
-NEXT_PAGE(init_top_pgt)
- .fill 512,8,0
-#else
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
NEXT_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
@@ -382,6 +380,9 @@ NEXT_PAGE(level2_ident_pgt)
* Don't set NX because code runs from these pages.
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+#else
+NEXT_PAGE(init_top_pgt)
+ .fill 512,8,0
#endif
#ifdef CONFIG_X86_5LEVEL
@@ -435,7 +436,7 @@ ENTRY(phys_base)
EXPORT_SYMBOL(phys_base)
#include "../../x86/xen/xen-head.S"
-
+
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 8f5cb2c7060c..86c4439f9d74 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -114,6 +114,7 @@ static void make_8259A_irq(unsigned int irq)
io_apic_irqs &= ~(1<<irq);
irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
enable_irq(irq);
+ lapic_assign_legacy_vector(irq, true);
}
/*
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 6107ee1cb8d5..d985cef3984f 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -92,8 +92,6 @@ static const __initdata struct idt_data def_idts[] = {
INTG(X86_TRAP_DF, double_fault),
#endif
INTG(X86_TRAP_DB, debug),
- INTG(X86_TRAP_NMI, nmi),
- INTG(X86_TRAP_BP, int3),
#ifdef CONFIG_X86_MCE
INTG(X86_TRAP_MC, &machine_check),
@@ -225,7 +223,7 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sy
idt_init_desc(&desc, t);
write_idt_entry(idt, t->vector, &desc);
if (sys)
- set_bit(t->vector, used_vectors);
+ set_bit(t->vector, system_vectors);
}
}
@@ -313,14 +311,14 @@ void __init idt_setup_apic_and_irq_gates(void)
idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true);
- for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) {
+ for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) {
entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
set_intr_gate(i, entry);
}
- for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
+ for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
#ifdef CONFIG_X86_LOCAL_APIC
- set_bit(i, used_vectors);
+ set_bit(i, system_vectors);
set_intr_gate(i, spurious_interrupt);
#else
entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
@@ -358,7 +356,7 @@ void idt_invalidate(void *addr)
void __init update_intr_gate(unsigned int n, const void *addr)
{
- if (WARN_ON_ONCE(!test_bit(n, used_vectors)))
+ if (WARN_ON_ONCE(!test_bit(n, system_vectors)))
return;
set_intr_gate(n, addr);
}
@@ -366,6 +364,6 @@ void __init update_intr_gate(unsigned int n, const void *addr)
void alloc_intr_gate(unsigned int n, const void *addr)
{
BUG_ON(n < FIRST_SYSTEM_VECTOR);
- if (!test_and_set_bit(n, used_vectors))
+ if (!test_and_set_bit(n, system_vectors))
set_intr_gate(n, addr);
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 52089c043160..49cfd9fe7589 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -134,7 +134,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_puts(p, " Machine check polls\n");
#endif
#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
- if (test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) {
+ if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) {
seq_printf(p, "%*s: ", prec, "HYP");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
@@ -333,105 +333,6 @@ __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs)
#ifdef CONFIG_HOTPLUG_CPU
-
-/* These two declarations are only used in check_irq_vectors_for_cpu_disable()
- * below, which is protected by stop_machine(). Putting them on the stack
- * results in a stack frame overflow. Dynamically allocating could result in a
- * failure so declare these two cpumasks as global.
- */
-static struct cpumask affinity_new, online_new;
-
-/*
- * This cpu is going to be removed and its vectors migrated to the remaining
- * online cpus. Check to see if there are enough vectors in the remaining cpus.
- * This function is protected by stop_machine().
- */
-int check_irq_vectors_for_cpu_disable(void)
-{
- unsigned int this_cpu, vector, this_count, count;
- struct irq_desc *desc;
- struct irq_data *data;
- int cpu;
-
- this_cpu = smp_processor_id();
- cpumask_copy(&online_new, cpu_online_mask);
- cpumask_clear_cpu(this_cpu, &online_new);
-
- this_count = 0;
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- desc = __this_cpu_read(vector_irq[vector]);
- if (IS_ERR_OR_NULL(desc))
- continue;
- /*
- * Protect against concurrent action removal, affinity
- * changes etc.
- */
- raw_spin_lock(&desc->lock);
- data = irq_desc_get_irq_data(desc);
- cpumask_copy(&affinity_new,
- irq_data_get_affinity_mask(data));
- cpumask_clear_cpu(this_cpu, &affinity_new);
-
- /* Do not count inactive or per-cpu irqs. */
- if (!irq_desc_has_action(desc) || irqd_is_per_cpu(data)) {
- raw_spin_unlock(&desc->lock);
- continue;
- }
-
- raw_spin_unlock(&desc->lock);
- /*
- * A single irq may be mapped to multiple cpu's
- * vector_irq[] (for example IOAPIC cluster mode). In
- * this case we have two possibilities:
- *
- * 1) the resulting affinity mask is empty; that is
- * this the down'd cpu is the last cpu in the irq's
- * affinity mask, or
- *
- * 2) the resulting affinity mask is no longer a
- * subset of the online cpus but the affinity mask is
- * not zero; that is the down'd cpu is the last online
- * cpu in a user set affinity mask.
- */
- if (cpumask_empty(&affinity_new) ||
- !cpumask_subset(&affinity_new, &online_new))
- this_count++;
- }
- /* No need to check any further. */
- if (!this_count)
- return 0;
-
- count = 0;
- for_each_online_cpu(cpu) {
- if (cpu == this_cpu)
- continue;
- /*
- * We scan from FIRST_EXTERNAL_VECTOR to first system
- * vector. If the vector is marked in the used vectors
- * bitmap or an irq is assigned to it, we don't count
- * it as available.
- *
- * As this is an inaccurate snapshot anyway, we can do
- * this w/o holding vector_lock.
- */
- for (vector = FIRST_EXTERNAL_VECTOR;
- vector < FIRST_SYSTEM_VECTOR; vector++) {
- if (!test_bit(vector, used_vectors) &&
- IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) {
- if (++count == this_count)
- return 0;
- }
- }
- }
-
- if (count < this_count) {
- pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n",
- this_cpu, this_count, count);
- return -ERANGE;
- }
- return 0;
-}
-
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void)
{
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 1e4094eba15e..8da3e909e967 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -61,9 +61,6 @@ void __init init_ISA_irqs(void)
struct irq_chip *chip = legacy_pic->chip;
int i;
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
- init_bsp_APIC();
-#endif
legacy_pic->init(0);
for (i = 0; i < nr_legacy_irqs(); i++)
@@ -94,6 +91,7 @@ void __init native_init_IRQ(void)
x86_init.irqs.pre_vector_init();
idt_setup_apic_and_irq_gates();
+ lapic_assign_system_vectors();
if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index 615105cf7d58..ae38dccf0c8f 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -85,11 +85,11 @@ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
* Copy an instruction and adjust the displacement if the instruction
* uses the %rip-relative addressing mode.
*/
-extern int __copy_instruction(u8 *dest, u8 *src, struct insn *insn);
+extern int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn);
/* Generate a relative-jump/call instruction */
-extern void synthesize_reljump(void *from, void *to);
-extern void synthesize_relcall(void *from, void *to);
+extern void synthesize_reljump(void *dest, void *from, void *to);
+extern void synthesize_relcall(void *dest, void *from, void *to);
#ifdef CONFIG_OPTPROBES
extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 0742491cbb73..bd36f3c33cd0 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -119,29 +119,29 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
static nokprobe_inline void
-__synthesize_relative_insn(void *from, void *to, u8 op)
+__synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
{
struct __arch_relative_insn {
u8 op;
s32 raddr;
} __packed *insn;
- insn = (struct __arch_relative_insn *)from;
+ insn = (struct __arch_relative_insn *)dest;
insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
insn->op = op;
}
/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-void synthesize_reljump(void *from, void *to)
+void synthesize_reljump(void *dest, void *from, void *to)
{
- __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
+ __synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE);
}
NOKPROBE_SYMBOL(synthesize_reljump);
/* Insert a call instruction at address 'from', which calls address 'to'.*/
-void synthesize_relcall(void *from, void *to)
+void synthesize_relcall(void *dest, void *from, void *to)
{
- __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
+ __synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE);
}
NOKPROBE_SYMBOL(synthesize_relcall);
@@ -346,10 +346,11 @@ static int is_IF_modifier(kprobe_opcode_t *insn)
/*
* Copy an instruction with recovering modified instruction by kprobes
* and adjust the displacement if the instruction uses the %rip-relative
- * addressing mode.
+ * addressing mode. Note that since @real will be the final place of copied
+ * instruction, displacement must be adjust by @real, not @dest.
* This returns the length of copied instruction, or 0 if it has an error.
*/
-int __copy_instruction(u8 *dest, u8 *src, struct insn *insn)
+int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
{
kprobe_opcode_t buf[MAX_INSN_SIZE];
unsigned long recovered_insn =
@@ -387,11 +388,11 @@ int __copy_instruction(u8 *dest, u8 *src, struct insn *insn)
* have given.
*/
newdisp = (u8 *) src + (s64) insn->displacement.value
- - (u8 *) dest;
+ - (u8 *) real;
if ((s64) (s32) newdisp != newdisp) {
pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
pr_err("\tSrc: %p, Dest: %p, old disp: %x\n",
- src, dest, insn->displacement.value);
+ src, real, insn->displacement.value);
return 0;
}
disp = (u8 *) dest + insn_offset_displacement(insn);
@@ -402,20 +403,38 @@ int __copy_instruction(u8 *dest, u8 *src, struct insn *insn)
}
/* Prepare reljump right after instruction to boost */
-static void prepare_boost(struct kprobe *p, struct insn *insn)
+static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
+ struct insn *insn)
{
+ int len = insn->length;
+
if (can_boost(insn, p->addr) &&
- MAX_INSN_SIZE - insn->length >= RELATIVEJUMP_SIZE) {
+ MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) {
/*
* These instructions can be executed directly if it
* jumps back to correct address.
*/
- synthesize_reljump(p->ainsn.insn + insn->length,
+ synthesize_reljump(buf + len, p->ainsn.insn + len,
p->addr + insn->length);
+ len += RELATIVEJUMP_SIZE;
p->ainsn.boostable = true;
} else {
p->ainsn.boostable = false;
}
+
+ return len;
+}
+
+/* Make page to RO mode when allocate it */
+void *alloc_insn_page(void)
+{
+ void *page;
+
+ page = module_alloc(PAGE_SIZE);
+ if (page)
+ set_memory_ro((unsigned long)page & PAGE_MASK, 1);
+
+ return page;
}
/* Recover page to RW mode before releasing it */
@@ -429,12 +448,11 @@ void free_insn_page(void *page)
static int arch_copy_kprobe(struct kprobe *p)
{
struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
int len;
- set_memory_rw((unsigned long)p->ainsn.insn & PAGE_MASK, 1);
-
/* Copy an instruction with recovering if other optprobe modifies it.*/
- len = __copy_instruction(p->ainsn.insn, p->addr, &insn);
+ len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn);
if (!len)
return -EINVAL;
@@ -442,15 +460,16 @@ static int arch_copy_kprobe(struct kprobe *p)
* __copy_instruction can modify the displacement of the instruction,
* but it doesn't affect boostable check.
*/
- prepare_boost(p, &insn);
-
- set_memory_ro((unsigned long)p->ainsn.insn & PAGE_MASK, 1);
+ len = prepare_boost(buf, p, &insn);
/* Check whether the instruction modifies Interrupt Flag or not */
- p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn);
+ p->ainsn.if_modifier = is_IF_modifier(buf);
/* Also, displacement change doesn't affect the first byte */
- p->opcode = p->ainsn.insn[0];
+ p->opcode = buf[0];
+
+ /* OK, write back the instruction(s) into ROX insn buffer */
+ text_poke(p->ainsn.insn, buf, len);
return 0;
}
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..8dc0161cec8f 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -26,7 +26,7 @@
#include "common.h"
static nokprobe_inline
-int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+void __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb, unsigned long orig_ip)
{
/*
@@ -41,33 +41,31 @@ int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
__this_cpu_write(current_kprobe, NULL);
if (orig_ip)
regs->ip = orig_ip;
- return 1;
}
int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
{
- if (kprobe_ftrace(p))
- return __skip_singlestep(p, regs, kcb, 0);
- else
- return 0;
+ if (kprobe_ftrace(p)) {
+ __skip_singlestep(p, regs, kcb, 0);
+ preempt_enable_no_resched();
+ return 1;
+ }
+ return 0;
}
NOKPROBE_SYMBOL(skip_singlestep);
-/* Ftrace callback handler for kprobes */
+/* Ftrace callback handler for kprobes -- called under preepmt disabed */
void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *regs)
{
struct kprobe *p;
struct kprobe_ctlblk *kcb;
- unsigned long flags;
-
- /* Disable irq for emulating a breakpoint and avoiding preempt */
- local_irq_save(flags);
+ /* Preempt is disabled by ftrace */
p = get_kprobe((kprobe_opcode_t *)ip);
if (unlikely(!p) || kprobe_disabled(p))
- goto end;
+ return;
kcb = get_kprobe_ctlblk();
if (kprobe_running()) {
@@ -77,17 +75,19 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
/* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
regs->ip = ip + sizeof(kprobe_opcode_t);
+ /* To emulate trap based kprobes, preempt_disable here */
+ preempt_disable();
__this_cpu_write(current_kprobe, p);
kcb->kprobe_status = KPROBE_HIT_ACTIVE;
- if (!p->pre_handler || !p->pre_handler(p, regs))
+ if (!p->pre_handler || !p->pre_handler(p, regs)) {
__skip_singlestep(p, regs, kcb, orig_ip);
+ preempt_enable_no_resched();
+ }
/*
* If pre_handler returns !0, it sets regs->ip and
- * resets current kprobe.
+ * resets current kprobe, and keep preempt count +1.
*/
}
-end:
- local_irq_restore(flags);
}
NOKPROBE_SYMBOL(kprobe_ftrace_handler);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 4f98aad38237..e941136e24d8 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -142,11 +142,11 @@ void optprobe_template_func(void);
STACK_FRAME_NON_STANDARD(optprobe_template_func);
#define TMPL_MOVE_IDX \
- ((long)&optprobe_template_val - (long)&optprobe_template_entry)
+ ((long)optprobe_template_val - (long)optprobe_template_entry)
#define TMPL_CALL_IDX \
- ((long)&optprobe_template_call - (long)&optprobe_template_entry)
+ ((long)optprobe_template_call - (long)optprobe_template_entry)
#define TMPL_END_IDX \
- ((long)&optprobe_template_end - (long)&optprobe_template_entry)
+ ((long)optprobe_template_end - (long)optprobe_template_entry)
#define INT3_SIZE sizeof(kprobe_opcode_t)
@@ -154,17 +154,15 @@ STACK_FRAME_NON_STANDARD(optprobe_template_func);
static void
optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
{
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- unsigned long flags;
-
/* This is possible if op is under delayed unoptimizing */
if (kprobe_disabled(&op->kp))
return;
- local_irq_save(flags);
+ preempt_disable();
if (kprobe_running()) {
kprobes_inc_nmissed_count(&op->kp);
} else {
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
/* Save skipped registers */
#ifdef CONFIG_X86_64
regs->cs = __KERNEL_CS;
@@ -180,17 +178,17 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
opt_pre_handler(&op->kp, regs);
__this_cpu_write(current_kprobe, NULL);
}
- local_irq_restore(flags);
+ preempt_enable_no_resched();
}
NOKPROBE_SYMBOL(optimized_callback);
-static int copy_optimized_instructions(u8 *dest, u8 *src)
+static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
{
struct insn insn;
int len = 0, ret;
while (len < RELATIVEJUMP_SIZE) {
- ret = __copy_instruction(dest + len, src + len, &insn);
+ ret = __copy_instruction(dest + len, src + len, real, &insn);
if (!ret || !can_boost(&insn, src + len))
return -EINVAL;
len += ret;
@@ -343,57 +341,66 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
struct kprobe *__unused)
{
- u8 *buf;
- int ret;
+ u8 *buf = NULL, *slot;
+ int ret, len;
long rel;
if (!can_optimize((unsigned long)op->kp.addr))
return -EILSEQ;
- op->optinsn.insn = get_optinsn_slot();
- if (!op->optinsn.insn)
+ buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL);
+ if (!buf)
return -ENOMEM;
+ op->optinsn.insn = slot = get_optinsn_slot();
+ if (!slot) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
/*
* Verify if the address gap is in 2GB range, because this uses
* a relative jump.
*/
- rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+ rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE;
if (abs(rel) > 0x7fffffff) {
- __arch_remove_optimized_kprobe(op, 0);
- return -ERANGE;
+ ret = -ERANGE;
+ goto err;
}
- buf = (u8 *)op->optinsn.insn;
- set_memory_rw((unsigned long)buf & PAGE_MASK, 1);
+ /* Copy arch-dep-instance from template */
+ memcpy(buf, optprobe_template_entry, TMPL_END_IDX);
/* Copy instructions into the out-of-line buffer */
- ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
- if (ret < 0) {
- __arch_remove_optimized_kprobe(op, 0);
- return ret;
- }
+ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr,
+ slot + TMPL_END_IDX);
+ if (ret < 0)
+ goto err;
op->optinsn.size = ret;
-
- /* Copy arch-dep-instance from template */
- memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
+ len = TMPL_END_IDX + op->optinsn.size;
/* Set probe information */
synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
/* Set probe function call */
- synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
+ synthesize_relcall(buf + TMPL_CALL_IDX,
+ slot + TMPL_CALL_IDX, optimized_callback);
/* Set returning jmp instruction at the tail of out-of-line buffer */
- synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
+ synthesize_reljump(buf + len, slot + len,
(u8 *)op->kp.addr + op->optinsn.size);
-
- set_memory_ro((unsigned long)buf & PAGE_MASK, 1);
-
- flush_icache_range((unsigned long) buf,
- (unsigned long) buf + TMPL_END_IDX +
- op->optinsn.size + RELATIVEJUMP_SIZE);
- return 0;
+ len += RELATIVEJUMP_SIZE;
+
+ /* We have to use text_poke for instuction buffer because it is RO */
+ text_poke(slot, buf, len);
+ ret = 0;
+out:
+ kfree(buf);
+ return ret;
+
+err:
+ __arch_remove_optimized_kprobe(op, 0);
+ goto out;
}
/*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8bb9594d0761..b40ffbf156c1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -75,8 +75,8 @@ static int parse_no_kvmclock_vsyscall(char *arg)
early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
-static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
-static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64);
static int has_steal_clock = 0;
/*
@@ -312,7 +312,7 @@ static void kvm_register_steal_time(void)
cpu, (unsigned long long) slow_virt_to_phys(st));
}
-static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
+static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
{
@@ -426,9 +426,42 @@ void kvm_disable_steal_time(void)
wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
}
+static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
+{
+ early_set_memory_decrypted((unsigned long) ptr, size);
+}
+
+/*
+ * Iterate through all possible CPUs and map the memory region pointed
+ * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
+ *
+ * Note: we iterate through all possible CPUs to ensure that CPUs
+ * hotplugged will have their per-cpu variable already mapped as
+ * decrypted.
+ */
+static void __init sev_map_percpu_data(void)
+{
+ int cpu;
+
+ if (!sev_active())
+ return;
+
+ for_each_possible_cpu(cpu) {
+ __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
+ __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
+ __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
+ }
+}
+
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
+ /*
+ * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
+ * shares the guest physical address with the hypervisor.
+ */
+ sev_map_percpu_data();
+
kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
kvm_spinlock_init();
@@ -465,7 +498,7 @@ static void __init kvm_apf_trap_init(void)
update_intr_gate(X86_TRAP_PF, async_page_fault);
}
-void __init kvm_guest_init(void)
+static void __init kvm_guest_init(void)
{
int i;
@@ -496,6 +529,7 @@ void __init kvm_guest_init(void)
kvm_cpu_online, kvm_cpu_down_prepare) < 0)
pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
#else
+ sev_map_percpu_data();
kvm_guest_cpu_init();
#endif
@@ -544,12 +578,13 @@ static uint32_t __init kvm_detect(void)
return kvm_cpuid_base();
}
-const struct hypervisor_x86 x86_hyper_kvm __refconst = {
+const __initconst struct hypervisor_x86 x86_hyper_kvm = {
.name = "KVM",
.detect = kvm_detect,
- .x2apic_available = kvm_para_available,
+ .type = X86_HYPER_KVM,
+ .init.guest_late_init = kvm_guest_init,
+ .init.x2apic_available = kvm_para_available,
};
-EXPORT_SYMBOL_GPL(x86_hyper_kvm);
static __init int activate_jump_labels(void)
{
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 5b609e28ce3f..77b492c2d658 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -27,6 +27,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <asm/mem_encrypt.h>
#include <asm/x86_init.h>
#include <asm/reboot.h>
#include <asm/kvmclock.h>
@@ -45,7 +46,7 @@ early_param("no-kvmclock", parse_no_kvmclock);
/* The hypervisor will put information about time periodically here */
static struct pvclock_vsyscall_time_info *hv_clock;
-static struct pvclock_wall_clock wall_clock;
+static struct pvclock_wall_clock *wall_clock;
struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
{
@@ -64,15 +65,15 @@ static void kvm_get_wallclock(struct timespec *now)
int low, high;
int cpu;
- low = (int)__pa_symbol(&wall_clock);
- high = ((u64)__pa_symbol(&wall_clock) >> 32);
+ low = (int)slow_virt_to_phys(wall_clock);
+ high = ((u64)slow_virt_to_phys(wall_clock) >> 32);
native_write_msr(msr_kvm_wall_clock, low, high);
cpu = get_cpu();
vcpu_time = &hv_clock[cpu].pvti;
- pvclock_read_wallclock(&wall_clock, vcpu_time, now);
+ pvclock_read_wallclock(wall_clock, vcpu_time, now);
put_cpu();
}
@@ -249,11 +250,39 @@ static void kvm_shutdown(void)
native_machine_shutdown();
}
+static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size,
+ phys_addr_t align)
+{
+ phys_addr_t mem;
+
+ mem = memblock_alloc(size, align);
+ if (!mem)
+ return 0;
+
+ if (sev_active()) {
+ if (early_set_memory_decrypted((unsigned long)__va(mem), size))
+ goto e_free;
+ }
+
+ return mem;
+e_free:
+ memblock_free(mem, size);
+ return 0;
+}
+
+static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size)
+{
+ if (sev_active())
+ early_set_memory_encrypted((unsigned long)__va(addr), size);
+
+ memblock_free(addr, size);
+}
+
void __init kvmclock_init(void)
{
struct pvclock_vcpu_time_info *vcpu_time;
- unsigned long mem;
- int size, cpu;
+ unsigned long mem, mem_wall_clock;
+ int size, cpu, wall_clock_size;
u8 flags;
size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
@@ -267,21 +296,35 @@ void __init kvmclock_init(void)
} else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
return;
- printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
- msr_kvm_system_time, msr_kvm_wall_clock);
+ wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock));
+ mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE);
+ if (!mem_wall_clock)
+ return;
- mem = memblock_alloc(size, PAGE_SIZE);
- if (!mem)
+ wall_clock = __va(mem_wall_clock);
+ memset(wall_clock, 0, wall_clock_size);
+
+ mem = kvm_memblock_alloc(size, PAGE_SIZE);
+ if (!mem) {
+ kvm_memblock_free(mem_wall_clock, wall_clock_size);
+ wall_clock = NULL;
return;
+ }
+
hv_clock = __va(mem);
memset(hv_clock, 0, size);
if (kvm_register_clock("primary cpu clock")) {
hv_clock = NULL;
- memblock_free(mem, size);
+ kvm_memblock_free(mem, size);
+ kvm_memblock_free(mem_wall_clock, wall_clock_size);
+ wall_clock = NULL;
return;
}
+ printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
+ msr_kvm_system_time, msr_kvm_wall_clock);
+
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 4d17bacf4030..1c1eae961340 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -13,6 +13,7 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
@@ -102,7 +103,7 @@ static void finalize_ldt_struct(struct ldt_struct *ldt)
static void install_ldt(struct mm_struct *current_mm,
struct ldt_struct *ldt)
{
- /* Synchronizes with lockless_dereference in load_mm_ldt. */
+ /* Synchronizes with READ_ONCE in load_mm_ldt. */
smp_store_release(&current_mm->context.ldt, ldt);
/* Activate the LDT for all CPUs using current_mm. */
@@ -295,8 +296,8 @@ out:
return error;
}
-asmlinkage int sys_modify_ldt(int func, void __user *ptr,
- unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+ unsigned long , bytecount)
{
int ret = -ENOSYS;
@@ -314,5 +315,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr,
ret = write_ldt(ptr, bytecount, 0);
break;
}
- return ret;
+ /*
+ * The SYSCALL_DEFINE() macros give us an 'unsigned long'
+ * return type, but tht ABI for sys_modify_ldt() expects
+ * 'int'. This cast gives us an int-sized value in %rax
+ * for the return code. The 'unsigned' is necessary so
+ * the compiler does not try to sign-extend the negative
+ * return codes into the high half of the register when
+ * taking the value from int->long.
+ */
+ return (unsigned int)ret;
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 35aafc95e4b8..18bc9b51ac9b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -105,7 +105,7 @@ static void nmi_max_handler(struct irq_work *w)
{
struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
int remainder_ns, decimal_msecs;
- u64 whole_msecs = ACCESS_ONCE(a->max_duration);
+ u64 whole_msecs = READ_ONCE(a->max_duration);
remainder_ns = do_div(whole_msecs, (1000 * 1000));
decimal_msecs = remainder_ns / 1000;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 19a3e8f961c7..041096bdef86 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -115,8 +115,18 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
return 5;
}
-/* Neat trick to map patch type back to the call within the
- * corresponding structure. */
+DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
+
+void __init native_pv_lock_init(void)
+{
+ if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+ static_branch_disable(&virt_spin_lock_key);
+}
+
+/*
+ * Neat trick to map patch type back to the call within the
+ * corresponding structure.
+ */
static void *get_call_destination(u8 type)
{
struct paravirt_patch_template tmpl = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 5286a4a92cf7..35c461f21815 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -898,10 +898,9 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl)
PHB_ROOT_COMPLEX_STATUS);
}
-static void calgary_watchdog(unsigned long data)
+static void calgary_watchdog(struct timer_list *t)
{
- struct pci_dev *dev = (struct pci_dev *)data;
- struct iommu_table *tbl = pci_iommu(dev->bus);
+ struct iommu_table *tbl = from_timer(tbl, t, watchdog_timer);
void __iomem *bbar = tbl->bbar;
u32 val32;
void __iomem *target;
@@ -1016,8 +1015,7 @@ static void __init calgary_enable_translation(struct pci_dev *dev)
writel(cpu_to_be32(val32), target);
readl(target); /* flush */
- setup_timer(&tbl->watchdog_timer, &calgary_watchdog,
- (unsigned long)dev);
+ timer_setup(&tbl->watchdog_timer, calgary_watchdog, 0);
mod_timer(&tbl->watchdog_timer, jiffies);
}
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 3fe690067802..6b07faaa1579 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -7,7 +7,7 @@
#include <linux/init.h>
#include <linux/ioport.h>
-static int found(u64 start, u64 end, void *data)
+static int found(struct resource *res, void *data)
{
return 1;
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c67685337c5a..97fb3e5737f5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -49,7 +49,13 @@
*/
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
.x86_tss = {
- .sp0 = TOP_OF_INIT_STACK,
+ /*
+ * .sp0 is only used when entering ring 0 from a lower
+ * privilege level. Since the init task never runs anything
+ * but ring 0 code, there is no need for a valid value here.
+ * Poison it.
+ */
+ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 11966251cd42..45bf0c5f93e1 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Reload esp0 and cpu_current_top_of_stack. This changes
- * current_thread_info().
+ * current_thread_info(). Refresh the SYSENTER configuration in
+ * case prev or next is vm86.
*/
- load_sp0(tss, next);
+ update_sp0(next_p);
+ refresh_sysenter_cs(next);
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 302e7b2572d1..eeeb34f85c25 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -274,7 +274,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
struct inactive_task_frame *frame;
struct task_struct *me = current;
- p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
childregs = task_pt_regs(p);
fork_frame = container_of(childregs, struct fork_frame, regs);
frame = &fork_frame->frame;
@@ -464,8 +463,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
this_cpu_write(current_task, next_p);
- /* Reload esp0 and ss1. This changes current_thread_info(). */
- load_sp0(tss, next);
+ /* Reload sp0. */
+ update_sp0(next_p);
/*
* Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0957dd73d127..8af2e8d0c0a1 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -136,18 +136,6 @@ RESERVE_BRK(dmi_alloc, 65536);
static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
unsigned long _brk_end = (unsigned long)__brk_base;
-#ifdef CONFIG_X86_64
-int default_cpu_present_to_apicid(int mps_cpu)
-{
- return __default_cpu_present_to_apicid(mps_cpu);
-}
-
-int default_check_phys_apicid_present(int phys_apicid)
-{
- return __default_check_phys_apicid_present(phys_apicid);
-}
-#endif
-
struct boot_params boot_params;
/*
@@ -380,9 +368,11 @@ static void __init reserve_initrd(void)
* If SME is active, this memory will be marked encrypted by the
* kernel when it is accessed (including relocation). However, the
* ramdisk image was loaded decrypted by the bootloader, so make
- * sure that it is encrypted before accessing it.
+ * sure that it is encrypted before accessing it. For SEV the
+ * ramdisk will already be encrypted, so only do this for SME.
*/
- sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
+ if (sme_active())
+ sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
initrd_start = 0;
@@ -822,26 +812,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
return 0;
}
-static void __init simple_udelay_calibration(void)
-{
- unsigned int tsc_khz, cpu_khz;
- unsigned long lpj;
-
- if (!boot_cpu_has(X86_FEATURE_TSC))
- return;
-
- cpu_khz = x86_platform.calibrate_cpu();
- tsc_khz = x86_platform.calibrate_tsc();
-
- tsc_khz = tsc_khz ? : cpu_khz;
- if (!tsc_khz)
- return;
-
- lpj = tsc_khz * 1000;
- do_div(lpj, HZ);
- loops_per_jiffy = lpj;
-}
-
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -1045,12 +1015,10 @@ void __init setup_arch(char **cmdline_p)
/*
* VMware detection requires dmi to be available, so this
- * needs to be done after dmi_scan_machine, for the BP.
+ * needs to be done after dmi_scan_machine(), for the boot CPU.
*/
init_hypervisor_platform();
- simple_udelay_calibration();
-
x86_init.resources.probe_roms();
/* after parse_early_param, so could debug it */
@@ -1135,9 +1103,6 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
e820__memblock_setup();
- if (!early_xdbc_setup_hardware())
- early_xdbc_register_console();
-
reserve_bios_regions();
if (efi_enabled(EFI_MEMMAP)) {
@@ -1243,6 +1208,10 @@ void __init setup_arch(char **cmdline_p)
kvmclock_init();
#endif
+ tsc_early_delay_calibrate();
+ if (!early_xdbc_setup_hardware())
+ early_xdbc_register_console();
+
x86_init.paging.pagetable_init();
kasan_init();
@@ -1294,7 +1263,7 @@ void __init setup_arch(char **cmdline_p)
io_apic_init_mappings();
- kvm_guest_init();
+ x86_init.hyper.guest_late_init();
e820__reserve_resources();
e820__register_nosave_regions(max_low_pfn);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ad59edd84de7..5f59e6bee123 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,6 +77,7 @@
#include <asm/i8259.h>
#include <asm/realmode.h>
#include <asm/misc.h>
+#include <asm/qspinlock.h>
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
@@ -194,6 +195,12 @@ static void smp_callin(void)
smp_store_cpu_info(cpuid);
/*
+ * The topology information must be up to date before
+ * calibrate_delay() and notify_cpu_starting().
+ */
+ set_cpu_sibling_map(raw_smp_processor_id());
+
+ /*
* Get our bogomips.
* Update loops_per_jiffy in cpu_data. Previous call to
* smp_store_cpu_info() stored a value that is close but not as
@@ -203,11 +210,6 @@ static void smp_callin(void)
cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
pr_debug("Stack at about %p\n", &cpuid);
- /*
- * This must be done before setting cpu_online_mask
- * or calling notify_cpu_starting.
- */
- set_cpu_sibling_map(raw_smp_processor_id());
wmb();
notify_cpu_starting(cpuid);
@@ -249,19 +251,19 @@ static void notrace start_secondary(void *unused)
/* otherwise gcc will move up smp_processor_id before the cpu_init */
barrier();
/*
- * Check TSC synchronization with the BP:
+ * Check TSC synchronization with the boot CPU:
*/
check_tsc_sync_target();
/*
- * Lock vector_lock and initialize the vectors on this cpu
- * before setting the cpu online. We must set it online with
- * vector_lock held to prevent a concurrent setup/teardown
- * from seeing a half valid vector space.
+ * Lock vector_lock, set CPU online and bring the vector
+ * allocator online. Online must be set with vector_lock held
+ * to prevent a concurrent irq setup/teardown from seeing a
+ * half valid vector space.
*/
lock_vector_lock();
- setup_vector_irq(smp_processor_id());
set_cpu_online(smp_processor_id(), true);
+ lapic_online();
unlock_vector_lock();
cpu_set_state_online(smp_processor_id());
x86_platform.nmi_init();
@@ -961,8 +963,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
- per_cpu(cpu_current_top_of_stack, cpu) =
- (unsigned long)task_stack_page(idle) + THREAD_SIZE;
+ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
#else
initial_gs = per_cpu_offset(cpu);
#endif
@@ -1094,7 +1095,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
unsigned long flags;
int err, ret = 0;
- WARN_ON(irqs_disabled());
+ lockdep_assert_irqs_enabled();
pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
@@ -1190,17 +1191,10 @@ static __init void disable_smp(void)
cpumask_set_cpu(0, topology_core_cpumask(0));
}
-enum {
- SMP_OK,
- SMP_NO_CONFIG,
- SMP_NO_APIC,
- SMP_FORCE_UP,
-};
-
/*
* Various sanity checks.
*/
-static int __init smp_sanity_check(unsigned max_cpus)
+static void __init smp_sanity_check(void)
{
preempt_disable();
@@ -1238,16 +1232,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
}
/*
- * If we couldn't find an SMP configuration at boot time,
- * get out of here now!
- */
- if (!smp_found_config && !acpi_lapic) {
- preempt_enable();
- pr_notice("SMP motherboard not detected\n");
- return SMP_NO_CONFIG;
- }
-
- /*
* Should not be necessary because the MP table should list the boot
* CPU too, but we do it for the sake of robustness anyway.
*/
@@ -1257,29 +1241,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
physid_set(hard_smp_processor_id(), phys_cpu_present_map);
}
preempt_enable();
-
- /*
- * If we couldn't find a local APIC, then get out of here now!
- */
- if (APIC_INTEGRATED(boot_cpu_apic_version) &&
- !boot_cpu_has(X86_FEATURE_APIC)) {
- if (!disable_apic) {
- pr_err("BIOS bug, local APIC #%d not detected!...\n",
- boot_cpu_physical_apicid);
- pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
- }
- return SMP_NO_APIC;
- }
-
- /*
- * If SMP should be disabled, then really disable it!
- */
- if (!max_cpus) {
- pr_info("SMP mode deactivated\n");
- return SMP_FORCE_UP;
- }
-
- return SMP_OK;
}
static void __init smp_cpu_index_default(void)
@@ -1294,9 +1255,18 @@ static void __init smp_cpu_index_default(void)
}
}
+static void __init smp_get_logical_apicid(void)
+{
+ if (x2apic_mode)
+ cpu0_logical_apicid = apic_read(APIC_LDR);
+ else
+ cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+}
+
/*
- * Prepare for SMP bootup. The MP table or ACPI has been read
- * earlier. Just do some sanity checking here and enable APIC mode.
+ * Prepare for SMP bootup.
+ * @max_cpus: configured maximum number of CPUs, It is a legacy parameter
+ * for common interface support.
*/
void __init native_smp_prepare_cpus(unsigned int max_cpus)
{
@@ -1328,35 +1298,33 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_cpu_sibling_map(0);
- switch (smp_sanity_check(max_cpus)) {
- case SMP_NO_CONFIG:
- disable_smp();
- if (APIC_init_uniprocessor())
- pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
- return;
- case SMP_NO_APIC:
+ smp_sanity_check();
+
+ switch (apic_intr_mode) {
+ case APIC_PIC:
+ case APIC_VIRTUAL_WIRE_NO_CONFIG:
disable_smp();
return;
- case SMP_FORCE_UP:
+ case APIC_SYMMETRIC_IO_NO_ROUTING:
disable_smp();
- apic_bsp_setup(false);
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
return;
- case SMP_OK:
+ case APIC_VIRTUAL_WIRE:
+ case APIC_SYMMETRIC_IO:
break;
}
- if (read_apic_id() != boot_cpu_physical_apicid) {
- panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
- read_apic_id(), boot_cpu_physical_apicid);
- /* Or can we switch back to PIC here? */
- }
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
- default_setup_apic_routing();
- cpu0_logical_apicid = apic_bsp_setup(false);
+ smp_get_logical_apicid();
pr_info("CPU0: ");
print_cpu_info(&cpu_data(0));
+ native_pv_lock_init();
+
uv_system_init();
set_mtrr_aps_delayed_init();
@@ -1395,7 +1363,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
nmi_selftest();
impress_friends();
- setup_ioapic_dest();
mtrr_aps_init();
}
@@ -1554,13 +1521,14 @@ void cpu_disable_common(void)
remove_cpu_from_maps(cpu);
unlock_vector_lock();
fixup_irqs();
+ lapic_offline();
}
int native_cpu_disable(void)
{
int ret;
- ret = check_irq_vectors_for_cpu_disable();
+ ret = lapic_can_unplug_cpu();
if (ret)
return ret;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 8dabd7bf1673..77835bc021c7 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -30,7 +30,7 @@ static int save_stack_address(struct stack_trace *trace, unsigned long addr,
return 0;
}
-static void __save_stack_trace(struct stack_trace *trace,
+static void noinline __save_stack_trace(struct stack_trace *trace,
struct task_struct *task, struct pt_regs *regs,
bool nosched)
{
@@ -56,6 +56,7 @@ static void __save_stack_trace(struct stack_trace *trace,
*/
void save_stack_trace(struct stack_trace *trace)
{
+ trace->skip++;
__save_stack_trace(trace, current, NULL, false);
}
EXPORT_SYMBOL_GPL(save_stack_trace);
@@ -70,6 +71,8 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
if (!try_get_task_stack(tsk))
return;
+ if (tsk == current)
+ trace->skip++;
__save_stack_trace(trace, tsk, NULL, true);
put_task_stack(tsk);
@@ -88,8 +91,9 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
} \
})
-static int __save_stack_trace_reliable(struct stack_trace *trace,
- struct task_struct *task)
+static int __always_inline
+__save_stack_trace_reliable(struct stack_trace *trace,
+ struct task_struct *task)
{
struct unwind_state state;
struct pt_regs *regs;
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 879af864d99a..749d189f8cd4 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -85,6 +85,11 @@ void __init hpet_time_init(void)
static __init void x86_late_time_init(void)
{
x86_init.timers.timer_init();
+ /*
+ * After PIT/HPET timers init, select and setup
+ * the final interrupt mode for delivering IRQs.
+ */
+ x86_init.irqs.intr_mode_init();
tsc_init();
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 67db4f43309e..b7b0f74a2150 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -60,6 +60,7 @@
#include <asm/trace/mpx.h>
#include <asm/mpx.h>
#include <asm/vm86.h>
+#include <asm/umip.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -71,7 +72,7 @@
#include <asm/proto.h>
#endif
-DECLARE_BITMAP(used_vectors, NR_VECTORS);
+DECLARE_BITMAP(system_vectors, NR_VECTORS);
static inline void cond_local_irq_enable(struct pt_regs *regs)
{
@@ -141,8 +142,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
* will catch asm bugs and any attempt to use ist_preempt_enable
* from double_fault.
*/
- BUG_ON((unsigned long)(current_top_of_stack() -
- current_stack_pointer) >= THREAD_SIZE);
+ BUG_ON(!on_thread_stack());
preempt_enable_no_resched();
}
@@ -209,9 +209,6 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
if (fixup_exception(regs, trapnr))
return 0;
- if (fixup_bug(regs, trapnr))
- return 0;
-
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = trapnr;
die(str, regs, error_code);
@@ -292,6 +289,13 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ /*
+ * WARN*()s end up here; fix them up before we call the
+ * notifier chain.
+ */
+ if (!user_mode(regs) && fixup_bug(regs, trapnr))
+ return;
+
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
NOTIFY_STOP) {
cond_local_irq_enable(regs);
@@ -514,6 +518,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
cond_local_irq_enable(regs);
+ if (static_cpu_has(X86_FEATURE_UMIP)) {
+ if (user_mode(regs) && fixup_umip_exception(regs))
+ return;
+ }
+
if (v8086_mode(regs)) {
local_irq_enable();
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 796d96bb0821..8ea117f8142e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -112,7 +112,7 @@ static void cyc2ns_data_init(struct cyc2ns_data *data)
data->cyc2ns_offset = 0;
}
-static void cyc2ns_init(int cpu)
+static void __init cyc2ns_init(int cpu)
{
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
@@ -812,13 +812,13 @@ unsigned long native_calibrate_cpu(void)
return tsc_pit_min;
}
-int recalibrate_cpu_khz(void)
+void recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
unsigned long cpu_khz_old = cpu_khz;
if (!boot_cpu_has(X86_FEATURE_TSC))
- return -ENODEV;
+ return;
cpu_khz = x86_platform.calibrate_cpu();
tsc_khz = x86_platform.calibrate_tsc();
@@ -828,10 +828,6 @@ int recalibrate_cpu_khz(void)
cpu_khz = tsc_khz;
cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
cpu_khz_old, cpu_khz);
-
- return 0;
-#else
- return -ENODEV;
#endif
}
@@ -959,17 +955,21 @@ core_initcall(cpufreq_register_tsc_scaling);
/*
* If ART is present detect the numerator:denominator to convert to TSC
*/
-static void detect_art(void)
+static void __init detect_art(void)
{
unsigned int unused[2];
if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
return;
- /* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */
+ /*
+ * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required,
+ * and the TSC counter resets must not occur asynchronously.
+ */
if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
- !boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ !boot_cpu_has(X86_FEATURE_TSC_ADJUST) ||
+ tsc_async_resets)
return;
cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
@@ -1263,6 +1263,25 @@ static int __init init_tsc_clocksource(void)
*/
device_initcall(init_tsc_clocksource);
+void __init tsc_early_delay_calibrate(void)
+{
+ unsigned long lpj;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return;
+
+ cpu_khz = x86_platform.calibrate_cpu();
+ tsc_khz = x86_platform.calibrate_tsc();
+
+ tsc_khz = tsc_khz ? : cpu_khz;
+ if (!tsc_khz)
+ return;
+
+ lpj = tsc_khz * 1000;
+ do_div(lpj, HZ);
+ loops_per_jiffy = lpj;
+}
+
void __init tsc_init(void)
{
u64 lpj, cyc;
@@ -1346,12 +1365,10 @@ void __init tsc_init(void)
unsigned long calibrate_delay_is_known(void)
{
int sibling, cpu = smp_processor_id();
- struct cpumask *mask = topology_core_cpumask(cpu);
-
- if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
- return 0;
+ int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC);
+ const struct cpumask *mask = topology_core_cpumask(cpu);
- if (!mask)
+ if (tsc_disabled || !constant_tsc || !mask)
return 0;
sibling = cpumask_any_but(mask, cpu);
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index e76a9881306b..ec534f978867 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -31,6 +31,20 @@ struct tsc_adjust {
static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+/*
+ * TSC's on different sockets may be reset asynchronously.
+ * This may cause the TSC ADJUST value on socket 0 to be NOT 0.
+ */
+bool __read_mostly tsc_async_resets;
+
+void mark_tsc_async_resets(char *reason)
+{
+ if (tsc_async_resets)
+ return;
+ tsc_async_resets = true;
+ pr_info("tsc: Marking TSC async resets true due to %s\n", reason);
+}
+
void tsc_verify_tsc_adjust(bool resume)
{
struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
@@ -39,6 +53,10 @@ void tsc_verify_tsc_adjust(bool resume)
if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
return;
+ /* Skip unnecessary error messages if TSC already unstable */
+ if (check_tsc_unstable())
+ return;
+
/* Rate limit the MSR check */
if (!resume && time_before(jiffies, adj->nextcheck))
return;
@@ -72,12 +90,22 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
* non zero. We don't do that on non boot cpus because physical
* hotplug should have set the ADJUST register to a value > 0 so
* the TSC is in sync with the already running cpus.
+ *
+ * Also don't force the ADJUST value to zero if that is a valid value
+ * for socket 0 as determined by the system arch. This is required
+ * when multiple sockets are reset asynchronously with each other
+ * and socket 0 may not have an TSC ADJUST value of 0.
*/
if (bootcpu && bootval != 0) {
- pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu,
- bootval);
- wrmsrl(MSR_IA32_TSC_ADJUST, 0);
- bootval = 0;
+ if (likely(!tsc_async_resets)) {
+ pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n",
+ cpu, bootval);
+ wrmsrl(MSR_IA32_TSC_ADJUST, 0);
+ bootval = 0;
+ } else {
+ pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n",
+ cpu, bootval);
+ }
}
cur->adjusted = bootval;
}
@@ -91,6 +119,10 @@ bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
return false;
+ /* Skip unnecessary error messages if TSC already unstable */
+ if (check_tsc_unstable())
+ return false;
+
rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
cur->bootval = bootval;
cur->nextcheck = jiffies + HZ;
@@ -119,6 +151,13 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
cur->warned = false;
/*
+ * If a non-zero TSC value for socket 0 may be valid then the default
+ * adjusted value cannot assumed to be zero either.
+ */
+ if (tsc_async_resets)
+ cur->adjusted = bootval;
+
+ /*
* Check whether this CPU is the first in a package to come up. In
* this case do not check the boot value against another package
* because the new package might have been physically hotplugged,
@@ -139,10 +178,9 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
* Compare the boot value and complain if it differs in the
* package.
*/
- if (bootval != ref->bootval) {
- pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n",
- refcpu, ref->bootval, cpu, bootval);
- }
+ if (bootval != ref->bootval)
+ printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n");
+
/*
* The TSC_ADJUST values in a package must be the same. If the boot
* value on this newly upcoming CPU differs from the adjustment
@@ -150,8 +188,6 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
* adjusted value.
*/
if (bootval != ref->adjusted) {
- pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n",
- refcpu, ref->adjusted, cpu, bootval);
cur->adjusted = ref->adjusted;
wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
}
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
new file mode 100644
index 000000000000..6ba82be68cff
--- /dev/null
+++ b/arch/x86/kernel/umip.c
@@ -0,0 +1,366 @@
+/*
+ * umip.c Emulation for instruction protected by the Intel User-Mode
+ * Instruction Prevention feature
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ * Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+ */
+
+#include <linux/uaccess.h>
+#include <asm/umip.h>
+#include <asm/traps.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <linux/ratelimit.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "umip: " fmt
+
+/** DOC: Emulation for User-Mode Instruction Prevention (UMIP)
+ *
+ * The feature User-Mode Instruction Prevention present in recent Intel
+ * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str)
+ * from being executed with CPL > 0. Otherwise, a general protection fault is
+ * issued.
+ *
+ * Rather than relaying to the user space the general protection fault caused by
+ * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be
+ * trapped and emulate the result of such instructions to provide dummy values.
+ * This allows to both conserve the current kernel behavior and not reveal the
+ * system resources that UMIP intends to protect (i.e., the locations of the
+ * global descriptor and interrupt descriptor tables, the segment selectors of
+ * the local descriptor table, the value of the task state register and the
+ * contents of the CR0 register).
+ *
+ * This emulation is needed because certain applications (e.g., WineHQ and
+ * DOSEMU2) rely on this subset of instructions to function.
+ *
+ * The instructions protected by UMIP can be split in two groups. Those which
+ * return a kernel memory address (sgdt and sidt) and those which return a
+ * value (sldt, str and smsw).
+ *
+ * For the instructions that return a kernel memory address, applications
+ * such as WineHQ rely on the result being located in the kernel memory space,
+ * not the actual location of the table. The result is emulated as a hard-coded
+ * value that, lies close to the top of the kernel memory. The limit for the GDT
+ * and the IDT are set to zero.
+ *
+ * Given that sldt and str are not commonly used in programs that run on WineHQ
+ * or DOSEMU2, they are not emulated.
+ *
+ * The instruction smsw is emulated to return the value that the register CR0
+ * has at boot time as set in the head_32.
+ *
+ * Also, emulation is provided only for 32-bit processes; 64-bit processes
+ * that attempt to use the instructions that UMIP protects will receive the
+ * SIGSEGV signal issued as a consequence of the general protection fault.
+ *
+ * Care is taken to appropriately emulate the results when segmentation is
+ * used. That is, rather than relying on USER_DS and USER_CS, the function
+ * insn_get_addr_ref() inspects the segment descriptor pointed by the
+ * registers in pt_regs. This ensures that we correctly obtain the segment
+ * base address and the address and operand sizes even if the user space
+ * application uses a local descriptor table.
+ */
+
+#define UMIP_DUMMY_GDT_BASE 0xfffe0000
+#define UMIP_DUMMY_IDT_BASE 0xffff0000
+
+/*
+ * The SGDT and SIDT instructions store the contents of the global descriptor
+ * table and interrupt table registers, respectively. The destination is a
+ * memory operand of X+2 bytes. X bytes are used to store the base address of
+ * the table and 2 bytes are used to store the limit. In 32-bit processes, the
+ * only processes for which emulation is provided, X has a value of 4.
+ */
+#define UMIP_GDT_IDT_BASE_SIZE 4
+#define UMIP_GDT_IDT_LIMIT_SIZE 2
+
+#define UMIP_INST_SGDT 0 /* 0F 01 /0 */
+#define UMIP_INST_SIDT 1 /* 0F 01 /1 */
+#define UMIP_INST_SMSW 3 /* 0F 01 /4 */
+
+/**
+ * identify_insn() - Identify a UMIP-protected instruction
+ * @insn: Instruction structure with opcode and ModRM byte.
+ *
+ * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected
+ * instruction that can be emulated.
+ *
+ * Returns:
+ *
+ * On success, a constant identifying a specific UMIP-protected instruction that
+ * can be emulated.
+ *
+ * -EINVAL on error or when not an UMIP-protected instruction that can be
+ * emulated.
+ */
+static int identify_insn(struct insn *insn)
+{
+ /* By getting modrm we also get the opcode. */
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ /* All the instructions of interest start with 0x0f. */
+ if (insn->opcode.bytes[0] != 0xf)
+ return -EINVAL;
+
+ if (insn->opcode.bytes[1] == 0x1) {
+ switch (X86_MODRM_REG(insn->modrm.value)) {
+ case 0:
+ return UMIP_INST_SGDT;
+ case 1:
+ return UMIP_INST_SIDT;
+ case 4:
+ return UMIP_INST_SMSW;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /* SLDT AND STR are not emulated */
+ return -EINVAL;
+}
+
+/**
+ * emulate_umip_insn() - Emulate UMIP instructions and return dummy values
+ * @insn: Instruction structure with operands
+ * @umip_inst: A constant indicating the instruction to emulate
+ * @data: Buffer into which the dummy result is stored
+ * @data_size: Size of the emulated result
+ *
+ * Emulate an instruction protected by UMIP and provide a dummy result. The
+ * result of the emulation is saved in @data. The size of the results depends
+ * on both the instruction and type of operand (register vs memory address).
+ * The size of the result is updated in @data_size. Caller is responsible
+ * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE +
+ * UMIP_GDT_IDT_LIMIT_SIZE bytes.
+ *
+ * Returns:
+ *
+ * 0 on success, -EINVAL on error while emulating.
+ */
+static int emulate_umip_insn(struct insn *insn, int umip_inst,
+ unsigned char *data, int *data_size)
+{
+ unsigned long dummy_base_addr, dummy_value;
+ unsigned short dummy_limit = 0;
+
+ if (!data || !data_size || !insn)
+ return -EINVAL;
+ /*
+ * These two instructions return the base address and limit of the
+ * global and interrupt descriptor table, respectively. According to the
+ * Intel Software Development manual, the base address can be 24-bit,
+ * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is
+ * 16-bit, the returned value of the base address is supposed to be a
+ * zero-extended 24-byte number. However, it seems that a 32-byte number
+ * is always returned irrespective of the operand size.
+ */
+ if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) {
+ /* SGDT and SIDT do not use registers operands. */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ return -EINVAL;
+
+ if (umip_inst == UMIP_INST_SGDT)
+ dummy_base_addr = UMIP_DUMMY_GDT_BASE;
+ else
+ dummy_base_addr = UMIP_DUMMY_IDT_BASE;
+
+ *data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE;
+
+ memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE);
+ memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE);
+
+ } else if (umip_inst == UMIP_INST_SMSW) {
+ dummy_value = CR0_STATE;
+
+ /*
+ * Even though the CR0 register has 4 bytes, the number
+ * of bytes to be copied in the result buffer is determined
+ * by whether the operand is a register or a memory location.
+ * If operand is a register, return as many bytes as the operand
+ * size. If operand is memory, return only the two least
+ * siginificant bytes of CR0.
+ */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ *data_size = insn->opnd_bytes;
+ else
+ *data_size = 2;
+
+ memcpy(data, &dummy_value, *data_size);
+ /* STR and SLDT are not emulated */
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR
+ * @addr: Address that caused the signal
+ * @regs: Register set containing the instruction pointer
+ *
+ * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is
+ * intended to be used to provide a segmentation fault when the result of the
+ * UMIP emulation could not be copied to the user space memory.
+ *
+ * Returns: none
+ */
+static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
+{
+ siginfo_t info;
+ struct task_struct *tsk = current;
+
+ tsk->thread.cr2 = (unsigned long)addr;
+ tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_MAPERR;
+ info.si_addr = addr;
+ force_sig_info(SIGSEGV, &info, tsk);
+
+ if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV)))
+ return;
+
+ pr_err_ratelimited("%s[%d] umip emulation segfault ip:%lx sp:%lx error:%x in %lx\n",
+ tsk->comm, task_pid_nr(tsk), regs->ip,
+ regs->sp, X86_PF_USER | X86_PF_WRITE,
+ regs->ip);
+}
+
+/**
+ * fixup_umip_exception() - Fixup a general protection fault caused by UMIP
+ * @regs: Registers as saved when entering the #GP handler
+ *
+ * The instructions sgdt, sidt, str, smsw, sldt cause a general protection
+ * fault if executed with CPL > 0 (i.e., from user space). If the offending
+ * user-space process is not in long mode, this function fixes the exception
+ * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not
+ * fixed up. Also long mode user-space processes are not fixed up.
+ *
+ * If operands are memory addresses, results are copied to user-space memory as
+ * indicated by the instruction pointed by eIP using the registers indicated in
+ * the instruction operands. If operands are registers, results are copied into
+ * the context that was saved when entering kernel mode.
+ *
+ * Returns:
+ *
+ * True if emulation was successful; false if not.
+ */
+bool fixup_umip_exception(struct pt_regs *regs)
+{
+ int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst;
+ unsigned long seg_base = 0, *reg_addr;
+ /* 10 bytes is the maximum size of the result of UMIP instructions */
+ unsigned char dummy_data[10] = { 0 };
+ unsigned char buf[MAX_INSN_SIZE];
+ void __user *uaddr;
+ struct insn insn;
+ char seg_defs;
+
+ if (!regs)
+ return false;
+
+ /* Do not emulate 64-bit processes. */
+ if (user_64bit_mode(regs))
+ return false;
+
+ /*
+ * If not in user-space long mode, a custom code segment could be in
+ * use. This is true in protected mode (if the process defined a local
+ * descriptor table), or virtual-8086 mode. In most of the cases
+ * seg_base will be zero as in USER_CS.
+ */
+ if (!user_64bit_mode(regs))
+ seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
+
+ if (seg_base == -1L)
+ return false;
+
+ not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip),
+ sizeof(buf));
+ nr_copied = sizeof(buf) - not_copied;
+
+ /*
+ * The copy_from_user above could have failed if user code is protected
+ * by a memory protection key. Give up on emulation in such a case.
+ * Should we issue a page fault?
+ */
+ if (!nr_copied)
+ return false;
+
+ insn_init(&insn, buf, nr_copied, user_64bit_mode(regs));
+
+ /*
+ * Override the default operand and address sizes with what is specified
+ * in the code segment descriptor. The instruction decoder only sets
+ * the address size it to either 4 or 8 address bytes and does nothing
+ * for the operand bytes. This OK for most of the cases, but we could
+ * have special cases where, for instance, a 16-bit code segment
+ * descriptor is used.
+ * If there is an address override prefix, the instruction decoder
+ * correctly updates these values, even for 16-bit defaults.
+ */
+ seg_defs = insn_get_code_seg_params(regs);
+ if (seg_defs == -EINVAL)
+ return false;
+
+ insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs);
+ insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs);
+
+ insn_get_length(&insn);
+ if (nr_copied < insn.length)
+ return false;
+
+ umip_inst = identify_insn(&insn);
+ if (umip_inst < 0)
+ return false;
+
+ if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size))
+ return false;
+
+ /*
+ * If operand is a register, write result to the copy of the register
+ * value that was pushed to the stack when entering into kernel mode.
+ * Upon exit, the value we write will be restored to the actual hardware
+ * register.
+ */
+ if (X86_MODRM_MOD(insn.modrm.value) == 3) {
+ reg_offset = insn_get_modrm_rm_off(&insn, regs);
+
+ /*
+ * Negative values are usually errors. In memory addressing,
+ * the exception is -EDOM. Since we expect a register operand,
+ * all negative values are errors.
+ */
+ if (reg_offset < 0)
+ return false;
+
+ reg_addr = (unsigned long *)((unsigned long)regs + reg_offset);
+ memcpy(reg_addr, dummy_data, dummy_data_size);
+ } else {
+ uaddr = insn_get_addr_ref(&insn, regs);
+ if ((unsigned long)uaddr == -1L)
+ return false;
+
+ nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size);
+ if (nr_copied > 0) {
+ /*
+ * If copy fails, send a signal and tell caller that
+ * fault was fixed up.
+ */
+ force_sig_info_umip_fault(uaddr, regs);
+ return true;
+ }
+ }
+
+ /* increase IP to let the program keep going */
+ regs->ip += insn.length;
+ return true;
+}
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index b95007e7c1b3..a3f973b2c97a 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -279,7 +279,7 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
if (!stack_access_ok(state, addr, sizeof(long)))
return false;
- *val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr);
+ *val = READ_ONCE_NOCHECK(*(unsigned long *)addr);
return true;
}
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 495c776de4b4..a3755d293a48 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -271,12 +271,15 @@ static bool is_prefix_bad(struct insn *insn)
int i;
for (i = 0; i < insn->prefixes.nbytes; i++) {
- switch (insn->prefixes.bytes[i]) {
- case 0x26: /* INAT_PFX_ES */
- case 0x2E: /* INAT_PFX_CS */
- case 0x36: /* INAT_PFX_DS */
- case 0x3E: /* INAT_PFX_SS */
- case 0xF0: /* INAT_PFX_LOCK */
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]);
+ switch (attr) {
+ case INAT_MAKE_PREFIX(INAT_PFX_ES):
+ case INAT_MAKE_PREFIX(INAT_PFX_CS):
+ case INAT_MAKE_PREFIX(INAT_PFX_DS):
+ case INAT_MAKE_PREFIX(INAT_PFX_SS):
+ case INAT_MAKE_PREFIX(INAT_PFX_LOCK):
return true;
}
}
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 014ea59aa153..3d3c2f71f617 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -33,7 +33,7 @@
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
-verify_cpu:
+ENTRY(verify_cpu)
pushf # Save caller passed flags
push $0 # Kill any dangerous flags
popf
@@ -139,3 +139,4 @@ verify_cpu:
popf # Restore caller passed flags
xorl %eax, %eax
ret
+ENDPROC(verify_cpu)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 68244742ecb0..5edb27f1a2c4 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -55,6 +55,7 @@
#include <asm/irq.h>
#include <asm/traps.h>
#include <asm/vm86.h>
+#include <asm/switch_to.h>
/*
* Known problems:
@@ -94,7 +95,6 @@
void save_v86_state(struct kernel_vm86_regs *regs, int retval)
{
- struct tss_struct *tss;
struct task_struct *tsk = current;
struct vm86plus_struct __user *user;
struct vm86 *vm86 = current->thread.vm86;
@@ -146,12 +146,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
do_exit(SIGSEGV);
}
- tss = &per_cpu(cpu_tss, get_cpu());
+ preempt_disable();
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
- load_sp0(tss, &tsk->thread);
+ update_sp0(tsk);
+ refresh_sysenter_cs(&tsk->thread);
vm86->saved_sp0 = 0;
- put_cpu();
+ preempt_enable();
memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
@@ -237,7 +238,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
{
- struct tss_struct *tss;
struct task_struct *tsk = current;
struct vm86 *vm86 = tsk->thread.vm86;
struct kernel_vm86_regs vm86regs;
@@ -365,15 +365,17 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
vm86->saved_sp0 = tsk->thread.sp0;
lazy_save_gs(vm86->regs32.gs);
- tss = &per_cpu(cpu_tss, get_cpu());
/* make room for real-mode segments */
+ preempt_disable();
tsk->thread.sp0 += 16;
- if (static_cpu_has(X86_FEATURE_SEP))
+ if (static_cpu_has(X86_FEATURE_SEP)) {
tsk->thread.sysenter_cs = 0;
+ refresh_sysenter_cs(&tsk->thread);
+ }
- load_sp0(tss, &tsk->thread);
- put_cpu();
+ update_sp0(tsk);
+ preempt_enable();
if (vm86->flags & VM86_SCREEN_BITMAP)
mark_screen_rdonly(tsk->mm);
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index b034b1b14b9c..44685fb2a192 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -26,9 +26,6 @@
#define TOPOLOGY_REGISTER_OFFSET 0x10
-/* Flag below is initialized once during vSMP PCI initialization. */
-static int irq_routing_comply = 1;
-
#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
/*
* Interrupt control on vSMPowered systems:
@@ -105,9 +102,6 @@ static void __init set_vsmp_pv_ops(void)
if (cap & ctl & BIT(8)) {
ctl &= ~BIT(8);
- /* Interrupt routing set to ignore */
- irq_routing_comply = 0;
-
#ifdef CONFIG_PROC_FS
/* Don't let users change irq affinity via procfs */
no_irq_affinity = 1;
@@ -211,23 +205,10 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
return hard_smp_processor_id() >> index_msb;
}
-/*
- * In vSMP, all cpus should be capable of handling interrupts, regardless of
- * the APIC used.
- */
-static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask,
- const struct cpumask *mask)
-{
- cpumask_setall(retmask);
-}
-
static void vsmp_apic_post_init(void)
{
/* need to update phys_pkg_id */
apic->phys_pkg_id = apicid_phys_pkg_id;
-
- if (!irq_routing_comply)
- apic->vector_allocation_domain = fill_vector_allocation_domain;
}
void __init vsmp_init(void)
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a088b2c47f73..1151ccd72ce9 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -28,6 +28,8 @@ void x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
int __init iommu_init_noop(void) { return 0; }
void iommu_shutdown_noop(void) { }
+bool __init bool_x86_init_noop(void) { return false; }
+void x86_op_int_noop(int cpu) { }
/*
* The platform setup functions are preset with the default functions
@@ -55,6 +57,7 @@ struct x86_init_ops x86_init __initdata = {
.pre_vector_init = init_ISA_irqs,
.intr_init = native_init_IRQ,
.trap_init = x86_init_noop,
+ .intr_mode_init = apic_intr_mode_init
},
.oem = {
@@ -81,6 +84,13 @@ struct x86_init_ops x86_init __initdata = {
.init_irq = x86_default_pci_init_irq,
.fixup_irqs = x86_default_pci_fixup_irqs,
},
+
+ .hyper = {
+ .init_platform = x86_init_noop,
+ .guest_late_init = x86_init_noop,
+ .x2apic_available = bool_x86_init_noop,
+ .init_mem_mapping = x86_init_noop,
+ },
};
struct x86_cpuinit_ops x86_cpuinit = {
@@ -101,6 +111,7 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.get_nmi_reason = default_get_nmi_reason,
.save_sched_clock_state = tsc_save_sched_clock_state,
.restore_sched_clock_state = tsc_restore_sched_clock_state,
+ .hyper.pin_vcpu = x86_op_int_noop,
};
EXPORT_SYMBOL_GPL(x86_platform);