aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile11
-rw-r--r--arch/x86/kernel/acpi/boot.c6
-rw-r--r--arch/x86/kernel/alternative.c22
-rw-r--r--arch/x86/kernel/apic/apic.c76
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/vector.c2
-rw-r--r--arch/x86/kernel/asm-offsets_32.c20
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/amd.c55
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c43
-rw-r--r--arch/x86/kernel/cpu/bugs.c8
-rw-r--r--arch/x86/kernel/cpu/common.c49
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c32
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c375
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h440
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c (renamed from arch/x86/kernel/cpu/intel_rdt_schemata.c)67
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_monitor.c499
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c1117
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c43
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c16
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c27
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c9
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c18
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/dumpstack.c14
-rw-r--r--arch/x86/kernel/dumpstack_32.c4
-rw-r--r--arch/x86/kernel/dumpstack_64.c4
-rw-r--r--arch/x86/kernel/e820.c26
-rw-r--r--arch/x86/kernel/early-quirks.c1
-rw-r--r--arch/x86/kernel/eisa.c19
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/head32.c4
-rw-r--r--arch/x86/kernel/head64.c106
-rw-r--r--arch/x86/kernel/head_32.S66
-rw-r--r--arch/x86/kernel/head_64.S40
-rw-r--r--arch/x86/kernel/hpet.c27
-rw-r--r--arch/x86/kernel/idt.c371
-rw-r--r--arch/x86/kernel/irq.c45
-rw-r--r--arch/x86/kernel/irq_work.c20
-rw-r--r--arch/x86/kernel/irqinit.c100
-rw-r--r--arch/x86/kernel/kdebugfs.c34
-rw-r--r--arch/x86/kernel/kprobes/core.c10
-rw-r--r--arch/x86/kernel/kprobes/opt.c9
-rw-r--r--arch/x86/kernel/ksysfs.c32
-rw-r--r--arch/x86/kernel/kvm.c10
-rw-r--r--arch/x86/kernel/ldt.c21
-rw-r--r--arch/x86/kernel/machine_kexec_32.c14
-rw-r--r--arch/x86/kernel/machine_kexec_64.c25
-rw-r--r--arch/x86/kernel/module.c11
-rw-r--r--arch/x86/kernel/mpparse.c108
-rw-r--r--arch/x86/kernel/nmi.c18
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/pci-nommu.c2
-rw-r--r--arch/x86/kernel/pci-swiotlb.c15
-rw-r--r--arch/x86/kernel/platform-quirks.c1
-rw-r--r--arch/x86/kernel/process.c17
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c244
-rw-r--r--arch/x86/kernel/reboot.c10
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S14
-rw-r--r--arch/x86/kernel/setup.c16
-rw-r--r--arch/x86/kernel/setup_percpu.c9
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/smp.c81
-rw-r--r--arch/x86/kernel/smpboot.c30
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c30
-rw-r--r--arch/x86/kernel/tls.c2
-rw-r--r--arch/x86/kernel/tracepoint.c57
-rw-r--r--arch/x86/kernel/traps.c107
-rw-r--r--arch/x86/kernel/unwind_frame.c41
-rw-r--r--arch/x86/kernel/unwind_guess.c5
-rw-r--r--arch/x86/kernel/unwind_orc.c582
-rw-r--r--arch/x86/kernel/vmlinux.lds.S3
80 files changed, 4190 insertions, 1236 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a01892bdd61a..fd0a7895b63f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -42,7 +42,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace
obj-y := process_$(BITS).o signal.o
obj-$(CONFIG_COMPAT) += signal_compat.o
-obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
+obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o dumpstack.o nmi.o
obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
@@ -111,6 +111,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
+obj-$(CONFIG_EISA) += eisa.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
@@ -126,11 +127,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
-ifdef CONFIG_FRAME_POINTER
-obj-y += unwind_frame.o
-else
-obj-y += unwind_guess.o
-endif
+obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o
+obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o
+obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o
###
# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7491e73d9253..97bb2caf3428 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -115,7 +115,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
#define ACPI_INVALID_GSI INT_MIN
/*
- * This is just a simple wrapper around early_ioremap(),
+ * This is just a simple wrapper around early_memremap(),
* with sanity checks for phys == 0 and size == 0.
*/
char *__init __acpi_map_table(unsigned long phys, unsigned long size)
@@ -124,7 +124,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
if (!phys || !size)
return NULL;
- return early_ioremap(phys, size);
+ return early_memremap(phys, size);
}
void __init __acpi_unmap_table(char *map, unsigned long size)
@@ -132,7 +132,7 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
if (!map || !size)
return;
- early_iounmap(map, size);
+ early_memunmap(map, size);
}
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 32e14d137416..3344d3382e91 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -742,7 +742,16 @@ static void *bp_int3_handler, *bp_int3_addr;
int poke_int3_handler(struct pt_regs *regs)
{
- /* bp_patching_in_progress */
+ /*
+ * Having observed our INT3 instruction, we now must observe
+ * bp_patching_in_progress.
+ *
+ * in_progress = TRUE INT3
+ * WMB RMB
+ * write INT3 if (in_progress)
+ *
+ * Idem for bp_int3_handler.
+ */
smp_rmb();
if (likely(!bp_patching_in_progress))
@@ -788,9 +797,8 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
bp_int3_addr = (u8 *)addr + sizeof(int3);
bp_patching_in_progress = true;
/*
- * Corresponding read barrier in int3 notifier for
- * making sure the in_progress flags is correctly ordered wrt.
- * patching
+ * Corresponding read barrier in int3 notifier for making sure the
+ * in_progress and handler are correctly ordered wrt. patching.
*/
smp_wmb();
@@ -815,9 +823,11 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
text_poke(addr, opcode, sizeof(int3));
on_each_cpu(do_sync_core, NULL, 1);
-
+ /*
+ * sync_core() implies an smp_mb() and orders this store against
+ * the writing of the new instruction.
+ */
bp_patching_in_progress = false;
- smp_wmb();
return addr;
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 98b3dd8cf2bf..7834f73efbf1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -177,8 +177,6 @@ static int disable_apic_timer __initdata;
int local_apic_timer_c2_ok;
EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
-int first_system_vector = FIRST_SYSTEM_VECTOR;
-
/*
* Debug level, exported for io_apic.c
*/
@@ -599,9 +597,13 @@ static const struct x86_cpu_id deadline_match[] = {
static void apic_check_deadline_errata(void)
{
- const struct x86_cpu_id *m = x86_match_cpu(deadline_match);
+ const struct x86_cpu_id *m;
u32 rev;
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return;
+
+ m = x86_match_cpu(deadline_match);
if (!m)
return;
@@ -990,8 +992,7 @@ void setup_secondary_APIC_clock(void)
*/
static void local_apic_timer_interrupt(void)
{
- int cpu = smp_processor_id();
- struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
+ struct clock_event_device *evt = this_cpu_ptr(&lapic_events);
/*
* Normally we should not be here till LAPIC has been initialized but
@@ -1005,7 +1006,8 @@ static void local_apic_timer_interrupt(void)
* spurious.
*/
if (!evt->event_handler) {
- pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+ pr_warning("Spurious LAPIC timer interrupt on cpu %d\n",
+ smp_processor_id());
/* Switch it off */
lapic_timer_shutdown(evt);
return;
@@ -1040,25 +1042,6 @@ __visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
* interrupt lock, which is the WrongThing (tm) to do.
*/
entering_ack_irq();
- local_apic_timer_interrupt();
- exiting_irq();
-
- set_irq_regs(old_regs);
-}
-
-__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow.
- *
- * update_process_times() expects us to have done irq_enter().
- * Besides, if we don't timer interrupts ignore the global
- * interrupt lock, which is the WrongThing (tm) to do.
- */
- entering_ack_irq();
trace_local_timer_entry(LOCAL_TIMER_VECTOR);
local_apic_timer_interrupt();
trace_local_timer_exit(LOCAL_TIMER_VECTOR);
@@ -1920,10 +1903,14 @@ void __init register_lapic_address(unsigned long address)
/*
* This interrupt should _never_ happen with our APIC/SMP architecture
*/
-static void __smp_spurious_interrupt(u8 vector)
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
{
+ u8 vector = ~regs->orig_ax;
u32 v;
+ entering_irq();
+ trace_spurious_apic_entry(vector);
+
/*
* Check if this really is a spurious interrupt and ACK it
* if it is a vectored one. Just in case...
@@ -1938,22 +1925,7 @@ static void __smp_spurious_interrupt(u8 vector)
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
pr_info("spurious APIC interrupt through vector %02x on CPU#%d, "
"should never happen.\n", vector, smp_processor_id());
-}
-__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
-{
- entering_irq();
- __smp_spurious_interrupt(~regs->orig_ax);
- exiting_irq();
-}
-
-__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs)
-{
- u8 vector = ~regs->orig_ax;
-
- entering_irq();
- trace_spurious_apic_entry(vector);
- __smp_spurious_interrupt(vector);
trace_spurious_apic_exit(vector);
exiting_irq();
}
@@ -1961,10 +1933,8 @@ __visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs)
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
-static void __smp_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
{
- u32 v;
- u32 i = 0;
static const char * const error_interrupt_reason[] = {
"Send CS error", /* APIC Error Bit 0 */
"Receive CS error", /* APIC Error Bit 1 */
@@ -1975,6 +1945,10 @@ static void __smp_error_interrupt(struct pt_regs *regs)
"Received illegal vector", /* APIC Error Bit 6 */
"Illegal register address", /* APIC Error Bit 7 */
};
+ u32 v, i = 0;
+
+ entering_irq();
+ trace_error_apic_entry(ERROR_APIC_VECTOR);
/* First tickle the hardware, only then report what went on. -- REW */
if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */
@@ -1996,20 +1970,6 @@ static void __smp_error_interrupt(struct pt_regs *regs)
apic_printk(APIC_DEBUG, KERN_CONT "\n");
-}
-
-__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
-{
- entering_irq();
- __smp_error_interrupt(regs);
- exiting_irq();
-}
-
-__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs)
-{
- entering_irq();
- trace_error_apic_entry(ERROR_APIC_VECTOR);
- __smp_error_interrupt(regs);
trace_error_apic_exit(ERROR_APIC_VECTOR);
exiting_irq();
}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 237e9c2341c7..70e48aa6af98 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1243,7 +1243,7 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
entry.vector, entry.irr, entry.delivery_status);
if (ir_entry->format)
printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n",
- buf, (ir_entry->index << 15) | ir_entry->index,
+ buf, (ir_entry->index2 << 15) | ir_entry->index,
ir_entry->zero);
else
printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index b3af457ed667..88c214e75a6b 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -166,7 +166,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
offset = current_offset;
next:
vector += 16;
- if (vector >= first_system_vector) {
+ if (vector >= FIRST_SYSTEM_VECTOR) {
offset = (offset + 1) % 16;
vector = FIRST_EXTERNAL_VECTOR + offset;
}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 880aa093268d..710edab9e644 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -4,9 +4,6 @@
#include <asm/ucontext.h>
-#include <linux/lguest.h>
-#include "../../../drivers/lguest/lg.h"
-
#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
static char syscalls[] = {
#include <asm/syscalls_32.h>
@@ -62,23 +59,6 @@ void foo(void)
OFFSET(stack_canary_offset, stack_canary, canary);
#endif
-#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
- BLANK();
- OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
- OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-
- BLANK();
- OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
- OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
- OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
- OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
- OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
- OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
- OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
- OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
- OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
- OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
-#endif
BLANK();
DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
DEFINE(NR_syscalls, sizeof(syscalls));
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 99332f550c48..cf42206926af 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -20,7 +20,6 @@ static char syscalls_ia32[] = {
int main(void)
{
#ifdef CONFIG_PARAVIRT
- OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
BLANK();
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index cdf82492b770..e17942c131c8 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
-obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
+obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 3b9e220621f8..9862e2cd6d93 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -297,13 +297,29 @@ static int nearby_node(int apicid)
}
#endif
+#ifdef CONFIG_SMP
+/*
+ * Fix up cpu_core_id for pre-F17h systems to be in the
+ * [0 .. cores_per_node - 1] range. Not really needed but
+ * kept so as not to break existing setups.
+ */
+static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
+{
+ u32 cus_per_node;
+
+ if (c->x86 >= 0x17)
+ return;
+
+ cus_per_node = c->x86_max_cores / nodes_per_socket;
+ c->cpu_core_id %= cus_per_node;
+}
+
/*
* Fixup core topology information for
* (1) AMD multi-node processors
* Assumption: Number of cores in each internal node is the same.
* (2) AMD processors supporting compute units
*/
-#ifdef CONFIG_SMP
static void amd_get_topology(struct cpuinfo_x86 *c)
{
u8 node_id;
@@ -354,15 +370,9 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
} else
return;
- /* fixup multi-node processor information */
if (nodes_per_socket > 1) {
- u32 cus_per_node;
-
set_cpu_cap(c, X86_FEATURE_AMD_DCM);
- cus_per_node = c->x86_max_cores / nodes_per_socket;
-
- /* core id has to be in the [0 .. cores_per_node - 1] range */
- c->cpu_core_id %= cus_per_node;
+ legacy_fixup_core_id(c);
}
}
#endif
@@ -548,8 +558,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
static void early_init_amd(struct cpuinfo_x86 *c)
{
+ u32 dummy;
+
early_init_amd_mc(c);
+ rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
/*
* c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
* with P/T states and does not stop in deep C-states
@@ -612,6 +626,27 @@ static void early_init_amd(struct cpuinfo_x86 *c)
*/
if (cpu_has_amd_erratum(c, amd_erratum_400))
set_cpu_bug(c, X86_BUG_AMD_E400);
+
+ /*
+ * BIOS support is required for SME. If BIOS has enabled SME then
+ * adjust x86_phys_bits by the SME physical address space reduction
+ * value. If BIOS has not enabled SME then don't advertise the
+ * feature (set in scattered.c). Also, since the SME support requires
+ * long mode, don't advertise the feature under CONFIG_X86_32.
+ */
+ if (cpu_has(c, X86_FEATURE_SME)) {
+ u64 msr;
+
+ /* Check if SME is enabled */
+ rdmsrl(MSR_K8_SYSCFG, msr);
+ if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) {
+ c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
+ if (IS_ENABLED(CONFIG_X86_32))
+ clear_cpu_cap(c, X86_FEATURE_SME);
+ } else {
+ clear_cpu_cap(c, X86_FEATURE_SME);
+ }
+ }
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -730,8 +765,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
static void init_amd(struct cpuinfo_x86 *c)
{
- u32 dummy;
-
early_init_amd(c);
/*
@@ -793,8 +826,6 @@ static void init_amd(struct cpuinfo_x86 *c)
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
- rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
-
/* 3DNow or LM implies PREFETCHW */
if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index d869c8671e36..0ee83321a313 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -8,20 +8,25 @@
* This file is licensed under GPLv2.
*/
-#include <linux/jiffies.h>
+#include <linux/delay.h>
+#include <linux/ktime.h>
#include <linux/math64.h>
#include <linux/percpu.h>
#include <linux/smp.h>
struct aperfmperf_sample {
unsigned int khz;
- unsigned long jiffies;
+ ktime_t time;
u64 aperf;
u64 mperf;
};
static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+#define APERFMPERF_CACHE_THRESHOLD_MS 10
+#define APERFMPERF_REFRESH_DELAY_MS 20
+#define APERFMPERF_STALE_THRESHOLD_MS 1000
+
/*
* aperfmperf_snapshot_khz()
* On the current CPU, snapshot APERF, MPERF, and jiffies
@@ -33,13 +38,18 @@ static void aperfmperf_snapshot_khz(void *dummy)
u64 aperf, aperf_delta;
u64 mperf, mperf_delta;
struct aperfmperf_sample *s = this_cpu_ptr(&samples);
+ ktime_t now = ktime_get();
+ s64 time_delta = ktime_ms_delta(now, s->time);
+ unsigned long flags;
- /* Don't bother re-computing within 10 ms */
- if (time_before(jiffies, s->jiffies + HZ/100))
+ /* Don't bother re-computing within the cache threshold time. */
+ if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
return;
+ local_irq_save(flags);
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
+ local_irq_restore(flags);
aperf_delta = aperf - s->aperf;
mperf_delta = mperf - s->mperf;
@@ -51,22 +61,21 @@ static void aperfmperf_snapshot_khz(void *dummy)
if (mperf_delta == 0)
return;
- /*
- * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
- * khz = (cpu_khz * aperf_delta) / mperf_delta
- */
- if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
- s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
- else /* khz = aperf_delta / (mperf_delta / cpu_khz) */
- s->khz = div64_u64(aperf_delta,
- div64_u64(mperf_delta, cpu_khz));
- s->jiffies = jiffies;
+ s->time = now;
s->aperf = aperf;
s->mperf = mperf;
+
+ /* If the previous iteration was too long ago, discard it. */
+ if (time_delta > APERFMPERF_STALE_THRESHOLD_MS)
+ s->khz = 0;
+ else
+ s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
}
unsigned int arch_freq_get_on_cpu(int cpu)
{
+ unsigned int khz;
+
if (!cpu_khz)
return 0;
@@ -74,6 +83,12 @@ unsigned int arch_freq_get_on_cpu(int cpu)
return 0;
smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+ khz = per_cpu(samples.khz, cpu);
+ if (khz)
+ return khz;
+
+ msleep(APERFMPERF_REFRESH_DELAY_MS);
+ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
return per_cpu(samples.khz, cpu);
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 0af86d9242da..db684880d74a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -21,6 +21,14 @@
void __init check_bugs(void)
{
+#ifdef CONFIG_X86_32
+ /*
+ * Regardless of whether PCID is enumerated, the SDM says
+ * that it can't be enabled in 32-bit mode.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
+
identify_boot_cpu();
if (!IS_ENABLED(CONFIG_SMP)) {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c8b39870f33e..efba8e3da3e2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s)
}
__setup("nompx", x86_mpx_setup);
+#ifdef CONFIG_X86_64
+static int __init x86_pcid_setup(char *s)
+{
+ /* require an exact match without trailing characters */
+ if (strlen(s))
+ return 0;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ pr_info("nopcid: PCID feature disabled\n");
+ return 1;
+}
+__setup("nopcid", x86_pcid_setup);
+#endif
+
static int __init x86_noinvpcid_setup(char *s)
{
/* noinvpcid doesn't accept parameters */
@@ -311,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
}
}
+static void setup_pcid(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_PCID)) {
+ if (cpu_has(c, X86_FEATURE_PGE)) {
+ cr4_set_bits(X86_CR4_PCIDE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't
+ * work if PCID is on but PGE is not. Since that
+ * combination doesn't exist on real hardware, there's
+ * no reason to try to fully support it, but it's
+ * polite to avoid corrupting data if we're on
+ * an improperly configured VM.
+ */
+ clear_cpu_cap(c, X86_FEATURE_PCID);
+ }
+ }
+}
+
/*
* Protection Keys are not available in 32-bit mode.
*/
@@ -1125,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_smep(c);
setup_smap(c);
+ /* Set up PCID */
+ setup_pcid(c);
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
@@ -1289,15 +1329,6 @@ static __init int setup_disablecpuid(char *arg)
__setup("clearcpuid=", setup_disablecpuid);
#ifdef CONFIG_X86_64
-struct desc_ptr idt_descr __ro_after_init = {
- .size = NR_VECTORS * 16 - 1,
- .address = (unsigned long) idt_table,
-};
-const struct desc_ptr debug_idt_descr = {
- .size = NR_VECTORS * 16 - 1,
- .address = (unsigned long) debug_idt_table,
-};
-
DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE) __visible;
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c55fb2cb2acc..24f749324c0f 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -811,7 +811,24 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
struct cacheinfo *this_leaf;
int i, sibling;
- if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ /*
+ * For L3, always use the pre-calculated cpu_llc_shared_mask
+ * to derive shared_cpu_map.
+ */
+ if (index == 3) {
+ for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+ this_leaf = this_cpu_ci->info_list + index;
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
+ if (!cpu_online(sibling))
+ continue;
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
+ }
+ }
+ } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
unsigned int apicid, nshared, first, last;
this_leaf = this_cpu_ci->info_list + index;
@@ -839,19 +856,6 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
&this_leaf->shared_cpu_map);
}
}
- } else if (index == 3) {
- for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
- this_cpu_ci = get_cpu_cacheinfo(i);
- if (!this_cpu_ci->info_list)
- continue;
- this_leaf = this_cpu_ci->info_list + index;
- for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
- if (!cpu_online(sibling))
- continue;
- cpumask_set_cpu(sibling,
- &this_leaf->shared_cpu_map);
- }
- }
} else
return 0;
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 5b366462f579..cd5fc61ba450 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -30,7 +30,8 @@
#include <linux/cpuhotplug.h>
#include <asm/intel-family.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
+#include "intel_rdt.h"
#define MAX_MBA_BW 100u
#define MBA_IS_LINEAR 0x4
@@ -38,7 +39,13 @@
/* Mutex to protect rdtgroup access. */
DEFINE_MUTEX(rdtgroup_mutex);
-DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Functions which modify the state
+ * are called with interrupts disabled and no preemption, which
+ * is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
/*
* Used to store the max resource name width and max resource data width
@@ -46,6 +53,12 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
*/
int max_name_width, max_data_width;
+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
+
static void
mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
static void
@@ -54,7 +67,9 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
struct rdt_resource rdt_resources_all[] = {
+ [RDT_RESOURCE_L3] =
{
+ .rid = RDT_RESOURCE_L3,
.name = "L3",
.domains = domain_init(RDT_RESOURCE_L3),
.msr_base = IA32_L3_CBM_BASE,
@@ -67,8 +82,11 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
},
+ [RDT_RESOURCE_L3DATA] =
{
+ .rid = RDT_RESOURCE_L3DATA,
.name = "L3DATA",
.domains = domain_init(RDT_RESOURCE_L3DATA),
.msr_base = IA32_L3_CBM_BASE,
@@ -81,8 +99,11 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
},
+ [RDT_RESOURCE_L3CODE] =
{
+ .rid = RDT_RESOURCE_L3CODE,
.name = "L3CODE",
.domains = domain_init(RDT_RESOURCE_L3CODE),
.msr_base = IA32_L3_CBM_BASE,
@@ -95,8 +116,11 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
},
+ [RDT_RESOURCE_L2] =
{
+ .rid = RDT_RESOURCE_L2,
.name = "L2",
.domains = domain_init(RDT_RESOURCE_L2),
.msr_base = IA32_L2_CBM_BASE,
@@ -109,8 +133,11 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
},
+ [RDT_RESOURCE_MBA] =
{
+ .rid = RDT_RESOURCE_MBA,
.name = "MB",
.domains = domain_init(RDT_RESOURCE_MBA),
.msr_base = IA32_MBA_THRTL_BASE,
@@ -118,6 +145,7 @@ struct rdt_resource rdt_resources_all[] = {
.cache_level = 3,
.parse_ctrlval = parse_bw,
.format_str = "%d=%*d",
+ .fflags = RFTYPE_RES_MB,
},
};
@@ -144,33 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
* is always 20 on hsw server parts. The minimum cache bitmask length
* allowed for HSW server is always 2 bits. Hardcode all of them.
*/
-static inline bool cache_alloc_hsw_probe(void)
+static inline void cache_alloc_hsw_probe(void)
{
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) {
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
- u32 l, h, max_cbm = BIT_MASK(20) - 1;
-
- if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
- return false;
- rdmsr(IA32_L3_CBM_BASE, l, h);
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+ u32 l, h, max_cbm = BIT_MASK(20) - 1;
- /* If all the bits were set in MSR, return success */
- if (l != max_cbm)
- return false;
+ if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
+ return;
+ rdmsr(IA32_L3_CBM_BASE, l, h);
- r->num_closid = 4;
- r->default_ctrl = max_cbm;
- r->cache.cbm_len = 20;
- r->cache.min_cbm_bits = 2;
- r->capable = true;
- r->enabled = true;
+ /* If all the bits were set in MSR, return success */
+ if (l != max_cbm)
+ return;
- return true;
- }
+ r->num_closid = 4;
+ r->default_ctrl = max_cbm;
+ r->cache.cbm_len = 20;
+ r->cache.shareable_bits = 0xc0000;
+ r->cache.min_cbm_bits = 2;
+ r->alloc_capable = true;
+ r->alloc_enabled = true;
- return false;
+ rdt_alloc_capable = true;
}
/*
@@ -213,15 +236,14 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
return false;
}
r->data_width = 3;
- rdt_get_mba_infofile(r);
- r->capable = true;
- r->enabled = true;
+ r->alloc_capable = true;
+ r->alloc_enabled = true;
return true;
}
-static void rdt_get_cache_config(int idx, struct rdt_resource *r)
+static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
{
union cpuid_0x10_1_eax eax;
union cpuid_0x10_x_edx edx;
@@ -231,10 +253,10 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r)
r->num_closid = edx.split.cos_max + 1;
r->cache.cbm_len = eax.split.cbm_len + 1;
r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.shareable_bits = ebx & r->default_ctrl;
r->data_width = (r->cache.cbm_len + 3) / 4;
- rdt_get_cache_infofile(r);
- r->capable = true;
- r->enabled = true;
+ r->alloc_capable = true;
+ r->alloc_enabled = true;
}
static void rdt_get_cdp_l3_config(int type)
@@ -246,12 +268,12 @@ static void rdt_get_cdp_l3_config(int type)
r->cache.cbm_len = r_l3->cache.cbm_len;
r->default_ctrl = r_l3->default_ctrl;
r->data_width = (r->cache.cbm_len + 3) / 4;
- r->capable = true;
+ r->alloc_capable = true;
/*
* By default, CDP is disabled. CDP can be enabled by mount parameter
* "cdp" during resctrl file system mount time.
*/
- r->enabled = false;
+ r->alloc_enabled = false;
}
static int get_cache_id(int cpu, int level)
@@ -300,6 +322,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
}
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
+{
+ struct rdt_domain *d;
+
+ list_for_each_entry(d, &r->domains, list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
void rdt_ctrl_update(void *arg)
{
struct msr_param *m = arg;
@@ -307,12 +342,10 @@ void rdt_ctrl_update(void *arg)
int cpu = smp_processor_id();
struct rdt_domain *d;
- list_for_each_entry(d, &r->domains, list) {
- /* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
- r->msr_update(d, m, r);
- return;
- }
+ d = get_domain_from_cpu(cpu, r);
+ if (d) {
+ r->msr_update(d, m, r);
+ return;
}
pr_warn_once("cpu %d not found in any domain for resource %s\n",
cpu, r->name);
@@ -326,8 +359,8 @@ void rdt_ctrl_update(void *arg)
* caller, return the first domain whose id is bigger than the input id.
* The domain list is sorted by id in ascending order.
*/
-static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
- struct list_head **pos)
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+ struct list_head **pos)
{
struct rdt_domain *d;
struct list_head *l;
@@ -377,6 +410,44 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
return 0;
}
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+{
+ size_t tsize;
+
+ if (is_llc_occupancy_enabled()) {
+ d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!d->rmid_busy_llc)
+ return -ENOMEM;
+ INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
+ }
+ if (is_mbm_total_enabled()) {
+ tsize = sizeof(*d->mbm_total);
+ d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ if (!d->mbm_total) {
+ kfree(d->rmid_busy_llc);
+ return -ENOMEM;
+ }
+ }
+ if (is_mbm_local_enabled()) {
+ tsize = sizeof(*d->mbm_local);
+ d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ if (!d->mbm_local) {
+ kfree(d->rmid_busy_llc);
+ kfree(d->mbm_total);
+ return -ENOMEM;
+ }
+ }
+
+ if (is_mbm_enabled()) {
+ INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
+ mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
+ }
+
+ return 0;
+}
+
/*
* domain_add_cpu - Add a cpu to a resource's domain list.
*
@@ -412,14 +483,26 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
d->id = id;
+ cpumask_set_cpu(cpu, &d->cpu_mask);
- if (domain_setup_ctrlval(r, d)) {
+ if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
+ kfree(d);
+ return;
+ }
+
+ if (r->mon_capable && domain_setup_mon_state(r, d)) {
kfree(d);
return;
}
- cpumask_set_cpu(cpu, &d->cpu_mask);
list_add_tail(&d->list, add_pos);
+
+ /*
+ * If resctrl is mounted, add
+ * per domain monitor data directories.
+ */
+ if (static_branch_unlikely(&rdt_mon_enable_key))
+ mkdir_mondata_subdir_allrdtgrp(r, d);
}
static void domain_remove_cpu(int cpu, struct rdt_resource *r)
@@ -435,19 +518,58 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
cpumask_clear_cpu(cpu, &d->cpu_mask);
if (cpumask_empty(&d->cpu_mask)) {
+ /*
+ * If resctrl is mounted, remove all the
+ * per domain monitor data directories.
+ */
+ if (static_branch_unlikely(&rdt_mon_enable_key))
+ rmdir_mondata_subdir_allrdtgrp(r, d->id);
kfree(d->ctrl_val);
+ kfree(d->rmid_busy_llc);
+ kfree(d->mbm_total);
+ kfree(d->mbm_local);
list_del(&d->list);
+ if (is_mbm_enabled())
+ cancel_delayed_work(&d->mbm_over);
+ if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
+ /*
+ * When a package is going down, forcefully
+ * decrement rmid->ebusy. There is no way to know
+ * that the L3 was flushed and hence may lead to
+ * incorrect counts in rare scenarios, but leaving
+ * the RMID as busy creates RMID leaks if the
+ * package never comes back.
+ */
+ __check_limbo(d, true);
+ cancel_delayed_work(&d->cqm_limbo);
+ }
+
kfree(d);
+ return;
+ }
+
+ if (r == &rdt_resources_all[RDT_RESOURCE_L3]) {
+ if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+ cancel_delayed_work(&d->mbm_over);
+ mbm_setup_overflow_handler(d, 0);
+ }
+ if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
+ has_busy_rmid(r, d)) {
+ cancel_delayed_work(&d->cqm_limbo);
+ cqm_setup_limbo_handler(d, 0);
+ }
}
}
-static void clear_closid(int cpu)
+static void clear_closid_rmid(int cpu)
{
struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
- per_cpu(cpu_closid, cpu) = 0;
- state->closid = 0;
- wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
+ state->default_closid = 0;
+ state->default_rmid = 0;
+ state->cur_closid = 0;
+ state->cur_rmid = 0;
+ wrmsr(IA32_PQR_ASSOC, 0, 0);
}
static int intel_rdt_online_cpu(unsigned int cpu)
@@ -459,12 +581,23 @@ static int intel_rdt_online_cpu(unsigned int cpu)
domain_add_cpu(cpu, r);
/* The cpu is set in default rdtgroup after online. */
cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
- clear_closid(cpu);
+ clear_closid_rmid(cpu);
mutex_unlock(&rdtgroup_mutex);
return 0;
}
+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
+{
+ struct rdtgroup *cr;
+
+ list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
+ break;
+ }
+ }
+}
+
static int intel_rdt_offline_cpu(unsigned int cpu)
{
struct rdtgroup *rdtgrp;
@@ -474,10 +607,12 @@ static int intel_rdt_offline_cpu(unsigned int cpu)
for_each_capable_rdt_resource(r)
domain_remove_cpu(cpu, r);
list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
- if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask))
+ if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
+ clear_childcpus(rdtgrp, cpu);
break;
+ }
}
- clear_closid(cpu);
+ clear_closid_rmid(cpu);
mutex_unlock(&rdtgroup_mutex);
return 0;
@@ -492,7 +627,7 @@ static __init void rdt_init_padding(void)
struct rdt_resource *r;
int cl;
- for_each_capable_rdt_resource(r) {
+ for_each_alloc_capable_rdt_resource(r) {
cl = strlen(r->name);
if (cl > max_name_width)
max_name_width = cl;
@@ -502,38 +637,153 @@ static __init void rdt_init_padding(void)
}
}
-static __init bool get_rdt_resources(void)
+enum {
+ RDT_FLAG_CMT,
+ RDT_FLAG_MBM_TOTAL,
+ RDT_FLAG_MBM_LOCAL,
+ RDT_FLAG_L3_CAT,
+ RDT_FLAG_L3_CDP,
+ RDT_FLAG_L2_CAT,
+ RDT_FLAG_MBA,
+};
+
+#define RDT_OPT(idx, n, f) \
+[idx] = { \
+ .name = n, \
+ .flag = f \
+}
+
+struct rdt_options {
+ char *name;
+ int flag;
+ bool force_off, force_on;
+};
+
+static struct rdt_options rdt_options[] __initdata = {
+ RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC),
+ RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
+ RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
+ RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3),
+ RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3),
+ RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2),
+ RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA),
+};
+#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
+
+static int __init set_rdt_options(char *str)
+{
+ struct rdt_options *o;
+ bool force_off;
+ char *tok;
+
+ if (*str == '=')
+ str++;
+ while ((tok = strsep(&str, ",")) != NULL) {
+ force_off = *tok == '!';
+ if (force_off)
+ tok++;
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (strcmp(tok, o->name) == 0) {
+ if (force_off)
+ o->force_off = true;
+ else
+ o->force_on = true;
+ break;
+ }
+ }
+ }
+ return 1;
+}
+__setup("rdt", set_rdt_options);
+
+static bool __init rdt_cpu_has(int flag)
+{
+ bool ret = boot_cpu_has(flag);
+ struct rdt_options *o;
+
+ if (!ret)
+ return ret;
+
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (flag == o->flag) {
+ if (o->force_off)
+ ret = false;
+ if (o->force_on)
+ ret = true;
+ break;
+ }
+ }
+ return ret;
+}
+
+static __init bool get_rdt_alloc_resources(void)
{
bool ret = false;
- if (cache_alloc_hsw_probe())
+ if (rdt_alloc_capable)
return true;
if (!boot_cpu_has(X86_FEATURE_RDT_A))
return false;
- if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
- rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
- if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
+ if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
+ rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+ if (rdt_cpu_has(X86_FEATURE_CDP_L3)) {
rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
}
ret = true;
}
- if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
+ if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
/* CPUID 0x10.2 fields are same format at 0x10.1 */
- rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+ rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
ret = true;
}
- if (boot_cpu_has(X86_FEATURE_MBA)) {
+ if (rdt_cpu_has(X86_FEATURE_MBA)) {
if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA]))
ret = true;
}
-
return ret;
}
+static __init bool get_rdt_mon_resources(void)
+{
+ if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
+ rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
+ rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
+ rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
+
+ if (!rdt_mon_features)
+ return false;
+
+ return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
+}
+
+static __init void rdt_quirks(void)
+{
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_HASWELL_X:
+ if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
+ cache_alloc_hsw_probe();
+ break;
+ case INTEL_FAM6_SKYLAKE_X:
+ if (boot_cpu_data.x86_mask <= 4)
+ set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
+ }
+}
+
+static __init bool get_rdt_resources(void)
+{
+ rdt_quirks();
+ rdt_alloc_capable = get_rdt_alloc_resources();
+ rdt_mon_capable = get_rdt_mon_resources();
+
+ return (rdt_mon_capable || rdt_alloc_capable);
+}
+
static int __init intel_rdt_late_init(void)
{
struct rdt_resource *r;
@@ -556,9 +806,12 @@ static int __init intel_rdt_late_init(void)
return ret;
}
- for_each_capable_rdt_resource(r)
+ for_each_alloc_capable_rdt_resource(r)
pr_info("Intel RDT %s allocation detected\n", r->name);
+ for_each_mon_capable_rdt_resource(r)
+ pr_info("Intel RDT %s monitoring detected\n", r->name);
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
new file mode 100644
index 000000000000..ebaddaeef023
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -0,0 +1,440 @@
+#ifndef _ASM_X86_INTEL_RDT_H
+#define _ASM_X86_INTEL_RDT_H
+
+#include <linux/sched.h>
+#include <linux/kernfs.h>
+#include <linux/jump_label.h>
+
+#define IA32_L3_QOS_CFG 0xc81
+#define IA32_L3_CBM_BASE 0xc90
+#define IA32_L2_CBM_BASE 0xd10
+#define IA32_MBA_THRTL_BASE 0xd50
+
+#define L3_QOS_CDP_ENABLE 0x01ULL
+
+/*
+ * Event IDs are used to program IA32_QM_EVTSEL before reading event
+ * counter from IA32_QM_CTR
+ */
+#define QOS_L3_OCCUP_EVENT_ID 0x01
+#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02
+#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03
+
+#define CQM_LIMBOCHECK_INTERVAL 1000
+
+#define MBM_CNTR_WIDTH 24
+#define MBM_OVERFLOW_INTERVAL 1000
+
+#define RMID_VAL_ERROR BIT_ULL(63)
+#define RMID_VAL_UNAVAIL BIT_ULL(62)
+
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
+/**
+ * struct mon_evt - Entry in the event list of a resource
+ * @evtid: event id
+ * @name: name of the event
+ */
+struct mon_evt {
+ u32 evtid;
+ char *name;
+ struct list_head list;
+};
+
+/**
+ * struct mon_data_bits - Monitoring details for each event file
+ * @rid: Resource id associated with the event file.
+ * @evtid: Event id associated with the event file
+ * @domid: The domain to which the event file belongs
+ */
+union mon_data_bits {
+ void *priv;
+ struct {
+ unsigned int rid : 10;
+ unsigned int evtid : 8;
+ unsigned int domid : 14;
+ } u;
+};
+
+struct rmid_read {
+ struct rdtgroup *rgrp;
+ struct rdt_domain *d;
+ int evtid;
+ bool first;
+ u64 val;
+};
+
+extern unsigned int intel_cqm_threshold;
+extern bool rdt_alloc_capable;
+extern bool rdt_mon_capable;
+extern unsigned int rdt_mon_features;
+
+enum rdt_group_type {
+ RDTCTRL_GROUP = 0,
+ RDTMON_GROUP,
+ RDT_NUM_GROUP,
+};
+
+/**
+ * struct mongroup - store mon group's data in resctrl fs.
+ * @mon_data_kn kernlfs node for the mon_data directory
+ * @parent: parent rdtgrp
+ * @crdtgrp_list: child rdtgroup node list
+ * @rmid: rmid for this rdtgroup
+ */
+struct mongroup {
+ struct kernfs_node *mon_data_kn;
+ struct rdtgroup *parent;
+ struct list_head crdtgrp_list;
+ u32 rmid;
+};
+
+/**
+ * struct rdtgroup - store rdtgroup's data in resctrl file system.
+ * @kn: kernfs node
+ * @rdtgroup_list: linked list for all rdtgroups
+ * @closid: closid for this rdtgroup
+ * @cpu_mask: CPUs assigned to this rdtgroup
+ * @flags: status bits
+ * @waitcount: how many cpus expect to find this
+ * group when they acquire rdtgroup_mutex
+ * @type: indicates type of this rdtgroup - either
+ * monitor only or ctrl_mon group
+ * @mon: mongroup related data
+ */
+struct rdtgroup {
+ struct kernfs_node *kn;
+ struct list_head rdtgroup_list;
+ u32 closid;
+ struct cpumask cpu_mask;
+ int flags;
+ atomic_t waitcount;
+ enum rdt_group_type type;
+ struct mongroup mon;
+};
+
+/* rdtgroup.flags */
+#define RDT_DELETED 1
+
+/* rftype.flags */
+#define RFTYPE_FLAGS_CPUS_LIST 1
+
+/*
+ * Define the file type flags for base and info directories.
+ */
+#define RFTYPE_INFO BIT(0)
+#define RFTYPE_BASE BIT(1)
+#define RF_CTRLSHIFT 4
+#define RF_MONSHIFT 5
+#define RFTYPE_CTRL BIT(RF_CTRLSHIFT)
+#define RFTYPE_MON BIT(RF_MONSHIFT)
+#define RFTYPE_RES_CACHE BIT(8)
+#define RFTYPE_RES_MB BIT(9)
+#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
+#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
+#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
+
+/* List of all resource groups */
+extern struct list_head rdt_all_groups;
+
+extern int max_name_width, max_data_width;
+
+int __init rdtgroup_init(void);
+
+/**
+ * struct rftype - describe each file in the resctrl file system
+ * @name: File name
+ * @mode: Access mode
+ * @kf_ops: File operations
+ * @flags: File specific RFTYPE_FLAGS_* flags
+ * @fflags: File specific RF_* or RFTYPE_* flags
+ * @seq_show: Show content of the file
+ * @write: Write to the file
+ */
+struct rftype {
+ char *name;
+ umode_t mode;
+ struct kernfs_ops *kf_ops;
+ unsigned long flags;
+ unsigned long fflags;
+
+ int (*seq_show)(struct kernfs_open_file *of,
+ struct seq_file *sf, void *v);
+ /*
+ * write() is the generic write callback which maps directly to
+ * kernfs write operation and overrides all other operations.
+ * Maximum write size is determined by ->max_write_len.
+ */
+ ssize_t (*write)(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+};
+
+/**
+ * struct mbm_state - status for each MBM counter in each domain
+ * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it
+ */
+struct mbm_state {
+ u64 chunks;
+ u64 prev_msr;
+};
+
+/**
+ * struct rdt_domain - group of cpus sharing an RDT resource
+ * @list: all instances of this resource
+ * @id: unique id for this instance
+ * @cpu_mask: which cpus share this resource
+ * @rmid_busy_llc:
+ * bitmap of which limbo RMIDs are above threshold
+ * @mbm_total: saved state for MBM total bandwidth
+ * @mbm_local: saved state for MBM local bandwidth
+ * @mbm_over: worker to periodically read MBM h/w counters
+ * @cqm_limbo: worker to periodically read CQM h/w counters
+ * @mbm_work_cpu:
+ * worker cpu for MBM h/w counters
+ * @cqm_work_cpu:
+ * worker cpu for CQM h/w counters
+ * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
+ * @new_ctrl: new ctrl value to be loaded
+ * @have_new_ctrl: did user provide new_ctrl for this domain
+ */
+struct rdt_domain {
+ struct list_head list;
+ int id;
+ struct cpumask cpu_mask;
+ unsigned long *rmid_busy_llc;
+ struct mbm_state *mbm_total;
+ struct mbm_state *mbm_local;
+ struct delayed_work mbm_over;
+ struct delayed_work cqm_limbo;
+ int mbm_work_cpu;
+ int cqm_work_cpu;
+ u32 *ctrl_val;
+ u32 new_ctrl;
+ bool have_new_ctrl;
+};
+
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res: The resource to use
+ * @low: Beginning index from base MSR
+ * @high: End index
+ */
+struct msr_param {
+ struct rdt_resource *res;
+ int low;
+ int high;
+};
+
+/**
+ * struct rdt_cache - Cache allocation related data
+ * @cbm_len: Length of the cache bit mask
+ * @min_cbm_bits: Minimum number of consecutive bits to be set
+ * @cbm_idx_mult: Multiplier of CBM index
+ * @cbm_idx_offset: Offset of CBM index. CBM index is computed by:
+ * closid * cbm_idx_multi + cbm_idx_offset
+ * in a cache bit mask
+ * @shareable_bits: Bitmask of shareable resource with other
+ * executing entities
+ */
+struct rdt_cache {
+ unsigned int cbm_len;
+ unsigned int min_cbm_bits;
+ unsigned int cbm_idx_mult;
+ unsigned int cbm_idx_offset;
+ unsigned int shareable_bits;
+};
+
+/**
+ * struct rdt_membw - Memory bandwidth allocation related data
+ * @max_delay: Max throttle delay. Delay is the hardware
+ * representation for memory bandwidth.
+ * @min_bw: Minimum memory bandwidth percentage user can request
+ * @bw_gran: Granularity at which the memory bandwidth is allocated
+ * @delay_linear: True if memory B/W delay is in linear scale
+ * @mb_map: Mapping of memory B/W percentage to memory B/W delay
+ */
+struct rdt_membw {
+ u32 max_delay;
+ u32 min_bw;
+ u32 bw_gran;
+ u32 delay_linear;
+ u32 *mb_map;
+};
+
+static inline bool is_llc_occupancy_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
+}
+
+static inline bool is_mbm_total_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
+}
+
+static inline bool is_mbm_local_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
+}
+
+static inline bool is_mbm_enabled(void)
+{
+ return (is_mbm_total_enabled() || is_mbm_local_enabled());
+}
+
+static inline bool is_mbm_event(int e)
+{
+ return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+ e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
+/**
+ * struct rdt_resource - attributes of an RDT resource
+ * @rid: The index of the resource
+ * @alloc_enabled: Is allocation enabled on this machine
+ * @mon_enabled: Is monitoring enabled for this feature
+ * @alloc_capable: Is allocation available on this machine
+ * @mon_capable: Is monitor feature available on this machine
+ * @name: Name to use in "schemata" file
+ * @num_closid: Number of CLOSIDs available
+ * @cache_level: Which cache level defines scope of this resource
+ * @default_ctrl: Specifies default cache cbm or memory B/W percent.
+ * @msr_base: Base MSR address for CBMs
+ * @msr_update: Function pointer to update QOS MSRs
+ * @data_width: Character width of data when displaying
+ * @domains: All domains for this resource
+ * @cache: Cache allocation related data
+ * @format_str: Per resource format string to show domain value
+ * @parse_ctrlval: Per resource function pointer to parse control values
+ * @evt_list: List of monitoring events
+ * @num_rmid: Number of RMIDs available
+ * @mon_scale: cqm counter * mon_scale = occupancy in bytes
+ * @fflags: flags to choose base and info files
+ */
+struct rdt_resource {
+ int rid;
+ bool alloc_enabled;
+ bool mon_enabled;
+ bool alloc_capable;
+ bool mon_capable;
+ char *name;
+ int num_closid;
+ int cache_level;
+ u32 default_ctrl;
+ unsigned int msr_base;
+ void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
+ struct rdt_resource *r);
+ int data_width;
+ struct list_head domains;
+ struct rdt_cache cache;
+ struct rdt_membw membw;
+ const char *format_str;
+ int (*parse_ctrlval) (char *buf, struct rdt_resource *r,
+ struct rdt_domain *d);
+ struct list_head evt_list;
+ int num_rmid;
+ unsigned int mon_scale;
+ unsigned long fflags;
+};
+
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
+int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d);
+
+extern struct mutex rdtgroup_mutex;
+
+extern struct rdt_resource rdt_resources_all[];
+extern struct rdtgroup rdtgroup_default;
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+
+int __init rdtgroup_init(void);
+
+enum {
+ RDT_RESOURCE_L3,
+ RDT_RESOURCE_L3DATA,
+ RDT_RESOURCE_L3CODE,
+ RDT_RESOURCE_L2,
+ RDT_RESOURCE_MBA,
+
+ /* Must be the last */
+ RDT_NUM_RESOURCES,
+};
+
+#define for_each_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->alloc_capable || r->mon_capable)
+
+#define for_each_alloc_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->alloc_capable)
+
+#define for_each_mon_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->mon_capable)
+
+#define for_each_alloc_enabled_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->alloc_enabled)
+
+#define for_each_mon_enabled_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->mon_enabled)
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+ struct {
+ unsigned int cbm_len:5;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
+union cpuid_0x10_3_eax {
+ struct {
+ unsigned int max_delay:12;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).EDX */
+union cpuid_0x10_x_edx {
+ struct {
+ unsigned int cos_max:16;
+ } split;
+ unsigned int full;
+};
+
+void rdt_ctrl_update(void *arg);
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
+void rdtgroup_kn_unlock(struct kernfs_node *kn);
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+ struct list_head **pos);
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v);
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
+int alloc_rmid(void);
+void free_rmid(u32 rmid);
+int rdt_get_mon_l3_config(struct rdt_resource *r);
+void mon_event_count(void *info);
+int rdtgroup_mondata_show(struct seq_file *m, void *arg);
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ unsigned int dom_id);
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ struct rdt_domain *d);
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+ struct rdtgroup *rdtgrp, int evtid, int first);
+void mbm_setup_overflow_handler(struct rdt_domain *dom,
+ unsigned long delay_ms);
+void mbm_handle_overflow(struct work_struct *work);
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
+void cqm_handle_limbo(struct work_struct *work);
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
+void __check_limbo(struct rdt_domain *d, bool force_free);
+
+#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index 406d7a6532f9..f6ea94f8954a 100644
--- a/arch/x86/kernel/cpu/intel_rdt_schemata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -26,7 +26,7 @@
#include <linux/kernfs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
-#include <asm/intel_rdt.h>
+#include "intel_rdt.h"
/*
* Check whether MBA bandwidth percentage value is correct. The value is
@@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
{
struct rdt_resource *r;
- for_each_enabled_rdt_resource(r) {
+ for_each_alloc_enabled_rdt_resource(r) {
if (!strcmp(resname, r->name) && closid < r->num_closid)
return parse_line(tok, r);
}
@@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
closid = rdtgrp->closid;
- for_each_enabled_rdt_resource(r) {
+ for_each_alloc_enabled_rdt_resource(r) {
list_for_each_entry(dom, &r->domains, list)
dom->have_new_ctrl = false;
}
@@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
goto out;
}
- for_each_enabled_rdt_resource(r) {
+ for_each_alloc_enabled_rdt_resource(r) {
ret = update_domains(r, closid);
if (ret)
goto out;
@@ -269,12 +269,13 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
{
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
- int closid, ret = 0;
+ int ret = 0;
+ u32 closid;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (rdtgrp) {
closid = rdtgrp->closid;
- for_each_enabled_rdt_resource(r) {
+ for_each_alloc_enabled_rdt_resource(r) {
if (closid < r->num_closid)
show_doms(s, r, closid);
}
@@ -284,3 +285,57 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
rdtgroup_kn_unlock(of->kn);
return ret;
}
+
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+ struct rdtgroup *rdtgrp, int evtid, int first)
+{
+ /*
+ * setup the parameters to send to the IPI to read the data.
+ */
+ rr->rgrp = rdtgrp;
+ rr->evtid = evtid;
+ rr->d = d;
+ rr->val = 0;
+ rr->first = first;
+
+ smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
+}
+
+int rdtgroup_mondata_show(struct seq_file *m, void *arg)
+{
+ struct kernfs_open_file *of = m->private;
+ u32 resid, evtid, domid;
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ union mon_data_bits md;
+ struct rdt_domain *d;
+ struct rmid_read rr;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ md.priv = of->kn->priv;
+ resid = md.u.rid;
+ domid = md.u.domid;
+ evtid = md.u.evtid;
+
+ r = &rdt_resources_all[resid];
+ d = rdt_find_domain(r, domid, NULL);
+ if (!d) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mon_event_read(&rr, d, rdtgrp, evtid, false);
+
+ if (rr.val & RMID_VAL_ERROR)
+ seq_puts(m, "Error\n");
+ else if (rr.val & RMID_VAL_UNAVAIL)
+ seq_puts(m, "Unavailable\n");
+ else
+ seq_printf(m, "%llu\n", rr.val * r->mon_scale);
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
new file mode 100644
index 000000000000..30827510094b
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -0,0 +1,499 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "intel_rdt.h"
+
+#define MSR_IA32_QM_CTR 0x0c8e
+#define MSR_IA32_QM_EVTSEL 0x0c8d
+
+struct rmid_entry {
+ u32 rmid;
+ int busy;
+ struct list_head list;
+};
+
+/**
+ * @rmid_free_lru A least recently used list of free RMIDs
+ * These RMIDs are guaranteed to have an occupancy less than the
+ * threshold occupancy
+ */
+static LIST_HEAD(rmid_free_lru);
+
+/**
+ * @rmid_limbo_count count of currently unused but (potentially)
+ * dirty RMIDs.
+ * This counts RMIDs that no one is currently using but that
+ * may have a occupancy value > intel_cqm_threshold. User can change
+ * the threshold occupancy value.
+ */
+unsigned int rmid_limbo_count;
+
+/**
+ * @rmid_entry - The entry in the limbo and free lists.
+ */
+static struct rmid_entry *rmid_ptrs;
+
+/*
+ * Global boolean for rdt_monitor which is true if any
+ * resource monitoring is enabled.
+ */
+bool rdt_mon_capable;
+
+/*
+ * Global to indicate which monitoring events are enabled.
+ */
+unsigned int rdt_mon_features;
+
+/*
+ * This is the threshold cache occupancy at which we will consider an
+ * RMID available for re-allocation.
+ */
+unsigned int intel_cqm_threshold;
+
+static inline struct rmid_entry *__rmid_entry(u32 rmid)
+{
+ struct rmid_entry *entry;
+
+ entry = &rmid_ptrs[rmid];
+ WARN_ON(entry->rmid != rmid);
+
+ return entry;
+}
+
+static u64 __rmid_read(u32 rmid, u32 eventid)
+{
+ u64 val;
+
+ /*
+ * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+ * with a valid event code for supported resource type and the bits
+ * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+ * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+ * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+ * are error bits.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, val);
+
+ return val;
+}
+
+static bool rmid_dirty(struct rmid_entry *entry)
+{
+ u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+
+ return val >= intel_cqm_threshold;
+}
+
+/*
+ * Check the RMIDs that are marked as busy for this domain. If the
+ * reported LLC occupancy is below the threshold clear the busy bit and
+ * decrement the count. If the busy count gets to zero on an RMID, we
+ * free the RMID
+ */
+void __check_limbo(struct rdt_domain *d, bool force_free)
+{
+ struct rmid_entry *entry;
+ struct rdt_resource *r;
+ u32 crmid = 1, nrmid;
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ /*
+ * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
+ * are marked as busy for occupancy < threshold. If the occupancy
+ * is less than the threshold decrement the busy counter of the
+ * RMID and move it to the free list when the counter reaches 0.
+ */
+ for (;;) {
+ nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
+ if (nrmid >= r->num_rmid)
+ break;
+
+ entry = __rmid_entry(nrmid);
+ if (force_free || !rmid_dirty(entry)) {
+ clear_bit(entry->rmid, d->rmid_busy_llc);
+ if (!--entry->busy) {
+ rmid_limbo_count--;
+ list_add_tail(&entry->list, &rmid_free_lru);
+ }
+ }
+ crmid = nrmid + 1;
+ }
+}
+
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
+{
+ return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
+}
+
+/*
+ * As of now the RMIDs allocation is global.
+ * However we keep track of which packages the RMIDs
+ * are used to optimize the limbo list management.
+ */
+int alloc_rmid(void)
+{
+ struct rmid_entry *entry;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ if (list_empty(&rmid_free_lru))
+ return rmid_limbo_count ? -EBUSY : -ENOSPC;
+
+ entry = list_first_entry(&rmid_free_lru,
+ struct rmid_entry, list);
+ list_del(&entry->list);
+
+ return entry->rmid;
+}
+
+static void add_rmid_to_limbo(struct rmid_entry *entry)
+{
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+ int cpu;
+ u64 val;
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ entry->busy = 0;
+ cpu = get_cpu();
+ list_for_each_entry(d, &r->domains, list) {
+ if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
+ val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+ if (val <= intel_cqm_threshold)
+ continue;
+ }
+
+ /*
+ * For the first limbo RMID in the domain,
+ * setup up the limbo worker.
+ */
+ if (!has_busy_rmid(r, d))
+ cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
+ set_bit(entry->rmid, d->rmid_busy_llc);
+ entry->busy++;
+ }
+ put_cpu();
+
+ if (entry->busy)
+ rmid_limbo_count++;
+ else
+ list_add_tail(&entry->list, &rmid_free_lru);
+}
+
+void free_rmid(u32 rmid)
+{
+ struct rmid_entry *entry;
+
+ if (!rmid)
+ return;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ entry = __rmid_entry(rmid);
+
+ if (is_llc_occupancy_enabled())
+ add_rmid_to_limbo(entry);
+ else
+ list_add_tail(&entry->list, &rmid_free_lru);
+}
+
+static int __mon_event_count(u32 rmid, struct rmid_read *rr)
+{
+ u64 chunks, shift, tval;
+ struct mbm_state *m;
+
+ tval = __rmid_read(rmid, rr->evtid);
+ if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
+ rr->val = tval;
+ return -EINVAL;
+ }
+ switch (rr->evtid) {
+ case QOS_L3_OCCUP_EVENT_ID:
+ rr->val += tval;
+ return 0;
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ m = &rr->d->mbm_total[rmid];
+ break;
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ m = &rr->d->mbm_local[rmid];
+ break;
+ default:
+ /*
+ * Code would never reach here because
+ * an invalid event id would fail the __rmid_read.
+ */
+ return -EINVAL;
+ }
+
+ if (rr->first) {
+ m->prev_msr = tval;
+ m->chunks = 0;
+ return 0;
+ }
+
+ shift = 64 - MBM_CNTR_WIDTH;
+ chunks = (tval << shift) - (m->prev_msr << shift);
+ chunks >>= shift;
+ m->chunks += chunks;
+ m->prev_msr = tval;
+
+ rr->val += m->chunks;
+ return 0;
+}
+
+/*
+ * This is called via IPI to read the CQM/MBM counters
+ * on a domain.
+ */
+void mon_event_count(void *info)
+{
+ struct rdtgroup *rdtgrp, *entry;
+ struct rmid_read *rr = info;
+ struct list_head *head;
+
+ rdtgrp = rr->rgrp;
+
+ if (__mon_event_count(rdtgrp->mon.rmid, rr))
+ return;
+
+ /*
+ * For Ctrl groups read data from child monitor groups.
+ */
+ head = &rdtgrp->mon.crdtgrp_list;
+
+ if (rdtgrp->type == RDTCTRL_GROUP) {
+ list_for_each_entry(entry, head, mon.crdtgrp_list) {
+ if (__mon_event_count(entry->mon.rmid, rr))
+ return;
+ }
+ }
+}
+
+static void mbm_update(struct rdt_domain *d, int rmid)
+{
+ struct rmid_read rr;
+
+ rr.first = false;
+ rr.d = d;
+
+ /*
+ * This is protected from concurrent reads from user
+ * as both the user and we hold the global mutex.
+ */
+ if (is_mbm_total_enabled()) {
+ rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
+ __mon_event_count(rmid, &rr);
+ }
+ if (is_mbm_local_enabled()) {
+ rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
+ __mon_event_count(rmid, &rr);
+ }
+}
+
+/*
+ * Handler to scan the limbo list and move the RMIDs
+ * to free list whose occupancy < threshold_occupancy.
+ */
+void cqm_handle_limbo(struct work_struct *work)
+{
+ unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
+ int cpu = smp_processor_id();
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+ d = get_domain_from_cpu(cpu, r);
+
+ if (!d) {
+ pr_warn_once("Failure to get domain for limbo worker\n");
+ goto out_unlock;
+ }
+
+ __check_limbo(d, false);
+
+ if (has_busy_rmid(r, d))
+ schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
+{
+ unsigned long delay = msecs_to_jiffies(delay_ms);
+ struct rdt_resource *r;
+ int cpu;
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ cpu = cpumask_any(&dom->cpu_mask);
+ dom->cqm_work_cpu = cpu;
+
+ schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
+}
+
+void mbm_handle_overflow(struct work_struct *work)
+{
+ unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
+ struct rdtgroup *prgrp, *crgrp;
+ int cpu = smp_processor_id();
+ struct list_head *head;
+ struct rdt_domain *d;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ if (!static_branch_likely(&rdt_enable_key))
+ goto out_unlock;
+
+ d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
+ if (!d)
+ goto out_unlock;
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ mbm_update(d, prgrp->mon.rmid);
+
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list)
+ mbm_update(d, crgrp->mon.rmid);
+ }
+
+ schedule_delayed_work_on(cpu, &d->mbm_over, delay);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
+{
+ unsigned long delay = msecs_to_jiffies(delay_ms);
+ int cpu;
+
+ if (!static_branch_likely(&rdt_enable_key))
+ return;
+ cpu = cpumask_any(&dom->cpu_mask);
+ dom->mbm_work_cpu = cpu;
+ schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
+}
+
+static int dom_data_init(struct rdt_resource *r)
+{
+ struct rmid_entry *entry = NULL;
+ int i, nr_rmids;
+
+ nr_rmids = r->num_rmid;
+ rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
+ if (!rmid_ptrs)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_rmids; i++) {
+ entry = &rmid_ptrs[i];
+ INIT_LIST_HEAD(&entry->list);
+
+ entry->rmid = i;
+ list_add_tail(&entry->list, &rmid_free_lru);
+ }
+
+ /*
+ * RMID 0 is special and is always allocated. It's used for all
+ * tasks that are not monitored.
+ */
+ entry = __rmid_entry(0);
+ list_del(&entry->list);
+
+ return 0;
+}
+
+static struct mon_evt llc_occupancy_event = {
+ .name = "llc_occupancy",
+ .evtid = QOS_L3_OCCUP_EVENT_ID,
+};
+
+static struct mon_evt mbm_total_event = {
+ .name = "mbm_total_bytes",
+ .evtid = QOS_L3_MBM_TOTAL_EVENT_ID,
+};
+
+static struct mon_evt mbm_local_event = {
+ .name = "mbm_local_bytes",
+ .evtid = QOS_L3_MBM_LOCAL_EVENT_ID,
+};
+
+/*
+ * Initialize the event list for the resource.
+ *
+ * Note that MBM events are also part of RDT_RESOURCE_L3 resource
+ * because as per the SDM the total and local memory bandwidth
+ * are enumerated as part of L3 monitoring.
+ */
+static void l3_mon_evt_init(struct rdt_resource *r)
+{
+ INIT_LIST_HEAD(&r->evt_list);
+
+ if (is_llc_occupancy_enabled())
+ list_add_tail(&llc_occupancy_event.list, &r->evt_list);
+ if (is_mbm_total_enabled())
+ list_add_tail(&mbm_total_event.list, &r->evt_list);
+ if (is_mbm_local_enabled())
+ list_add_tail(&mbm_local_event.list, &r->evt_list);
+}
+
+int rdt_get_mon_l3_config(struct rdt_resource *r)
+{
+ int ret;
+
+ r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
+ r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid;
+
+ /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
+ intel_cqm_threshold /= r->mon_scale;
+
+ ret = dom_data_init(r);
+ if (ret)
+ return ret;
+
+ l3_mon_evt_init(r);
+
+ r->mon_capable = true;
+ r->mon_enabled = true;
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 9257bd9dc664..a869d4a073c5 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -32,17 +32,25 @@
#include <uapi/linux/magic.h>
-#include <asm/intel_rdt.h>
-#include <asm/intel_rdt_common.h>
+#include <asm/intel_rdt_sched.h>
+#include "intel_rdt.h"
DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
-struct kernfs_root *rdt_root;
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+static struct kernfs_root *rdt_root;
struct rdtgroup rdtgroup_default;
LIST_HEAD(rdt_all_groups);
/* Kernel fs node for "info" directory under root */
static struct kernfs_node *kn_info;
+/* Kernel fs node for "mon_groups" directory under root */
+static struct kernfs_node *kn_mongrp;
+
+/* Kernel fs node for "mon_data" directory under root */
+static struct kernfs_node *kn_mondata;
+
/*
* Trivial allocator for CLOSIDs. Since h/w only supports a small number,
* we can keep a bitmap of free CLOSIDs in a single integer.
@@ -66,7 +74,7 @@ static void closid_init(void)
int rdt_min_closid = 32;
/* Compute rdt_min_closid across all resources */
- for_each_enabled_rdt_resource(r)
+ for_each_alloc_enabled_rdt_resource(r)
rdt_min_closid = min(rdt_min_closid, r->num_closid);
closid_free_map = BIT_MASK(rdt_min_closid) - 1;
@@ -75,9 +83,9 @@ static void closid_init(void)
closid_free_map &= ~1;
}
-int closid_alloc(void)
+static int closid_alloc(void)
{
- int closid = ffs(closid_free_map);
+ u32 closid = ffs(closid_free_map);
if (closid == 0)
return -ENOSPC;
@@ -125,28 +133,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
return 0;
}
-static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts,
- int len)
-{
- struct rftype *rft;
- int ret;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- for (rft = rfts; rft < rfts + len; rft++) {
- ret = rdtgroup_add_file(kn, rft);
- if (ret)
- goto error;
- }
-
- return 0;
-error:
- pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
- while (--rft >= rfts)
- kernfs_remove_by_name(kn, rft->name);
- return ret;
-}
-
static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
{
struct kernfs_open_file *of = m->private;
@@ -174,6 +160,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = {
.seq_show = rdtgroup_seqfile_show,
};
+static struct kernfs_ops kf_mondata_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .seq_show = rdtgroup_mondata_show,
+};
+
static bool is_cpu_list(struct kernfs_open_file *of)
{
struct rftype *rft = of->kn->priv;
@@ -203,13 +194,18 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
/*
* This is safe against intel_rdt_sched_in() called from __switch_to()
* because __switch_to() is executed with interrupts disabled. A local call
- * from rdt_update_closid() is proteced against __switch_to() because
+ * from update_closid_rmid() is proteced against __switch_to() because
* preemption is disabled.
*/
-static void rdt_update_cpu_closid(void *closid)
+static void update_cpu_closid_rmid(void *info)
{
- if (closid)
- this_cpu_write(cpu_closid, *(int *)closid);
+ struct rdtgroup *r = info;
+
+ if (r) {
+ this_cpu_write(pqr_state.default_closid, r->closid);
+ this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
+ }
+
/*
* We cannot unconditionally write the MSR because the current
* executing task might have its own closid selected. Just reuse
@@ -221,28 +217,128 @@ static void rdt_update_cpu_closid(void *closid)
/*
* Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
*
- * Per task closids must have been set up before calling this function.
- *
- * The per cpu closids are updated with the smp function call, when @closid
- * is not NULL. If @closid is NULL then all affected percpu closids must
- * have been set up before calling this function.
+ * Per task closids/rmids must have been set up before calling this function.
*/
static void
-rdt_update_closid(const struct cpumask *cpu_mask, int *closid)
+update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
{
int cpu = get_cpu();
if (cpumask_test_cpu(cpu, cpu_mask))
- rdt_update_cpu_closid(closid);
- smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1);
+ update_cpu_closid_rmid(r);
+ smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
put_cpu();
}
+static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+ cpumask_var_t tmpmask)
+{
+ struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
+ struct list_head *head;
+
+ /* Check whether cpus belong to parent ctrl group */
+ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
+ if (cpumask_weight(tmpmask))
+ return -EINVAL;
+
+ /* Check whether cpus are dropped from this group */
+ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+ if (cpumask_weight(tmpmask)) {
+ /* Give any dropped cpus to parent rdtgroup */
+ cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
+ update_closid_rmid(tmpmask, prgrp);
+ }
+
+ /*
+ * If we added cpus, remove them from previous group that owned them
+ * and update per-cpu rmid
+ */
+ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+ if (cpumask_weight(tmpmask)) {
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ if (crgrp == rdtgrp)
+ continue;
+ cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
+ tmpmask);
+ }
+ update_closid_rmid(tmpmask, rdtgrp);
+ }
+
+ /* Done pushing/pulling - update this group with new mask */
+ cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+ return 0;
+}
+
+static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
+{
+ struct rdtgroup *crgrp;
+
+ cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
+ /* update the child mon group masks as well*/
+ list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
+ cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
+}
+
+static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+ cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
+{
+ struct rdtgroup *r, *crgrp;
+ struct list_head *head;
+
+ /* Check whether cpus are dropped from this group */
+ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+ if (cpumask_weight(tmpmask)) {
+ /* Can't drop from default group */
+ if (rdtgrp == &rdtgroup_default)
+ return -EINVAL;
+
+ /* Give any dropped cpus to rdtgroup_default */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, tmpmask);
+ update_closid_rmid(tmpmask, &rdtgroup_default);
+ }
+
+ /*
+ * If we added cpus, remove them from previous group and
+ * the prev group's child groups that owned them
+ * and update per-cpu closid/rmid.
+ */
+ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+ if (cpumask_weight(tmpmask)) {
+ list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+ if (r == rdtgrp)
+ continue;
+ cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
+ if (cpumask_weight(tmpmask1))
+ cpumask_rdtgrp_clear(r, tmpmask1);
+ }
+ update_closid_rmid(tmpmask, rdtgrp);
+ }
+
+ /* Done pushing/pulling - update this group with new mask */
+ cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+ /*
+ * Clear child mon group masks since there is a new parent mask
+ * now and update the rmid for the cpus the child lost.
+ */
+ head = &rdtgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
+ update_closid_rmid(tmpmask, rdtgrp);
+ cpumask_clear(&crgrp->cpu_mask);
+ }
+
+ return 0;
+}
+
static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- cpumask_var_t tmpmask, newmask;
- struct rdtgroup *rdtgrp, *r;
+ cpumask_var_t tmpmask, newmask, tmpmask1;
+ struct rdtgroup *rdtgrp;
int ret;
if (!buf)
@@ -254,6 +350,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
free_cpumask_var(tmpmask);
return -ENOMEM;
}
+ if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
+ free_cpumask_var(tmpmask);
+ free_cpumask_var(newmask);
+ return -ENOMEM;
+ }
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) {
@@ -276,41 +377,18 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
goto unlock;
}
- /* Check whether cpus are dropped from this group */
- cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (cpumask_weight(tmpmask)) {
- /* Can't drop from default group */
- if (rdtgrp == &rdtgroup_default) {
- ret = -EINVAL;
- goto unlock;
- }
- /* Give any dropped cpus to rdtgroup_default */
- cpumask_or(&rdtgroup_default.cpu_mask,
- &rdtgroup_default.cpu_mask, tmpmask);
- rdt_update_closid(tmpmask, &rdtgroup_default.closid);
- }
-
- /*
- * If we added cpus, remove them from previous group that owned them
- * and update per-cpu closid
- */
- cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (cpumask_weight(tmpmask)) {
- list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
- if (r == rdtgrp)
- continue;
- cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
- }
- rdt_update_closid(tmpmask, &rdtgrp->closid);
- }
-
- /* Done pushing/pulling - update this group with new mask */
- cpumask_copy(&rdtgrp->cpu_mask, newmask);
+ if (rdtgrp->type == RDTCTRL_GROUP)
+ ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
+ else if (rdtgrp->type == RDTMON_GROUP)
+ ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
+ else
+ ret = -EINVAL;
unlock:
rdtgroup_kn_unlock(of->kn);
free_cpumask_var(tmpmask);
free_cpumask_var(newmask);
+ free_cpumask_var(tmpmask1);
return ret ?: nbytes;
}
@@ -336,6 +414,7 @@ static void move_myself(struct callback_head *head)
if (atomic_dec_and_test(&rdtgrp->waitcount) &&
(rdtgrp->flags & RDT_DELETED)) {
current->closid = 0;
+ current->rmid = 0;
kfree(rdtgrp);
}
@@ -374,7 +453,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
atomic_dec(&rdtgrp->waitcount);
kfree(callback);
} else {
- tsk->closid = rdtgrp->closid;
+ /*
+ * For ctrl_mon groups move both closid and rmid.
+ * For monitor groups, can move the tasks only from
+ * their parent CTRL group.
+ */
+ if (rdtgrp->type == RDTCTRL_GROUP) {
+ tsk->closid = rdtgrp->closid;
+ tsk->rmid = rdtgrp->mon.rmid;
+ } else if (rdtgrp->type == RDTMON_GROUP) {
+ if (rdtgrp->mon.parent->closid == tsk->closid)
+ tsk->rmid = rdtgrp->mon.rmid;
+ else
+ ret = -EINVAL;
+ }
}
return ret;
}
@@ -454,7 +546,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
rcu_read_lock();
for_each_process_thread(p, t) {
- if (t->closid == r->closid)
+ if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
+ (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
seq_printf(s, "%d\n", t->pid);
}
rcu_read_unlock();
@@ -476,39 +569,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
return ret;
}
-/* Files in each rdtgroup */
-static struct rftype rdtgroup_base_files[] = {
- {
- .name = "cpus",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_cpus_write,
- .seq_show = rdtgroup_cpus_show,
- },
- {
- .name = "cpus_list",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_cpus_write,
- .seq_show = rdtgroup_cpus_show,
- .flags = RFTYPE_FLAGS_CPUS_LIST,
- },
- {
- .name = "tasks",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_tasks_write,
- .seq_show = rdtgroup_tasks_show,
- },
- {
- .name = "schemata",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_schemata_write,
- .seq_show = rdtgroup_schemata_show,
- },
-};
-
static int rdt_num_closids_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
@@ -536,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
return 0;
}
+static int rdt_shareable_bits_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%x\n", r->cache.shareable_bits);
+ return 0;
+}
+
static int rdt_min_bw_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
@@ -545,6 +614,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of,
return 0;
}
+static int rdt_num_rmids_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%d\n", r->num_rmid);
+
+ return 0;
+}
+
+static int rdt_mon_features_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ struct mon_evt *mevt;
+
+ list_for_each_entry(mevt, &r->evt_list, list)
+ seq_printf(seq, "%s\n", mevt->name);
+
+ return 0;
+}
+
static int rdt_bw_gran_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
@@ -563,74 +654,200 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
return 0;
}
+static int max_threshold_occ_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
+
+ return 0;
+}
+
+static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ unsigned int bytes;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &bytes);
+ if (ret)
+ return ret;
+
+ if (bytes > (boot_cpu_data.x86_cache_size * 1024))
+ return -EINVAL;
+
+ intel_cqm_threshold = bytes / r->mon_scale;
+
+ return nbytes;
+}
+
/* rdtgroup information files for one cache resource. */
-static struct rftype res_cache_info_files[] = {
+static struct rftype res_common_files[] = {
{
.name = "num_closids",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_num_closids_show,
+ .fflags = RF_CTRL_INFO,
+ },
+ {
+ .name = "mon_features",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_mon_features_show,
+ .fflags = RF_MON_INFO,
+ },
+ {
+ .name = "num_rmids",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_num_rmids_show,
+ .fflags = RF_MON_INFO,
},
{
.name = "cbm_mask",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_default_ctrl_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
.name = "min_cbm_bits",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_min_cbm_bits_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
},
-};
-
-/* rdtgroup information files for memory bandwidth. */
-static struct rftype res_mba_info_files[] = {
{
- .name = "num_closids",
+ .name = "shareable_bits",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_num_closids_show,
+ .seq_show = rdt_shareable_bits_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
.name = "min_bandwidth",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_min_bw_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
},
{
.name = "bandwidth_gran",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_bw_gran_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
},
{
.name = "delay_linear",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_delay_linear_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ },
+ {
+ .name = "max_threshold_occupancy",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = max_threshold_occ_write,
+ .seq_show = max_threshold_occ_show,
+ .fflags = RF_MON_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "cpus",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "cpus_list",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ .flags = RFTYPE_FLAGS_CPUS_LIST,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "tasks",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_tasks_write,
+ .seq_show = rdtgroup_tasks_show,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "schemata",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_schemata_write,
+ .seq_show = rdtgroup_schemata_show,
+ .fflags = RF_CTRL_BASE,
},
};
-void rdt_get_mba_infofile(struct rdt_resource *r)
+static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
{
- r->info_files = res_mba_info_files;
- r->nr_info_files = ARRAY_SIZE(res_mba_info_files);
+ struct rftype *rfts, *rft;
+ int ret, len;
+
+ rfts = res_common_files;
+ len = ARRAY_SIZE(res_common_files);
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ if ((fflags & rft->fflags) == rft->fflags) {
+ ret = rdtgroup_add_file(kn, rft);
+ if (ret)
+ goto error;
+ }
+ }
+
+ return 0;
+error:
+ pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+ while (--rft >= rfts) {
+ if ((fflags & rft->fflags) == rft->fflags)
+ kernfs_remove_by_name(kn, rft->name);
+ }
+ return ret;
}
-void rdt_get_cache_infofile(struct rdt_resource *r)
+static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
+ unsigned long fflags)
{
- r->info_files = res_cache_info_files;
- r->nr_info_files = ARRAY_SIZE(res_cache_info_files);
+ struct kernfs_node *kn_subdir;
+ int ret;
+
+ kn_subdir = kernfs_create_dir(kn_info, name,
+ kn_info->mode, r);
+ if (IS_ERR(kn_subdir))
+ return PTR_ERR(kn_subdir);
+
+ kernfs_get(kn_subdir);
+ ret = rdtgroup_kn_set_ugid(kn_subdir);
+ if (ret)
+ return ret;
+
+ ret = rdtgroup_add_files(kn_subdir, fflags);
+ if (!ret)
+ kernfs_activate(kn_subdir);
+
+ return ret;
}
static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
{
- struct kernfs_node *kn_subdir;
- struct rftype *res_info_files;
struct rdt_resource *r;
- int ret, len;
+ unsigned long fflags;
+ char name[32];
+ int ret;
/* create the directory */
kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
@@ -638,25 +855,19 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
return PTR_ERR(kn_info);
kernfs_get(kn_info);
- for_each_enabled_rdt_resource(r) {
- kn_subdir = kernfs_create_dir(kn_info, r->name,
- kn_info->mode, r);
- if (IS_ERR(kn_subdir)) {
- ret = PTR_ERR(kn_subdir);
- goto out_destroy;
- }
- kernfs_get(kn_subdir);
- ret = rdtgroup_kn_set_ugid(kn_subdir);
+ for_each_alloc_enabled_rdt_resource(r) {
+ fflags = r->fflags | RF_CTRL_INFO;
+ ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
if (ret)
goto out_destroy;
+ }
- res_info_files = r->info_files;
- len = r->nr_info_files;
-
- ret = rdtgroup_add_files(kn_subdir, res_info_files, len);
+ for_each_mon_enabled_rdt_resource(r) {
+ fflags = r->fflags | RF_MON_INFO;
+ sprintf(name, "%s_MON", r->name);
+ ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
if (ret)
goto out_destroy;
- kernfs_activate(kn_subdir);
}
/*
@@ -678,6 +889,39 @@ out_destroy:
return ret;
}
+static int
+mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
+ char *name, struct kernfs_node **dest_kn)
+{
+ struct kernfs_node *kn;
+ int ret;
+
+ /* create the directory */
+ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ if (dest_kn)
+ *dest_kn = kn;
+
+ /*
+ * This extra ref will be put in kernfs_remove() and guarantees
+ * that @rdtgrp->kn is always accessible.
+ */
+ kernfs_get(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn);
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
static void l3_qos_cfg_update(void *arg)
{
bool *enable = arg;
@@ -718,14 +962,15 @@ static int cdp_enable(void)
struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
int ret;
- if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable)
+ if (!r_l3->alloc_capable || !r_l3data->alloc_capable ||
+ !r_l3code->alloc_capable)
return -EINVAL;
ret = set_l3_qos_cfg(r_l3, true);
if (!ret) {
- r_l3->enabled = false;
- r_l3data->enabled = true;
- r_l3code->enabled = true;
+ r_l3->alloc_enabled = false;
+ r_l3data->alloc_enabled = true;
+ r_l3code->alloc_enabled = true;
}
return ret;
}
@@ -734,11 +979,11 @@ static void cdp_disable(void)
{
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
- r->enabled = r->capable;
+ r->alloc_enabled = r->alloc_capable;
- if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) {
- rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false;
- rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false;
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) {
+ rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false;
+ rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false;
set_l3_qos_cfg(r, false);
}
}
@@ -823,10 +1068,16 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
}
}
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+ struct rdtgroup *prgrp,
+ struct kernfs_node **mon_data_kn);
+
static struct dentry *rdt_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
+ struct rdt_domain *dom;
+ struct rdt_resource *r;
struct dentry *dentry;
int ret;
@@ -853,15 +1104,54 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
goto out_cdp;
}
+ if (rdt_mon_capable) {
+ ret = mongroup_create_dir(rdtgroup_default.kn,
+ NULL, "mon_groups",
+ &kn_mongrp);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_info;
+ }
+ kernfs_get(kn_mongrp);
+
+ ret = mkdir_mondata_all(rdtgroup_default.kn,
+ &rdtgroup_default, &kn_mondata);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_mongrp;
+ }
+ kernfs_get(kn_mondata);
+ rdtgroup_default.mon.mon_data_kn = kn_mondata;
+ }
+
dentry = kernfs_mount(fs_type, flags, rdt_root,
RDTGROUP_SUPER_MAGIC, NULL);
if (IS_ERR(dentry))
- goto out_destroy;
+ goto out_mondata;
+
+ if (rdt_alloc_capable)
+ static_branch_enable(&rdt_alloc_enable_key);
+ if (rdt_mon_capable)
+ static_branch_enable(&rdt_mon_enable_key);
+
+ if (rdt_alloc_capable || rdt_mon_capable)
+ static_branch_enable(&rdt_enable_key);
+
+ if (is_mbm_enabled()) {
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+ list_for_each_entry(dom, &r->domains, list)
+ mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
+ }
- static_branch_enable(&rdt_enable_key);
goto out;
-out_destroy:
+out_mondata:
+ if (rdt_mon_capable)
+ kernfs_remove(kn_mondata);
+out_mongrp:
+ if (rdt_mon_capable)
+ kernfs_remove(kn_mongrp);
+out_info:
kernfs_remove(kn_info);
out_cdp:
cdp_disable();
@@ -909,6 +1199,18 @@ static int reset_all_ctrls(struct rdt_resource *r)
return 0;
}
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+ return (rdt_alloc_capable &&
+ (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+}
+
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+ return (rdt_mon_capable &&
+ (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+}
+
/*
* Move tasks from one to the other group. If @from is NULL, then all tasks
* in the systems are moved unconditionally (used for teardown).
@@ -924,8 +1226,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
read_lock(&tasklist_lock);
for_each_process_thread(p, t) {
- if (!from || t->closid == from->closid) {
+ if (!from || is_closid_match(t, from) ||
+ is_rmid_match(t, from)) {
t->closid = to->closid;
+ t->rmid = to->mon.rmid;
+
#ifdef CONFIG_SMP
/*
* This is safe on x86 w/o barriers as the ordering
@@ -944,6 +1249,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
read_unlock(&tasklist_lock);
}
+static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
+{
+ struct rdtgroup *sentry, *stmp;
+ struct list_head *head;
+
+ head = &rdtgrp->mon.crdtgrp_list;
+ list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+ free_rmid(sentry->mon.rmid);
+ list_del(&sentry->mon.crdtgrp_list);
+ kfree(sentry);
+ }
+}
+
/*
* Forcibly remove all of subdirectories under root.
*/
@@ -955,6 +1273,9 @@ static void rmdir_all_sub(void)
rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+ /* Free any child rmids */
+ free_all_child_rdtgrp(rdtgrp);
+
/* Remove each rdtgroup other than root */
if (rdtgrp == &rdtgroup_default)
continue;
@@ -967,16 +1288,20 @@ static void rmdir_all_sub(void)
cpumask_or(&rdtgroup_default.cpu_mask,
&rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+ free_rmid(rdtgrp->mon.rmid);
+
kernfs_remove(rdtgrp->kn);
list_del(&rdtgrp->rdtgroup_list);
kfree(rdtgrp);
}
/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
get_online_cpus();
- rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid);
+ update_closid_rmid(cpu_online_mask, &rdtgroup_default);
put_online_cpus();
kernfs_remove(kn_info);
+ kernfs_remove(kn_mongrp);
+ kernfs_remove(kn_mondata);
}
static void rdt_kill_sb(struct super_block *sb)
@@ -986,10 +1311,12 @@ static void rdt_kill_sb(struct super_block *sb)
mutex_lock(&rdtgroup_mutex);
/*Put everything back to default values. */
- for_each_enabled_rdt_resource(r)
+ for_each_alloc_enabled_rdt_resource(r)
reset_all_ctrls(r);
cdp_disable();
rmdir_all_sub();
+ static_branch_disable(&rdt_alloc_enable_key);
+ static_branch_disable(&rdt_mon_enable_key);
static_branch_disable(&rdt_enable_key);
kernfs_kill_sb(sb);
mutex_unlock(&rdtgroup_mutex);
@@ -1001,46 +1328,223 @@ static struct file_system_type rdt_fs_type = {
.kill_sb = rdt_kill_sb,
};
-static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
+static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
+ void *priv)
{
- struct rdtgroup *parent, *rdtgrp;
struct kernfs_node *kn;
- int ret, closid;
+ int ret = 0;
- /* Only allow mkdir in the root directory */
- if (parent_kn != rdtgroup_default.kn)
- return -EPERM;
+ kn = __kernfs_create_file(parent_kn, name, 0444, 0,
+ &kf_mondata_ops, priv, NULL, NULL);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
- /* Do not accept '\n' to avoid unparsable situation. */
- if (strchr(name, '\n'))
- return -EINVAL;
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
- parent = rdtgroup_kn_lock_live(parent_kn);
- if (!parent) {
- ret = -ENODEV;
- goto out_unlock;
+ return ret;
+}
+
+/*
+ * Remove all subdirectories of mon_data of ctrl_mon groups
+ * and monitor groups with given domain id.
+ */
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
+{
+ struct rdtgroup *prgrp, *crgrp;
+ char name[32];
+
+ if (!r->mon_enabled)
+ return;
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ sprintf(name, "mon_%s_%02d", r->name, dom_id);
+ kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
+
+ list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
+ kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
}
+}
- ret = closid_alloc();
- if (ret < 0)
+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
+ struct rdt_domain *d,
+ struct rdt_resource *r, struct rdtgroup *prgrp)
+{
+ union mon_data_bits priv;
+ struct kernfs_node *kn;
+ struct mon_evt *mevt;
+ struct rmid_read rr;
+ char name[32];
+ int ret;
+
+ sprintf(name, "mon_%s_%02d", r->name, d->id);
+ /* create the directory */
+ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ /*
+ * This extra ref will be put in kernfs_remove() and guarantees
+ * that kn is always accessible.
+ */
+ kernfs_get(kn);
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ if (WARN_ON(list_empty(&r->evt_list))) {
+ ret = -EPERM;
+ goto out_destroy;
+ }
+
+ priv.u.rid = r->rid;
+ priv.u.domid = d->id;
+ list_for_each_entry(mevt, &r->evt_list, list) {
+ priv.u.evtid = mevt->evtid;
+ ret = mon_addfile(kn, mevt->name, priv.priv);
+ if (ret)
+ goto out_destroy;
+
+ if (is_mbm_event(mevt->evtid))
+ mon_event_read(&rr, d, prgrp, mevt->evtid, true);
+ }
+ kernfs_activate(kn);
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
+
+/*
+ * Add all subdirectories of mon_data for "ctrl_mon" groups
+ * and "monitor" groups with given domain id.
+ */
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ struct rdt_domain *d)
+{
+ struct kernfs_node *parent_kn;
+ struct rdtgroup *prgrp, *crgrp;
+ struct list_head *head;
+
+ if (!r->mon_enabled)
+ return;
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ parent_kn = prgrp->mon.mon_data_kn;
+ mkdir_mondata_subdir(parent_kn, d, r, prgrp);
+
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ parent_kn = crgrp->mon.mon_data_kn;
+ mkdir_mondata_subdir(parent_kn, d, r, crgrp);
+ }
+ }
+}
+
+static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
+ struct rdt_resource *r,
+ struct rdtgroup *prgrp)
+{
+ struct rdt_domain *dom;
+ int ret;
+
+ list_for_each_entry(dom, &r->domains, list) {
+ ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * This creates a directory mon_data which contains the monitored data.
+ *
+ * mon_data has one directory for each domain whic are named
+ * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
+ * with L3 domain looks as below:
+ * ./mon_data:
+ * mon_L3_00
+ * mon_L3_01
+ * mon_L3_02
+ * ...
+ *
+ * Each domain directory has one file per event:
+ * ./mon_L3_00/:
+ * llc_occupancy
+ *
+ */
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+ struct rdtgroup *prgrp,
+ struct kernfs_node **dest_kn)
+{
+ struct rdt_resource *r;
+ struct kernfs_node *kn;
+ int ret;
+
+ /*
+ * Create the mon_data directory first.
+ */
+ ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
+ if (ret)
+ return ret;
+
+ if (dest_kn)
+ *dest_kn = kn;
+
+ /*
+ * Create the subdirectories for each domain. Note that all events
+ * in a domain like L3 are grouped into a resource whose domain is L3
+ */
+ for_each_mon_enabled_rdt_resource(r) {
+ ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
+ if (ret)
+ goto out_destroy;
+ }
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
+
+static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+ struct kernfs_node *prgrp_kn,
+ const char *name, umode_t mode,
+ enum rdt_group_type rtype, struct rdtgroup **r)
+{
+ struct rdtgroup *prdtgrp, *rdtgrp;
+ struct kernfs_node *kn;
+ uint files = 0;
+ int ret;
+
+ prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
+ if (!prdtgrp) {
+ ret = -ENODEV;
goto out_unlock;
- closid = ret;
+ }
/* allocate the rdtgroup. */
rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
if (!rdtgrp) {
ret = -ENOSPC;
- goto out_closid_free;
+ goto out_unlock;
}
- rdtgrp->closid = closid;
- list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+ *r = rdtgrp;
+ rdtgrp->mon.parent = prdtgrp;
+ rdtgrp->type = rtype;
+ INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
/* kernfs creates the directory for rdtgrp */
- kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp);
+ kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
- goto out_cancel_ref;
+ goto out_free_rgrp;
}
rdtgrp->kn = kn;
@@ -1056,43 +1560,211 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (ret)
goto out_destroy;
- ret = rdtgroup_add_files(kn, rdtgroup_base_files,
- ARRAY_SIZE(rdtgroup_base_files));
+ files = RFTYPE_BASE | RFTYPE_CTRL;
+ files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
+ ret = rdtgroup_add_files(kn, files);
if (ret)
goto out_destroy;
+ if (rdt_mon_capable) {
+ ret = alloc_rmid();
+ if (ret < 0)
+ goto out_destroy;
+ rdtgrp->mon.rmid = ret;
+
+ ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
+ if (ret)
+ goto out_idfree;
+ }
kernfs_activate(kn);
- ret = 0;
- goto out_unlock;
+ /*
+ * The caller unlocks the prgrp_kn upon success.
+ */
+ return 0;
+out_idfree:
+ free_rmid(rdtgrp->mon.rmid);
out_destroy:
kernfs_remove(rdtgrp->kn);
-out_cancel_ref:
- list_del(&rdtgrp->rdtgroup_list);
+out_free_rgrp:
kfree(rdtgrp);
-out_closid_free:
- closid_free(closid);
out_unlock:
- rdtgroup_kn_unlock(parent_kn);
+ rdtgroup_kn_unlock(prgrp_kn);
return ret;
}
-static int rdtgroup_rmdir(struct kernfs_node *kn)
+static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
+{
+ kernfs_remove(rgrp->kn);
+ free_rmid(rgrp->mon.rmid);
+ kfree(rgrp);
+}
+
+/*
+ * Create a monitor group under "mon_groups" directory of a control
+ * and monitor group(ctrl_mon). This is a resource group
+ * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
+ */
+static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
+ struct kernfs_node *prgrp_kn,
+ const char *name,
+ umode_t mode)
+{
+ struct rdtgroup *rdtgrp, *prgrp;
+ int ret;
+
+ ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
+ &rdtgrp);
+ if (ret)
+ return ret;
+
+ prgrp = rdtgrp->mon.parent;
+ rdtgrp->closid = prgrp->closid;
+
+ /*
+ * Add the rdtgrp to the list of rdtgrps the parent
+ * ctrl_mon group has to track.
+ */
+ list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
+
+ rdtgroup_kn_unlock(prgrp_kn);
+ return ret;
+}
+
+/*
+ * These are rdtgroups created under the root directory. Can be used
+ * to allocate and monitor resources.
+ */
+static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
+ struct kernfs_node *prgrp_kn,
+ const char *name, umode_t mode)
{
- int ret, cpu, closid = rdtgroup_default.closid;
struct rdtgroup *rdtgrp;
- cpumask_var_t tmpmask;
+ struct kernfs_node *kn;
+ u32 closid;
+ int ret;
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
- return -ENOMEM;
+ ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
+ &rdtgrp);
+ if (ret)
+ return ret;
- rdtgrp = rdtgroup_kn_lock_live(kn);
- if (!rdtgrp) {
- ret = -EPERM;
- goto out;
+ kn = rdtgrp->kn;
+ ret = closid_alloc();
+ if (ret < 0)
+ goto out_common_fail;
+ closid = ret;
+
+ rdtgrp->closid = closid;
+ list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+
+ if (rdt_mon_capable) {
+ /*
+ * Create an empty mon_groups directory to hold the subset
+ * of tasks and cpus to monitor.
+ */
+ ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
+ if (ret)
+ goto out_id_free;
}
+ goto out_unlock;
+
+out_id_free:
+ closid_free(closid);
+ list_del(&rdtgrp->rdtgroup_list);
+out_common_fail:
+ mkdir_rdt_prepare_clean(rdtgrp);
+out_unlock:
+ rdtgroup_kn_unlock(prgrp_kn);
+ return ret;
+}
+
+/*
+ * We allow creating mon groups only with in a directory called "mon_groups"
+ * which is present in every ctrl_mon group. Check if this is a valid
+ * "mon_groups" directory.
+ *
+ * 1. The directory should be named "mon_groups".
+ * 2. The mon group itself should "not" be named "mon_groups".
+ * This makes sure "mon_groups" directory always has a ctrl_mon group
+ * as parent.
+ */
+static bool is_mon_groups(struct kernfs_node *kn, const char *name)
+{
+ return (!strcmp(kn->name, "mon_groups") &&
+ strcmp(name, "mon_groups"));
+}
+
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ /* Do not accept '\n' to avoid unparsable situation. */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ /*
+ * If the parent directory is the root directory and RDT
+ * allocation is supported, add a control and monitoring
+ * subdirectory
+ */
+ if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
+ return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
+
+ /*
+ * If RDT monitoring is supported and the parent directory is a valid
+ * "mon_groups" directory, add a monitoring subdirectory.
+ */
+ if (rdt_mon_capable && is_mon_groups(parent_kn, name))
+ return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
+
+ return -EPERM;
+}
+
+static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+ cpumask_var_t tmpmask)
+{
+ struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+ int cpu;
+
+ /* Give any tasks back to the parent group */
+ rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
+
+ /* Update per cpu rmid of the moved CPUs first */
+ for_each_cpu(cpu, &rdtgrp->cpu_mask)
+ per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
+ /*
+ * Update the MSR on moved CPUs and CPUs which have moved
+ * task running on them.
+ */
+ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+ update_closid_rmid(tmpmask, NULL);
+
+ rdtgrp->flags = RDT_DELETED;
+ free_rmid(rdtgrp->mon.rmid);
+
+ /*
+ * Remove the rdtgrp from the parent ctrl_mon group's list
+ */
+ WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+ list_del(&rdtgrp->mon.crdtgrp_list);
+
+ /*
+ * one extra hold on this, will drop when we kfree(rdtgrp)
+ * in rdtgroup_kn_unlock()
+ */
+ kernfs_get(kn);
+ kernfs_remove(rdtgrp->kn);
+
+ return 0;
+}
+
+static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+ cpumask_var_t tmpmask)
+{
+ int cpu;
+
/* Give any tasks back to the default group */
rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
@@ -1100,18 +1772,28 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
cpumask_or(&rdtgroup_default.cpu_mask,
&rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
- /* Update per cpu closid of the moved CPUs first */
- for_each_cpu(cpu, &rdtgrp->cpu_mask)
- per_cpu(cpu_closid, cpu) = closid;
+ /* Update per cpu closid and rmid of the moved CPUs first */
+ for_each_cpu(cpu, &rdtgrp->cpu_mask) {
+ per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
+ per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
+ }
+
/*
* Update the MSR on moved CPUs and CPUs which have moved
* task running on them.
*/
cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
- rdt_update_closid(tmpmask, NULL);
+ update_closid_rmid(tmpmask, NULL);
rdtgrp->flags = RDT_DELETED;
closid_free(rdtgrp->closid);
+ free_rmid(rdtgrp->mon.rmid);
+
+ /*
+ * Free all the child monitor group rmids.
+ */
+ free_all_child_rdtgrp(rdtgrp);
+
list_del(&rdtgrp->rdtgroup_list);
/*
@@ -1120,7 +1802,41 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
*/
kernfs_get(kn);
kernfs_remove(rdtgrp->kn);
- ret = 0;
+
+ return 0;
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+ struct kernfs_node *parent_kn = kn->parent;
+ struct rdtgroup *rdtgrp;
+ cpumask_var_t tmpmask;
+ int ret = 0;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ rdtgrp = rdtgroup_kn_lock_live(kn);
+ if (!rdtgrp) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ /*
+ * If the rdtgroup is a ctrl_mon group and parent directory
+ * is the root directory, remove the ctrl_mon group.
+ *
+ * If the rdtgroup is a mon group and parent directory
+ * is a valid "mon_groups" directory, remove the mon group.
+ */
+ if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn)
+ ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+ else if (rdtgrp->type == RDTMON_GROUP &&
+ is_mon_groups(parent_kn, kn->name))
+ ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
+ else
+ ret = -EPERM;
+
out:
rdtgroup_kn_unlock(kn);
free_cpumask_var(tmpmask);
@@ -1129,7 +1845,7 @@ out:
static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
{
- if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled)
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
seq_puts(seq, ",cdp");
return 0;
}
@@ -1153,10 +1869,13 @@ static int __init rdtgroup_setup_root(void)
mutex_lock(&rdtgroup_mutex);
rdtgroup_default.closid = 0;
+ rdtgroup_default.mon.rmid = 0;
+ rdtgroup_default.type = RDTCTRL_GROUP;
+ INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
+
list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
- ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files,
- ARRAY_SIZE(rdtgroup_base_files));
+ ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
if (ret) {
kernfs_destroy_root(rdt_root);
goto out;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6dde0497efc7..3b413065c613 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,6 +51,7 @@
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/reboot.h>
+#include <asm/set_memory.h>
#include "mce-internal.h"
@@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m)
return ret;
}
+#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
+
+void arch_unmap_kpfn(unsigned long pfn)
+{
+ unsigned long decoy_addr;
+
+ /*
+ * Unmap this page from the kernel 1:1 mappings to make sure
+ * we don't log more errors because of speculative access to
+ * the page.
+ * We would like to just call:
+ * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
+ * but doing that would radically increase the odds of a
+ * speculative access to the posion page because we'd have
+ * the virtual address of the kernel 1:1 mapping sitting
+ * around in registers.
+ * Instead we get tricky. We create a non-canonical address
+ * that looks just like the one we want, but has bit 63 flipped.
+ * This relies on set_memory_np() not checking whether we passed
+ * a legal address.
+ */
+
+/*
+ * Build time check to see if we have a spare virtual bit. Don't want
+ * to leave this until run time because most developers don't have a
+ * system that can exercise this code path. This will only become a
+ * problem if/when we move beyond 5-level page tables.
+ *
+ * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
+ */
+#if PGDIR_SHIFT + 9 < 63
+ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+#else
+#error "no unused virtual bit available"
+#endif
+
+ if (set_memory_np(decoy_addr, 1))
+ pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+
+}
+#endif
+
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 9e314bcf67cc..40e28ed77fbf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -201,8 +201,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
wrmsr(smca_config, low, high);
}
- /* Collect bank_info using CPU 0 for now. */
- if (cpu)
+ /* Return early if this bank was already initialized. */
+ if (smca_banks[bank].hwid)
return;
if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
@@ -216,11 +216,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
s_hwid = &smca_hwid_mcatypes[i];
if (hwid_mcatype == s_hwid->hwid_mcatype) {
-
- WARN(smca_banks[bank].hwid,
- "Bank %s already initialized!\n",
- smca_get_name(s_hwid->bank_type));
-
smca_banks[bank].hwid = s_hwid;
smca_banks[bank].id = low;
smca_banks[bank].sysfs_id = s_hwid->count++;
@@ -776,24 +771,12 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
mce_log(&m);
}
-static inline void __smp_deferred_error_interrupt(void)
-{
- inc_irq_stat(irq_deferred_error_count);
- deferred_error_int_vector();
-}
-
asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void)
{
entering_irq();
- __smp_deferred_error_interrupt();
- exiting_ack_irq();
-}
-
-asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
-{
- entering_irq();
trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
- __smp_deferred_error_interrupt();
+ inc_irq_stat(irq_deferred_error_count);
+ deferred_error_int_vector();
trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
exiting_ack_irq();
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d7cc190ae457..2da67b70ba98 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -122,7 +122,7 @@ static struct attribute *thermal_throttle_attrs[] = {
NULL
};
-static struct attribute_group thermal_attr_group = {
+static const struct attribute_group thermal_attr_group = {
.attrs = thermal_throttle_attrs,
.name = "thermal_throttle"
};
@@ -390,26 +390,12 @@ static void unexpected_thermal_interrupt(void)
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-static inline void __smp_thermal_interrupt(void)
-{
- inc_irq_stat(irq_thermal_count);
- smp_thermal_vector();
-}
-
-asmlinkage __visible void __irq_entry
-smp_thermal_interrupt(struct pt_regs *regs)
-{
- entering_irq();
- __smp_thermal_interrupt();
- exiting_ack_irq();
-}
-
-asmlinkage __visible void __irq_entry
-smp_trace_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r)
{
entering_irq();
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
- __smp_thermal_interrupt();
+ inc_irq_stat(irq_thermal_count);
+ smp_thermal_vector();
trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
exiting_ack_irq();
}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index bb0e75eed10a..5e7249e42f8f 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -17,24 +17,12 @@ static void default_threshold_interrupt(void)
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-static inline void __smp_threshold_interrupt(void)
-{
- inc_irq_stat(irq_threshold_count);
- mce_threshold_vector();
-}
-
asmlinkage __visible void __irq_entry smp_threshold_interrupt(void)
{
entering_irq();
- __smp_threshold_interrupt();
- exiting_ack_irq();
-}
-
-asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void)
-{
- entering_irq();
trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
- __smp_threshold_interrupt();
+ inc_irq_stat(irq_threshold_count);
+ mce_threshold_vector();
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
exiting_ack_irq();
}
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 21b185793c80..c6daec4bdba5 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -400,9 +400,12 @@ static void update_cache(struct ucode_patch *new_patch)
list_for_each_entry(p, &microcode_cache, plist) {
if (p->equiv_cpu == new_patch->equiv_cpu) {
- if (p->patch_id >= new_patch->patch_id)
+ if (p->patch_id >= new_patch->patch_id) {
/* we already have the latest patch */
+ kfree(new_patch->data);
+ kfree(new_patch);
return;
+ }
list_replace(&p->plist, &new_patch->plist);
kfree(p->data);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 9cb98ee103db..86e8f0b2537b 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -561,7 +561,7 @@ static struct attribute *mc_default_attrs[] = {
NULL
};
-static struct attribute_group mc_attr_group = {
+static const struct attribute_group mc_attr_group = {
.attrs = mc_default_attrs,
.name = "microcode",
};
@@ -707,7 +707,7 @@ static struct attribute *cpu_root_microcode_attrs[] = {
NULL
};
-static struct attribute_group cpu_root_microcode_group = {
+static const struct attribute_group cpu_root_microcode_group = {
.name = "microcode",
.attrs = cpu_root_microcode_attrs,
};
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 59edbe9d4ccb..8f7a9bbad514 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -146,18 +146,18 @@ static bool microcode_matches(struct microcode_header_intel *mc_header,
return false;
}
-static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
+static struct ucode_patch *memdup_patch(void *data, unsigned int size)
{
struct ucode_patch *p;
p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL);
if (!p)
- return ERR_PTR(-ENOMEM);
+ return NULL;
p->data = kmemdup(data, size, GFP_KERNEL);
if (!p->data) {
kfree(p);
- return ERR_PTR(-ENOMEM);
+ return NULL;
}
return p;
@@ -183,8 +183,8 @@ static void save_microcode_patch(void *data, unsigned int size)
if (mc_hdr->rev <= mc_saved_hdr->rev)
continue;
- p = __alloc_microcode_buf(data, size);
- if (IS_ERR(p))
+ p = memdup_patch(data, size);
+ if (!p)
pr_err("Error allocating buffer %p\n", data);
else
list_replace(&iter->plist, &p->plist);
@@ -196,24 +196,25 @@ static void save_microcode_patch(void *data, unsigned int size)
* newly found.
*/
if (!prev_found) {
- p = __alloc_microcode_buf(data, size);
- if (IS_ERR(p))
+ p = memdup_patch(data, size);
+ if (!p)
pr_err("Error allocating buffer for %p\n", data);
else
list_add_tail(&p->plist, &microcode_cache);
}
+ if (!p)
+ return;
+
/*
* Save for early loading. On 32-bit, that needs to be a physical
* address as the APs are running from physical addresses, before
* paging has been enabled.
*/
- if (p) {
- if (IS_ENABLED(CONFIG_X86_32))
- intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
- else
- intel_ucode_patch = p->data;
- }
+ if (IS_ENABLED(CONFIG_X86_32))
+ intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
+ else
+ intel_ucode_patch = p->data;
}
static int microcode_sanity_check(void *mc, int print_err)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 70e717fccdd6..9fc32651c911 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -59,13 +59,8 @@ void hyperv_vector_handler(struct pt_regs *regs)
void hv_setup_vmbus_irq(void (*handler)(void))
{
vmbus_handler = handler;
- /*
- * Setup the IDT for hypervisor callback. Prevent reallocation
- * at module reload.
- */
- if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
- hyperv_callback_vector);
+ /* Setup the IDT for hypervisor callback */
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
}
void hv_remove_vmbus_irq(void)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index c5bb63be4ba1..40d5a8a75212 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -237,6 +237,18 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
}
+static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
+
+ stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
+
static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type)
{
@@ -370,7 +382,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
/* Search for an empty MTRR */
i = mtrr_if->get_free_region(base, size, replace);
if (i >= 0) {
- set_mtrr(i, base, size, type);
+ set_mtrr_cpuslocked(i, base, size, type);
if (likely(replace < 0)) {
mtrr_usage_table[i] = 1;
} else {
@@ -378,7 +390,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
if (increment)
mtrr_usage_table[i]++;
if (unlikely(replace != i)) {
- set_mtrr(replace, 0, 0, 0);
+ set_mtrr_cpuslocked(replace, 0, 0, 0);
mtrr_usage_table[replace] = 0;
}
}
@@ -506,7 +518,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
goto out;
}
if (--mtrr_usage_table[reg] < 1)
- set_mtrr(reg, 0, 0, 0);
+ set_mtrr_cpuslocked(reg, 0, 0, 0);
error = reg;
out:
mutex_unlock(&mtrr_mutex);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 23c23508c012..05459ad3db46 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,6 +31,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index dbce3cca94cb..f13b4c00a5de 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -94,6 +94,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (stack_name)
printk("%s <%s>\n", log_lvl, stack_name);
+ if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
+ __show_regs(regs, 0);
+
/*
* Scan the stack, printing any text addresses we find. At the
* same time, follow proper stack frames with the unwinder.
@@ -118,10 +121,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* Don't print regs->ip again if it was already printed
* by __show_regs() below.
*/
- if (regs && stack == &regs->ip) {
- unwind_next_frame(&state);
- continue;
- }
+ if (regs && stack == &regs->ip)
+ goto next;
if (stack == ret_addr_p)
reliable = 1;
@@ -144,6 +145,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (!reliable)
continue;
+next:
/*
* Get the next frame from the unwinder. No need to
* check for an error: if anything goes wrong, the rest
@@ -153,7 +155,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/* if the frame has entry regs, print them */
regs = unwind_get_entry_regs(&state);
- if (regs)
+ if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
__show_regs(regs, 0);
}
@@ -265,7 +267,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
#ifdef CONFIG_X86_32
if (user_mode(regs)) {
sp = regs->sp;
- ss = regs->ss & 0xffff;
+ ss = regs->ss;
} else {
sp = kernel_stack_pointer(regs);
savesegment(ss, ss);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index e5f0b40e66d2..4f0481474903 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -37,7 +37,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
* This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty.
*/
- if (stack < begin || stack > end)
+ if (stack <= begin || stack > end)
return false;
info->type = STACK_TYPE_IRQ;
@@ -62,7 +62,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
* This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty.
*/
- if (stack < begin || stack > end)
+ if (stack <= begin || stack > end)
return false;
info->type = STACK_TYPE_SOFTIRQ;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 3e1471d57487..225af4184f06 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -55,7 +55,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
begin = end - (exception_stack_sizes[k] / sizeof(long));
regs = (struct pt_regs *)end - 1;
- if (stack < begin || stack >= end)
+ if (stack <= begin || stack >= end)
continue;
info->type = STACK_TYPE_EXCEPTION + k;
@@ -78,7 +78,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
* This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty.
*/
- if (stack < begin || stack > end)
+ if (stack <= begin || stack > end)
return false;
info->type = STACK_TYPE_IRQ;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 532da61d605c..71c11ad5643e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -96,7 +96,8 @@ EXPORT_SYMBOL_GPL(e820__mapped_any);
* Note: this function only works correctly once the E820 table is sorted and
* not-overlapping (at least for the range specified), which is the case normally.
*/
-bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
+ enum e820_type type)
{
int i;
@@ -122,9 +123,28 @@ bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
* coverage of the desired range exists:
*/
if (start >= end)
- return 1;
+ return entry;
}
- return 0;
+
+ return NULL;
+}
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ */
+bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+{
+ return __e820__mapped_all(start, end, type);
+}
+
+/*
+ * This function returns the type associated with the range <start,end>.
+ */
+int e820__get_entry_type(u64 start, u64 end)
+{
+ struct e820_entry *entry = __e820__mapped_all(start, end, 0);
+
+ return entry ? entry->type : -EINVAL;
}
/*
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index d907c3d8633f..a4516ca4c4f3 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -527,6 +527,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_BXT_IDS(&gen9_early_ops),
INTEL_KBL_IDS(&gen9_early_ops),
INTEL_GLK_IDS(&gen9_early_ops),
+ INTEL_CNL_IDS(&gen9_early_ops),
};
static void __init
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
new file mode 100644
index 000000000000..f260e452e4f8
--- /dev/null
+++ b/arch/x86/kernel/eisa.c
@@ -0,0 +1,19 @@
+/*
+ * EISA specific code
+ *
+ * This file is licensed under the GPL V2
+ */
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#include <linux/io.h>
+
+static __init int eisa_bus_probe(void)
+{
+ void __iomem *p = ioremap(0x0FFFD9, 4);
+
+ if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
+ EISA_bus = 1;
+ iounmap(p);
+ return 0;
+}
+subsys_initcall(eisa_bus_probe);
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 6b91e2eb8d3f..9c4e7ba6870c 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -195,7 +195,7 @@ void init_espfix_ap(int cpu)
pte_p = pte_offset_kernel(&pmd, addr);
stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
- pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
+ pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask));
for (n = 0; n < ESPFIX_PTE_CLONES; n++)
set_pte(&pte_p[n*PTE_STRIDE], pte);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 538ec012b371..cf2ce063f65a 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -10,6 +10,7 @@
#include <linux/mm.h>
#include <linux/memblock.h>
+#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/e820/api.h>
@@ -30,6 +31,9 @@ static void __init i386_default_early_setup(void)
asmlinkage __visible void __init i386_start_kernel(void)
{
cr4_init_shadow();
+
+ idt_setup_early_handler();
+
sanitize_boot_params(&boot_params);
x86_early_init_platform_quirks();
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 46c3c73e7f43..bab4fa579450 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -14,6 +14,7 @@
#include <linux/start_kernel.h>
#include <linux/io.h>
#include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
#include <asm/processor.h>
#include <asm/proto.h>
@@ -33,7 +34,6 @@
/*
* Manage page tables very early on.
*/
-extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
@@ -45,14 +45,17 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
return ptr - (void *)_text + (void *)physaddr;
}
-void __head __startup_64(unsigned long physaddr)
+unsigned long __head __startup_64(unsigned long physaddr,
+ struct boot_params *bp)
{
unsigned long load_delta, *p;
+ unsigned long pgtable_flags;
pgdval_t *pgd;
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
int i;
+ unsigned int *next_pgt_ptr;
/* Is the address too large? */
if (physaddr >> MAX_PHYSMEM_BITS)
@@ -68,6 +71,12 @@ void __head __startup_64(unsigned long physaddr)
if (load_delta & ~PMD_PAGE_MASK)
for (;;);
+ /* Activate Secure Memory Encryption (SME) if supported and enabled */
+ sme_enable(bp);
+
+ /* Include the SME encryption mask in the fixup value */
+ load_delta += sme_get_me_mask();
+
/* Fixup the physical addresses in the page table */
pgd = fixup_pointer(&early_top_pgt, physaddr);
@@ -92,30 +101,34 @@ void __head __startup_64(unsigned long physaddr)
* it avoids problems around wraparound.
*/
- pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
- pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+ next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
+ pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+ pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+
+ pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
- pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
- pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
+ pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
- p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
- p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+ p4d[i + 0] = (pgdval_t)pud + pgtable_flags;
+ p4d[i + 1] = (pgdval_t)pud + pgtable_flags;
} else {
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
- pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
- pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+ pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
}
i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
- pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
- pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
+ pud[i + 0] = (pudval_t)pmd + pgtable_flags;
+ pud[i + 1] = (pudval_t)pmd + pgtable_flags;
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+ pmd_entry += sme_get_me_mask();
pmd_entry += physaddr;
for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
@@ -136,9 +149,30 @@ void __head __startup_64(unsigned long physaddr)
pmd[i] += load_delta;
}
- /* Fixup phys_base */
+ /*
+ * Fixup phys_base - remove the memory encryption mask to obtain
+ * the true physical address.
+ */
p = fixup_pointer(&phys_base, physaddr);
- *p += load_delta;
+ *p += load_delta - sme_get_me_mask();
+
+ /* Encrypt the kernel (if SME is active) */
+ sme_encrypt_kernel();
+
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
+unsigned long __startup_secondary_64(void)
+{
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
}
/* Wipe all early page tables except for the kernel symbol map */
@@ -146,17 +180,17 @@ static void __init reset_early_page_tables(void)
{
memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
next_early_pgt = 0;
- write_cr3(__pa_nodebug(early_top_pgt));
+ write_cr3(__sme_pa_nodebug(early_top_pgt));
}
/* Create a new PMD entry */
-int __init early_make_pgtable(unsigned long address)
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
{
unsigned long physaddr = address - __PAGE_OFFSET;
pgdval_t pgd, *pgd_p;
p4dval_t p4d, *p4d_p;
pudval_t pud, *pud_p;
- pmdval_t pmd, *pmd_p;
+ pmdval_t *pmd_p;
/* Invalid address or early pgt is done ? */
if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
@@ -215,12 +249,21 @@ again:
memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
*pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
}
- pmd = (physaddr & PMD_MASK) + early_pmd_flags;
pmd_p[pmd_index(address)] = pmd;
return 0;
}
+int __init early_make_pgtable(unsigned long address)
+{
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ pmdval_t pmd;
+
+ pmd = (physaddr & PMD_MASK) + early_pmd_flags;
+
+ return __early_make_pgtable(address, pmd);
+}
+
/* Don't add a printk in there. printk relies on the PDA which is not initialized
yet. */
static void __init clear_bss(void)
@@ -243,6 +286,12 @@ static void __init copy_bootdata(char *real_mode_data)
char * command_line;
unsigned long cmd_line_ptr;
+ /*
+ * If SME is active, this will create decrypted mappings of the
+ * boot data in advance of the copy operations.
+ */
+ sme_map_bootdata(real_mode_data);
+
memcpy(&boot_params, real_mode_data, sizeof boot_params);
sanitize_boot_params(&boot_params);
cmd_line_ptr = get_cmd_line_ptr();
@@ -250,12 +299,18 @@ static void __init copy_bootdata(char *real_mode_data)
command_line = __va(cmd_line_ptr);
memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
}
+
+ /*
+ * The old boot data is no longer needed and won't be reserved,
+ * freeing up that memory for use by the system. If SME is active,
+ * we need to remove the mappings that were created so that the
+ * memory doesn't remain mapped as decrypted.
+ */
+ sme_unmap_bootdata(real_mode_data);
}
asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
{
- int i;
-
/*
* Build-time sanity checks on the kernel image and module
* area mappings. (these are purely build-time and produce no code)
@@ -279,11 +334,16 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
clear_page(init_top_pgt);
+ /*
+ * SME support may update early_pmd_flags to include the memory
+ * encryption mask, so it needs to be called before anything
+ * that may generate a page fault.
+ */
+ sme_early_init();
+
kasan_early_init();
- for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
- set_intr_gate(i, early_idt_handler_array[i]);
- load_idt((const struct desc_ptr *)&idt_descr);
+ idt_setup_early_handler();
copy_bootdata(__va(real_mode_data));
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 1f85ee8f9439..9ed3074d0d27 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -155,7 +155,6 @@ ENTRY(startup_32)
jmp *%eax
.Lbad_subarch:
-WEAK(lguest_entry)
WEAK(xen_entry)
/* Unknown implementation; there's really
nothing we can do at this point. */
@@ -165,7 +164,6 @@ WEAK(xen_entry)
subarch_entries:
.long .Ldefault_entry /* normal x86/PC */
- .long lguest_entry /* lguest hypervisor */
.long xen_entry /* Xen hypervisor */
.long .Ldefault_entry /* Moorestown MID */
num_subarch_entries = (. - subarch_entries) / 4
@@ -347,7 +345,6 @@ ENTRY(startup_32_smp)
movl %eax,%cr0
lgdt early_gdt_descr
- lidt idt_descr
ljmp $(__KERNEL_CS),$1f
1: movl $(__KERNEL_DS),%eax # reload all the segment registers
movl %eax,%ss # after changing gdt.
@@ -380,37 +377,6 @@ ENDPROC(startup_32_smp)
*/
__INIT
setup_once:
- /*
- * Set up a idt with 256 interrupt gates that push zero if there
- * is no error code and then jump to early_idt_handler_common.
- * It doesn't actually load the idt - that needs to be done on
- * each CPU. Interrupts are enabled elsewhere, when we can be
- * relatively sure everything is ok.
- */
-
- movl $idt_table,%edi
- movl $early_idt_handler_array,%eax
- movl $NUM_EXCEPTION_VECTORS,%ecx
-1:
- movl %eax,(%edi)
- movl %eax,4(%edi)
- /* interrupt gate, dpl=0, present */
- movl $(0x8E000000 + __KERNEL_CS),2(%edi)
- addl $EARLY_IDT_HANDLER_SIZE,%eax
- addl $8,%edi
- loop 1b
-
- movl $256 - NUM_EXCEPTION_VECTORS,%ecx
- movl $ignore_int,%edx
- movl $(__KERNEL_CS << 16),%eax
- movw %dx,%ax /* selector = 0x0010 = cs */
- movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
-2:
- movl %eax,(%edi)
- movl %edx,4(%edi)
- addl $8,%edi
- loop 2b
-
#ifdef CONFIG_CC_STACKPROTECTOR
/*
* Configure the stack canary. The linker can't handle this by
@@ -457,12 +423,9 @@ early_idt_handler_common:
/* The vector number is in pt_regs->gs */
cld
- pushl %fs /* pt_regs->fs */
- movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
- pushl %es /* pt_regs->es */
- movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
- pushl %ds /* pt_regs->ds */
- movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
+ pushl %fs /* pt_regs->fs (__fsh varies by model) */
+ pushl %es /* pt_regs->es (__esh varies by model) */
+ pushl %ds /* pt_regs->ds (__dsh varies by model) */
pushl %eax /* pt_regs->ax */
pushl %ebp /* pt_regs->bp */
pushl %edi /* pt_regs->di */
@@ -479,9 +442,8 @@ early_idt_handler_common:
/* Load the vector number into EDX */
movl PT_GS(%esp), %edx
- /* Load GS into pt_regs->gs and clear high bits */
+ /* Load GS into pt_regs->gs (and maybe clobber __gsh) */
movw %gs, PT_GS(%esp)
- movw $0, PT_GS+2(%esp)
movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */
call early_fixup_exception
@@ -493,18 +455,17 @@ early_idt_handler_common:
popl %edi /* pt_regs->di */
popl %ebp /* pt_regs->bp */
popl %eax /* pt_regs->ax */
- popl %ds /* pt_regs->ds */
- popl %es /* pt_regs->es */
- popl %fs /* pt_regs->fs */
- popl %gs /* pt_regs->gs */
+ popl %ds /* pt_regs->ds (always ignores __dsh) */
+ popl %es /* pt_regs->es (always ignores __esh) */
+ popl %fs /* pt_regs->fs (always ignores __fsh) */
+ popl %gs /* pt_regs->gs (always ignores __gsh) */
decl %ss:early_recursion_flag
addl $4, %esp /* pop pt_regs->orig_ax */
iret
ENDPROC(early_idt_handler_common)
/* This is the default interrupt "handler" :-) */
- ALIGN
-ignore_int:
+ENTRY(early_ignore_irq)
cld
#ifdef CONFIG_PRINTK
pushl %eax
@@ -539,7 +500,8 @@ ignore_int:
hlt_loop:
hlt
jmp hlt_loop
-ENDPROC(ignore_int)
+ENDPROC(early_ignore_irq)
+
__INITDATA
.align 4
GLOBAL(early_recursion_flag)
@@ -628,7 +590,6 @@ int_msg:
.data
.globl boot_gdt_descr
-.globl idt_descr
ALIGN
# early boot GDT descriptor (must use 1:1 address mapping)
@@ -637,11 +598,6 @@ boot_gdt_descr:
.word __BOOT_DS+7
.long boot_gdt - __PAGE_OFFSET
- .word 0 # 32-bit align idt_desc.address
-idt_descr:
- .word IDT_ENTRIES*8-1 # idt contains 256 entries
- .long idt_table
-
# boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
ENTRY(early_gdt_descr)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6225550883df..513cbb012ecc 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -73,12 +73,19 @@ startup_64:
/* Sanitize CPU configuration */
call verify_cpu
+ /*
+ * Perform pagetable fixups. Additionally, if SME is active, encrypt
+ * the kernel and retrieve the modifier (SME encryption mask if SME
+ * is active) to be added to the initial pgdir entry that will be
+ * programmed into CR3.
+ */
leaq _text(%rip), %rdi
pushq %rsi
call __startup_64
popq %rsi
- movq $(early_top_pgt - __START_KERNEL_map), %rax
+ /* Form the CR3 value being sure to include the CR3 modifier */
+ addq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
/*
@@ -98,7 +105,16 @@ ENTRY(secondary_startup_64)
/* Sanitize CPU configuration */
call verify_cpu
- movq $(init_top_pgt - __START_KERNEL_map), %rax
+ /*
+ * Retrieve the modifier (SME encryption mask if SME is active) to be
+ * added to the initial pgdir entry that will be programmed into CR3.
+ */
+ pushq %rsi
+ call __startup_secondary_64
+ popq %rsi
+
+ /* Form the CR3 value being sure to include the CR3 modifier */
+ addq $(init_top_pgt - __START_KERNEL_map), %rax
1:
/* Enable PAE mode, PGE and LA57 */
@@ -335,9 +351,9 @@ GLOBAL(name)
NEXT_PAGE(early_top_pgt)
.fill 511,8,0
#ifdef CONFIG_X86_5LEVEL
- .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#else
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#endif
NEXT_PAGE(early_dynamic_pgts)
@@ -350,15 +366,15 @@ NEXT_PAGE(init_top_pgt)
.fill 512,8,0
#else
NEXT_PAGE(init_top_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
NEXT_PAGE(level3_ident_pgt)
- .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.fill 511, 8, 0
NEXT_PAGE(level2_ident_pgt)
/* Since I easily can, map the first 1G.
@@ -370,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt)
#ifdef CONFIG_X86_5LEVEL
NEXT_PAGE(level4_kernel_pgt)
.fill 511,8,0
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#endif
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
- .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
NEXT_PAGE(level2_kernel_pgt)
/*
@@ -395,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt)
NEXT_PAGE(level2_fixmap_pgt)
.fill 506,8,0
- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
/* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
.fill 5,8,0
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 16f82a3aaec7..8ce4212e2b8d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -345,21 +345,10 @@ static int hpet_shutdown(struct clock_event_device *evt, int timer)
return 0;
}
-static int hpet_resume(struct clock_event_device *evt, int timer)
-{
- if (!timer) {
- hpet_enable_legacy_int();
- } else {
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-
- irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq));
- irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
- disable_hardirq(hdev->irq);
- irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
- enable_irq(hdev->irq);
- }
+static int hpet_resume(struct clock_event_device *evt)
+{
+ hpet_enable_legacy_int();
hpet_print_config();
-
return 0;
}
@@ -417,7 +406,7 @@ static int hpet_legacy_set_periodic(struct clock_event_device *evt)
static int hpet_legacy_resume(struct clock_event_device *evt)
{
- return hpet_resume(evt, 0);
+ return hpet_resume(evt);
}
static int hpet_legacy_next_event(unsigned long delta,
@@ -510,8 +499,14 @@ static int hpet_msi_set_periodic(struct clock_event_device *evt)
static int hpet_msi_resume(struct clock_event_device *evt)
{
struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ struct irq_data *data = irq_get_irq_data(hdev->irq);
+ struct msi_msg msg;
- return hpet_resume(evt, hdev->num);
+ /* Restore the MSI msg and unmask the interrupt */
+ irq_chip_compose_msi_msg(data, &msg);
+ hpet_msi_write(hdev, &msg);
+ hpet_msi_unmask(data);
+ return 0;
}
static int hpet_msi_next_event(unsigned long delta,
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
new file mode 100644
index 000000000000..6107ee1cb8d5
--- /dev/null
+++ b/arch/x86/kernel/idt.c
@@ -0,0 +1,371 @@
+/*
+ * Interrupt descriptor table related code
+ *
+ * This file is licensed under the GPL V2
+ */
+#include <linux/interrupt.h>
+
+#include <asm/traps.h>
+#include <asm/proto.h>
+#include <asm/desc.h>
+
+struct idt_data {
+ unsigned int vector;
+ unsigned int segment;
+ struct idt_bits bits;
+ const void *addr;
+};
+
+#define DPL0 0x0
+#define DPL3 0x3
+
+#define DEFAULT_STACK 0
+
+#define G(_vector, _addr, _ist, _type, _dpl, _segment) \
+ { \
+ .vector = _vector, \
+ .bits.ist = _ist, \
+ .bits.type = _type, \
+ .bits.dpl = _dpl, \
+ .bits.p = 1, \
+ .addr = _addr, \
+ .segment = _segment, \
+ }
+
+/* Interrupt gate */
+#define INTG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate */
+#define SYSG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+/* Interrupt gate with interrupt stack */
+#define ISTG(_vector, _addr, _ist) \
+ G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate with interrupt stack */
+#define SISTG(_vector, _addr, _ist) \
+ G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+/* Task gate */
+#define TSKG(_vector, _gdt) \
+ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
+
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initdata struct idt_data early_idts[] = {
+ INTG(X86_TRAP_DB, debug),
+ SYSG(X86_TRAP_BP, int3),
+#ifdef CONFIG_X86_32
+ INTG(X86_TRAP_PF, page_fault),
+#endif
+};
+
+/*
+ * The default IDT entries which are set up in trap_init() before
+ * cpu_init() is invoked. Interrupt stacks cannot be used at that point and
+ * the traps which use them are reinitialized with IST after cpu_init() has
+ * set up TSS.
+ */
+static const __initdata struct idt_data def_idts[] = {
+ INTG(X86_TRAP_DE, divide_error),
+ INTG(X86_TRAP_NMI, nmi),
+ INTG(X86_TRAP_BR, bounds),
+ INTG(X86_TRAP_UD, invalid_op),
+ INTG(X86_TRAP_NM, device_not_available),
+ INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun),
+ INTG(X86_TRAP_TS, invalid_TSS),
+ INTG(X86_TRAP_NP, segment_not_present),
+ INTG(X86_TRAP_SS, stack_segment),
+ INTG(X86_TRAP_GP, general_protection),
+ INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug),
+ INTG(X86_TRAP_MF, coprocessor_error),
+ INTG(X86_TRAP_AC, alignment_check),
+ INTG(X86_TRAP_XF, simd_coprocessor_error),
+
+#ifdef CONFIG_X86_32
+ TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
+#else
+ INTG(X86_TRAP_DF, double_fault),
+#endif
+ INTG(X86_TRAP_DB, debug),
+ INTG(X86_TRAP_NMI, nmi),
+ INTG(X86_TRAP_BP, int3),
+
+#ifdef CONFIG_X86_MCE
+ INTG(X86_TRAP_MC, &machine_check),
+#endif
+
+ SYSG(X86_TRAP_OF, overflow),
+#if defined(CONFIG_IA32_EMULATION)
+ SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
+#elif defined(CONFIG_X86_32)
+ SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
+#endif
+};
+
+/*
+ * The APIC and SMP idt entries
+ */
+static const __initdata struct idt_data apic_idts[] = {
+#ifdef CONFIG_SMP
+ INTG(RESCHEDULE_VECTOR, reschedule_interrupt),
+ INTG(CALL_FUNCTION_VECTOR, call_function_interrupt),
+ INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt),
+ INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt),
+ INTG(REBOOT_VECTOR, reboot_interrupt),
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ INTG(THERMAL_APIC_VECTOR, thermal_interrupt),
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt),
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt),
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt),
+ INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi),
+# ifdef CONFIG_HAVE_KVM
+ INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
+ INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
+ INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
+# endif
+# ifdef CONFIG_IRQ_WORK
+ INTG(IRQ_WORK_VECTOR, irq_work_interrupt),
+# endif
+ INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt),
+ INTG(ERROR_APIC_VECTOR, error_interrupt),
+#endif
+};
+
+#ifdef CONFIG_X86_64
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initdata struct idt_data early_pf_idts[] = {
+ INTG(X86_TRAP_PF, page_fault),
+};
+
+/*
+ * Override for the debug_idt. Same as the default, but with interrupt
+ * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
+ */
+static const __initdata struct idt_data dbg_idts[] = {
+ INTG(X86_TRAP_DB, debug),
+ INTG(X86_TRAP_BP, int3),
+};
+#endif
+
+/* Must be page-aligned because the real IDT is used in a fixmap. */
+gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+struct desc_ptr idt_descr __ro_after_init = {
+ .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1,
+ .address = (unsigned long) idt_table,
+};
+
+#ifdef CONFIG_X86_64
+/* No need to be aligned, but done to keep all IDTs defined the same way. */
+gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+/*
+ * The exceptions which use Interrupt stacks. They are setup after
+ * cpu_init() when the TSS has been initialized.
+ */
+static const __initdata struct idt_data ist_idts[] = {
+ ISTG(X86_TRAP_DB, debug, DEBUG_STACK),
+ ISTG(X86_TRAP_NMI, nmi, NMI_STACK),
+ SISTG(X86_TRAP_BP, int3, DEBUG_STACK),
+ ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK),
+#ifdef CONFIG_X86_MCE
+ ISTG(X86_TRAP_MC, &machine_check, MCE_STACK),
+#endif
+};
+
+/*
+ * Override for the debug_idt. Same as the default, but with interrupt
+ * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
+ */
+const struct desc_ptr debug_idt_descr = {
+ .size = IDT_ENTRIES * 16 - 1,
+ .address = (unsigned long) debug_idt_table,
+};
+#endif
+
+static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
+{
+ unsigned long addr = (unsigned long) d->addr;
+
+ gate->offset_low = (u16) addr;
+ gate->segment = (u16) d->segment;
+ gate->bits = d->bits;
+ gate->offset_middle = (u16) (addr >> 16);
+#ifdef CONFIG_X86_64
+ gate->offset_high = (u32) (addr >> 32);
+ gate->reserved = 0;
+#endif
+}
+
+static void
+idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
+{
+ gate_desc desc;
+
+ for (; size > 0; t++, size--) {
+ idt_init_desc(&desc, t);
+ write_idt_entry(idt, t->vector, &desc);
+ if (sys)
+ set_bit(t->vector, used_vectors);
+ }
+}
+
+static void set_intr_gate(unsigned int n, const void *addr)
+{
+ struct idt_data data;
+
+ BUG_ON(n > 0xFF);
+
+ memset(&data, 0, sizeof(data));
+ data.vector = n;
+ data.addr = addr;
+ data.segment = __KERNEL_CS;
+ data.bits.type = GATE_INTERRUPT;
+ data.bits.p = 1;
+
+ idt_setup_from_table(idt_table, &data, 1, false);
+}
+
+/**
+ * idt_setup_early_traps - Initialize the idt table with early traps
+ *
+ * On X8664 these traps do not use interrupt stacks as they can't work
+ * before cpu_init() is invoked and sets up TSS. The IST variants are
+ * installed after that.
+ */
+void __init idt_setup_early_traps(void)
+{
+ idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts),
+ true);
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_setup_traps - Initialize the idt table with default traps
+ */
+void __init idt_setup_traps(void)
+{
+ idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
+}
+
+#ifdef CONFIG_X86_64
+/**
+ * idt_setup_early_pf - Initialize the idt table with early pagefault handler
+ *
+ * On X8664 this does not use interrupt stacks as they can't work before
+ * cpu_init() is invoked and sets up TSS. The IST variant is installed
+ * after that.
+ *
+ * FIXME: Why is 32bit and 64bit installing the PF handler at different
+ * places in the early setup code?
+ */
+void __init idt_setup_early_pf(void)
+{
+ idt_setup_from_table(idt_table, early_pf_idts,
+ ARRAY_SIZE(early_pf_idts), true);
+}
+
+/**
+ * idt_setup_ist_traps - Initialize the idt table with traps using IST
+ */
+void __init idt_setup_ist_traps(void)
+{
+ idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true);
+}
+
+/**
+ * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps
+ */
+void __init idt_setup_debugidt_traps(void)
+{
+ memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
+
+ idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false);
+}
+#endif
+
+/**
+ * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates
+ */
+void __init idt_setup_apic_and_irq_gates(void)
+{
+ int i = FIRST_EXTERNAL_VECTOR;
+ void *entry;
+
+ idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true);
+
+ for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) {
+ entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
+ set_intr_gate(i, entry);
+ }
+
+ for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
+#ifdef CONFIG_X86_LOCAL_APIC
+ set_bit(i, used_vectors);
+ set_intr_gate(i, spurious_interrupt);
+#else
+ entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
+ set_intr_gate(i, entry);
+#endif
+ }
+}
+
+/**
+ * idt_setup_early_handler - Initializes the idt table with early handlers
+ */
+void __init idt_setup_early_handler(void)
+{
+ int i;
+
+ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
+ set_intr_gate(i, early_idt_handler_array[i]);
+#ifdef CONFIG_X86_32
+ for ( ; i < NR_VECTORS; i++)
+ set_intr_gate(i, early_ignore_irq);
+#endif
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_invalidate - Invalidate interrupt descriptor table
+ * @addr: The virtual address of the 'invalid' IDT
+ */
+void idt_invalidate(void *addr)
+{
+ struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 };
+
+ load_idt(&idt);
+}
+
+void __init update_intr_gate(unsigned int n, const void *addr)
+{
+ if (WARN_ON_ONCE(!test_bit(n, used_vectors)))
+ return;
+ set_intr_gate(n, addr);
+}
+
+void alloc_intr_gate(unsigned int n, const void *addr)
+{
+ BUG_ON(n < FIRST_SYSTEM_VECTOR);
+ if (!test_and_set_bit(n, used_vectors))
+ set_intr_gate(n, addr);
+}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 4aa03c5a14c9..52089c043160 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -29,9 +29,6 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);
atomic_t irq_err_count;
-/* Function pointer for generic interrupt vector handling */
-void (*x86_platform_ipi_callback)(void) = NULL;
-
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
* each architecture has to answer this themselves.
@@ -87,13 +84,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
seq_puts(p, " APIC ICR read retries\n");
-#endif
if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
seq_puts(p, " Platform interrupts\n");
}
+#endif
#ifdef CONFIG_SMP
seq_printf(p, "%*s: ", prec, "RES");
for_each_online_cpu(j)
@@ -155,6 +152,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
seq_puts(p, " Posted-interrupt notification event\n");
+ seq_printf(p, "%*s: ", prec, "NPI");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->kvm_posted_intr_nested_ipis);
+ seq_puts(p, " Nested posted-interrupt event\n");
+
seq_printf(p, "%*s: ", prec, "PIW");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
@@ -177,9 +180,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->apic_perf_irqs;
sum += irq_stats(cpu)->apic_irq_work_irqs;
sum += irq_stats(cpu)->icr_read_retry_count;
-#endif
if (x86_platform_ipi_callback)
sum += irq_stats(cpu)->x86_platform_ipis;
+#endif
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
@@ -253,26 +256,26 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
return 1;
}
+#ifdef CONFIG_X86_LOCAL_APIC
+/* Function pointer for generic interrupt vector handling */
+void (*x86_platform_ipi_callback)(void) = NULL;
/*
* Handler for X86_PLATFORM_IPI_VECTOR.
*/
-void __smp_x86_platform_ipi(void)
-{
- inc_irq_stat(x86_platform_ipis);
-
- if (x86_platform_ipi_callback)
- x86_platform_ipi_callback();
-}
-
__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
entering_ack_irq();
- __smp_x86_platform_ipi();
+ trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
+ inc_irq_stat(x86_platform_ipis);
+ if (x86_platform_ipi_callback)
+ x86_platform_ipi_callback();
+ trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
exiting_irq();
set_irq_regs(old_regs);
}
+#endif
#ifdef CONFIG_HAVE_KVM
static void dummy_handler(void) {}
@@ -313,21 +316,21 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
exiting_irq();
set_irq_regs(old_regs);
}
-#endif
-__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs)
+/*
+ * Handler for POSTED_INTERRUPT_NESTED_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
entering_ack_irq();
- trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
- __smp_x86_platform_ipi();
- trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
+ inc_irq_stat(kvm_posted_intr_nested_ipis);
exiting_irq();
set_irq_regs(old_regs);
}
+#endif
-EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
#ifdef CONFIG_HOTPLUG_CPU
@@ -412,7 +415,7 @@ int check_irq_vectors_for_cpu_disable(void)
* this w/o holding vector_lock.
*/
for (vector = FIRST_EXTERNAL_VECTOR;
- vector < first_system_vector; vector++) {
+ vector < FIRST_SYSTEM_VECTOR; vector++) {
if (!test_bit(vector, used_vectors) &&
IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) {
if (++count == this_count)
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 275487872be2..70dee056f92b 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -11,35 +11,23 @@
#include <asm/trace/irq_vectors.h>
#include <linux/interrupt.h>
-static inline void __smp_irq_work_interrupt(void)
-{
- inc_irq_stat(apic_irq_work_irqs);
- irq_work_run();
-}
-
+#ifdef CONFIG_X86_LOCAL_APIC
__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
- __smp_irq_work_interrupt();
- exiting_irq();
-}
-
-__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs)
-{
- ipi_entering_ack_irq();
trace_irq_work_entry(IRQ_WORK_VECTOR);
- __smp_irq_work_interrupt();
+ inc_irq_stat(apic_irq_work_irqs);
+ irq_work_run();
trace_irq_work_exit(IRQ_WORK_VECTOR);
exiting_irq();
}
void arch_irq_work_raise(void)
{
-#ifdef CONFIG_X86_LOCAL_APIC
if (!arch_irq_work_has_interrupt())
return;
apic->send_IPI_self(IRQ_WORK_VECTOR);
apic_wait_icr_idle();
-#endif
}
+#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 7468c6987547..1add9e08e83e 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -55,18 +55,6 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
};
-int vector_used_by_percpu_irq(unsigned int vector)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
- return 1;
- }
-
- return 0;
-}
-
void __init init_ISA_irqs(void)
{
struct irq_chip *chip = legacy_pic->chip;
@@ -99,98 +87,12 @@ void __init init_IRQ(void)
x86_init.irqs.intr_init();
}
-static void __init smp_intr_init(void)
-{
-#ifdef CONFIG_SMP
- /*
- * The reschedule interrupt is a CPU-to-CPU reschedule-helper
- * IPI, driven by wakeup.
- */
- alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
- /* IPI for generic function call */
- alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
- /* IPI for generic single function call */
- alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
- call_function_single_interrupt);
-
- /* Low priority IPI to cleanup after moving an irq */
- set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
- set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-
- /* IPI used for rebooting/stopping */
- alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
-#endif /* CONFIG_SMP */
-}
-
-static void __init apic_intr_init(void)
-{
- smp_intr_init();
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
- alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-#ifdef CONFIG_X86_MCE_THRESHOLD
- alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-#endif
-
-#ifdef CONFIG_X86_MCE_AMD
- alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
- /* self generated IPI for local APIC timer */
- alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
- /* IPI for X86 platform specific use */
- alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
-#ifdef CONFIG_HAVE_KVM
- /* IPI for KVM to deliver posted interrupt */
- alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
- /* IPI for KVM to deliver interrupt to wake up tasks */
- alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi);
-#endif
-
- /* IPI vectors for APIC spurious and error interrupts */
- alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
- /* IRQ work interrupts: */
-# ifdef CONFIG_IRQ_WORK
- alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
-# endif
-
-#endif
-}
-
void __init native_init_IRQ(void)
{
- int i;
-
/* Execute any quirks before the call gates are initialised: */
x86_init.irqs.pre_vector_init();
- apic_intr_init();
-
- /*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
- */
- i = FIRST_EXTERNAL_VECTOR;
-#ifndef CONFIG_X86_LOCAL_APIC
-#define first_system_vector NR_VECTORS
-#endif
- for_each_clear_bit_from(i, used_vectors, first_system_vector) {
- /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
- set_intr_gate(i, irq_entries_start +
- 8 * (i - FIRST_EXTERNAL_VECTOR));
- }
-#ifdef CONFIG_X86_LOCAL_APIC
- for_each_clear_bit_from(i, used_vectors, NR_VECTORS)
- set_intr_gate(i, spurious_interrupt);
-#endif
+ idt_setup_apic_and_irq_gates();
if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 38b64587b31b..fd6f8fbbe6f2 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -33,7 +33,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
struct setup_data_node *node = file->private_data;
unsigned long remain;
loff_t pos = *ppos;
- struct page *pg;
void *p;
u64 pa;
@@ -47,18 +46,13 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
count = node->len - pos;
pa = node->paddr + sizeof(struct setup_data) + pos;
- pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
- if (PageHighMem(pg)) {
- p = ioremap_cache(pa, count);
- if (!p)
- return -ENXIO;
- } else
- p = __va(pa);
+ p = memremap(pa, count, MEMREMAP_WB);
+ if (!p)
+ return -ENOMEM;
remain = copy_to_user(user_buf, p, count);
- if (PageHighMem(pg))
- iounmap(p);
+ memunmap(p);
if (remain)
return -EFAULT;
@@ -109,7 +103,6 @@ static int __init create_setup_data_nodes(struct dentry *parent)
struct setup_data *data;
int error;
struct dentry *d;
- struct page *pg;
u64 pa_data;
int no = 0;
@@ -126,16 +119,12 @@ static int __init create_setup_data_nodes(struct dentry *parent)
goto err_dir;
}
- pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
- if (PageHighMem(pg)) {
- data = ioremap_cache(pa_data, sizeof(*data));
- if (!data) {
- kfree(node);
- error = -ENXIO;
- goto err_dir;
- }
- } else
- data = __va(pa_data);
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data) {
+ kfree(node);
+ error = -ENOMEM;
+ goto err_dir;
+ }
node->paddr = pa_data;
node->type = data->type;
@@ -143,8 +132,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
error = create_setup_data_node(d, no, node);
pa_data = data->next;
- if (PageHighMem(pg))
- iounmap(data);
+ memunmap(data);
if (error)
goto err_dir;
no++;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 6b877807598b..f0153714ddac 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -457,6 +457,8 @@ static int arch_copy_kprobe(struct kprobe *p)
int arch_prepare_kprobe(struct kprobe *p)
{
+ int ret;
+
if (alternatives_text_reserved(p->addr, p->addr))
return -EINVAL;
@@ -467,7 +469,13 @@ int arch_prepare_kprobe(struct kprobe *p)
if (!p->ainsn.insn)
return -ENOMEM;
- return arch_copy_kprobe(p);
+ ret = arch_copy_kprobe(p);
+ if (ret) {
+ free_insn_slot(p->ainsn.insn, 0);
+ p->ainsn.insn = NULL;
+ }
+
+ return ret;
}
void arch_arm_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 69ea0bc1cfa3..4f98aad38237 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -39,6 +39,7 @@
#include <asm/insn.h>
#include <asm/debugreg.h>
#include <asm/set_memory.h>
+#include <asm/sections.h>
#include "common.h"
@@ -251,10 +252,12 @@ static int can_optimize(unsigned long paddr)
/*
* Do not optimize in the entry code due to the unstable
- * stack handling.
+ * stack handling and registers setup.
*/
- if ((paddr >= (unsigned long)__entry_text_start) &&
- (paddr < (unsigned long)__entry_text_end))
+ if (((paddr >= (unsigned long)__entry_text_start) &&
+ (paddr < (unsigned long)__entry_text_end)) ||
+ ((paddr >= (unsigned long)__irqentry_text_start) &&
+ (paddr < (unsigned long)__irqentry_text_end)))
return 0;
/* Check there is enough space for a relative jump. */
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index 4afc67f5facc..4b0592ca9e47 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -16,8 +16,8 @@
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/io.h>
-#include <asm/io.h>
#include <asm/setup.h>
static ssize_t version_show(struct kobject *kobj,
@@ -55,7 +55,7 @@ static struct bin_attribute *boot_params_data_attrs[] = {
NULL,
};
-static struct attribute_group boot_params_attr_group = {
+static const struct attribute_group boot_params_attr_group = {
.attrs = boot_params_version_attrs,
.bin_attrs = boot_params_data_attrs,
};
@@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr)
*paddr = pa_data;
return 0;
}
- data = ioremap_cache(pa_data, sizeof(*data));
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
if (!data)
return -ENOMEM;
pa_data = data->next;
- iounmap(data);
+ memunmap(data);
i++;
}
return -EINVAL;
@@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size)
u64 pa_data = boot_params.hdr.setup_data;
while (pa_data) {
- data = ioremap_cache(pa_data, sizeof(*data));
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
if (!data)
return -ENOMEM;
if (nr == i) {
*size = data->len;
- iounmap(data);
+ memunmap(data);
return 0;
}
pa_data = data->next;
- iounmap(data);
+ memunmap(data);
i++;
}
return -EINVAL;
@@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj,
ret = get_setup_data_paddr(nr, &paddr);
if (ret)
return ret;
- data = ioremap_cache(paddr, sizeof(*data));
+ data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
if (!data)
return -ENOMEM;
ret = sprintf(buf, "0x%x\n", data->type);
- iounmap(data);
+ memunmap(data);
return ret;
}
@@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp,
ret = get_setup_data_paddr(nr, &paddr);
if (ret)
return ret;
- data = ioremap_cache(paddr, sizeof(*data));
+ data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
if (!data)
return -ENOMEM;
@@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp,
goto out;
ret = count;
- p = ioremap_cache(paddr + sizeof(*data), data->len);
+ p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB);
if (!p) {
ret = -ENOMEM;
goto out;
}
memcpy(buf, p + off, count);
- iounmap(p);
+ memunmap(p);
out:
- iounmap(data);
+ memunmap(data);
return ret;
}
@@ -202,7 +202,7 @@ static struct bin_attribute *setup_data_data_attrs[] = {
NULL,
};
-static struct attribute_group setup_data_attr_group = {
+static const struct attribute_group setup_data_attr_group = {
.attrs = setup_data_type_attrs,
.bin_attrs = setup_data_data_attrs,
};
@@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr)
*nr = 0;
while (pa_data) {
*nr += 1;
- data = ioremap_cache(pa_data, sizeof(*data));
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
if (!data) {
ret = -ENOMEM;
goto out;
}
pa_data = data->next;
- iounmap(data);
+ memunmap(data);
}
out:
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 71c17a5be983..874827b0d7ca 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -151,6 +151,8 @@ void kvm_async_pf_task_wait(u32 token)
if (hlist_unhashed(&n.link))
break;
+ rcu_irq_exit();
+
if (!n.halted) {
local_irq_enable();
schedule();
@@ -159,11 +161,11 @@ void kvm_async_pf_task_wait(u32 token)
/*
* We cannot reschedule. So halt.
*/
- rcu_irq_exit();
native_safe_halt();
local_irq_disable();
- rcu_irq_enter();
}
+
+ rcu_irq_enter();
}
if (!n.halted)
finish_swait(&n.wq, &wait);
@@ -261,7 +263,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
switch (kvm_read_and_reset_pf_reason()) {
default:
- trace_do_page_fault(regs, error_code);
+ do_page_fault(regs, error_code);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
@@ -453,7 +455,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
static void __init kvm_apf_trap_init(void)
{
- set_intr_gate(14, async_page_fault);
+ update_intr_gate(X86_TRAP_PF, async_page_fault);
}
void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a870910c8565..f0e64db18ac8 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -21,6 +21,25 @@
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
+static void refresh_ldt_segments(void)
+{
+#ifdef CONFIG_X86_64
+ unsigned short sel;
+
+ /*
+ * Make sure that the cached DS and ES descriptors match the updated
+ * LDT.
+ */
+ savesegment(ds, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(ds, sel);
+
+ savesegment(es, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(es, sel);
+#endif
+}
+
/* context.lock is held for us, so we don't need any locking. */
static void flush_ldt(void *__mm)
{
@@ -32,6 +51,8 @@ static void flush_ldt(void *__mm)
pc = &mm->context;
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+
+ refresh_ldt_segments();
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8c53c5d7a1bc..00bc751c861c 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -26,18 +26,6 @@
#include <asm/set_memory.h>
#include <asm/debugreg.h>
-static void set_idt(void *newidt, __u16 limit)
-{
- struct desc_ptr curidt;
-
- /* ia32 supports unaliged loads & stores */
- curidt.size = limit;
- curidt.address = (unsigned long)newidt;
-
- load_idt(&curidt);
-}
-
-
static void set_gdt(void *newgdt, __u16 limit)
{
struct desc_ptr curgdt;
@@ -245,7 +233,7 @@ void machine_kexec(struct kimage *image)
* If you want to load them you must set up your own idt & gdt.
*/
set_gdt(phys_to_virt(0), 0);
- set_idt(phys_to_virt(0), 0);
+ idt_invalidate(phys_to_virt(0));
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index cb0a30473c23..1f790cf9d38f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -87,7 +87,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
}
pte = pte_offset_kernel(pmd, vaddr);
- set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
return 0;
err:
free_transition_pgtable(image);
@@ -115,6 +115,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
.alloc_pgt_page = alloc_pgt_page,
.context = image,
.page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
};
unsigned long mstart, mend;
pgd_t *level4p;
@@ -334,7 +335,8 @@ void machine_kexec(struct kimage *image)
image->start = relocate_kernel((unsigned long)image->head,
(unsigned long)page_list,
image->start,
- image->preserve_context);
+ image->preserve_context,
+ sme_active());
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -602,3 +604,22 @@ void arch_kexec_unprotect_crashkres(void)
{
kexec_mark_crashkres(false);
}
+
+int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
+{
+ /*
+ * If SME is active we need to be sure that kexec pages are
+ * not encrypted because when we boot to the new kernel the
+ * pages won't be accessed encrypted (initially).
+ */
+ return set_memory_decrypted((unsigned long)vaddr, pages);
+}
+
+void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
+{
+ /*
+ * If SME is active we need to reset the pages back to being
+ * an encrypted mapping before freeing them.
+ */
+ set_memory_encrypted((unsigned long)vaddr, pages);
+}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index f67bd3205df7..62e7d70aadd5 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -35,6 +35,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/setup.h>
+#include <asm/unwind.h>
#if 0
#define DEBUGP(fmt, ...) \
@@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr,
struct module *me)
{
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
- *para = NULL;
+ *para = NULL, *orc = NULL, *orc_ip = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr,
locks = s;
if (!strcmp(".parainstructions", secstrings + s->sh_name))
para = s;
+ if (!strcmp(".orc_unwind", secstrings + s->sh_name))
+ orc = s;
+ if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
+ orc_ip = s;
}
if (alt) {
@@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr,
/* make jump label nops */
jump_label_apply_nops(me);
+ if (orc && orc_ip)
+ unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+ (void *)orc->sh_addr, orc->sh_size);
+
return 0;
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 0d904d759ff1..5cbb3177ed17 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -429,16 +429,16 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
}
}
-static struct mpf_intel *mpf_found;
+static unsigned long mpf_base;
static unsigned long __init get_mpc_size(unsigned long physptr)
{
struct mpc_table *mpc;
unsigned long size;
- mpc = early_ioremap(physptr, PAGE_SIZE);
+ mpc = early_memremap(physptr, PAGE_SIZE);
size = mpc->length;
- early_iounmap(mpc, PAGE_SIZE);
+ early_memunmap(mpc, PAGE_SIZE);
apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
return size;
@@ -450,7 +450,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
unsigned long size;
size = get_mpc_size(mpf->physptr);
- mpc = early_ioremap(mpf->physptr, size);
+ mpc = early_memremap(mpf->physptr, size);
+
/*
* Read the physical hardware table. Anything here will
* override the defaults.
@@ -461,10 +462,10 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
#endif
pr_err("BIOS bug, MP table errors detected!...\n");
pr_cont("... disabling SMP support. (tell your hw vendor)\n");
- early_iounmap(mpc, size);
+ early_memunmap(mpc, size);
return -1;
}
- early_iounmap(mpc, size);
+ early_memunmap(mpc, size);
if (early)
return -1;
@@ -497,12 +498,12 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
*/
void __init default_get_smp_config(unsigned int early)
{
- struct mpf_intel *mpf = mpf_found;
+ struct mpf_intel *mpf;
if (!smp_found_config)
return;
- if (!mpf)
+ if (!mpf_base)
return;
if (acpi_lapic && early)
@@ -515,6 +516,12 @@ void __init default_get_smp_config(unsigned int early)
if (acpi_lapic && acpi_ioapic)
return;
+ mpf = early_memremap(mpf_base, sizeof(*mpf));
+ if (!mpf) {
+ pr_err("MPTABLE: error mapping MP table\n");
+ return;
+ }
+
pr_info("Intel MultiProcessor Specification v1.%d\n",
mpf->specification);
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
@@ -529,7 +536,7 @@ void __init default_get_smp_config(unsigned int early)
/*
* Now see if we need to read further.
*/
- if (mpf->feature1 != 0) {
+ if (mpf->feature1) {
if (early) {
/*
* local APIC has default address
@@ -542,8 +549,10 @@ void __init default_get_smp_config(unsigned int early)
construct_default_ISA_mptable(mpf->feature1);
} else if (mpf->physptr) {
- if (check_physptr(mpf, early))
+ if (check_physptr(mpf, early)) {
+ early_memunmap(mpf, sizeof(*mpf));
return;
+ }
} else
BUG();
@@ -552,6 +561,8 @@ void __init default_get_smp_config(unsigned int early)
/*
* Only use the first configuration found.
*/
+
+ early_memunmap(mpf, sizeof(*mpf));
}
static void __init smp_reserve_memory(struct mpf_intel *mpf)
@@ -561,15 +572,16 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
static int __init smp_scan_config(unsigned long base, unsigned long length)
{
- unsigned int *bp = phys_to_virt(base);
+ unsigned int *bp;
struct mpf_intel *mpf;
- unsigned long mem;
+ int ret = 0;
apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
base, base + length - 1);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
+ bp = early_memremap(base, length);
mpf = (struct mpf_intel *)bp;
if ((*bp == SMP_MAGIC_IDENT) &&
(mpf->length == 1) &&
@@ -579,24 +591,26 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 1;
#endif
- mpf_found = mpf;
+ mpf_base = base;
- pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
- (unsigned long long) virt_to_phys(mpf),
- (unsigned long long) virt_to_phys(mpf) +
- sizeof(*mpf) - 1, mpf);
+ pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n",
+ base, base + sizeof(*mpf) - 1, mpf);
- mem = virt_to_phys(mpf);
- memblock_reserve(mem, sizeof(*mpf));
+ memblock_reserve(base, sizeof(*mpf));
if (mpf->physptr)
smp_reserve_memory(mpf);
- return 1;
+ ret = 1;
}
- bp += 4;
+ early_memunmap(bp, length);
+
+ if (ret)
+ break;
+
+ base += 16;
length -= 16;
}
- return 0;
+ return ret;
}
void __init default_find_smp_config(void)
@@ -838,29 +852,40 @@ static int __init update_mp_table(void)
char oem[10];
struct mpf_intel *mpf;
struct mpc_table *mpc, *mpc_new;
+ unsigned long size;
if (!enable_update_mptable)
return 0;
- mpf = mpf_found;
- if (!mpf)
+ if (!mpf_base)
return 0;
+ mpf = early_memremap(mpf_base, sizeof(*mpf));
+ if (!mpf) {
+ pr_err("MPTABLE: mpf early_memremap() failed\n");
+ return 0;
+ }
+
/*
* Now see if we need to go further.
*/
- if (mpf->feature1 != 0)
- return 0;
+ if (mpf->feature1)
+ goto do_unmap_mpf;
if (!mpf->physptr)
- return 0;
+ goto do_unmap_mpf;
- mpc = phys_to_virt(mpf->physptr);
+ size = get_mpc_size(mpf->physptr);
+ mpc = early_memremap(mpf->physptr, size);
+ if (!mpc) {
+ pr_err("MPTABLE: mpc early_memremap() failed\n");
+ goto do_unmap_mpf;
+ }
if (!smp_check_mpc(mpc, oem, str))
- return 0;
+ goto do_unmap_mpc;
- pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf));
+ pr_info("mpf: %llx\n", (u64)mpf_base);
pr_info("physptr: %x\n", mpf->physptr);
if (mpc_new_phys && mpc->length > mpc_new_length) {
@@ -878,21 +903,32 @@ static int __init update_mp_table(void)
new = mpf_checksum((unsigned char *)mpc, mpc->length);
if (old == new) {
pr_info("mpc is readonly, please try alloc_mptable instead\n");
- return 0;
+ goto do_unmap_mpc;
}
pr_info("use in-position replacing\n");
} else {
+ mpc_new = early_memremap(mpc_new_phys, mpc_new_length);
+ if (!mpc_new) {
+ pr_err("MPTABLE: new mpc early_memremap() failed\n");
+ goto do_unmap_mpc;
+ }
mpf->physptr = mpc_new_phys;
- mpc_new = phys_to_virt(mpc_new_phys);
memcpy(mpc_new, mpc, mpc->length);
+ early_memunmap(mpc, size);
mpc = mpc_new;
+ size = mpc_new_length;
/* check if we can modify that */
if (mpc_new_phys - mpf->physptr) {
struct mpf_intel *mpf_new;
/* steal 16 bytes from [0, 1k) */
+ mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new));
+ if (!mpf_new) {
+ pr_err("MPTABLE: new mpf early_memremap() failed\n");
+ goto do_unmap_mpc;
+ }
pr_info("mpf new: %x\n", 0x400 - 16);
- mpf_new = phys_to_virt(0x400 - 16);
memcpy(mpf_new, mpf, 16);
+ early_memunmap(mpf, sizeof(*mpf));
mpf = mpf_new;
mpf->physptr = mpc_new_phys;
}
@@ -909,6 +945,12 @@ static int __init update_mp_table(void)
*/
replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+do_unmap_mpc:
+ early_memunmap(mpc, size);
+
+do_unmap_mpf:
+ early_memunmap(mpf, sizeof(*mpf));
+
return 0;
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 446c8aa09b9b..35aafc95e4b8 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -39,26 +39,26 @@
#include <trace/events/nmi.h>
struct nmi_desc {
- spinlock_t lock;
+ raw_spinlock_t lock;
struct list_head head;
};
static struct nmi_desc nmi_desc[NMI_MAX] =
{
{
- .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
.head = LIST_HEAD_INIT(nmi_desc[0].head),
},
{
- .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
.head = LIST_HEAD_INIT(nmi_desc[1].head),
},
{
- .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
.head = LIST_HEAD_INIT(nmi_desc[2].head),
},
{
- .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
.head = LIST_HEAD_INIT(nmi_desc[3].head),
},
@@ -163,7 +163,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
init_irq_work(&action->irq_work, nmi_max_handler);
- spin_lock_irqsave(&desc->lock, flags);
+ raw_spin_lock_irqsave(&desc->lock, flags);
/*
* Indicate if there are multiple registrations on the
@@ -181,7 +181,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
else
list_add_tail_rcu(&action->list, &desc->head);
- spin_unlock_irqrestore(&desc->lock, flags);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
EXPORT_SYMBOL(__register_nmi_handler);
@@ -192,7 +192,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
struct nmiaction *n;
unsigned long flags;
- spin_lock_irqsave(&desc->lock, flags);
+ raw_spin_lock_irqsave(&desc->lock, flags);
list_for_each_entry_rcu(n, &desc->head, list) {
/*
@@ -207,7 +207,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
}
}
- spin_unlock_irqrestore(&desc->lock, flags);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bc0a849589bb..a14df9eecfed 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -319,9 +319,6 @@ __visible struct pv_irq_ops pv_irq_ops = {
.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.safe_halt = native_safe_halt,
.halt = native_halt,
-#ifdef CONFIG_X86_64
- .adjust_exception_frame = paravirt_nop,
-#endif
};
__visible struct pv_cpu_ops pv_cpu_ops = {
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 5e16d3f29594..0accc2404b92 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -93,9 +93,12 @@ again:
if (gfpflags_allow_blocking(flag)) {
page = dma_alloc_from_contiguous(dev, count, get_order(size),
flag);
- if (page && page_to_phys(page) + size > dma_mask) {
- dma_release_from_contiguous(dev, page, count);
- page = NULL;
+ if (page) {
+ addr = phys_to_dma(dev, page_to_phys(page));
+ if (addr + size > dma_mask) {
+ dma_release_from_contiguous(dev, page, count);
+ page = NULL;
+ }
}
}
/* fallback */
@@ -104,7 +107,7 @@ again:
if (!page)
return NULL;
- addr = page_to_phys(page);
+ addr = phys_to_dma(dev, page_to_phys(page));
if (addr + size > dma_mask) {
__free_pages(page, get_order(size));
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a6d404087fe3..4fc3cb60ea11 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -32,7 +32,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
enum dma_data_direction dir,
unsigned long attrs)
{
- dma_addr_t bus = page_to_phys(page) + offset;
+ dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset;
WARN_ON(size == 0);
if (!check_addr("map_single", dev, bus, size))
return NOMMU_MAPPING_ERROR;
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 1e23577e17cf..677077510e30 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -6,12 +6,14 @@
#include <linux/swiotlb.h>
#include <linux/bootmem.h>
#include <linux/dma-mapping.h>
+#include <linux/mem_encrypt.h>
#include <asm/iommu.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
#include <asm/xen/swiotlb-xen.h>
#include <asm/iommu_table.h>
+
int swiotlb __read_mostly;
void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -79,8 +81,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
pci_swiotlb_late_init);
/*
- * if 4GB or more detected (and iommu=off not set) return 1
- * and set swiotlb to 1.
+ * If 4GB or more detected (and iommu=off not set) or if SME is active
+ * then set swiotlb to 1 and return 1.
*/
int __init pci_swiotlb_detect_4gb(void)
{
@@ -89,6 +91,15 @@ int __init pci_swiotlb_detect_4gb(void)
if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
swiotlb = 1;
#endif
+
+ /*
+ * If SME is active then swiotlb will be set to 1 so that bounce
+ * buffers are allocated and used for devices that do not support
+ * the addressing range required for the encryption mask.
+ */
+ if (sme_active())
+ swiotlb = 1;
+
return swiotlb;
}
IOMMU_INIT(pci_swiotlb_detect_4gb,
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index 91271122f0df..502a77d0adb0 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void)
x86_platform.legacy.reserve_bios_regions = 1;
break;
case X86_SUBARCH_XEN:
- case X86_SUBARCH_LGUEST:
x86_platform.legacy.devices.pnpbios = 0;
x86_platform.legacy.rtc = 0;
break;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3ca198080ea9..bd6b85fac666 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -355,6 +355,7 @@ bool xen_set_default_idle(void)
return ret;
}
#endif
+
void stop_this_cpu(void *dummy)
{
local_irq_disable();
@@ -365,8 +366,20 @@ void stop_this_cpu(void *dummy)
disable_local_APIC();
mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
- for (;;)
- halt();
+ for (;;) {
+ /*
+ * Use wbinvd followed by hlt to stop the processor. This
+ * provides support for kexec on a processor that supports
+ * SME. With kexec, going from SME inactive to SME active
+ * requires clearing cache entries so that addresses without
+ * the encryption bit set don't corrupt the same physical
+ * address that has the encryption bit set when caches are
+ * flushed. To achieve this a wbinvd is performed followed by
+ * a hlt. Even if the processor is not in the kexec/SME
+ * scenario this only adds a wbinvd to a halting processor.
+ */
+ asm volatile("wbinvd; hlt" : : : "memory");
+ }
}
/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c6d6dc5f8bb2..11966251cd42 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -56,7 +56,7 @@
#include <asm/debugreg.h>
#include <asm/switch_to.h>
#include <asm/vm86.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
#include <asm/proto.h>
void __show_regs(struct pt_regs *regs, int all)
@@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, int all)
if (user_mode(regs)) {
sp = regs->sp;
- ss = regs->ss & 0xffff;
+ ss = regs->ss;
gs = get_user_gs(regs);
} else {
sp = kernel_stack_pointer(regs);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c3169be4c596..302e7b2572d1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
#include <asm/switch_to.h>
#include <asm/xen/hypervisor.h>
#include <asm/vdso.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
#include <asm/unistd.h>
#ifdef CONFIG_IA32_EMULATION
/* Not included via unistd.h */
@@ -69,8 +69,7 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
- printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
- (void *)regs->ip);
+ printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
regs->sp, regs->flags);
if (regs->orig_ax != -1)
@@ -149,6 +148,123 @@ void release_thread(struct task_struct *dead_task)
}
}
+enum which_selector {
+ FS,
+ GS
+};
+
+/*
+ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
+ * not available. The goal is to be reasonably fast on non-FSGSBASE systems.
+ * It's forcibly inlined because it'll generate better code and this function
+ * is hot.
+ */
+static __always_inline void save_base_legacy(struct task_struct *prev_p,
+ unsigned short selector,
+ enum which_selector which)
+{
+ if (likely(selector == 0)) {
+ /*
+ * On Intel (without X86_BUG_NULL_SEG), the segment base could
+ * be the pre-existing saved base or it could be zero. On AMD
+ * (with X86_BUG_NULL_SEG), the segment base could be almost
+ * anything.
+ *
+ * This branch is very hot (it's hit twice on almost every
+ * context switch between 64-bit programs), and avoiding
+ * the RDMSR helps a lot, so we just assume that whatever
+ * value is already saved is correct. This matches historical
+ * Linux behavior, so it won't break existing applications.
+ *
+ * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
+ * report that the base is zero, it needs to actually be zero:
+ * see the corresponding logic in load_seg_legacy.
+ */
+ } else {
+ /*
+ * If the selector is 1, 2, or 3, then the base is zero on
+ * !X86_BUG_NULL_SEG CPUs and could be anything on
+ * X86_BUG_NULL_SEG CPUs. In the latter case, Linux
+ * has never attempted to preserve the base across context
+ * switches.
+ *
+ * If selector > 3, then it refers to a real segment, and
+ * saving the base isn't necessary.
+ */
+ if (which == FS)
+ prev_p->thread.fsbase = 0;
+ else
+ prev_p->thread.gsbase = 0;
+ }
+}
+
+static __always_inline void save_fsgs(struct task_struct *task)
+{
+ savesegment(fs, task->thread.fsindex);
+ savesegment(gs, task->thread.gsindex);
+ save_base_legacy(task, task->thread.fsindex, FS);
+ save_base_legacy(task, task->thread.gsindex, GS);
+}
+
+static __always_inline void loadseg(enum which_selector which,
+ unsigned short sel)
+{
+ if (which == FS)
+ loadsegment(fs, sel);
+ else
+ load_gs_index(sel);
+}
+
+static __always_inline void load_seg_legacy(unsigned short prev_index,
+ unsigned long prev_base,
+ unsigned short next_index,
+ unsigned long next_base,
+ enum which_selector which)
+{
+ if (likely(next_index <= 3)) {
+ /*
+ * The next task is using 64-bit TLS, is not using this
+ * segment at all, or is having fun with arcane CPU features.
+ */
+ if (next_base == 0) {
+ /*
+ * Nasty case: on AMD CPUs, we need to forcibly zero
+ * the base.
+ */
+ if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+ loadseg(which, __USER_DS);
+ loadseg(which, next_index);
+ } else {
+ /*
+ * We could try to exhaustively detect cases
+ * under which we can skip the segment load,
+ * but there's really only one case that matters
+ * for performance: if both the previous and
+ * next states are fully zeroed, we can skip
+ * the load.
+ *
+ * (This assumes that prev_base == 0 has no
+ * false positives. This is the case on
+ * Intel-style CPUs.)
+ */
+ if (likely(prev_index | next_index | prev_base))
+ loadseg(which, next_index);
+ }
+ } else {
+ if (prev_index != next_index)
+ loadseg(which, next_index);
+ wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
+ next_base);
+ }
+ } else {
+ /*
+ * The next task is using a real segment. Loading the selector
+ * is sufficient.
+ */
+ loadseg(which, next_index);
+ }
+}
+
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
@@ -229,10 +345,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp,
unsigned int _cs, unsigned int _ss, unsigned int _ds)
{
+ WARN_ON_ONCE(regs != current_pt_regs());
+
+ if (static_cpu_has(X86_BUG_NULL_SEG)) {
+ /* Loading zero below won't clear the base. */
+ loadsegment(fs, __USER_DS);
+ load_gs_index(__USER_DS);
+ }
+
loadsegment(fs, 0);
loadsegment(es, _ds);
loadsegment(ds, _ds);
load_gs_index(0);
+
regs->ip = new_ip;
regs->sp = new_sp;
regs->cs = _cs;
@@ -277,7 +402,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
- unsigned prev_fsindex, prev_gsindex;
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+ this_cpu_read(irq_count) != -1);
switch_fpu_prepare(prev_fpu, cpu);
@@ -286,8 +413,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*
* (e.g. xen_load_tls())
*/
- savesegment(fs, prev_fsindex);
- savesegment(gs, prev_gsindex);
+ save_fsgs(prev_p);
/*
* Load TLS before restoring any segments so that segment loads
@@ -326,108 +452,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
- /*
- * Switch FS and GS.
- *
- * These are even more complicated than DS and ES: they have
- * 64-bit bases are that controlled by arch_prctl. The bases
- * don't necessarily match the selectors, as user code can do
- * any number of things to cause them to be inconsistent.
- *
- * We don't promise to preserve the bases if the selectors are
- * nonzero. We also don't promise to preserve the base if the
- * selector is zero and the base doesn't match whatever was
- * most recently passed to ARCH_SET_FS/GS. (If/when the
- * FSGSBASE instructions are enabled, we'll need to offer
- * stronger guarantees.)
- *
- * As an invariant,
- * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
- * impossible.
- */
- if (next->fsindex) {
- /* Loading a nonzero value into FS sets the index and base. */
- loadsegment(fs, next->fsindex);
- } else {
- if (next->fsbase) {
- /* Next index is zero but next base is nonzero. */
- if (prev_fsindex)
- loadsegment(fs, 0);
- wrmsrl(MSR_FS_BASE, next->fsbase);
- } else {
- /* Next base and index are both zero. */
- if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
- /*
- * We don't know the previous base and can't
- * find out without RDMSR. Forcibly clear it.
- */
- loadsegment(fs, __USER_DS);
- loadsegment(fs, 0);
- } else {
- /*
- * If the previous index is zero and ARCH_SET_FS
- * didn't change the base, then the base is
- * also zero and we don't need to do anything.
- */
- if (prev->fsbase || prev_fsindex)
- loadsegment(fs, 0);
- }
- }
- }
- /*
- * Save the old state and preserve the invariant.
- * NB: if prev_fsindex == 0, then we can't reliably learn the base
- * without RDMSR because Intel user code can zero it without telling
- * us and AMD user code can program any 32-bit value without telling
- * us.
- */
- if (prev_fsindex)
- prev->fsbase = 0;
- prev->fsindex = prev_fsindex;
-
- if (next->gsindex) {
- /* Loading a nonzero value into GS sets the index and base. */
- load_gs_index(next->gsindex);
- } else {
- if (next->gsbase) {
- /* Next index is zero but next base is nonzero. */
- if (prev_gsindex)
- load_gs_index(0);
- wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
- } else {
- /* Next base and index are both zero. */
- if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
- /*
- * We don't know the previous base and can't
- * find out without RDMSR. Forcibly clear it.
- *
- * This contains a pointless SWAPGS pair.
- * Fixing it would involve an explicit check
- * for Xen or a new pvop.
- */
- load_gs_index(__USER_DS);
- load_gs_index(0);
- } else {
- /*
- * If the previous index is zero and ARCH_SET_GS
- * didn't change the base, then the base is
- * also zero and we don't need to do anything.
- */
- if (prev->gsbase || prev_gsindex)
- load_gs_index(0);
- }
- }
- }
- /*
- * Save the old state and preserve the invariant.
- * NB: if prev_gsindex == 0, then we can't reliably learn the base
- * without RDMSR because Intel user code can zero it without telling
- * us and AMD user code can program any 32-bit value without telling
- * us.
- */
- if (prev_gsindex)
- prev->gsbase = 0;
- prev->gsindex = prev_gsindex;
+ load_seg_legacy(prev->fsindex, prev->fsbase,
+ next->fsindex, next->fsbase, FS);
+ load_seg_legacy(prev->gsindex, prev->gsbase,
+ next->gsindex, next->gsbase, GS);
switch_fpu_finish(next_fpu, cpu);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 67393fc88353..54984b142641 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -38,8 +38,6 @@
void (*pm_power_off)(void);
EXPORT_SYMBOL(pm_power_off);
-static const struct desc_ptr no_idt = {};
-
/*
* This is set if we need to go through the 'emergency' path.
* When machine_emergency_restart() is called, we may be on
@@ -471,12 +469,12 @@ static int __init reboot_init(void)
/*
* The DMI quirks table takes precedence. If no quirks entry
- * matches and the ACPI Hardware Reduced bit is set, force EFI
- * reboot.
+ * matches and the ACPI Hardware Reduced bit is set and EFI
+ * runtime services are enabled, force EFI reboot.
*/
rv = dmi_check_system(reboot_dmi_table);
- if (!rv && efi_reboot_required())
+ if (!rv && efi_reboot_required() && !efi_runtime_disabled())
reboot_type = BOOT_EFI;
return 0;
@@ -638,7 +636,7 @@ static void native_machine_emergency_restart(void)
break;
case BOOT_TRIPLE:
- load_idt(&no_idt);
+ idt_invalidate(NULL);
__asm__ __volatile__("int3");
/* We're probably dead after this, but... */
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 98111b38ebfd..307d3bac5f04 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -47,6 +47,7 @@ relocate_kernel:
* %rsi page_list
* %rdx start address
* %rcx preserve_context
+ * %r8 sme_active
*/
/* Save the CPU context, used for jumping back */
@@ -71,6 +72,9 @@ relocate_kernel:
pushq $0
popfq
+ /* Save SME active flag */
+ movq %r8, %r12
+
/*
* get physical address of control page now
* this is impossible after page table switch
@@ -132,6 +136,16 @@ identity_mapped:
/* Flush the TLB (needed?) */
movq %r9, %cr3
+ /*
+ * If SME is active, there could be old encrypted cache line
+ * entries that will conflict with the now unencrypted memory
+ * used by kexec. Flush the caches before copying the kernel.
+ */
+ testq %r12, %r12
+ jz 1f
+ wbinvd
+1:
+
movq %rcx, %r11
call swap_pages
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3486d0498800..9cc16a841745 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -69,6 +69,7 @@
#include <linux/crash_dump.h>
#include <linux/tboot.h>
#include <linux/jiffies.h>
+#include <linux/mem_encrypt.h>
#include <linux/usb/xhci-dbgp.h>
#include <video/edid.h>
@@ -115,6 +116,7 @@
#include <asm/microcode.h>
#include <asm/mmu_context.h>
#include <asm/kaslr.h>
+#include <asm/unwind.h>
/*
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -374,6 +376,14 @@ static void __init reserve_initrd(void)
!ramdisk_image || !ramdisk_size)
return; /* No initrd provided by bootloader */
+ /*
+ * If SME is active, this memory will be marked encrypted by the
+ * kernel when it is accessed (including relocation). However, the
+ * ramdisk image was loaded decrypted by the bootloader, so make
+ * sure that it is encrypted before accessing it.
+ */
+ sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
+
initrd_start = 0;
mapped_size = memblock_mem_size(max_pfn_mapped);
@@ -890,7 +900,7 @@ void __init setup_arch(char **cmdline_p)
*/
olpc_ofw_detect();
- early_trap_init();
+ idt_setup_early_traps();
early_cpu_init();
early_ioremap_init();
@@ -1161,7 +1171,7 @@ void __init setup_arch(char **cmdline_p)
init_mem_mapping();
- early_trap_pf_init();
+ idt_setup_early_pf();
/*
* Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
@@ -1310,6 +1320,8 @@ void __init setup_arch(char **cmdline_p)
if (efi_enabled(EFI_BOOT))
efi_apply_memmap_quirks();
#endif
+
+ unwind_init();
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 10edd1e69a68..6e8fcb6f7e1e 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -155,13 +155,10 @@ static void __init pcpup_populate_pte(unsigned long addr)
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
- struct desc_struct gdt;
+ struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
+ 0xFFFFF);
- pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
- 0x2 | DESCTYPE_S, 0x8);
- gdt.s = 1;
- write_gdt_entry(get_cpu_gdt_rw(cpu),
- GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
#endif
}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cc30a74e4adb..e04442345fc0 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -256,7 +256,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
sp = current->sas_ss_sp + current->sas_ss_size;
} else if (IS_ENABLED(CONFIG_X86_32) &&
!onsigstack &&
- (regs->ss & 0xffff) != __USER_DS &&
+ regs->ss != __USER_DS &&
!(ka->sa.sa_flags & SA_RESTORER) &&
ka->sa.sa_restorer) {
/* This is the legacy signal stack switching. */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d798c0da451c..5c574dff4c1a 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -254,84 +254,45 @@ finish:
}
/*
- * Reschedule call back.
+ * Reschedule call back. KVM uses this interrupt to force a cpu out of
+ * guest mode
*/
-static inline void __smp_reschedule_interrupt(void)
-{
- inc_irq_stat(irq_resched_count);
- scheduler_ipi();
-}
-
__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
- __smp_reschedule_interrupt();
- /*
- * KVM uses this interrupt to force a cpu out of guest mode
- */
-}
-
-__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs)
-{
- /*
- * Need to call irq_enter() before calling the trace point.
- * __smp_reschedule_interrupt() calls irq_enter/exit() too (in
- * scheduler_ipi(). This is OK, since those functions are allowed
- * to nest.
- */
- ipi_entering_ack_irq();
- trace_reschedule_entry(RESCHEDULE_VECTOR);
- __smp_reschedule_interrupt();
- trace_reschedule_exit(RESCHEDULE_VECTOR);
- exiting_irq();
- /*
- * KVM uses this interrupt to force a cpu out of guest mode
- */
-}
+ inc_irq_stat(irq_resched_count);
-static inline void __smp_call_function_interrupt(void)
-{
- generic_smp_call_function_interrupt();
- inc_irq_stat(irq_call_count);
+ if (trace_resched_ipi_enabled()) {
+ /*
+ * scheduler_ipi() might call irq_enter() as well, but
+ * nested calls are fine.
+ */
+ irq_enter();
+ trace_reschedule_entry(RESCHEDULE_VECTOR);
+ scheduler_ipi();
+ trace_reschedule_exit(RESCHEDULE_VECTOR);
+ irq_exit();
+ return;
+ }
+ scheduler_ipi();
}
__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
- __smp_call_function_interrupt();
- exiting_irq();
-}
-
-__visible void __irq_entry
-smp_trace_call_function_interrupt(struct pt_regs *regs)
-{
- ipi_entering_ack_irq();
trace_call_function_entry(CALL_FUNCTION_VECTOR);
- __smp_call_function_interrupt();
- trace_call_function_exit(CALL_FUNCTION_VECTOR);
- exiting_irq();
-}
-
-static inline void __smp_call_function_single_interrupt(void)
-{
- generic_smp_call_function_single_interrupt();
inc_irq_stat(irq_call_count);
-}
-
-__visible void __irq_entry
-smp_call_function_single_interrupt(struct pt_regs *regs)
-{
- ipi_entering_ack_irq();
- __smp_call_function_single_interrupt();
+ generic_smp_call_function_interrupt();
+ trace_call_function_exit(CALL_FUNCTION_VECTOR);
exiting_irq();
}
-__visible void __irq_entry
-smp_trace_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r)
{
ipi_entering_ack_irq();
trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
- __smp_call_function_single_interrupt();
+ inc_irq_stat(irq_call_count);
+ generic_smp_call_function_single_interrupt();
trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
exiting_irq();
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b474c8de7fba..54b9e89d4d6b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -971,7 +971,8 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
* Returns zero if CPU booted OK, else error code from
* ->wakeup_secondary_cpu.
*/
-static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
+static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
+ int *cpu0_nmi_registered)
{
volatile u32 *trampoline_status =
(volatile u32 *) __va(real_mode_header->trampoline_status);
@@ -979,7 +980,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
unsigned long start_ip = real_mode_header->trampoline_start;
unsigned long boot_error = 0;
- int cpu0_nmi_registered = 0;
unsigned long timeout;
idle->thread.sp = (unsigned long)task_pt_regs(idle);
@@ -1035,7 +1035,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
else
boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
- &cpu0_nmi_registered);
+ cpu0_nmi_registered);
if (!boot_error) {
/*
@@ -1080,12 +1080,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
*/
smpboot_restore_warm_reset_vector();
}
- /*
- * Clean up the nmi handler. Do this after the callin and callout sync
- * to avoid impact of possible long unregister time.
- */
- if (cpu0_nmi_registered)
- unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
return boot_error;
}
@@ -1093,8 +1087,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int apicid = apic->cpu_present_to_apicid(cpu);
+ int cpu0_nmi_registered = 0;
unsigned long flags;
- int err;
+ int err, ret = 0;
WARN_ON(irqs_disabled());
@@ -1131,10 +1126,11 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
common_cpu_up(cpu, tidle);
- err = do_boot_cpu(apicid, cpu, tidle);
+ err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
if (err) {
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
- return -EIO;
+ ret = -EIO;
+ goto unreg_nmi;
}
/*
@@ -1150,7 +1146,15 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
touch_nmi_watchdog();
}
- return 0;
+unreg_nmi:
+ /*
+ * Clean up the nmi handler. Do this after the callin and callout sync
+ * to avoid impact of possible long unregister time.
+ */
+ if (cpu0_nmi_registered)
+ unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
+
+ return ret;
}
/**
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 5f25cfbd952e..5ee663836c08 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -13,7 +13,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
unsigned long addr, seg;
addr = regs->ip;
- seg = regs->cs & 0xffff;
+ seg = regs->cs;
if (v8086_mode(regs)) {
addr = (addr & 0xffff) + (seg << 4);
return addr;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 213ddf3e937d..73e4d28112f8 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -21,6 +21,7 @@
#include <asm/compat.h>
#include <asm/ia32.h>
#include <asm/syscalls.h>
+#include <asm/mpx.h>
/*
* Align a virtual address to avoid aliasing in the I$ on AMD F15h.
@@ -100,8 +101,8 @@ out:
return error;
}
-static void find_start_end(unsigned long flags, unsigned long *begin,
- unsigned long *end)
+static void find_start_end(unsigned long addr, unsigned long flags,
+ unsigned long *begin, unsigned long *end)
{
if (!in_compat_syscall() && (flags & MAP_32BIT)) {
/* This is usually used needed to map code in small
@@ -120,7 +121,10 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
}
*begin = get_mmap_base(1);
- *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit();
+ if (in_compat_syscall())
+ *end = task_size_32bit();
+ else
+ *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
}
unsigned long
@@ -132,10 +136,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_unmapped_area_info info;
unsigned long begin, end;
+ addr = mpx_unmapped_area_check(addr, len, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
if (flags & MAP_FIXED)
return addr;
- find_start_end(flags, &begin, &end);
+ find_start_end(addr, flags, &begin, &end);
if (len > end)
return -ENOMEM;
@@ -171,6 +179,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
unsigned long addr = addr0;
struct vm_unmapped_area_info info;
+ addr = mpx_unmapped_area_check(addr, len, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
/* requested length too big for entire address space */
if (len > TASK_SIZE)
return -ENOMEM;
@@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = get_mmap_base(0);
+
+ /*
+ * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+ * in the full address space.
+ *
+ * !in_compat_syscall() check to avoid high addresses for x32.
+ */
+ if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+ info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
if (filp) {
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index dcd699baea1b..a106b9719c58 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -93,7 +93,7 @@ static void set_tls_desc(struct task_struct *p, int idx,
while (n-- > 0) {
if (LDT_empty(info) || LDT_zero(info)) {
- desc->a = desc->b = 0;
+ memset(desc, 0, sizeof(*desc));
} else {
fill_ldt(desc, info);
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 15515132bf0d..c6636d1f60b9 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -4,57 +4,38 @@
* Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com>
*
*/
-#include <asm/hw_irq.h>
-#include <asm/desc.h>
+#include <linux/jump_label.h>
#include <linux/atomic.h>
-atomic_t trace_idt_ctr = ATOMIC_INIT(0);
-struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
- (unsigned long) trace_idt_table };
-
-/* No need to be aligned, but done to keep all IDTs defined the same way. */
-gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
+#include <asm/hw_irq.h>
+#include <asm/desc.h>
-static int trace_irq_vector_refcount;
-static DEFINE_MUTEX(irq_vector_mutex);
+DEFINE_STATIC_KEY_FALSE(trace_pagefault_key);
-static void set_trace_idt_ctr(int val)
+int trace_pagefault_reg(void)
{
- atomic_set(&trace_idt_ctr, val);
- /* Ensure the trace_idt_ctr is set before sending IPI */
- wmb();
+ static_branch_inc(&trace_pagefault_key);
+ return 0;
}
-static void switch_idt(void *arg)
+void trace_pagefault_unreg(void)
{
- unsigned long flags;
-
- local_irq_save(flags);
- load_current_idt();
- local_irq_restore(flags);
+ static_branch_dec(&trace_pagefault_key);
}
-int trace_irq_vector_regfunc(void)
+#ifdef CONFIG_SMP
+
+DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key);
+
+int trace_resched_ipi_reg(void)
{
- mutex_lock(&irq_vector_mutex);
- if (!trace_irq_vector_refcount) {
- set_trace_idt_ctr(1);
- smp_call_function(switch_idt, NULL, 0);
- switch_idt(NULL);
- }
- trace_irq_vector_refcount++;
- mutex_unlock(&irq_vector_mutex);
+ static_branch_inc(&trace_resched_ipi_key);
return 0;
}
-void trace_irq_vector_unregfunc(void)
+void trace_resched_ipi_unreg(void)
{
- mutex_lock(&irq_vector_mutex);
- trace_irq_vector_refcount--;
- if (!trace_irq_vector_refcount) {
- set_trace_idt_ctr(0);
- smp_call_function(switch_idt, NULL, 0);
- switch_idt(NULL);
- }
- mutex_unlock(&irq_vector_mutex);
+ static_branch_dec(&trace_resched_ipi_key);
}
+
+#endif
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bf54309b85da..34ea3651362e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -38,11 +38,6 @@
#include <linux/smp.h>
#include <linux/io.h>
-#ifdef CONFIG_EISA
-#include <linux/ioport.h>
-#include <linux/eisa.h>
-#endif
-
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
@@ -70,20 +65,13 @@
#include <asm/x86_init.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
-
-/* No need to be aligned, but done to keep all IDTs defined the same way. */
-gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss;
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
#include <asm/proto.h>
#endif
-/* Must be page-aligned because the real IDT is used in a fixmap. */
-gate_desc idt_table[NR_VECTORS] __page_aligned_bss;
-
DECLARE_BITMAP(used_vectors, NR_VECTORS);
-EXPORT_SYMBOL_GPL(used_vectors);
static inline void cond_local_irq_enable(struct pt_regs *regs)
{
@@ -935,87 +923,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
}
#endif
-/* Set of traps needed for early debugging. */
-void __init early_trap_init(void)
-{
- /*
- * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
- * is ready in cpu_init() <-- trap_init(). Before trap_init(),
- * CPU runs at ring 0 so it is impossible to hit an invalid
- * stack. Using the original stack works well enough at this
- * early stage. DEBUG_STACK will be equipped after cpu_init() in
- * trap_init().
- *
- * We don't need to set trace_idt_table like set_intr_gate(),
- * since we don't have trace_debug and it will be reset to
- * 'debug' in trap_init() by set_intr_gate_ist().
- */
- set_intr_gate_notrace(X86_TRAP_DB, debug);
- /* int3 can be called from all */
- set_system_intr_gate(X86_TRAP_BP, &int3);
-#ifdef CONFIG_X86_32
- set_intr_gate(X86_TRAP_PF, page_fault);
-#endif
- load_idt(&idt_descr);
-}
-
-void __init early_trap_pf_init(void)
-{
-#ifdef CONFIG_X86_64
- set_intr_gate(X86_TRAP_PF, page_fault);
-#endif
-}
-
void __init trap_init(void)
{
- int i;
-
-#ifdef CONFIG_EISA
- void __iomem *p = early_ioremap(0x0FFFD9, 4);
-
- if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
- EISA_bus = 1;
- early_iounmap(p, 4);
-#endif
-
- set_intr_gate(X86_TRAP_DE, divide_error);
- set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
- /* int4 can be called from all */
- set_system_intr_gate(X86_TRAP_OF, &overflow);
- set_intr_gate(X86_TRAP_BR, bounds);
- set_intr_gate(X86_TRAP_UD, invalid_op);
- set_intr_gate(X86_TRAP_NM, device_not_available);
-#ifdef CONFIG_X86_32
- set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
-#else
- set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
-#endif
- set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
- set_intr_gate(X86_TRAP_TS, invalid_TSS);
- set_intr_gate(X86_TRAP_NP, segment_not_present);
- set_intr_gate(X86_TRAP_SS, stack_segment);
- set_intr_gate(X86_TRAP_GP, general_protection);
- set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
- set_intr_gate(X86_TRAP_MF, coprocessor_error);
- set_intr_gate(X86_TRAP_AC, alignment_check);
-#ifdef CONFIG_X86_MCE
- set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
-#endif
- set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
-
- /* Reserve all the builtin and the syscall vector: */
- for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
- set_bit(i, used_vectors);
-
-#ifdef CONFIG_IA32_EMULATION
- set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat);
- set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#endif
-
-#ifdef CONFIG_X86_32
- set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
- set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#endif
+ idt_setup_traps();
/*
* Set the IDT descriptor to a fixed read-only location, so that the
@@ -1030,20 +940,9 @@ void __init trap_init(void)
*/
cpu_init();
- /*
- * X86_TRAP_DB and X86_TRAP_BP have been set
- * in early_trap_init(). However, ITS works only after
- * cpu_init() loads TSS. See comments in early_trap_init().
- */
- set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
- /* int3 can be called from all */
- set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+ idt_setup_ist_traps();
x86_init.irqs.trap_init();
-#ifdef CONFIG_X86_64
- memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
- set_nmi_gate(X86_TRAP_DB, &debug);
- set_nmi_gate(X86_TRAP_BP, &int3);
-#endif
+ idt_setup_debugidt_traps();
}
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index b9389d72b2f7..d145a0b1f529 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -10,20 +10,22 @@
#define FRAME_HEADER_SIZE (sizeof(long) * 2)
-/*
- * This disables KASAN checking when reading a value from another task's stack,
- * since the other task could be running on another CPU and could have poisoned
- * the stack in the meantime.
- */
-#define READ_ONCE_TASK_STACK(task, x) \
-({ \
- unsigned long val; \
- if (task == current) \
- val = READ_ONCE(x); \
- else \
- val = READ_ONCE_NOCHECK(x); \
- val; \
-})
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ return state->regs ? &state->regs->ip : state->bp + 1;
+}
static void unwind_dump(struct unwind_state *state)
{
@@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state)
}
}
-unsigned long unwind_get_return_address(struct unwind_state *state)
-{
- if (unwind_done(state))
- return 0;
-
- return __kernel_text_address(state->ip) ? state->ip : 0;
-}
-EXPORT_SYMBOL_GPL(unwind_get_return_address);
-
static size_t regs_size(struct pt_regs *regs)
{
/* x86_32 regs from kernel mode are two words shorter: */
@@ -91,10 +84,8 @@ static bool in_entry_code(unsigned long ip)
if (addr >= __entry_text_start && addr < __entry_text_end)
return true;
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
return true;
-#endif
return false;
}
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
index 039f36738e49..4f0e17b90463 100644
--- a/arch/x86/kernel/unwind_guess.c
+++ b/arch/x86/kernel/unwind_guess.c
@@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ return NULL;
+}
+
bool unwind_next_frame(struct unwind_state *state)
{
struct stack_info *info = &state->stack_info;
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
new file mode 100644
index 000000000000..570b70d3f604
--- /dev/null
+++ b/arch/x86/kernel/unwind_orc.c
@@ -0,0 +1,582 @@
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+#include <asm/orc_types.h>
+#include <asm/orc_lookup.h>
+#include <asm/sections.h>
+
+#define orc_warn(fmt, ...) \
+ printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
+
+extern int __start_orc_unwind_ip[];
+extern int __stop_orc_unwind_ip[];
+extern struct orc_entry __start_orc_unwind[];
+extern struct orc_entry __stop_orc_unwind[];
+
+static DEFINE_MUTEX(sort_mutex);
+int *cur_orc_ip_table = __start_orc_unwind_ip;
+struct orc_entry *cur_orc_table = __start_orc_unwind;
+
+unsigned int lookup_num_blocks;
+bool orc_init;
+
+static inline unsigned long orc_ip(const int *ip)
+{
+ return (unsigned long)ip + *ip;
+}
+
+static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
+ unsigned int num_entries, unsigned long ip)
+{
+ int *first = ip_table;
+ int *last = ip_table + num_entries - 1;
+ int *mid = first, *found = first;
+
+ if (!num_entries)
+ return NULL;
+
+ /*
+ * Do a binary range search to find the rightmost duplicate of a given
+ * starting address. Some entries are section terminators which are
+ * "weak" entries for ensuring there are no gaps. They should be
+ * ignored when they conflict with a real entry.
+ */
+ while (first <= last) {
+ mid = first + ((last - first) / 2);
+
+ if (orc_ip(mid) <= ip) {
+ found = mid;
+ first = mid + 1;
+ } else
+ last = mid - 1;
+ }
+
+ return u_table + (found - ip_table);
+}
+
+#ifdef CONFIG_MODULES
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ struct module *mod;
+
+ mod = __module_address(ip);
+ if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip)
+ return NULL;
+ return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind,
+ mod->arch.num_orcs, ip);
+}
+#else
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ return NULL;
+}
+#endif
+
+static struct orc_entry *orc_find(unsigned long ip)
+{
+ if (!orc_init)
+ return NULL;
+
+ /* For non-init vmlinux addresses, use the fast lookup table: */
+ if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
+ unsigned int idx, start, stop;
+
+ idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;
+
+ if (unlikely((idx >= lookup_num_blocks-1))) {
+ orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n",
+ idx, lookup_num_blocks, ip);
+ return NULL;
+ }
+
+ start = orc_lookup[idx];
+ stop = orc_lookup[idx + 1] + 1;
+
+ if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
+ (__start_orc_unwind + stop > __stop_orc_unwind))) {
+ orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n",
+ idx, lookup_num_blocks, start, stop, ip);
+ return NULL;
+ }
+
+ return __orc_find(__start_orc_unwind_ip + start,
+ __start_orc_unwind + start, stop - start, ip);
+ }
+
+ /* vmlinux .init slow lookup: */
+ if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext)
+ return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
+
+ /* Module lookup: */
+ return orc_module_find(ip);
+}
+
+static void orc_sort_swap(void *_a, void *_b, int size)
+{
+ struct orc_entry *orc_a, *orc_b;
+ struct orc_entry orc_tmp;
+ int *a = _a, *b = _b, tmp;
+ int delta = _b - _a;
+
+ /* Swap the .orc_unwind_ip entries: */
+ tmp = *a;
+ *a = *b + delta;
+ *b = tmp - delta;
+
+ /* Swap the corresponding .orc_unwind entries: */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ orc_b = cur_orc_table + (b - cur_orc_ip_table);
+ orc_tmp = *orc_a;
+ *orc_a = *orc_b;
+ *orc_b = orc_tmp;
+}
+
+static int orc_sort_cmp(const void *_a, const void *_b)
+{
+ struct orc_entry *orc_a;
+ const int *a = _a, *b = _b;
+ unsigned long a_val = orc_ip(a);
+ unsigned long b_val = orc_ip(b);
+
+ if (a_val > b_val)
+ return 1;
+ if (a_val < b_val)
+ return -1;
+
+ /*
+ * The "weak" section terminator entries need to always be on the left
+ * to ensure the lookup code skips them in favor of real entries.
+ * These terminator entries exist to handle any gaps created by
+ * whitelisted .o files which didn't get objtool generation.
+ */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1;
+}
+
+#ifdef CONFIG_MODULES
+void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
+ void *_orc, size_t orc_size)
+{
+ int *orc_ip = _orc_ip;
+ struct orc_entry *orc = _orc;
+ unsigned int num_entries = orc_ip_size / sizeof(int);
+
+ WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(*orc) != 0 ||
+ num_entries != orc_size / sizeof(*orc));
+
+ /*
+ * The 'cur_orc_*' globals allow the orc_sort_swap() callback to
+ * associate an .orc_unwind_ip table entry with its corresponding
+ * .orc_unwind entry so they can both be swapped.
+ */
+ mutex_lock(&sort_mutex);
+ cur_orc_ip_table = orc_ip;
+ cur_orc_table = orc;
+ sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap);
+ mutex_unlock(&sort_mutex);
+
+ mod->arch.orc_unwind_ip = orc_ip;
+ mod->arch.orc_unwind = orc;
+ mod->arch.num_orcs = num_entries;
+}
+#endif
+
+void __init unwind_init(void)
+{
+ size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip;
+ size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind;
+ size_t num_entries = orc_ip_size / sizeof(int);
+ struct orc_entry *orc;
+ int i;
+
+ if (!num_entries || orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(struct orc_entry) != 0 ||
+ num_entries != orc_size / sizeof(struct orc_entry)) {
+ orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ /* Sort the .orc_unwind and .orc_unwind_ip tables: */
+ sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp,
+ orc_sort_swap);
+
+ /* Initialize the fast lookup table: */
+ lookup_num_blocks = orc_lookup_end - orc_lookup;
+ for (i = 0; i < lookup_num_blocks-1; i++) {
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ num_entries,
+ LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i));
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ orc_lookup[i] = orc - __start_orc_unwind;
+ }
+
+ /* Initialize the ending block: */
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries,
+ LOOKUP_STOP_IP);
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+ orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind;
+
+ orc_init = true;
+}
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ if (state->regs)
+ return &state->regs->ip;
+
+ if (state->sp)
+ return (unsigned long *)state->sp - 1;
+
+ return NULL;
+}
+
+static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
+ size_t len)
+{
+ struct stack_info *info = &state->stack_info;
+
+ /*
+ * If the address isn't on the current stack, switch to the next one.
+ *
+ * We may have to traverse multiple stacks to deal with the possibility
+ * that info->next_sp could point to an empty stack and the address
+ * could be on a subsequent stack.
+ */
+ while (!on_stack(info, (void *)addr, len))
+ if (get_stack_info(info->next_sp, state->task, info,
+ &state->stack_mask))
+ return false;
+
+ return true;
+}
+
+static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
+ unsigned long *val)
+{
+ if (!stack_access_ok(state, addr, sizeof(long)))
+ return false;
+
+ *val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr);
+ return true;
+}
+
+#define REGS_SIZE (sizeof(struct pt_regs))
+#define SP_OFFSET (offsetof(struct pt_regs, sp))
+#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
+#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
+
+static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp, bool full)
+{
+ size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
+ size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
+ struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ if (!stack_access_ok(state, addr, regs_size))
+ return false;
+
+ *ip = regs->ip;
+ *sp = regs->sp;
+
+ return true;
+ }
+
+ if (!stack_access_ok(state, addr, sp_offset))
+ return false;
+
+ *ip = regs->ip;
+
+ if (user_mode(regs)) {
+ if (!stack_access_ok(state, addr + sp_offset,
+ REGS_SIZE - SP_OFFSET))
+ return false;
+
+ *sp = regs->sp;
+ } else
+ *sp = (unsigned long)&regs->sp;
+
+ return true;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
+ enum stack_type prev_type = state->stack_info.type;
+ struct orc_entry *orc;
+ struct pt_regs *ptregs;
+ bool indirect = false;
+
+ if (unwind_done(state))
+ return false;
+
+ /* Don't let modules unload while we're reading their ORC data. */
+ preempt_disable();
+
+ /* Have we reached the end? */
+ if (state->regs && user_mode(state->regs))
+ goto done;
+
+ /*
+ * Find the orc_entry associated with the text address.
+ *
+ * Decrement call return addresses by one so they work for sibling
+ * calls and calls to noreturn functions.
+ */
+ orc = orc_find(state->signal ? state->ip : state->ip - 1);
+ if (!orc || orc->sp_reg == ORC_REG_UNDEFINED)
+ goto done;
+ orig_ip = state->ip;
+
+ /* Find the previous frame's stack: */
+ switch (orc->sp_reg) {
+ case ORC_REG_SP:
+ sp = state->sp + orc->sp_offset;
+ break;
+
+ case ORC_REG_BP:
+ sp = state->bp + orc->sp_offset;
+ break;
+
+ case ORC_REG_SP_INDIRECT:
+ sp = state->sp + orc->sp_offset;
+ indirect = true;
+ break;
+
+ case ORC_REG_BP_INDIRECT:
+ sp = state->bp + orc->sp_offset;
+ indirect = true;
+ break;
+
+ case ORC_REG_R10:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg R10 at ip %p\n",
+ (void *)state->ip);
+ goto done;
+ }
+ sp = state->regs->r10;
+ break;
+
+ case ORC_REG_R13:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg R13 at ip %p\n",
+ (void *)state->ip);
+ goto done;
+ }
+ sp = state->regs->r13;
+ break;
+
+ case ORC_REG_DI:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg DI at ip %p\n",
+ (void *)state->ip);
+ goto done;
+ }
+ sp = state->regs->di;
+ break;
+
+ case ORC_REG_DX:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg DX at ip %p\n",
+ (void *)state->ip);
+ goto done;
+ }
+ sp = state->regs->dx;
+ break;
+
+ default:
+ orc_warn("unknown SP base reg %d for ip %p\n",
+ orc->sp_reg, (void *)state->ip);
+ goto done;
+ }
+
+ if (indirect) {
+ if (!deref_stack_reg(state, sp, &sp))
+ goto done;
+ }
+
+ /* Find IP, SP and possibly regs: */
+ switch (orc->type) {
+ case ORC_TYPE_CALL:
+ ip_p = sp - sizeof(long);
+
+ if (!deref_stack_reg(state, ip_p, &state->ip))
+ goto done;
+
+ state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ state->ip, (void *)ip_p);
+
+ state->sp = sp;
+ state->regs = NULL;
+ state->signal = false;
+ break;
+
+ case ORC_TYPE_REGS:
+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
+ orc_warn("can't dereference registers at %p for ip %p\n",
+ (void *)sp, (void *)orig_ip);
+ goto done;
+ }
+
+ state->regs = (struct pt_regs *)sp;
+ state->full_regs = true;
+ state->signal = true;
+ break;
+
+ case ORC_TYPE_REGS_IRET:
+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
+ orc_warn("can't dereference iret registers at %p for ip %p\n",
+ (void *)sp, (void *)orig_ip);
+ goto done;
+ }
+
+ ptregs = container_of((void *)sp, struct pt_regs, ip);
+ if ((unsigned long)ptregs >= prev_sp &&
+ on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
+ state->regs = ptregs;
+ state->full_regs = false;
+ } else
+ state->regs = NULL;
+
+ state->signal = true;
+ break;
+
+ default:
+ orc_warn("unknown .orc_unwind entry type %d\n", orc->type);
+ break;
+ }
+
+ /* Find BP: */
+ switch (orc->bp_reg) {
+ case ORC_REG_UNDEFINED:
+ if (state->regs && state->full_regs)
+ state->bp = state->regs->bp;
+ break;
+
+ case ORC_REG_PREV_SP:
+ if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
+ goto done;
+ break;
+
+ case ORC_REG_BP:
+ if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
+ goto done;
+ break;
+
+ default:
+ orc_warn("unknown BP base reg %d for ip %p\n",
+ orc->bp_reg, (void *)orig_ip);
+ goto done;
+ }
+
+ /* Prevent a recursive loop due to bad ORC data: */
+ if (state->stack_info.type == prev_type &&
+ on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
+ state->sp <= prev_sp) {
+ orc_warn("stack going in the wrong direction? ip=%p\n",
+ (void *)orig_ip);
+ goto done;
+ }
+
+ preempt_enable();
+ return true;
+
+done:
+ preempt_enable();
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+
+ /*
+ * Refuse to unwind the stack of a task while it's executing on another
+ * CPU. This check is racy, but that's ok: the unwinder has other
+ * checks to prevent it from going off the rails.
+ */
+ if (task_on_another_cpu(task))
+ goto done;
+
+ if (regs) {
+ if (user_mode(regs))
+ goto done;
+
+ state->ip = regs->ip;
+ state->sp = kernel_stack_pointer(regs);
+ state->bp = regs->bp;
+ state->regs = regs;
+ state->full_regs = true;
+ state->signal = true;
+
+ } else if (task == current) {
+ asm volatile("lea (%%rip), %0\n\t"
+ "mov %%rsp, %1\n\t"
+ "mov %%rbp, %2\n\t"
+ : "=r" (state->ip), "=r" (state->sp),
+ "=r" (state->bp));
+
+ } else {
+ struct inactive_task_frame *frame = (void *)task->thread.sp;
+
+ state->sp = task->thread.sp;
+ state->bp = READ_ONCE_NOCHECK(frame->bp);
+ state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+ }
+
+ if (get_stack_info((unsigned long *)state->sp, state->task,
+ &state->stack_info, &state->stack_mask))
+ return;
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+
+ /* When starting from regs, skip the regs frame: */
+ if (regs) {
+ unwind_next_frame(state);
+ return;
+ }
+
+ /* Otherwise, skip ahead to the user-specified starting frame: */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ state->sp <= (unsigned long)first_frame))
+ unwind_next_frame(state);
+
+ return;
+
+done:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return;
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index c8a3b61be0aa..f05f00acac89 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -24,6 +24,7 @@
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/page_types.h>
+#include <asm/orc_lookup.h>
#include <asm/cache.h>
#include <asm/boot.h>
@@ -148,6 +149,8 @@ SECTIONS
BUG_TABLE
+ ORC_UNWIND_TABLE
+
. = ALIGN(PAGE_SIZE);
__vvar_page = .;