aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/arc/Kconfig1
-rw-r--r--arch/arc/include/asm/arcregs.h2
-rw-r--r--arch/arc/include/asm/cacheflush.h6
-rw-r--r--arch/arc/include/asm/irqflags-arcv2.h6
-rw-r--r--arch/arc/kernel/entry-arcv2.S24
-rw-r--r--arch/arc/kernel/entry-compact.S2
-rw-r--r--arch/arc/kernel/intc-arcv2.c10
-rw-r--r--arch/arc/mm/cache.c28
-rw-r--r--arch/arm/boot/dts/hisi-x5hd2.dtsi4
-rw-r--r--arch/arm64/include/asm/acpi.h2
-rw-r--r--arch/arm64/include/asm/memory.h5
-rw-r--r--arch/arm64/kernel/acpi.c7
-rw-r--r--arch/arm64/kernel/setup.c8
-rw-r--r--arch/microblaze/include/asm/unistd.h2
-rw-r--r--arch/microblaze/include/uapi/asm/unistd.h6
-rw-r--r--arch/microblaze/kernel/cpu/cpuinfo.c6
-rw-r--r--arch/microblaze/kernel/syscall_table.S6
-rw-r--r--arch/microblaze/kernel/timer.c2
-rw-r--r--arch/parisc/Kconfig1
-rw-r--r--arch/parisc/include/asm/elf.h7
-rw-r--r--arch/parisc/include/asm/pdcpat.h2
-rw-r--r--arch/parisc/include/asm/processor.h4
-rw-r--r--arch/parisc/kernel/entry.S12
-rw-r--r--arch/parisc/kernel/firmware.c2
-rw-r--r--arch/parisc/kernel/inventory.c8
-rw-r--r--arch/parisc/kernel/perf.c5
-rw-r--r--arch/parisc/kernel/process.c6
-rw-r--r--arch/parisc/kernel/processor.c29
-rw-r--r--arch/parisc/kernel/sys_parisc.c18
-rw-r--r--arch/parisc/kernel/time.c112
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/ima.h29
-rw-r--r--arch/powerpc/include/asm/kexec.h15
-rw-r--r--arch/powerpc/kernel/Makefile4
-rw-r--r--arch/powerpc/kernel/ima_kexec.c223
-rw-r--r--arch/powerpc/kernel/kexec_elf_64.c2
-rw-r--r--arch/powerpc/kernel/machine_kexec_file_64.c15
-rw-r--r--arch/powerpc/platforms/85xx/corenet_generic.c3
-rw-r--r--arch/x86/Kconfig13
-rw-r--r--arch/x86/events/intel/core.c30
-rw-r--r--arch/x86/events/intel/cqm.c23
-rw-r--r--arch/x86/events/intel/uncore_snbep.c2
-rw-r--r--arch/x86/include/asm/cpufeatures.h4
-rw-r--r--arch/x86/include/asm/intel_rdt.h224
-rw-r--r--arch/x86/include/asm/intel_rdt_common.h27
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c20
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c403
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c1115
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_schemata.c245
-rw-r--r--arch/x86/kernel/cpu/scattered.c11
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kvm/cpuid.c9
-rw-r--r--arch/x86/kvm/hyperv.c24
-rw-r--r--arch/x86/kvm/vmx.c13
-rw-r--r--arch/x86/kvm/x86.c18
-rw-r--r--arch/xtensa/Kconfig1
-rw-r--r--arch/xtensa/boot/dts/kc705.dts16
-rw-r--r--arch/xtensa/include/asm/Kbuild1
-rw-r--r--arch/xtensa/kernel/Makefile1
-rw-r--r--arch/xtensa/kernel/pci-dma.c21
-rw-r--r--arch/xtensa/kernel/s32c1i_selftest.c128
-rw-r--r--arch/xtensa/kernel/setup.c137
-rw-r--r--arch/xtensa/mm/init.c2
67 files changed, 2744 insertions, 383 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 19483aea4bbc..99839c23d453 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -5,6 +5,9 @@
config KEXEC_CORE
bool
+config HAVE_IMA_KEXEC
+ bool
+
config OPROFILE
tristate "OProfile system profiling"
depends on PROFILING
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index ab12723d39a0..c75d29077e4a 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -9,6 +9,7 @@
config ARC
def_bool y
select ARC_TIMERS
+ select ARCH_HAS_SG_CHAIN
select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC
select BUILDTIME_EXTABLE_SORT
select CLONE_BACKWARDS
diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h
index da41a54ea2d7..f659942744de 100644
--- a/arch/arc/include/asm/arcregs.h
+++ b/arch/arc/include/asm/arcregs.h
@@ -244,7 +244,7 @@ struct cpuinfo_arc_mmu {
};
struct cpuinfo_arc_cache {
- unsigned int sz_k:14, line_len:8, assoc:4, ver:4, alias:1, vipt:1;
+ unsigned int sz_k:14, line_len:8, assoc:4, alias:1, vipt:1, pad:4;
};
struct cpuinfo_arc_bpu {
diff --git a/arch/arc/include/asm/cacheflush.h b/arch/arc/include/asm/cacheflush.h
index a093adbdb017..fc662f49c55a 100644
--- a/arch/arc/include/asm/cacheflush.h
+++ b/arch/arc/include/asm/cacheflush.h
@@ -85,6 +85,10 @@ void flush_anon_page(struct vm_area_struct *vma,
*/
#define PG_dc_clean PG_arch_1
+#define CACHE_COLORS_NUM 4
+#define CACHE_COLORS_MSK (CACHE_COLORS_NUM - 1)
+#define CACHE_COLOR(addr) (((unsigned long)(addr) >> (PAGE_SHIFT)) & CACHE_COLORS_MSK)
+
/*
* Simple wrapper over config option
* Bootup code ensures that hardware matches kernel configuration
@@ -94,8 +98,6 @@ static inline int cache_is_vipt_aliasing(void)
return IS_ENABLED(CONFIG_ARC_CACHE_VIPT_ALIASING);
}
-#define CACHE_COLOR(addr) (((unsigned long)(addr) >> (PAGE_SHIFT)) & 1)
-
/*
* checks if two addresses (after page aligning) index into same cache set
*/
diff --git a/arch/arc/include/asm/irqflags-arcv2.h b/arch/arc/include/asm/irqflags-arcv2.h
index e880dfa3fcd3..a64c447b0337 100644
--- a/arch/arc/include/asm/irqflags-arcv2.h
+++ b/arch/arc/include/asm/irqflags-arcv2.h
@@ -38,10 +38,10 @@
#define AUX_IRQ_ACT_BIT_U 31
/*
- * User space should be interruptable even by lowest prio interrupt
- * Safe even if actual interrupt priorities is fewer or even one
+ * Hardware supports 16 priorities (0 highest, 15 lowest)
+ * Linux by default runs at 1, priority 0 reserved for NMI style interrupts
*/
-#define ARCV2_IRQ_DEF_PRIO 15
+#define ARCV2_IRQ_DEF_PRIO 1
/* seed value for status register */
#define ISA_INIT_STATUS_BITS (STATUS_IE_MASK | STATUS_AD_MASK | \
diff --git a/arch/arc/kernel/entry-arcv2.S b/arch/arc/kernel/entry-arcv2.S
index 7a1c124ff021..0b6388a5f0b8 100644
--- a/arch/arc/kernel/entry-arcv2.S
+++ b/arch/arc/kernel/entry-arcv2.S
@@ -67,12 +67,23 @@ ENTRY(handle_interrupt)
INTERRUPT_PROLOGUE irq
- clri ; To make status32.IE agree with CPU internal state
-
-#ifdef CONFIG_TRACE_IRQFLAGS
- TRACE_ASM_IRQ_DISABLE
-#endif
-
+ # irq control APIs local_irq_save/restore/disable/enable fiddle with
+ # global interrupt enable bits in STATUS32 (.IE for 1 prio, .E[] for 2 prio)
+ # However a taken interrupt doesn't clear these bits. Thus irqs_disabled()
+ # query in hard ISR path would return false (since .IE is set) which would
+ # trips genirq interrupt handling asserts.
+ #
+ # So do a "soft" disable of interrutps here.
+ #
+ # Note this disable is only for consistent book-keeping as further interrupts
+ # will be disabled anyways even w/o this. Hardware tracks active interrupts
+ # seperately in AUX_IRQ_ACTIVE.active and will not take new interrupts
+ # unless this one returns (or higher prio becomes pending in 2-prio scheme)
+
+ IRQ_DISABLE
+
+ ; icause is banked: one per priority level
+ ; so a higher prio interrupt taken here won't clobber prev prio icause
lr r0, [ICAUSE]
mov blink, ret_from_exception
@@ -171,6 +182,7 @@ END(EV_TLBProtV)
; All 2 entry points to here already disable interrupts
.Lrestore_regs:
+restore_regs:
# Interrpts are actually disabled from this point on, but will get
# reenabled after we return from interrupt/exception.
diff --git a/arch/arc/kernel/entry-compact.S b/arch/arc/kernel/entry-compact.S
index 98812c1248df..9211707634dc 100644
--- a/arch/arc/kernel/entry-compact.S
+++ b/arch/arc/kernel/entry-compact.S
@@ -259,7 +259,7 @@ ENTRY(EV_TLBProtV)
EXCEPTION_PROLOGUE
- lr r2, [ecr]
+ mov r2, r9 ; ECR set into r9 already
lr r0, [efa] ; Faulting Data address (not part of pt_regs saved above)
; Exception auto-disables further Intr/exceptions.
diff --git a/arch/arc/kernel/intc-arcv2.c b/arch/arc/kernel/intc-arcv2.c
index 62b59409a5d9..994dca7014db 100644
--- a/arch/arc/kernel/intc-arcv2.c
+++ b/arch/arc/kernel/intc-arcv2.c
@@ -14,8 +14,6 @@
#include <linux/irqchip.h>
#include <asm/irq.h>
-static int irq_prio;
-
/*
* Early Hardware specific Interrupt setup
* -Called very early (start_kernel -> setup_arch -> setup_processor)
@@ -24,7 +22,7 @@ static int irq_prio;
*/
void arc_init_IRQ(void)
{
- unsigned int tmp;
+ unsigned int tmp, irq_prio;
struct irq_build {
#ifdef CONFIG_CPU_BIG_ENDIAN
@@ -67,12 +65,12 @@ void arc_init_IRQ(void)
irq_prio = irq_bcr.prio; /* Encoded as N-1 for N levels */
pr_info("archs-intc\t: %d priority levels (default %d)%s\n",
- irq_prio + 1, irq_prio,
+ irq_prio + 1, ARCV2_IRQ_DEF_PRIO,
irq_bcr.firq ? " FIRQ (not used)":"");
/* setup status32, don't enable intr yet as kernel doesn't want */
tmp = read_aux_reg(0xa);
- tmp |= STATUS_AD_MASK | (irq_prio << 1);
+ tmp |= STATUS_AD_MASK | (ARCV2_IRQ_DEF_PRIO << 1);
tmp &= ~STATUS_IE_MASK;
asm volatile("kflag %0 \n"::"r"(tmp));
}
@@ -93,7 +91,7 @@ void arcv2_irq_enable(struct irq_data *data)
{
/* set default priority */
write_aux_reg(AUX_IRQ_SELECT, data->irq);
- write_aux_reg(AUX_IRQ_PRIORITY, irq_prio);
+ write_aux_reg(AUX_IRQ_PRIORITY, ARCV2_IRQ_DEF_PRIO);
/*
* hw auto enables (linux unmask) all by default
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 50d71695cd4e..ec86ac0e3321 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -40,7 +40,7 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len)
struct cpuinfo_arc_cache *p;
#define PR_CACHE(p, cfg, str) \
- if (!(p)->ver) \
+ if (!(p)->line_len) \
n += scnprintf(buf + n, len - n, str"\t\t: N/A\n"); \
else \
n += scnprintf(buf + n, len - n, \
@@ -54,7 +54,7 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len)
PR_CACHE(&cpuinfo_arc700[c].dcache, CONFIG_ARC_HAS_DCACHE, "D-Cache");
p = &cpuinfo_arc700[c].slc;
- if (p->ver)
+ if (p->line_len)
n += scnprintf(buf + n, len - n,
"SLC\t\t: %uK, %uB Line%s\n",
p->sz_k, p->line_len, IS_USED_RUN(slc_enable));
@@ -104,7 +104,6 @@ static void read_decode_cache_bcr_arcv2(int cpu)
READ_BCR(ARC_REG_SLC_BCR, sbcr);
if (sbcr.ver) {
READ_BCR(ARC_REG_SLC_CFG, slc_cfg);
- p_slc->ver = sbcr.ver;
p_slc->sz_k = 128 << slc_cfg.sz;
l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64;
}
@@ -152,7 +151,6 @@ void read_decode_cache_bcr(void)
p_ic->line_len = 8 << ibcr.line_len;
p_ic->sz_k = 1 << (ibcr.sz - 1);
- p_ic->ver = ibcr.ver;
p_ic->vipt = 1;
p_ic->alias = p_ic->sz_k/p_ic->assoc/TO_KB(PAGE_SIZE) > 1;
@@ -176,7 +174,6 @@ dc_chk:
p_dc->line_len = 16 << dbcr.line_len;
p_dc->sz_k = 1 << (dbcr.sz - 1);
- p_dc->ver = dbcr.ver;
slc_chk:
if (is_isa_arcv2())
@@ -945,17 +942,13 @@ void arc_cache_init(void)
if (IS_ENABLED(CONFIG_ARC_HAS_ICACHE)) {
struct cpuinfo_arc_cache *ic = &cpuinfo_arc700[cpu].icache;
- if (!ic->ver)
+ if (!ic->line_len)
panic("cache support enabled but non-existent cache\n");
if (ic->line_len != L1_CACHE_BYTES)
panic("ICache line [%d] != kernel Config [%d]",
ic->line_len, L1_CACHE_BYTES);
- if (ic->ver != CONFIG_ARC_MMU_VER)
- panic("Cache ver [%d] doesn't match MMU ver [%d]\n",
- ic->ver, CONFIG_ARC_MMU_VER);
-
/*
* In MMU v4 (HS38x) the aliasing icache config uses IVIL/PTAG
* pair to provide vaddr/paddr respectively, just as in MMU v3
@@ -969,7 +962,7 @@ void arc_cache_init(void)
if (IS_ENABLED(CONFIG_ARC_HAS_DCACHE)) {
struct cpuinfo_arc_cache *dc = &cpuinfo_arc700[cpu].dcache;
- if (!dc->ver)
+ if (!dc->line_len)
panic("cache support enabled but non-existent cache\n");
if (dc->line_len != L1_CACHE_BYTES)
@@ -979,11 +972,16 @@ void arc_cache_init(void)
/* check for D-Cache aliasing on ARCompact: ARCv2 has PIPT */
if (is_isa_arcompact()) {
int handled = IS_ENABLED(CONFIG_ARC_CACHE_VIPT_ALIASING);
-
- if (dc->alias && !handled)
- panic("Enable CONFIG_ARC_CACHE_VIPT_ALIASING\n");
- else if (!dc->alias && handled)
+ int num_colors = dc->sz_k/dc->assoc/TO_KB(PAGE_SIZE);
+
+ if (dc->alias) {
+ if (!handled)
+ panic("Enable CONFIG_ARC_CACHE_VIPT_ALIASING\n");
+ if (CACHE_COLORS_NUM != num_colors)
+ panic("CACHE_COLORS_NUM not optimized for config\n");
+ } else if (!dc->alias && handled) {
panic("Disable CONFIG_ARC_CACHE_VIPT_ALIASING\n");
+ }
}
}
diff --git a/arch/arm/boot/dts/hisi-x5hd2.dtsi b/arch/arm/boot/dts/hisi-x5hd2.dtsi
index c02e092fad8b..6c712a97e1fe 100644
--- a/arch/arm/boot/dts/hisi-x5hd2.dtsi
+++ b/arch/arm/boot/dts/hisi-x5hd2.dtsi
@@ -438,7 +438,7 @@
};
gmac0: ethernet@1840000 {
- compatible = "hisilicon,hix5hd2-gemac", "hisilicon,hisi-gemac-v1";
+ compatible = "hisilicon,hix5hd2-gmac", "hisilicon,hisi-gmac-v1";
reg = <0x1840000 0x1000>,<0x184300c 0x4>;
interrupts = <0 71 4>;
clocks = <&clock HIX5HD2_MAC0_CLK>;
@@ -447,7 +447,7 @@
};
gmac1: ethernet@1841000 {
- compatible = "hisilicon,hix5hd2-gemac", "hisilicon,hisi-gemac-v1";
+ compatible = "hisilicon,hix5hd2-gmac", "hisilicon,hisi-gmac-v1";
reg = <0x1841000 0x1000>,<0x1843010 0x4>;
interrupts = <0 72 4>;
clocks = <&clock HIX5HD2_MAC1_CLK>;
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index d0de0e032bc2..c1976c0adca7 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -29,7 +29,7 @@
/* Basic configuration for ACPI */
#ifdef CONFIG_ACPI
-/* ACPI table mapping after acpi_gbl_permanent_mmap is set */
+/* ACPI table mapping after acpi_permanent_mmap is set */
static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys,
acpi_size size)
{
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index b71086d25195..bfe632808d77 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -165,6 +165,11 @@ extern u64 kimage_vaddr;
/* the offset between the kernel virtual and physical mappings */
extern u64 kimage_voffset;
+static inline unsigned long kaslr_offset(void)
+{
+ return kimage_vaddr - KIMAGE_VADDR;
+}
+
/*
* Allow all memory at the discovery stage. We will clip it later.
*/
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 252a6d9c1da5..64d9cbd61678 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -132,14 +132,13 @@ static int __init acpi_fadt_sanity_check(void)
struct acpi_table_header *table;
struct acpi_table_fadt *fadt;
acpi_status status;
- acpi_size tbl_size;
int ret = 0;
/*
* FADT is required on arm64; retrieve it to check its presence
* and carry out revision and ACPI HW reduced compliancy tests
*/
- status = acpi_get_table_with_size(ACPI_SIG_FADT, 0, &table, &tbl_size);
+ status = acpi_get_table(ACPI_SIG_FADT, 0, &table);
if (ACPI_FAILURE(status)) {
const char *msg = acpi_format_exception(status);
@@ -170,10 +169,10 @@ static int __init acpi_fadt_sanity_check(void)
out:
/*
- * acpi_get_table_with_size() creates FADT table mapping that
+ * acpi_get_table() creates FADT table mapping that
* should be released after parsing and before resuming boot
*/
- early_acpi_os_unmap_memory(table, tbl_size);
+ acpi_put_table(table);
return ret;
}
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index a53f52ac81c6..b051367e2149 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -338,11 +338,11 @@ subsys_initcall(topology_init);
static int dump_kernel_offset(struct notifier_block *self, unsigned long v,
void *p)
{
- u64 const kaslr_offset = kimage_vaddr - KIMAGE_VADDR;
+ const unsigned long offset = kaslr_offset();
- if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset > 0) {
- pr_emerg("Kernel Offset: 0x%llx from 0x%lx\n",
- kaslr_offset, KIMAGE_VADDR);
+ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && offset > 0) {
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n",
+ offset, KIMAGE_VADDR);
} else {
pr_emerg("Kernel Offset: disabled\n");
}
diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h
index 805ae5d712e8..032fed71223f 100644
--- a/arch/microblaze/include/asm/unistd.h
+++ b/arch/microblaze/include/asm/unistd.h
@@ -38,6 +38,6 @@
#endif /* __ASSEMBLY__ */
-#define __NR_syscalls 392
+#define __NR_syscalls 398
#endif /* _ASM_MICROBLAZE_UNISTD_H */
diff --git a/arch/microblaze/include/uapi/asm/unistd.h b/arch/microblaze/include/uapi/asm/unistd.h
index a8bd3fa28bc7..d8086159d996 100644
--- a/arch/microblaze/include/uapi/asm/unistd.h
+++ b/arch/microblaze/include/uapi/asm/unistd.h
@@ -407,5 +407,11 @@
#define __NR_userfaultfd 389
#define __NR_membarrier 390
#define __NR_mlock2 391
+#define __NR_copy_file_range 392
+#define __NR_preadv2 393
+#define __NR_pwritev2 394
+#define __NR_pkey_mprotect 395
+#define __NR_pkey_alloc 396
+#define __NR_pkey_free 397
#endif /* _UAPI_ASM_MICROBLAZE_UNISTD_H */
diff --git a/arch/microblaze/kernel/cpu/cpuinfo.c b/arch/microblaze/kernel/cpu/cpuinfo.c
index b70bb538f001..96b3f26d16be 100644
--- a/arch/microblaze/kernel/cpu/cpuinfo.c
+++ b/arch/microblaze/kernel/cpu/cpuinfo.c
@@ -49,6 +49,8 @@ const struct cpu_ver_key cpu_ver_lookup[] = {
{"9.3", 0x20},
{"9.4", 0x21},
{"9.5", 0x22},
+ {"9.6", 0x23},
+ {"10.0", 0x24},
{NULL, 0},
};
@@ -75,6 +77,10 @@ const struct family_string_key family_string_lookup[] = {
{"zynq7000", 0x12},
{"UltraScale Virtex", 0x13},
{"UltraScale Kintex", 0x14},
+ {"UltraScale+ Zynq", 0x15},
+ {"UltraScale+ Virtex", 0x16},
+ {"UltraScale+ Kintex", 0x17},
+ {"Spartan7", 0x18},
{NULL, 0},
};
diff --git a/arch/microblaze/kernel/syscall_table.S b/arch/microblaze/kernel/syscall_table.S
index 6b3dd99126d7..6841c2df14d9 100644
--- a/arch/microblaze/kernel/syscall_table.S
+++ b/arch/microblaze/kernel/syscall_table.S
@@ -392,3 +392,9 @@ ENTRY(sys_call_table)
.long sys_userfaultfd
.long sys_membarrier /* 390 */
.long sys_mlock2
+ .long sys_copy_file_range
+ .long sys_preadv2
+ .long sys_pwritev2
+ .long sys_pkey_mprotect /* 395 */
+ .long sys_pkey_alloc
+ .long sys_pkey_free
diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c
index 5bbf38b916ef..9e954959f605 100644
--- a/arch/microblaze/kernel/timer.c
+++ b/arch/microblaze/kernel/timer.c
@@ -259,7 +259,7 @@ static int __init xilinx_timer_init(struct device_node *timer)
int ret;
if (initialized)
- return;
+ return -EINVAL;
initialized = 1;
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index a14b86587013..3a71f38cdc05 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -7,6 +7,7 @@ config PARISC
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_SYSCALL_TRACEPOINTS
select ARCH_WANT_FRAME_POINTERS
+ select ARCH_HAS_ELF_RANDOMIZE
select RTC_CLASS
select RTC_DRV_GENERIC
select INIT_ALL_POSSIBLE
diff --git a/arch/parisc/include/asm/elf.h b/arch/parisc/include/asm/elf.h
index 78c9fd32c554..a6b2a421571e 100644
--- a/arch/parisc/include/asm/elf.h
+++ b/arch/parisc/include/asm/elf.h
@@ -348,9 +348,10 @@ struct pt_regs; /* forward declaration... */
#define ELF_HWCAP 0
-#define STACK_RND_MASK (is_32bit_task() ? \
- 0x7ff >> (PAGE_SHIFT - 12) : \
- 0x3ffff >> (PAGE_SHIFT - 12))
+/* Masks for stack and mmap randomization */
+#define BRK_RND_MASK (is_32bit_task() ? 0x07ffUL : 0x3ffffUL)
+#define MMAP_RND_MASK (is_32bit_task() ? 0x1fffUL : 0x3ffffUL)
+#define STACK_RND_MASK MMAP_RND_MASK
struct mm_struct;
extern unsigned long arch_randomize_brk(struct mm_struct *);
diff --git a/arch/parisc/include/asm/pdcpat.h b/arch/parisc/include/asm/pdcpat.h
index 47539f117958..e1d289092705 100644
--- a/arch/parisc/include/asm/pdcpat.h
+++ b/arch/parisc/include/asm/pdcpat.h
@@ -289,7 +289,7 @@ extern int pdc_pat_cell_get_number(struct pdc_pat_cell_num *cell_info);
extern int pdc_pat_cell_module(unsigned long *actcnt, unsigned long ploc, unsigned long mod, unsigned long view_type, void *mem_addr);
extern int pdc_pat_cell_num_to_loc(void *, unsigned long);
-extern int pdc_pat_cpu_get_number(struct pdc_pat_cpu_num *cpu_info, void *hpa);
+extern int pdc_pat_cpu_get_number(struct pdc_pat_cpu_num *cpu_info, unsigned long hpa);
extern int pdc_pat_pd_get_addr_map(unsigned long *actual_len, void *mem_addr, unsigned long count, unsigned long offset);
diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h
index ca40741378be..a3661ee6b060 100644
--- a/arch/parisc/include/asm/processor.h
+++ b/arch/parisc/include/asm/processor.h
@@ -93,9 +93,7 @@ struct system_cpuinfo_parisc {
/* Per CPU data structure - ie varies per CPU. */
struct cpuinfo_parisc {
unsigned long it_value; /* Interval Timer at last timer Intr */
- unsigned long it_delta; /* Interval delta (tic_10ms / HZ * 100) */
unsigned long irq_count; /* number of IRQ's since boot */
- unsigned long irq_max_cr16; /* longest time to handle a single IRQ */
unsigned long cpuid; /* aka slot_number or set to NO_PROC_ID */
unsigned long hpa; /* Host Physical address */
unsigned long txn_addr; /* MMIO addr of EIR or id_eid */
@@ -103,8 +101,6 @@ struct cpuinfo_parisc {
unsigned long pending_ipi; /* bitmap of type ipi_message_type */
#endif
unsigned long bh_count; /* number of times bh was invoked */
- unsigned long prof_counter; /* per CPU profiling support */
- unsigned long prof_multiplier; /* per CPU profiling support */
unsigned long fp_rev;
unsigned long fp_model;
unsigned int state;
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index 4fcff2dcc9c3..ad4cb1613c57 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -878,6 +878,9 @@ ENTRY_CFI(syscall_exit_rfi)
STREG %r19,PT_SR7(%r16)
intr_return:
+ /* NOTE: Need to enable interrupts incase we schedule. */
+ ssm PSW_SM_I, %r0
+
/* check for reschedule */
mfctl %cr30,%r1
LDREG TI_FLAGS(%r1),%r19 /* sched.h: TIF_NEED_RESCHED */
@@ -904,11 +907,6 @@ intr_check_sig:
LDREG PT_IASQ1(%r16), %r20
cmpib,COND(=),n 0,%r20,intr_restore /* backward */
- /* NOTE: We need to enable interrupts if we have to deliver
- * signals. We used to do this earlier but it caused kernel
- * stack overflows. */
- ssm PSW_SM_I, %r0
-
copy %r0, %r25 /* long in_syscall = 0 */
#ifdef CONFIG_64BIT
ldo -16(%r30),%r29 /* Reference param save area */
@@ -960,10 +958,6 @@ intr_do_resched:
cmpib,COND(=) 0, %r20, intr_do_preempt
nop
- /* NOTE: We need to enable interrupts if we schedule. We used
- * to do this earlier but it caused kernel stack overflows. */
- ssm PSW_SM_I, %r0
-
#ifdef CONFIG_64BIT
ldo -16(%r30),%r29 /* Reference param save area */
#endif
diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c
index e5d71905cad5..9d797ae4fa22 100644
--- a/arch/parisc/kernel/firmware.c
+++ b/arch/parisc/kernel/firmware.c
@@ -1258,7 +1258,7 @@ int pdc_pat_cell_module(unsigned long *actcnt, unsigned long ploc, unsigned long
*
* Retrieve the cpu number for the cpu at the specified HPA.
*/
-int pdc_pat_cpu_get_number(struct pdc_pat_cpu_num *cpu_info, void *hpa)
+int pdc_pat_cpu_get_number(struct pdc_pat_cpu_num *cpu_info, unsigned long hpa)
{
int retval;
unsigned long flags;
diff --git a/arch/parisc/kernel/inventory.c b/arch/parisc/kernel/inventory.c
index c05d1876d27c..c9789d9c73b4 100644
--- a/arch/parisc/kernel/inventory.c
+++ b/arch/parisc/kernel/inventory.c
@@ -216,9 +216,9 @@ pat_query_module(ulong pcell_loc, ulong mod_index)
register_parisc_device(dev); /* advertise device */
#ifdef DEBUG_PAT
- pdc_pat_cell_mod_maddr_block_t io_pdc_cell;
/* dump what we see so far... */
switch (PAT_GET_ENTITY(dev->mod_info)) {
+ pdc_pat_cell_mod_maddr_block_t io_pdc_cell;
unsigned long i;
case PAT_ENTITY_PROC:
@@ -259,9 +259,9 @@ pat_query_module(ulong pcell_loc, ulong mod_index)
pa_pdc_cell->mod[4 + i * 3]); /* finish (ie end) */
printk(KERN_DEBUG
" IO_VIEW %ld: 0x%016lx 0x%016lx 0x%016lx\n",
- i, io_pdc_cell->mod[2 + i * 3], /* type */
- io_pdc_cell->mod[3 + i * 3], /* start */
- io_pdc_cell->mod[4 + i * 3]); /* finish (ie end) */
+ i, io_pdc_cell.mod[2 + i * 3], /* type */
+ io_pdc_cell.mod[3 + i * 3], /* start */
+ io_pdc_cell.mod[4 + i * 3]); /* finish (ie end) */
}
printk(KERN_DEBUG "\n");
break;
diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c
index 518f4f5f1f43..6eabce62463b 100644
--- a/arch/parisc/kernel/perf.c
+++ b/arch/parisc/kernel/perf.c
@@ -301,7 +301,6 @@ static ssize_t perf_read(struct file *file, char __user *buf, size_t cnt, loff_t
static ssize_t perf_write(struct file *file, const char __user *buf, size_t count,
loff_t *ppos)
{
- int err;
size_t image_size;
uint32_t image_type;
uint32_t interface_type;
@@ -320,8 +319,8 @@ static ssize_t perf_write(struct file *file, const char __user *buf, size_t coun
if (count != sizeof(uint32_t))
return -EIO;
- if ((err = copy_from_user(&image_type, buf, sizeof(uint32_t))) != 0)
- return err;
+ if (copy_from_user(&image_type, buf, sizeof(uint32_t)))
+ return -EFAULT;
/* Get the interface type and test type */
interface_type = (image_type >> 16) & 0xffff;
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 40639439d8b3..ea6603ee8d24 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -276,11 +276,7 @@ void *dereference_function_descriptor(void *ptr)
static inline unsigned long brk_rnd(void)
{
- /* 8MB for 32bit, 1GB for 64bit */
- if (is_32bit_task())
- return (get_random_int() & 0x7ffUL) << PAGE_SHIFT;
- else
- return (get_random_int() & 0x3ffffUL) << PAGE_SHIFT;
+ return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT;
}
unsigned long arch_randomize_brk(struct mm_struct *mm)
diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c
index 0c2a94a0f751..85de47f4eb59 100644
--- a/arch/parisc/kernel/processor.c
+++ b/arch/parisc/kernel/processor.c
@@ -78,11 +78,6 @@ DEFINE_PER_CPU(struct cpuinfo_parisc, cpu_data);
static void
init_percpu_prof(unsigned long cpunum)
{
- struct cpuinfo_parisc *p;
-
- p = &per_cpu(cpu_data, cpunum);
- p->prof_counter = 1;
- p->prof_multiplier = 1;
}
@@ -99,6 +94,7 @@ static int processor_probe(struct parisc_device *dev)
unsigned long txn_addr;
unsigned long cpuid;
struct cpuinfo_parisc *p;
+ struct pdc_pat_cpu_num cpu_info __maybe_unused;
#ifdef CONFIG_SMP
if (num_online_cpus() >= nr_cpu_ids) {
@@ -123,10 +119,6 @@ static int processor_probe(struct parisc_device *dev)
ulong status;
unsigned long bytecnt;
pdc_pat_cell_mod_maddr_block_t *pa_pdc_cell;
-#undef USE_PAT_CPUID
-#ifdef USE_PAT_CPUID
- struct pdc_pat_cpu_num cpu_info;
-#endif
pa_pdc_cell = kmalloc(sizeof (*pa_pdc_cell), GFP_KERNEL);
if (!pa_pdc_cell)
@@ -145,22 +137,27 @@ static int processor_probe(struct parisc_device *dev)
kfree(pa_pdc_cell);
+ /* get the cpu number */
+ status = pdc_pat_cpu_get_number(&cpu_info, dev->hpa.start);
+ BUG_ON(PDC_OK != status);
+
+ pr_info("Logical CPU #%lu is physical cpu #%lu at location "
+ "0x%lx with hpa %pa\n",
+ cpuid, cpu_info.cpu_num, cpu_info.cpu_loc,
+ &dev->hpa.start);
+
+#undef USE_PAT_CPUID
#ifdef USE_PAT_CPUID
/* We need contiguous numbers for cpuid. Firmware's notion
* of cpuid is for physical CPUs and we just don't care yet.
* We'll care when we need to query PAT PDC about a CPU *after*
* boot time (ie shutdown a CPU from an OS perspective).
*/
- /* get the cpu number */
- status = pdc_pat_cpu_get_number(&cpu_info, dev->hpa.start);
-
- BUG_ON(PDC_OK != status);
-
if (cpu_info.cpu_num >= NR_CPUS) {
- printk(KERN_WARNING "IGNORING CPU at 0x%x,"
+ printk(KERN_WARNING "IGNORING CPU at %pa,"
" cpu_slot_id > NR_CPUS"
" (%ld > %d)\n",
- dev->hpa.start, cpu_info.cpu_num, NR_CPUS);
+ &dev->hpa.start, cpu_info.cpu_num, NR_CPUS);
/* Ignore CPU since it will only crash */
boot_cpu_data.cpu_count--;
return 1;
diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c
index 0a393a04e891..a81e177cac7b 100644
--- a/arch/parisc/kernel/sys_parisc.c
+++ b/arch/parisc/kernel/sys_parisc.c
@@ -225,19 +225,17 @@ static unsigned long mmap_rnd(void)
{
unsigned long rnd = 0;
- /*
- * 8 bits of randomness in 32bit mmaps, 20 address space bits
- * 28 bits of randomness in 64bit mmaps, 40 address space bits
- */
- if (current->flags & PF_RANDOMIZE) {
- if (is_32bit_task())
- rnd = get_random_int() % (1<<8);
- else
- rnd = get_random_int() % (1<<28);
- }
+ if (current->flags & PF_RANDOMIZE)
+ rnd = get_random_int() & MMAP_RND_MASK;
+
return rnd << PAGE_SHIFT;
}
+unsigned long arch_mmap_rnd(void)
+{
+ return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT;
+}
+
static unsigned long mmap_legacy_base(void)
{
return TASK_UNMAPPED_BASE + mmap_rnd();
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 325f30d82b64..4215f5596c8b 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -59,10 +59,9 @@ static unsigned long clocktick __read_mostly; /* timer cycles per tick */
*/
irqreturn_t __irq_entry timer_interrupt(int irq, void *dev_id)
{
- unsigned long now, now2;
+ unsigned long now;
unsigned long next_tick;
- unsigned long cycles_elapsed, ticks_elapsed = 1;
- unsigned long cycles_remainder;
+ unsigned long ticks_elapsed = 0;
unsigned int cpu = smp_processor_id();
struct cpuinfo_parisc *cpuinfo = &per_cpu(cpu_data, cpu);
@@ -71,102 +70,49 @@ irqreturn_t __irq_entry timer_interrupt(int irq, void *dev_id)
profile_tick(CPU_PROFILING);
- /* Initialize next_tick to the expected tick time. */
+ /* Initialize next_tick to the old expected tick time. */
next_tick = cpuinfo->it_value;
- /* Get current cycle counter (Control Register 16). */
- now = mfctl(16);
-
- cycles_elapsed = now - next_tick;
-
- if ((cycles_elapsed >> 6) < cpt) {
- /* use "cheap" math (add/subtract) instead
- * of the more expensive div/mul method
- */
- cycles_remainder = cycles_elapsed;
- while (cycles_remainder > cpt) {
- cycles_remainder -= cpt;
- ticks_elapsed++;
- }
- } else {
- /* TODO: Reduce this to one fdiv op */
- cycles_remainder = cycles_elapsed % cpt;
- ticks_elapsed += cycles_elapsed / cpt;
- }
-
- /* convert from "division remainder" to "remainder of clock tick" */
- cycles_remainder = cpt - cycles_remainder;
-
- /* Determine when (in CR16 cycles) next IT interrupt will fire.
- * We want IT to fire modulo clocktick even if we miss/skip some.
- * But those interrupts don't in fact get delivered that regularly.
- */
- next_tick = now + cycles_remainder;
+ /* Calculate how many ticks have elapsed. */
+ do {
+ ++ticks_elapsed;
+ next_tick += cpt;
+ now = mfctl(16);
+ } while (next_tick - now > cpt);
+ /* Store (in CR16 cycles) up to when we are accounting right now. */
cpuinfo->it_value = next_tick;
- /* Program the IT when to deliver the next interrupt.
- * Only bottom 32-bits of next_tick are writable in CR16!
- */
- mtctl(next_tick, 16);
+ /* Go do system house keeping. */
+ if (cpu == 0)
+ xtime_update(ticks_elapsed);
+
+ update_process_times(user_mode(get_irq_regs()));
- /* Skip one clocktick on purpose if we missed next_tick.
+ /* Skip clockticks on purpose if we know we would miss those.
* The new CR16 must be "later" than current CR16 otherwise
* itimer would not fire until CR16 wrapped - e.g 4 seconds
* later on a 1Ghz processor. We'll account for the missed
- * tick on the next timer interrupt.
+ * ticks on the next timer interrupt.
+ * We want IT to fire modulo clocktick even if we miss/skip some.
+ * But those interrupts don't in fact get delivered that regularly.
*
* "next_tick - now" will always give the difference regardless
* if one or the other wrapped. If "now" is "bigger" we'll end up
* with a very large unsigned number.
*/
- now2 = mfctl(16);
- if (next_tick - now2 > cpt)
- mtctl(next_tick+cpt, 16);
+ while (next_tick - mfctl(16) > cpt)
+ next_tick += cpt;
-#if 1
-/*
- * GGG: DEBUG code for how many cycles programming CR16 used.
- */
- if (unlikely(now2 - now > 0x3000)) /* 12K cycles */
- printk (KERN_CRIT "timer_interrupt(CPU %d): SLOW! 0x%lx cycles!"
- " cyc %lX rem %lX "
- " next/now %lX/%lX\n",
- cpu, now2 - now, cycles_elapsed, cycles_remainder,
- next_tick, now );
-#endif
-
- /* Can we differentiate between "early CR16" (aka Scenario 1) and
- * "long delay" (aka Scenario 3)? I don't think so.
- *
- * Timer_interrupt will be delivered at least a few hundred cycles
- * after the IT fires. But it's arbitrary how much time passes
- * before we call it "late". I've picked one second.
- *
- * It's important NO printk's are between reading CR16 and
- * setting up the next value. May introduce huge variance.
- */
- if (unlikely(ticks_elapsed > HZ)) {
- /* Scenario 3: very long delay? bad in any case */
- printk (KERN_CRIT "timer_interrupt(CPU %d): delayed!"
- " cycles %lX rem %lX "
- " next/now %lX/%lX\n",
- cpu,
- cycles_elapsed, cycles_remainder,
- next_tick, now );
- }
-
- /* Done mucking with unreliable delivery of interrupts.
- * Go do system house keeping.
+ /* Program the IT when to deliver the next interrupt.
+ * Only bottom 32-bits of next_tick are writable in CR16!
+ * Timer interrupt will be delivered at least a few hundred cycles
+ * after the IT fires, so if we are too close (<= 500 cycles) to the
+ * next cycle, simply skip it.
*/
-
- if (!--cpuinfo->prof_counter) {
- cpuinfo->prof_counter = cpuinfo->prof_multiplier;
- update_process_times(user_mode(get_irq_regs()));
- }
-
- if (cpu == 0)
- xtime_update(ticks_elapsed);
+ if (next_tick - mfctl(16) <= 500)
+ next_tick += cpt;
+ mtctl(next_tick, 16);
return IRQ_HANDLED;
}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 3da87e198878..a8ee573fe610 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -469,6 +469,7 @@ config KEXEC
config KEXEC_FILE
bool "kexec file based system call"
select KEXEC_CORE
+ select HAVE_IMA_KEXEC
select BUILD_BIN2C
depends on PPC64
depends on CRYPTO=y
diff --git a/arch/powerpc/include/asm/ima.h b/arch/powerpc/include/asm/ima.h
new file mode 100644
index 000000000000..2313bdface34
--- /dev/null
+++ b/arch/powerpc/include/asm/ima.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_POWERPC_IMA_H
+#define _ASM_POWERPC_IMA_H
+
+struct kimage;
+
+int ima_get_kexec_buffer(void **addr, size_t *size);
+int ima_free_kexec_buffer(void);
+
+#ifdef CONFIG_IMA
+void remove_ima_buffer(void *fdt, int chosen_node);
+#else
+static inline void remove_ima_buffer(void *fdt, int chosen_node) {}
+#endif
+
+#ifdef CONFIG_IMA_KEXEC
+int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr,
+ size_t size);
+
+int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node);
+#else
+static inline int setup_ima_buffer(const struct kimage *image, void *fdt,
+ int chosen_node)
+{
+ remove_ima_buffer(fdt, chosen_node);
+ return 0;
+}
+#endif /* CONFIG_IMA_KEXEC */
+
+#endif /* _ASM_POWERPC_IMA_H */
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 6c3b71502fbc..25668bc8cb2a 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -94,11 +94,22 @@ static inline bool kdump_in_progress(void)
#ifdef CONFIG_KEXEC_FILE
extern struct kexec_file_ops kexec_elf64_ops;
+#ifdef CONFIG_IMA_KEXEC
+#define ARCH_HAS_KIMAGE_ARCH
+
+struct kimage_arch {
+ phys_addr_t ima_buffer_addr;
+ size_t ima_buffer_size;
+};
+#endif
+
int setup_purgatory(struct kimage *image, const void *slave_code,
const void *fdt, unsigned long kernel_load_addr,
unsigned long fdt_load_addr);
-int setup_new_fdt(void *fdt, unsigned long initrd_load_addr,
- unsigned long initrd_len, const char *cmdline);
+int setup_new_fdt(const struct kimage *image, void *fdt,
+ unsigned long initrd_load_addr, unsigned long initrd_len,
+ const char *cmdline);
+int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size);
#endif /* CONFIG_KEXEC_FILE */
#else /* !CONFIG_KEXEC_CORE */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index a3a6047fd395..23f8082d7bfa 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -112,6 +112,10 @@ obj-$(CONFIG_PCI_MSI) += msi.o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o crash.o \
machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file_$(BITS).o kexec_elf_$(BITS).o
+ifeq ($(CONFIG_HAVE_IMA_KEXEC)$(CONFIG_IMA),yy)
+obj-y += ima_kexec.o
+endif
+
obj-$(CONFIG_AUDIT) += audit.o
obj64-$(CONFIG_AUDIT) += compat_audit.o
diff --git a/arch/powerpc/kernel/ima_kexec.c b/arch/powerpc/kernel/ima_kexec.c
new file mode 100644
index 000000000000..5ea42c937ca9
--- /dev/null
+++ b/arch/powerpc/kernel/ima_kexec.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2016 IBM Corporation
+ *
+ * Authors:
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/of.h>
+#include <linux/memblock.h>
+#include <linux/libfdt.h>
+
+static int get_addr_size_cells(int *addr_cells, int *size_cells)
+{
+ struct device_node *root;
+
+ root = of_find_node_by_path("/");
+ if (!root)
+ return -EINVAL;
+
+ *addr_cells = of_n_addr_cells(root);
+ *size_cells = of_n_size_cells(root);
+
+ of_node_put(root);
+
+ return 0;
+}
+
+static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr,
+ size_t *size)
+{
+ int ret, addr_cells, size_cells;
+
+ ret = get_addr_size_cells(&addr_cells, &size_cells);
+ if (ret)
+ return ret;
+
+ if (len < 4 * (addr_cells + size_cells))
+ return -ENOENT;
+
+ *addr = of_read_number(prop, addr_cells);
+ *size = of_read_number(prop + 4 * addr_cells, size_cells);
+
+ return 0;
+}
+
+/**
+ * ima_get_kexec_buffer - get IMA buffer from the previous kernel
+ * @addr: On successful return, set to point to the buffer contents.
+ * @size: On successful return, set to the buffer size.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int ima_get_kexec_buffer(void **addr, size_t *size)
+{
+ int ret, len;
+ unsigned long tmp_addr;
+ size_t tmp_size;
+ const void *prop;
+
+ prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", &len);
+ if (!prop)
+ return -ENOENT;
+
+ ret = do_get_kexec_buffer(prop, len, &tmp_addr, &tmp_size);
+ if (ret)
+ return ret;
+
+ *addr = __va(tmp_addr);
+ *size = tmp_size;
+
+ return 0;
+}
+
+/**
+ * ima_free_kexec_buffer - free memory used by the IMA buffer
+ */
+int ima_free_kexec_buffer(void)
+{
+ int ret;
+ unsigned long addr;
+ size_t size;
+ struct property *prop;
+
+ prop = of_find_property(of_chosen, "linux,ima-kexec-buffer", NULL);
+ if (!prop)
+ return -ENOENT;
+
+ ret = do_get_kexec_buffer(prop->value, prop->length, &addr, &size);
+ if (ret)
+ return ret;
+
+ ret = of_remove_property(of_chosen, prop);
+ if (ret)
+ return ret;
+
+ return memblock_free(addr, size);
+
+}
+
+/**
+ * remove_ima_buffer - remove the IMA buffer property and reservation from @fdt
+ *
+ * The IMA measurement buffer is of no use to a subsequent kernel, so we always
+ * remove it from the device tree.
+ */
+void remove_ima_buffer(void *fdt, int chosen_node)
+{
+ int ret, len;
+ unsigned long addr;
+ size_t size;
+ const void *prop;
+
+ prop = fdt_getprop(fdt, chosen_node, "linux,ima-kexec-buffer", &len);
+ if (!prop)
+ return;
+
+ ret = do_get_kexec_buffer(prop, len, &addr, &size);
+ fdt_delprop(fdt, chosen_node, "linux,ima-kexec-buffer");
+ if (ret)
+ return;
+
+ ret = delete_fdt_mem_rsv(fdt, addr, size);
+ if (!ret)
+ pr_debug("Removed old IMA buffer reservation.\n");
+}
+
+#ifdef CONFIG_IMA_KEXEC
+/**
+ * arch_ima_add_kexec_buffer - do arch-specific steps to add the IMA buffer
+ *
+ * Architectures should use this function to pass on the IMA buffer
+ * information to the next kernel.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr,
+ size_t size)
+{
+ image->arch.ima_buffer_addr = load_addr;
+ image->arch.ima_buffer_size = size;
+
+ return 0;
+}
+
+static int write_number(void *p, u64 value, int cells)
+{
+ if (cells == 1) {
+ u32 tmp;
+
+ if (value > U32_MAX)
+ return -EINVAL;
+
+ tmp = cpu_to_be32(value);
+ memcpy(p, &tmp, sizeof(tmp));
+ } else if (cells == 2) {
+ u64 tmp;
+
+ tmp = cpu_to_be64(value);
+ memcpy(p, &tmp, sizeof(tmp));
+ } else
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * setup_ima_buffer - add IMA buffer information to the fdt
+ * @image: kexec image being loaded.
+ * @fdt: Flattened device tree for the next kernel.
+ * @chosen_node: Offset to the chosen node.
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node)
+{
+ int ret, addr_cells, size_cells, entry_size;
+ u8 value[16];
+
+ remove_ima_buffer(fdt, chosen_node);
+ if (!image->arch.ima_buffer_size)
+ return 0;
+
+ ret = get_addr_size_cells(&addr_cells, &size_cells);
+ if (ret)
+ return ret;
+
+ entry_size = 4 * (addr_cells + size_cells);
+
+ if (entry_size > sizeof(value))
+ return -EINVAL;
+
+ ret = write_number(value, image->arch.ima_buffer_addr, addr_cells);
+ if (ret)
+ return ret;
+
+ ret = write_number(value + 4 * addr_cells, image->arch.ima_buffer_size,
+ size_cells);
+ if (ret)
+ return ret;
+
+ ret = fdt_setprop(fdt, chosen_node, "linux,ima-kexec-buffer", value,
+ entry_size);
+ if (ret < 0)
+ return -EINVAL;
+
+ ret = fdt_add_mem_rsv(fdt, image->arch.ima_buffer_addr,
+ image->arch.ima_buffer_size);
+ if (ret)
+ return -EINVAL;
+
+ pr_debug("IMA buffer at 0x%llx, size = 0x%zx\n",
+ image->arch.ima_buffer_addr, image->arch.ima_buffer_size);
+
+ return 0;
+}
+#endif /* CONFIG_IMA_KEXEC */
diff --git a/arch/powerpc/kernel/kexec_elf_64.c b/arch/powerpc/kernel/kexec_elf_64.c
index 6acffd34a70f..9a42309b091a 100644
--- a/arch/powerpc/kernel/kexec_elf_64.c
+++ b/arch/powerpc/kernel/kexec_elf_64.c
@@ -627,7 +627,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
goto out;
}
- ret = setup_new_fdt(fdt, initrd_load_addr, initrd_len, cmdline);
+ ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
if (ret)
goto out;
diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c
index 7abc8a75ee48..992c0d258e5d 100644
--- a/arch/powerpc/kernel/machine_kexec_file_64.c
+++ b/arch/powerpc/kernel/machine_kexec_file_64.c
@@ -27,6 +27,7 @@
#include <linux/memblock.h>
#include <linux/of_fdt.h>
#include <linux/libfdt.h>
+#include <asm/ima.h>
#define SLAVE_CODE_SIZE 256
@@ -180,7 +181,7 @@ int setup_purgatory(struct kimage *image, const void *slave_code,
*
* Return: 0 on success, or negative errno on error.
*/
-static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size)
+int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size)
{
int i, ret, num_rsvs = fdt_num_mem_rsv(fdt);
@@ -209,6 +210,7 @@ static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size
/*
* setup_new_fdt - modify /chosen and memory reservation for the next kernel
+ * @image: kexec image being loaded.
* @fdt: Flattened device tree for the next kernel.
* @initrd_load_addr: Address where the next initrd will be loaded.
* @initrd_len: Size of the next initrd, or 0 if there will be none.
@@ -217,8 +219,9 @@ static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size
*
* Return: 0 on success, or negative errno on error.
*/
-int setup_new_fdt(void *fdt, unsigned long initrd_load_addr,
- unsigned long initrd_len, const char *cmdline)
+int setup_new_fdt(const struct kimage *image, void *fdt,
+ unsigned long initrd_load_addr, unsigned long initrd_len,
+ const char *cmdline)
{
int ret, chosen_node;
const void *prop;
@@ -328,6 +331,12 @@ int setup_new_fdt(void *fdt, unsigned long initrd_load_addr,
}
}
+ ret = setup_ima_buffer(image, fdt, chosen_node);
+ if (ret) {
+ pr_err("Error setting up the new device tree.\n");
+ return ret;
+ }
+
ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
if (ret) {
pr_err("Error setting up the new device tree.\n");
diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c
index 3803b0addf65..6c0ba75fb256 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -117,9 +117,6 @@ static const struct of_device_id of_device_ids[] = {
{
.compatible = "fsl,qe",
},
- {
- .compatible = "fsl,fman",
- },
/* The following two are for the Freescale hypervisor */
{
.name = "hypervisor",
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 64024c999531..e487493bbd47 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -412,6 +412,19 @@ config GOLDFISH
def_bool y
depends on X86_GOLDFISH
+config INTEL_RDT_A
+ bool "Intel Resource Director Technology Allocation support"
+ default n
+ depends on X86 && CPU_SUP_INTEL
+ select KERNFS
+ help
+ Select to enable resource allocation which is a sub-feature of
+ Intel Resource Director Technology(RDT). More information about
+ RDT can be found in the Intel x86 Architecture Software
+ Developer Manual.
+
+ Say N if unsure.
+
if X86_32
config X86_EXTENDED_PLATFORM
bool "Support for extended (non-PC) x86 platforms"
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index cb8522290e6a..86138267b68a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2110,6 +2110,27 @@ again:
GLOBAL_STATUS_LBRS_FROZEN);
if (!status)
goto done;
+ /*
+ * In case multiple PEBS events are sampled at the same time,
+ * it is possible to have GLOBAL_STATUS bit 62 set indicating
+ * PEBS buffer overflow and also seeing at most 3 PEBS counters
+ * having their bits set in the status register. This is a sign
+ * that there was at least one PEBS record pending at the time
+ * of the PMU interrupt. PEBS counters must only be processed
+ * via the drain_pebs() calls and not via the regular sample
+ * processing loop coming after that the function, otherwise
+ * phony regular samples may be generated in the sampling buffer
+ * not marked with the EXACT tag. Another possibility is to have
+ * one PEBS event and at least one non-PEBS event whic hoverflows
+ * while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will
+ * not be set, yet the overflow status bit for the PEBS counter will
+ * be on Skylake.
+ *
+ * To avoid this problem, we systematically ignore the PEBS-enabled
+ * counters from the GLOBAL_STATUS mask and we always process PEBS
+ * events via drain_pebs().
+ */
+ status &= ~cpuc->pebs_enabled;
/*
* PEBS overflow sets bit 62 in the global status register
@@ -2117,15 +2138,6 @@ again:
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
handled++;
x86_pmu.drain_pebs(regs);
- /*
- * There are cases where, even though, the PEBS ovfl bit is set
- * in GLOBAL_OVF_STATUS, the PEBS events may also have their
- * overflow bits set for their counters. We must clear them
- * here because they have been processed as exact samples in
- * the drain_pebs() routine. They must not be processed again
- * in the for_each_bit_set() loop for regular samples below.
- */
- status &= ~cpuc->pebs_enabled;
status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
}
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 8f82b02934fa..0c45cc8e64ba 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -7,9 +7,9 @@
#include <linux/perf_event.h>
#include <linux/slab.h>
#include <asm/cpu_device_id.h>
+#include <asm/intel_rdt_common.h>
#include "../perf_event.h"
-#define MSR_IA32_PQR_ASSOC 0x0c8f
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
@@ -24,32 +24,13 @@ static unsigned int cqm_l3_scale; /* supposedly cacheline size */
static bool cqm_enabled, mbm_enabled;
unsigned int mbm_socket_max;
-/**
- * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid: The cached Resource Monitoring ID
- * @closid: The cached Class Of Service ID
- * @rmid_usecnt: The usage counter for rmid
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
- *
- * The cache also helps to avoid pointless updates if the value does
- * not change.
- */
-struct intel_pqr_state {
- u32 rmid;
- u32 closid;
- int rmid_usecnt;
-};
-
/*
* The cached intel_pqr_state is strictly per CPU and can never be
* updated from a remote CPU. Both functions which modify the state
* (intel_cqm_event_start and intel_cqm_event_stop) are called with
* interrupts disabled, which is sufficient for the protection.
*/
-static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
static struct hrtimer *mbm_timers;
/**
* struct sample - mbm event's (local or total) data
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 272427700d48..e6832be714bc 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -669,7 +669,7 @@ static struct event_constraint snbep_uncore_cbox_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
- EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0xe),
UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 6ccbf1aaa7ce..eafee3161d1c 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -189,6 +189,9 @@
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
+#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
+#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
@@ -222,6 +225,7 @@
#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
new file mode 100644
index 000000000000..95ce5c85b009
--- /dev/null
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -0,0 +1,224 @@
+#ifndef _ASM_X86_INTEL_RDT_H
+#define _ASM_X86_INTEL_RDT_H
+
+#ifdef CONFIG_INTEL_RDT_A
+
+#include <linux/kernfs.h>
+#include <linux/jump_label.h>
+
+#include <asm/intel_rdt_common.h>
+
+#define IA32_L3_QOS_CFG 0xc81
+#define IA32_L3_CBM_BASE 0xc90
+#define IA32_L2_CBM_BASE 0xd10
+
+#define L3_QOS_CDP_ENABLE 0x01ULL
+
+/**
+ * struct rdtgroup - store rdtgroup's data in resctrl file system.
+ * @kn: kernfs node
+ * @rdtgroup_list: linked list for all rdtgroups
+ * @closid: closid for this rdtgroup
+ * @cpu_mask: CPUs assigned to this rdtgroup
+ * @flags: status bits
+ * @waitcount: how many cpus expect to find this
+ * group when they acquire rdtgroup_mutex
+ */
+struct rdtgroup {
+ struct kernfs_node *kn;
+ struct list_head rdtgroup_list;
+ int closid;
+ struct cpumask cpu_mask;
+ int flags;
+ atomic_t waitcount;
+};
+
+/* rdtgroup.flags */
+#define RDT_DELETED 1
+
+/* List of all resource groups */
+extern struct list_head rdt_all_groups;
+
+int __init rdtgroup_init(void);
+
+/**
+ * struct rftype - describe each file in the resctrl file system
+ * @name: file name
+ * @mode: access mode
+ * @kf_ops: operations
+ * @seq_show: show content of the file
+ * @write: write to the file
+ */
+struct rftype {
+ char *name;
+ umode_t mode;
+ struct kernfs_ops *kf_ops;
+
+ int (*seq_show)(struct kernfs_open_file *of,
+ struct seq_file *sf, void *v);
+ /*
+ * write() is the generic write callback which maps directly to
+ * kernfs write operation and overrides all other operations.
+ * Maximum write size is determined by ->max_write_len.
+ */
+ ssize_t (*write)(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+};
+
+/**
+ * struct rdt_resource - attributes of an RDT resource
+ * @enabled: Is this feature enabled on this machine
+ * @capable: Is this feature available on this machine
+ * @name: Name to use in "schemata" file
+ * @num_closid: Number of CLOSIDs available
+ * @max_cbm: Largest Cache Bit Mask allowed
+ * @min_cbm_bits: Minimum number of consecutive bits to be set
+ * in a cache bit mask
+ * @domains: All domains for this resource
+ * @num_domains: Number of domains active
+ * @msr_base: Base MSR address for CBMs
+ * @tmp_cbms: Scratch space when updating schemata
+ * @num_tmp_cbms: Number of CBMs in tmp_cbms
+ * @cache_level: Which cache level defines scope of this domain
+ * @cbm_idx_multi: Multiplier of CBM index
+ * @cbm_idx_offset: Offset of CBM index. CBM index is computed by:
+ * closid * cbm_idx_multi + cbm_idx_offset
+ */
+struct rdt_resource {
+ bool enabled;
+ bool capable;
+ char *name;
+ int num_closid;
+ int cbm_len;
+ int min_cbm_bits;
+ u32 max_cbm;
+ struct list_head domains;
+ int num_domains;
+ int msr_base;
+ u32 *tmp_cbms;
+ int num_tmp_cbms;
+ int cache_level;
+ int cbm_idx_multi;
+ int cbm_idx_offset;
+};
+
+/**
+ * struct rdt_domain - group of cpus sharing an RDT resource
+ * @list: all instances of this resource
+ * @id: unique id for this instance
+ * @cpu_mask: which cpus share this resource
+ * @cbm: array of cache bit masks (indexed by CLOSID)
+ */
+struct rdt_domain {
+ struct list_head list;
+ int id;
+ struct cpumask cpu_mask;
+ u32 *cbm;
+};
+
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res: The resource to use
+ * @low: Beginning index from base MSR
+ * @high: End index
+ */
+struct msr_param {
+ struct rdt_resource *res;
+ int low;
+ int high;
+};
+
+extern struct mutex rdtgroup_mutex;
+
+extern struct rdt_resource rdt_resources_all[];
+extern struct rdtgroup rdtgroup_default;
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
+int __init rdtgroup_init(void);
+
+enum {
+ RDT_RESOURCE_L3,
+ RDT_RESOURCE_L3DATA,
+ RDT_RESOURCE_L3CODE,
+ RDT_RESOURCE_L2,
+
+ /* Must be the last */
+ RDT_NUM_RESOURCES,
+};
+
+#define for_each_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->capable)
+
+#define for_each_enabled_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->enabled)
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+ struct {
+ unsigned int cbm_len:5;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EDX */
+union cpuid_0x10_1_edx {
+ struct {
+ unsigned int cos_max:16;
+ } split;
+ unsigned int full;
+};
+
+DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+
+void rdt_cbm_update(void *arg);
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
+void rdtgroup_kn_unlock(struct kernfs_node *kn);
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v);
+
+/*
+ * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ *
+ * Following considerations are made so that this has minimal impact
+ * on scheduler hot path:
+ * - This will stay as no-op unless we are running on an Intel SKU
+ * which supports resource control and we enable by mounting the
+ * resctrl file system.
+ * - Caches the per cpu CLOSid values and does the MSR write only
+ * when a task with a different CLOSid is scheduled in.
+ *
+ * Must be called with preemption disabled.
+ */
+static inline void intel_rdt_sched_in(void)
+{
+ if (static_branch_likely(&rdt_enable_key)) {
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ int closid;
+
+ /*
+ * If this task has a closid assigned, use it.
+ * Else use the closid assigned to this cpu.
+ */
+ closid = current->closid;
+ if (closid == 0)
+ closid = this_cpu_read(cpu_closid);
+
+ if (closid != state->closid) {
+ state->closid = closid;
+ wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
+ }
+ }
+}
+
+#else
+
+static inline void intel_rdt_sched_in(void) {}
+
+#endif /* CONFIG_INTEL_RDT_A */
+#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h
new file mode 100644
index 000000000000..b31081b89407
--- /dev/null
+++ b/arch/x86/include/asm/intel_rdt_common.h
@@ -0,0 +1,27 @@
+#ifndef _ASM_X86_INTEL_RDT_COMMON_H
+#define _ASM_X86_INTEL_RDT_COMMON_H
+
+#define MSR_IA32_PQR_ASSOC 0x0c8f
+
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid: The cached Resource Monitoring ID
+ * @closid: The cached Class Of Service ID
+ * @rmid_usecnt: The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+ u32 rmid;
+ u32 closid;
+ int rmid_usecnt;
+};
+
+DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+#endif /* _ASM_X86_INTEL_RDT_COMMON_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7892530cbacf..2e25038dbd93 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -704,6 +704,7 @@ struct kvm_apic_map {
/* Hyper-V emulation context */
struct kvm_hv {
+ struct mutex hv_lock;
u64 hv_guest_os_id;
u64 hv_hypercall;
u64 hv_tsc_page;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 33b63670bf09..52000010c62e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,6 +32,8 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
+obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
+
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_MICROCODE) += microcode/
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index be6337156502..0282b0df004a 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -153,6 +153,7 @@ struct _cpuid4_info_regs {
union _cpuid4_leaf_eax eax;
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
+ unsigned int id;
unsigned long size;
struct amd_northbridge *nb;
};
@@ -894,6 +895,8 @@ static void __cache_cpumap_setup(unsigned int cpu, int index,
static void ci_leaf_init(struct cacheinfo *this_leaf,
struct _cpuid4_info_regs *base)
{
+ this_leaf->id = base->id;
+ this_leaf->attributes = CACHE_ID;
this_leaf->level = base->eax.split.level;
this_leaf->type = cache_type_map[base->eax.split.type];
this_leaf->coherency_line_size =
@@ -920,6 +923,22 @@ static int __init_cache_level(unsigned int cpu)
return 0;
}
+/*
+ * The max shared threads number comes from CPUID.4:EAX[25-14] with input
+ * ECX as cache index. Then right shift apicid by the number's order to get
+ * cache id for this cache node.
+ */
+static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ unsigned long num_threads_sharing;
+ int index_msb;
+
+ num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ id4_regs->id = c->apicid >> index_msb;
+}
+
static int __populate_cache_leaves(unsigned int cpu)
{
unsigned int idx, ret;
@@ -931,6 +950,7 @@ static int __populate_cache_leaves(unsigned int cpu)
ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
if (ret)
return ret;
+ get_cache_id(cpu, &id4_regs);
ci_leaf_init(this_leaf++, &id4_regs);
__cache_cpumap_setup(cpu, idx, &id4_regs);
}
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
new file mode 100644
index 000000000000..5a533fefefa0
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -0,0 +1,403 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpuhotplug.h>
+
+#include <asm/intel-family.h>
+#include <asm/intel_rdt.h>
+
+/* Mutex to protect rdtgroup access. */
+DEFINE_MUTEX(rdtgroup_mutex);
+
+DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+
+#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
+
+struct rdt_resource rdt_resources_all[] = {
+ {
+ .name = "L3",
+ .domains = domain_init(RDT_RESOURCE_L3),
+ .msr_base = IA32_L3_CBM_BASE,
+ .min_cbm_bits = 1,
+ .cache_level = 3,
+ .cbm_idx_multi = 1,
+ .cbm_idx_offset = 0
+ },
+ {
+ .name = "L3DATA",
+ .domains = domain_init(RDT_RESOURCE_L3DATA),
+ .msr_base = IA32_L3_CBM_BASE,
+ .min_cbm_bits = 1,
+ .cache_level = 3,
+ .cbm_idx_multi = 2,
+ .cbm_idx_offset = 0
+ },
+ {
+ .name = "L3CODE",
+ .domains = domain_init(RDT_RESOURCE_L3CODE),
+ .msr_base = IA32_L3_CBM_BASE,
+ .min_cbm_bits = 1,
+ .cache_level = 3,
+ .cbm_idx_multi = 2,
+ .cbm_idx_offset = 1
+ },
+ {
+ .name = "L2",
+ .domains = domain_init(RDT_RESOURCE_L2),
+ .msr_base = IA32_L2_CBM_BASE,
+ .min_cbm_bits = 1,
+ .cache_level = 2,
+ .cbm_idx_multi = 1,
+ .cbm_idx_offset = 0
+ },
+};
+
+static int cbm_idx(struct rdt_resource *r, int closid)
+{
+ return closid * r->cbm_idx_multi + r->cbm_idx_offset;
+}
+
+/*
+ * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
+ * as they do not have CPUID enumeration support for Cache allocation.
+ * The check for Vendor/Family/Model is not enough to guarantee that
+ * the MSRs won't #GP fault because only the following SKUs support
+ * CAT:
+ * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz
+ * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz
+ * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz
+ * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz
+ *
+ * Probe by trying to write the first of the L3 cach mask registers
+ * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
+ * is always 20 on hsw server parts. The minimum cache bitmask length
+ * allowed for HSW server is always 2 bits. Hardcode all of them.
+ */
+static inline bool cache_alloc_hsw_probe(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86 == 6 &&
+ boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) {
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+ u32 l, h, max_cbm = BIT_MASK(20) - 1;
+
+ if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
+ return false;
+ rdmsr(IA32_L3_CBM_BASE, l, h);
+
+ /* If all the bits were set in MSR, return success */
+ if (l != max_cbm)
+ return false;
+
+ r->num_closid = 4;
+ r->cbm_len = 20;
+ r->max_cbm = max_cbm;
+ r->min_cbm_bits = 2;
+ r->capable = true;
+ r->enabled = true;
+
+ return true;
+ }
+
+ return false;
+}
+
+static void rdt_get_config(int idx, struct rdt_resource *r)
+{
+ union cpuid_0x10_1_eax eax;
+ union cpuid_0x10_1_edx edx;
+ u32 ebx, ecx;
+
+ cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
+ r->num_closid = edx.split.cos_max + 1;
+ r->cbm_len = eax.split.cbm_len + 1;
+ r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->capable = true;
+ r->enabled = true;
+}
+
+static void rdt_get_cdp_l3_config(int type)
+{
+ struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct rdt_resource *r = &rdt_resources_all[type];
+
+ r->num_closid = r_l3->num_closid / 2;
+ r->cbm_len = r_l3->cbm_len;
+ r->max_cbm = r_l3->max_cbm;
+ r->capable = true;
+ /*
+ * By default, CDP is disabled. CDP can be enabled by mount parameter
+ * "cdp" during resctrl file system mount time.
+ */
+ r->enabled = false;
+}
+
+static inline bool get_rdt_resources(void)
+{
+ bool ret = false;
+
+ if (cache_alloc_hsw_probe())
+ return true;
+
+ if (!boot_cpu_has(X86_FEATURE_RDT_A))
+ return false;
+
+ if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
+ rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+ if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
+ rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
+ rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
+ }
+ ret = true;
+ }
+ if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
+ /* CPUID 0x10.2 fields are same format at 0x10.1 */
+ rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+ ret = true;
+ }
+
+ return ret;
+}
+
+static int get_cache_id(int cpu, int level)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+ int i;
+
+ for (i = 0; i < ci->num_leaves; i++) {
+ if (ci->info_list[i].level == level)
+ return ci->info_list[i].id;
+ }
+
+ return -1;
+}
+
+void rdt_cbm_update(void *arg)
+{
+ struct msr_param *m = (struct msr_param *)arg;
+ struct rdt_resource *r = m->res;
+ int i, cpu = smp_processor_id();
+ struct rdt_domain *d;
+
+ list_for_each_entry(d, &r->domains, list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->cpu_mask))
+ goto found;
+ }
+ pr_info_once("cpu %d not found in any domain for resource %s\n",
+ cpu, r->name);
+
+ return;
+
+found:
+ for (i = m->low; i < m->high; i++) {
+ int idx = cbm_idx(r, i);
+
+ wrmsrl(r->msr_base + idx, d->cbm[i]);
+ }
+}
+
+/*
+ * rdt_find_domain - Find a domain in a resource that matches input resource id
+ *
+ * Search resource r's domain list to find the resource id. If the resource
+ * id is found in a domain, return the domain. Otherwise, if requested by
+ * caller, return the first domain whose id is bigger than the input id.
+ * The domain list is sorted by id in ascending order.
+ */
+static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+ struct list_head **pos)
+{
+ struct rdt_domain *d;
+ struct list_head *l;
+
+ if (id < 0)
+ return ERR_PTR(id);
+
+ list_for_each(l, &r->domains) {
+ d = list_entry(l, struct rdt_domain, list);
+ /* When id is found, return its domain. */
+ if (id == d->id)
+ return d;
+ /* Stop searching when finding id's position in sorted list. */
+ if (id < d->id)
+ break;
+ }
+
+ if (pos)
+ *pos = l;
+
+ return NULL;
+}
+
+/*
+ * domain_add_cpu - Add a cpu to a resource's domain list.
+ *
+ * If an existing domain in the resource r's domain list matches the cpu's
+ * resource id, add the cpu in the domain.
+ *
+ * Otherwise, a new domain is allocated and inserted into the right position
+ * in the domain list sorted by id in ascending order.
+ *
+ * The order in the domain list is visible to users when we print entries
+ * in the schemata file and schemata input is validated to have the same order
+ * as this list.
+ */
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
+{
+ int i, id = get_cache_id(cpu, r->cache_level);
+ struct list_head *add_pos = NULL;
+ struct rdt_domain *d;
+
+ d = rdt_find_domain(r, id, &add_pos);
+ if (IS_ERR(d)) {
+ pr_warn("Could't find cache id for cpu %d\n", cpu);
+ return;
+ }
+
+ if (d) {
+ cpumask_set_cpu(cpu, &d->cpu_mask);
+ return;
+ }
+
+ d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu));
+ if (!d)
+ return;
+
+ d->id = id;
+
+ d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL);
+ if (!d->cbm) {
+ kfree(d);
+ return;
+ }
+
+ for (i = 0; i < r->num_closid; i++) {
+ int idx = cbm_idx(r, i);
+
+ d->cbm[i] = r->max_cbm;
+ wrmsrl(r->msr_base + idx, d->cbm[i]);
+ }
+
+ cpumask_set_cpu(cpu, &d->cpu_mask);
+ list_add_tail(&d->list, add_pos);
+ r->num_domains++;
+}
+
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+ int id = get_cache_id(cpu, r->cache_level);
+ struct rdt_domain *d;
+
+ d = rdt_find_domain(r, id, NULL);
+ if (IS_ERR_OR_NULL(d)) {
+ pr_warn("Could't find cache id for cpu %d\n", cpu);
+ return;
+ }
+
+ cpumask_clear_cpu(cpu, &d->cpu_mask);
+ if (cpumask_empty(&d->cpu_mask)) {
+ r->num_domains--;
+ kfree(d->cbm);
+ list_del(&d->list);
+ kfree(d);
+ }
+}
+
+static void clear_closid(int cpu)
+{
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+ per_cpu(cpu_closid, cpu) = 0;
+ state->closid = 0;
+ wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
+}
+
+static int intel_rdt_online_cpu(unsigned int cpu)
+{
+ struct rdt_resource *r;
+
+ mutex_lock(&rdtgroup_mutex);
+ for_each_capable_rdt_resource(r)
+ domain_add_cpu(cpu, r);
+ /* The cpu is set in default rdtgroup after online. */
+ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+ clear_closid(cpu);
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static int intel_rdt_offline_cpu(unsigned int cpu)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+
+ mutex_lock(&rdtgroup_mutex);
+ for_each_capable_rdt_resource(r)
+ domain_remove_cpu(cpu, r);
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask))
+ break;
+ }
+ clear_closid(cpu);
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static int __init intel_rdt_late_init(void)
+{
+ struct rdt_resource *r;
+ int state, ret;
+
+ if (!get_rdt_resources())
+ return -ENODEV;
+
+ state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/rdt/cat:online:",
+ intel_rdt_online_cpu, intel_rdt_offline_cpu);
+ if (state < 0)
+ return state;
+
+ ret = rdtgroup_init();
+ if (ret) {
+ cpuhp_remove_state(state);
+ return ret;
+ }
+
+ for_each_capable_rdt_resource(r)
+ pr_info("Intel RDT %s allocation detected\n", r->name);
+
+ return 0;
+}
+
+late_initcall(intel_rdt_late_init);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
new file mode 100644
index 000000000000..8af04afdfcb9
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -0,0 +1,1115 @@
+/*
+ * User interface for Resource Alloction in Resource Director Technology(RDT)
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/task_work.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_common.h>
+
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+struct kernfs_root *rdt_root;
+struct rdtgroup rdtgroup_default;
+LIST_HEAD(rdt_all_groups);
+
+/* Kernel fs node for "info" directory under root */
+static struct kernfs_node *kn_info;
+
+/*
+ * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
+ * we can keep a bitmap of free CLOSIDs in a single integer.
+ *
+ * Using a global CLOSID across all resources has some advantages and
+ * some drawbacks:
+ * + We can simply set "current->closid" to assign a task to a resource
+ * group.
+ * + Context switch code can avoid extra memory references deciding which
+ * CLOSID to load into the PQR_ASSOC MSR
+ * - We give up some options in configuring resource groups across multi-socket
+ * systems.
+ * - Our choices on how to configure each resource become progressively more
+ * limited as the number of resources grows.
+ */
+static int closid_free_map;
+
+static void closid_init(void)
+{
+ struct rdt_resource *r;
+ int rdt_min_closid = 32;
+
+ /* Compute rdt_min_closid across all resources */
+ for_each_enabled_rdt_resource(r)
+ rdt_min_closid = min(rdt_min_closid, r->num_closid);
+
+ closid_free_map = BIT_MASK(rdt_min_closid) - 1;
+
+ /* CLOSID 0 is always reserved for the default group */
+ closid_free_map &= ~1;
+}
+
+int closid_alloc(void)
+{
+ int closid = ffs(closid_free_map);
+
+ if (closid == 0)
+ return -ENOSPC;
+ closid--;
+ closid_free_map &= ~(1 << closid);
+
+ return closid;
+}
+
+static void closid_free(int closid)
+{
+ closid_free_map |= 1 << closid;
+}
+
+/* set uid and gid of rdtgroup dirs and files to that of the creator */
+static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
+
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
+
+ return kernfs_setattr(kn, &iattr);
+}
+
+static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
+{
+ struct kernfs_node *kn;
+ int ret;
+
+ kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
+ 0, rft->kf_ops, rft, NULL, NULL);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts,
+ int len)
+{
+ struct rftype *rft;
+ int ret;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ ret = rdtgroup_add_file(kn, rft);
+ if (ret)
+ goto error;
+ }
+
+ return 0;
+error:
+ pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+ while (--rft >= rfts)
+ kernfs_remove_by_name(kn, rft->name);
+ return ret;
+}
+
+static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+ struct kernfs_open_file *of = m->private;
+ struct rftype *rft = of->kn->priv;
+
+ if (rft->seq_show)
+ return rft->seq_show(of, m, arg);
+ return 0;
+}
+
+static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct rftype *rft = of->kn->priv;
+
+ if (rft->write)
+ return rft->write(of, buf, nbytes, off);
+
+ return -EINVAL;
+}
+
+static struct kernfs_ops rdtgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = rdtgroup_file_write,
+ .seq_show = rdtgroup_seqfile_show,
+};
+
+static int rdtgroup_cpus_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ if (rdtgrp)
+ seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask));
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+/*
+ * This is safe against intel_rdt_sched_in() called from __switch_to()
+ * because __switch_to() is executed with interrupts disabled. A local call
+ * from rdt_update_closid() is proteced against __switch_to() because
+ * preemption is disabled.
+ */
+static void rdt_update_cpu_closid(void *closid)
+{
+ if (closid)
+ this_cpu_write(cpu_closid, *(int *)closid);
+ /*
+ * We cannot unconditionally write the MSR because the current
+ * executing task might have its own closid selected. Just reuse
+ * the context switch code.
+ */
+ intel_rdt_sched_in();
+}
+
+/*
+ * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
+ *
+ * Per task closids must have been set up before calling this function.
+ *
+ * The per cpu closids are updated with the smp function call, when @closid
+ * is not NULL. If @closid is NULL then all affected percpu closids must
+ * have been set up before calling this function.
+ */
+static void
+rdt_update_closid(const struct cpumask *cpu_mask, int *closid)
+{
+ int cpu = get_cpu();
+
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ rdt_update_cpu_closid(closid);
+ smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1);
+ put_cpu();
+}
+
+static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ cpumask_var_t tmpmask, newmask;
+ struct rdtgroup *rdtgrp, *r;
+ int ret;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+ if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
+ free_cpumask_var(tmpmask);
+ return -ENOMEM;
+ }
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ ret = cpumask_parse(buf, newmask);
+ if (ret)
+ goto unlock;
+
+ /* check that user didn't specify any offline cpus */
+ cpumask_andnot(tmpmask, newmask, cpu_online_mask);
+ if (cpumask_weight(tmpmask)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ /* Check whether cpus are dropped from this group */
+ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+ if (cpumask_weight(tmpmask)) {
+ /* Can't drop from default group */
+ if (rdtgrp == &rdtgroup_default) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+ /* Give any dropped cpus to rdtgroup_default */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, tmpmask);
+ rdt_update_closid(tmpmask, &rdtgroup_default.closid);
+ }
+
+ /*
+ * If we added cpus, remove them from previous group that owned them
+ * and update per-cpu closid
+ */
+ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+ if (cpumask_weight(tmpmask)) {
+ list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+ if (r == rdtgrp)
+ continue;
+ cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
+ }
+ rdt_update_closid(tmpmask, &rdtgrp->closid);
+ }
+
+ /* Done pushing/pulling - update this group with new mask */
+ cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+unlock:
+ rdtgroup_kn_unlock(of->kn);
+ free_cpumask_var(tmpmask);
+ free_cpumask_var(newmask);
+
+ return ret ?: nbytes;
+}
+
+struct task_move_callback {
+ struct callback_head work;
+ struct rdtgroup *rdtgrp;
+};
+
+static void move_myself(struct callback_head *head)
+{
+ struct task_move_callback *callback;
+ struct rdtgroup *rdtgrp;
+
+ callback = container_of(head, struct task_move_callback, work);
+ rdtgrp = callback->rdtgrp;
+
+ /*
+ * If resource group was deleted before this task work callback
+ * was invoked, then assign the task to root group and free the
+ * resource group.
+ */
+ if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+ (rdtgrp->flags & RDT_DELETED)) {
+ current->closid = 0;
+ kfree(rdtgrp);
+ }
+
+ preempt_disable();
+ /* update PQR_ASSOC MSR to make resource group go into effect */
+ intel_rdt_sched_in();
+ preempt_enable();
+
+ kfree(callback);
+}
+
+static int __rdtgroup_move_task(struct task_struct *tsk,
+ struct rdtgroup *rdtgrp)
+{
+ struct task_move_callback *callback;
+ int ret;
+
+ callback = kzalloc(sizeof(*callback), GFP_KERNEL);
+ if (!callback)
+ return -ENOMEM;
+ callback->work.func = move_myself;
+ callback->rdtgrp = rdtgrp;
+
+ /*
+ * Take a refcount, so rdtgrp cannot be freed before the
+ * callback has been invoked.
+ */
+ atomic_inc(&rdtgrp->waitcount);
+ ret = task_work_add(tsk, &callback->work, true);
+ if (ret) {
+ /*
+ * Task is exiting. Drop the refcount and free the callback.
+ * No need to check the refcount as the group cannot be
+ * deleted before the write function unlocks rdtgroup_mutex.
+ */
+ atomic_dec(&rdtgrp->waitcount);
+ kfree(callback);
+ } else {
+ tsk->closid = rdtgrp->closid;
+ }
+ return ret;
+}
+
+static int rdtgroup_task_write_permission(struct task_struct *task,
+ struct kernfs_open_file *of)
+{
+ const struct cred *tcred = get_task_cred(task);
+ const struct cred *cred = current_cred();
+ int ret = 0;
+
+ /*
+ * Even if we're attaching all tasks in the thread group, we only
+ * need to check permissions on one of them.
+ */
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EPERM;
+
+ put_cred(tcred);
+ return ret;
+}
+
+static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
+ struct kernfs_open_file *of)
+{
+ struct task_struct *tsk;
+ int ret;
+
+ rcu_read_lock();
+ if (pid) {
+ tsk = find_task_by_vpid(pid);
+ if (!tsk) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ } else {
+ tsk = current;
+ }
+
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
+ ret = rdtgroup_task_write_permission(tsk, of);
+ if (!ret)
+ ret = __rdtgroup_move_task(tsk, rdtgrp);
+
+ put_task_struct(tsk);
+ return ret;
+}
+
+static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+ pid_t pid;
+
+ if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+ return -EINVAL;
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ if (rdtgrp)
+ ret = rdtgroup_move_task(pid, rdtgrp, of);
+ else
+ ret = -ENOENT;
+
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
+{
+ struct task_struct *p, *t;
+
+ rcu_read_lock();
+ for_each_process_thread(p, t) {
+ if (t->closid == r->closid)
+ seq_printf(s, "%d\n", t->pid);
+ }
+ rcu_read_unlock();
+}
+
+static int rdtgroup_tasks_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp)
+ show_rdt_tasks(rdtgrp, s);
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+/* Files in each rdtgroup */
+static struct rftype rdtgroup_base_files[] = {
+ {
+ .name = "cpus",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ },
+ {
+ .name = "tasks",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_tasks_write,
+ .seq_show = rdtgroup_tasks_show,
+ },
+ {
+ .name = "schemata",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_schemata_write,
+ .seq_show = rdtgroup_schemata_show,
+ },
+};
+
+static int rdt_num_closids_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%d\n", r->num_closid);
+
+ return 0;
+}
+
+static int rdt_cbm_mask_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%x\n", r->max_cbm);
+
+ return 0;
+}
+
+static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%d\n", r->min_cbm_bits);
+
+ return 0;
+}
+
+/* rdtgroup information files for one cache resource. */
+static struct rftype res_info_files[] = {
+ {
+ .name = "num_closids",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_num_closids_show,
+ },
+ {
+ .name = "cbm_mask",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_cbm_mask_show,
+ },
+ {
+ .name = "min_cbm_bits",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_min_cbm_bits_show,
+ },
+};
+
+static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
+{
+ struct kernfs_node *kn_subdir;
+ struct rdt_resource *r;
+ int ret;
+
+ /* create the directory */
+ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
+ if (IS_ERR(kn_info))
+ return PTR_ERR(kn_info);
+ kernfs_get(kn_info);
+
+ for_each_enabled_rdt_resource(r) {
+ kn_subdir = kernfs_create_dir(kn_info, r->name,
+ kn_info->mode, r);
+ if (IS_ERR(kn_subdir)) {
+ ret = PTR_ERR(kn_subdir);
+ goto out_destroy;
+ }
+ kernfs_get(kn_subdir);
+ ret = rdtgroup_kn_set_ugid(kn_subdir);
+ if (ret)
+ goto out_destroy;
+ ret = rdtgroup_add_files(kn_subdir, res_info_files,
+ ARRAY_SIZE(res_info_files));
+ if (ret)
+ goto out_destroy;
+ kernfs_activate(kn_subdir);
+ }
+
+ /*
+ * This extra ref will be put in kernfs_remove() and guarantees
+ * that @rdtgrp->kn is always accessible.
+ */
+ kernfs_get(kn_info);
+
+ ret = rdtgroup_kn_set_ugid(kn_info);
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn_info);
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn_info);
+ return ret;
+}
+
+static void l3_qos_cfg_update(void *arg)
+{
+ bool *enable = arg;
+
+ wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
+}
+
+static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
+{
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ int cpu;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ list_for_each_entry(d, &r->domains, list) {
+ /* Pick one CPU from each domain instance to update MSR */
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ }
+ cpu = get_cpu();
+ /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ l3_qos_cfg_update(&enable);
+ /* Update QOS_CFG MSR on all other cpus in cpu_mask. */
+ smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1);
+ put_cpu();
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+static int cdp_enable(void)
+{
+ struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA];
+ struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE];
+ struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
+ int ret;
+
+ if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable)
+ return -EINVAL;
+
+ ret = set_l3_qos_cfg(r_l3, true);
+ if (!ret) {
+ r_l3->enabled = false;
+ r_l3data->enabled = true;
+ r_l3code->enabled = true;
+ }
+ return ret;
+}
+
+static void cdp_disable(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ r->enabled = r->capable;
+
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) {
+ rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false;
+ rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false;
+ set_l3_qos_cfg(r, false);
+ }
+}
+
+static int parse_rdtgroupfs_options(char *data)
+{
+ char *token, *o = data;
+ int ret = 0;
+
+ while ((token = strsep(&o, ",")) != NULL) {
+ if (!*token)
+ return -EINVAL;
+
+ if (!strcmp(token, "cdp"))
+ ret = cdp_enable();
+ }
+
+ return ret;
+}
+
+/*
+ * We don't allow rdtgroup directories to be created anywhere
+ * except the root directory. Thus when looking for the rdtgroup
+ * structure for a kernfs node we are either looking at a directory,
+ * in which case the rdtgroup structure is pointed at by the "priv"
+ * field, otherwise we have a file, and need only look to the parent
+ * to find the rdtgroup.
+ */
+static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
+{
+ if (kernfs_type(kn) == KERNFS_DIR) {
+ /*
+ * All the resource directories use "kn->priv"
+ * to point to the "struct rdtgroup" for the
+ * resource. "info" and its subdirectories don't
+ * have rdtgroup structures, so return NULL here.
+ */
+ if (kn == kn_info || kn->parent == kn_info)
+ return NULL;
+ else
+ return kn->priv;
+ } else {
+ return kn->parent->priv;
+ }
+}
+
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
+{
+ struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+ if (!rdtgrp)
+ return NULL;
+
+ atomic_inc(&rdtgrp->waitcount);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ /* Was this group deleted while we waited? */
+ if (rdtgrp->flags & RDT_DELETED)
+ return NULL;
+
+ return rdtgrp;
+}
+
+void rdtgroup_kn_unlock(struct kernfs_node *kn)
+{
+ struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+ if (!rdtgrp)
+ return;
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+ (rdtgrp->flags & RDT_DELETED)) {
+ kernfs_unbreak_active_protection(kn);
+ kernfs_put(kn);
+ kfree(rdtgrp);
+ } else {
+ kernfs_unbreak_active_protection(kn);
+ }
+}
+
+static struct dentry *rdt_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name,
+ void *data)
+{
+ struct dentry *dentry;
+ int ret;
+
+ mutex_lock(&rdtgroup_mutex);
+ /*
+ * resctrl file system can only be mounted once.
+ */
+ if (static_branch_unlikely(&rdt_enable_key)) {
+ dentry = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ ret = parse_rdtgroupfs_options(data);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_cdp;
+ }
+
+ closid_init();
+
+ ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_cdp;
+ }
+
+ dentry = kernfs_mount(fs_type, flags, rdt_root,
+ RDTGROUP_SUPER_MAGIC, NULL);
+ if (IS_ERR(dentry))
+ goto out_cdp;
+
+ static_branch_enable(&rdt_enable_key);
+ goto out;
+
+out_cdp:
+ cdp_disable();
+out:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return dentry;
+}
+
+static int reset_all_cbms(struct rdt_resource *r)
+{
+ struct msr_param msr_param;
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ int i, cpu;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ msr_param.res = r;
+ msr_param.low = 0;
+ msr_param.high = r->num_closid;
+
+ /*
+ * Disable resource control for this resource by setting all
+ * CBMs in all domains to the maximum mask value. Pick one CPU
+ * from each domain to update the MSRs below.
+ */
+ list_for_each_entry(d, &r->domains, list) {
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+
+ for (i = 0; i < r->num_closid; i++)
+ d->cbm[i] = r->max_cbm;
+ }
+ cpu = get_cpu();
+ /* Update CBM on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ rdt_cbm_update(&msr_param);
+ /* Update CBM on all other cpus in cpu_mask. */
+ smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+ put_cpu();
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+/*
+ * Move tasks from one to the other group. If @from is NULL, then all tasks
+ * in the systems are moved unconditionally (used for teardown).
+ *
+ * If @mask is not NULL the cpus on which moved tasks are running are set
+ * in that mask so the update smp function call is restricted to affected
+ * cpus.
+ */
+static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
+ struct cpumask *mask)
+{
+ struct task_struct *p, *t;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
+ if (!from || t->closid == from->closid) {
+ t->closid = to->closid;
+#ifdef CONFIG_SMP
+ /*
+ * This is safe on x86 w/o barriers as the ordering
+ * of writing to task_cpu() and t->on_cpu is
+ * reverse to the reading here. The detection is
+ * inaccurate as tasks might move or schedule
+ * before the smp function call takes place. In
+ * such a case the function call is pointless, but
+ * there is no other side effect.
+ */
+ if (mask && t->on_cpu)
+ cpumask_set_cpu(task_cpu(t), mask);
+#endif
+ }
+ }
+ read_unlock(&tasklist_lock);
+}
+
+/*
+ * Forcibly remove all of subdirectories under root.
+ */
+static void rmdir_all_sub(void)
+{
+ struct rdtgroup *rdtgrp, *tmp;
+
+ /* Move all tasks to the default resource group */
+ rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
+
+ list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+ /* Remove each rdtgroup other than root */
+ if (rdtgrp == &rdtgroup_default)
+ continue;
+
+ /*
+ * Give any CPUs back to the default group. We cannot copy
+ * cpu_online_mask because a CPU might have executed the
+ * offline callback already, but is still marked online.
+ */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+ kernfs_remove(rdtgrp->kn);
+ list_del(&rdtgrp->rdtgroup_list);
+ kfree(rdtgrp);
+ }
+ /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
+ get_online_cpus();
+ rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid);
+ put_online_cpus();
+
+ kernfs_remove(kn_info);
+}
+
+static void rdt_kill_sb(struct super_block *sb)
+{
+ struct rdt_resource *r;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ /*Put everything back to default values. */
+ for_each_enabled_rdt_resource(r)
+ reset_all_cbms(r);
+ cdp_disable();
+ rmdir_all_sub();
+ static_branch_disable(&rdt_enable_key);
+ kernfs_kill_sb(sb);
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+static struct file_system_type rdt_fs_type = {
+ .name = "resctrl",
+ .mount = rdt_mount,
+ .kill_sb = rdt_kill_sb,
+};
+
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ struct rdtgroup *parent, *rdtgrp;
+ struct kernfs_node *kn;
+ int ret, closid;
+
+ /* Only allow mkdir in the root directory */
+ if (parent_kn != rdtgroup_default.kn)
+ return -EPERM;
+
+ /* Do not accept '\n' to avoid unparsable situation. */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ parent = rdtgroup_kn_lock_live(parent_kn);
+ if (!parent) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ ret = closid_alloc();
+ if (ret < 0)
+ goto out_unlock;
+ closid = ret;
+
+ /* allocate the rdtgroup. */
+ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
+ if (!rdtgrp) {
+ ret = -ENOSPC;
+ goto out_closid_free;
+ }
+ rdtgrp->closid = closid;
+ list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+
+ /* kernfs creates the directory for rdtgrp */
+ kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ goto out_cancel_ref;
+ }
+ rdtgrp->kn = kn;
+
+ /*
+ * kernfs_remove() will drop the reference count on "kn" which
+ * will free it. But we still need it to stick around for the
+ * rdtgroup_kn_unlock(kn} call below. Take one extra reference
+ * here, which will be dropped inside rdtgroup_kn_unlock().
+ */
+ kernfs_get(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ ret = rdtgroup_add_files(kn, rdtgroup_base_files,
+ ARRAY_SIZE(rdtgroup_base_files));
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn);
+
+ ret = 0;
+ goto out_unlock;
+
+out_destroy:
+ kernfs_remove(rdtgrp->kn);
+out_cancel_ref:
+ list_del(&rdtgrp->rdtgroup_list);
+ kfree(rdtgrp);
+out_closid_free:
+ closid_free(closid);
+out_unlock:
+ rdtgroup_kn_unlock(parent_kn);
+ return ret;
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+ int ret, cpu, closid = rdtgroup_default.closid;
+ struct rdtgroup *rdtgrp;
+ cpumask_var_t tmpmask;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ rdtgrp = rdtgroup_kn_lock_live(kn);
+ if (!rdtgrp) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ /* Give any tasks back to the default group */
+ rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
+
+ /* Give any CPUs back to the default group */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+ /* Update per cpu closid of the moved CPUs first */
+ for_each_cpu(cpu, &rdtgrp->cpu_mask)
+ per_cpu(cpu_closid, cpu) = closid;
+ /*
+ * Update the MSR on moved CPUs and CPUs which have moved
+ * task running on them.
+ */
+ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+ rdt_update_closid(tmpmask, NULL);
+
+ rdtgrp->flags = RDT_DELETED;
+ closid_free(rdtgrp->closid);
+ list_del(&rdtgrp->rdtgroup_list);
+
+ /*
+ * one extra hold on this, will drop when we kfree(rdtgrp)
+ * in rdtgroup_kn_unlock()
+ */
+ kernfs_get(kn);
+ kernfs_remove(rdtgrp->kn);
+ ret = 0;
+out:
+ rdtgroup_kn_unlock(kn);
+ free_cpumask_var(tmpmask);
+ return ret;
+}
+
+static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
+{
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled)
+ seq_puts(seq, ",cdp");
+ return 0;
+}
+
+static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
+ .mkdir = rdtgroup_mkdir,
+ .rmdir = rdtgroup_rmdir,
+ .show_options = rdtgroup_show_options,
+};
+
+static int __init rdtgroup_setup_root(void)
+{
+ int ret;
+
+ rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED,
+ &rdtgroup_default);
+ if (IS_ERR(rdt_root))
+ return PTR_ERR(rdt_root);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgroup_default.closid = 0;
+ list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
+
+ ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files,
+ ARRAY_SIZE(rdtgroup_base_files));
+ if (ret) {
+ kernfs_destroy_root(rdt_root);
+ goto out;
+ }
+
+ rdtgroup_default.kn = rdt_root->kn;
+ kernfs_activate(rdtgroup_default.kn);
+
+out:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret;
+}
+
+/*
+ * rdtgroup_init - rdtgroup initialization
+ *
+ * Setup resctrl file system including set up root, create mount point,
+ * register rdtgroup filesystem, and initialize files under root directory.
+ *
+ * Return: 0 on success or -errno
+ */
+int __init rdtgroup_init(void)
+{
+ int ret = 0;
+
+ ret = rdtgroup_setup_root();
+ if (ret)
+ return ret;
+
+ ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+ if (ret)
+ goto cleanup_root;
+
+ ret = register_filesystem(&rdt_fs_type);
+ if (ret)
+ goto cleanup_mountpoint;
+
+ return 0;
+
+cleanup_mountpoint:
+ sysfs_remove_mount_point(fs_kobj, "resctrl");
+cleanup_root:
+ kernfs_destroy_root(rdt_root);
+
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c
new file mode 100644
index 000000000000..f369cb8db0d5
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c
@@ -0,0 +1,245 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <asm/intel_rdt.h>
+
+/*
+ * Check whether a cache bit mask is valid. The SDM says:
+ * Please note that all (and only) contiguous '1' combinations
+ * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
+ * Additionally Haswell requires at least two bits set.
+ */
+static bool cbm_validate(unsigned long var, struct rdt_resource *r)
+{
+ unsigned long first_bit, zero_bit;
+
+ if (var == 0 || var > r->max_cbm)
+ return false;
+
+ first_bit = find_first_bit(&var, r->cbm_len);
+ zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit);
+
+ if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len)
+ return false;
+
+ if ((zero_bit - first_bit) < r->min_cbm_bits)
+ return false;
+ return true;
+}
+
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+static int parse_cbm(char *buf, struct rdt_resource *r)
+{
+ unsigned long data;
+ int ret;
+
+ ret = kstrtoul(buf, 16, &data);
+ if (ret)
+ return ret;
+ if (!cbm_validate(data, r))
+ return -EINVAL;
+ r->tmp_cbms[r->num_tmp_cbms++] = data;
+
+ return 0;
+}
+
+/*
+ * For each domain in this resource we expect to find a series of:
+ * id=mask
+ * separated by ";". The "id" is in decimal, and must appear in the
+ * right order.
+ */
+static int parse_line(char *line, struct rdt_resource *r)
+{
+ char *dom = NULL, *id;
+ struct rdt_domain *d;
+ unsigned long dom_id;
+
+ list_for_each_entry(d, &r->domains, list) {
+ dom = strsep(&line, ";");
+ if (!dom)
+ return -EINVAL;
+ id = strsep(&dom, "=");
+ if (kstrtoul(id, 10, &dom_id) || dom_id != d->id)
+ return -EINVAL;
+ if (parse_cbm(dom, r))
+ return -EINVAL;
+ }
+
+ /* Any garbage at the end of the line? */
+ if (line && line[0])
+ return -EINVAL;
+ return 0;
+}
+
+static int update_domains(struct rdt_resource *r, int closid)
+{
+ struct msr_param msr_param;
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ int cpu, idx = 0;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ msr_param.low = closid;
+ msr_param.high = msr_param.low + 1;
+ msr_param.res = r;
+
+ list_for_each_entry(d, &r->domains, list) {
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ d->cbm[msr_param.low] = r->tmp_cbms[idx++];
+ }
+ cpu = get_cpu();
+ /* Update CBM on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ rdt_cbm_update(&msr_param);
+ /* Update CBM on other cpus. */
+ smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+ put_cpu();
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ char *tok, *resname;
+ int closid, ret = 0;
+ u32 *l3_cbms = NULL;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+ buf[nbytes - 1] = '\0';
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ closid = rdtgrp->closid;
+
+ /* get scratch space to save all the masks while we validate input */
+ for_each_enabled_rdt_resource(r) {
+ r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms),
+ GFP_KERNEL);
+ if (!r->tmp_cbms) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ r->num_tmp_cbms = 0;
+ }
+
+ while ((tok = strsep(&buf, "\n")) != NULL) {
+ resname = strsep(&tok, ":");
+ if (!tok) {
+ ret = -EINVAL;
+ goto out;
+ }
+ for_each_enabled_rdt_resource(r) {
+ if (!strcmp(resname, r->name) &&
+ closid < r->num_closid) {
+ ret = parse_line(tok, r);
+ if (ret)
+ goto out;
+ break;
+ }
+ }
+ if (!r->name) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Did the parser find all the masks we need? */
+ for_each_enabled_rdt_resource(r) {
+ if (r->num_tmp_cbms != r->num_domains) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ for_each_enabled_rdt_resource(r) {
+ ret = update_domains(r, closid);
+ if (ret)
+ goto out;
+ }
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+ for_each_enabled_rdt_resource(r) {
+ kfree(r->tmp_cbms);
+ r->tmp_cbms = NULL;
+ }
+ return ret ?: nbytes;
+}
+
+static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
+{
+ struct rdt_domain *dom;
+ bool sep = false;
+
+ seq_printf(s, "%s:", r->name);
+ list_for_each_entry(dom, &r->domains, list) {
+ if (sep)
+ seq_puts(s, ";");
+ seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]);
+ sep = true;
+ }
+ seq_puts(s, "\n");
+}
+
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ int closid, ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp) {
+ closid = rdtgrp->closid;
+ for_each_enabled_rdt_resource(r) {
+ if (closid < r->num_closid)
+ show_doms(s, r, closid);
+ }
+ } else {
+ ret = -ENOENT;
+ }
+ rdtgroup_kn_unlock(of->kn);
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index d1316f9c8329..d9794060fe22 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -20,12 +20,15 @@ struct cpuid_bit {
/* Please keep the leaf sorted by cpuid_bit.level for faster search. */
static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
- { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
- { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 },
+ { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 },
{ X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 },
{ X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 },
- { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
- { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
+ { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
+ { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
+ { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index d0d744108594..a0ac3e81518a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -53,6 +53,7 @@
#include <asm/debugreg.h>
#include <asm/switch_to.h>
#include <asm/vm86.h>
+#include <asm/intel_rdt.h>
void __show_regs(struct pt_regs *regs, int all)
{
@@ -296,5 +297,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(current_task, next_p);
+ /* Load the Intel cache allocation PQR MSR. */
+ intel_rdt_sched_in();
+
return prev_p;
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a76b65e3e615..a61e141b6891 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,6 +49,7 @@
#include <asm/switch_to.h>
#include <asm/xen/hypervisor.h>
#include <asm/vdso.h>
+#include <asm/intel_rdt.h>
__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
@@ -476,6 +477,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
loadsegment(ss, __KERNEL_DS);
}
+ /* Load the Intel cache allocation PQR MSR. */
+ intel_rdt_sched_in();
+
return prev_p;
}
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b2d3cf1ef54a..e85f6bd7b9d5 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -373,16 +373,17 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
const u32 kvm_cpuid_7_0_ebx_x86_features =
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
- F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
- F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
- F(AVX512BW) | F(AVX512VL);
+ F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
+ F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+ F(SHA_NI) | F(AVX512BW) | F(AVX512VL);
/* cpuid 0xD.1.eax */
const u32 kvm_cpuid_D_1_eax_x86_features =
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
/* cpuid 7.0.ecx*/
- const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
+ const u32 kvm_cpuid_7_0_ecx_x86_features =
+ F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/;
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 99cde5220e07..1572c35b4f1a 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -852,6 +852,10 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
return;
+ mutex_lock(&kvm->arch.hyperv.hv_lock);
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ goto out_unlock;
+
gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
/*
* Because the TSC parameters only vary when there is a
@@ -859,7 +863,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
*/
if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
&tsc_seq, sizeof(tsc_seq))))
- return;
+ goto out_unlock;
/*
* While we're computing and writing the parameters, force the
@@ -868,15 +872,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
hv->tsc_ref.tsc_sequence = 0;
if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
- return;
+ goto out_unlock;
if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
- return;
+ goto out_unlock;
/* Ensure sequence is zero before writing the rest of the struct. */
smp_wmb();
if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
- return;
+ goto out_unlock;
/*
* Now switch to the TSC page mechanism by writing the sequence.
@@ -891,6 +895,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
hv->tsc_ref.tsc_sequence = tsc_seq;
kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+out_unlock:
+ mutex_unlock(&kvm->arch.hyperv.hv_lock);
}
static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
@@ -1142,9 +1148,9 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
if (kvm_hv_msr_partition_wide(msr)) {
int r;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
return r;
} else
return kvm_hv_set_msr(vcpu, msr, data, host);
@@ -1155,9 +1161,9 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
if (kvm_hv_msr_partition_wide(msr)) {
int r;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
r = kvm_hv_get_msr_pw(vcpu, msr, pdata);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
return r;
} else
return kvm_hv_get_msr(vcpu, msr, pdata);
@@ -1165,7 +1171,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
bool kvm_hv_hypercall_enabled(struct kvm *kvm)
{
- return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
+ return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE;
}
static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aae43c6f2472..24db5fb6f575 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1389,10 +1389,10 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
}
-static inline bool is_exception(u32 intr_info)
+static inline bool is_nmi(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+ == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
}
static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
@@ -5728,7 +5728,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (is_machine_check(intr_info))
return handle_machine_check(vcpu);
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
+ if (is_nmi(intr_info))
return 1; /* already handled by vmx_vcpu_run() */
if (is_no_device(intr_info)) {
@@ -7122,7 +7122,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
if (vmptr == vmx->nested.vmxon_ptr) {
nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_VMXON_POINTER);
+ VMXERR_VMPTRLD_VMXON_POINTER);
return kvm_skip_emulated_instruction(vcpu);
}
break;
@@ -8170,7 +8170,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
switch (exit_reason) {
case EXIT_REASON_EXCEPTION_NMI:
- if (!is_exception(intr_info))
+ if (is_nmi(intr_info))
return false;
else if (is_page_fault(intr_info))
return enable_ept;
@@ -8765,8 +8765,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
- if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (exit_intr_info & INTR_INFO_VALID_MASK)) {
+ if (is_nmi(exit_intr_info)) {
kvm_before_handle_nmi(&vmx->vcpu);
asm("int $2");
kvm_after_handle_nmi(&vmx->vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1f0d2383f5ee..445c51b6cf6d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2844,7 +2844,24 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
+ int idx;
+ /*
+ * Disable page faults because we're in atomic context here.
+ * kvm_write_guest_offset_cached() would call might_fault()
+ * that relies on pagefault_disable() to tell if there's a
+ * bug. NOTE: the write to guest memory may not go through if
+ * during postcopy live migration or if there's heavy guest
+ * paging.
+ */
+ pagefault_disable();
+ /*
+ * kvm_memslots() will be called by
+ * kvm_write_guest_offset_cached() so take the srcu lock.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ pagefault_enable();
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
@@ -7881,6 +7898,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
+ mutex_init(&kvm->arch.hyperv.hv_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index f61058617ada..f4126cf997a4 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -15,6 +15,7 @@ config XTENSA
select GENERIC_SCHED_CLOCK
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_API_DEBUG
+ select HAVE_DMA_CONTIGUOUS
select HAVE_EXIT_THREAD
select HAVE_FUNCTION_TRACER
select HAVE_FUTEX_CMPXCHG if !MMU
diff --git a/arch/xtensa/boot/dts/kc705.dts b/arch/xtensa/boot/dts/kc705.dts
index b1f4ee8c9a22..6106bdc097ad 100644
--- a/arch/xtensa/boot/dts/kc705.dts
+++ b/arch/xtensa/boot/dts/kc705.dts
@@ -11,4 +11,20 @@
device_type = "memory";
reg = <0x00000000 0x38000000>;
};
+
+ reserved-memory {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges;
+
+ /* global autoconfigured region for contiguous allocations */
+ linux,cma {
+ compatible = "shared-dma-pool";
+ reusable;
+ size = <0x04000000>;
+ alignment = <0x2000>;
+ alloc-ranges = <0x00000000 0x20000000>;
+ linux,cma-default;
+ };
+ };
};
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 28cf4c5d65ef..b7fbaa56b51a 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -3,6 +3,7 @@ generic-y += bug.h
generic-y += clkdev.h
generic-y += cputime.h
generic-y += div64.h
+generic-y += dma-contiguous.h
generic-y += emergency-restart.h
generic-y += errno.h
generic-y += exec.h
diff --git a/arch/xtensa/kernel/Makefile b/arch/xtensa/kernel/Makefile
index c31f5d5afc7d..264fb89c444e 100644
--- a/arch/xtensa/kernel/Makefile
+++ b/arch/xtensa/kernel/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += mcount.o
obj-$(CONFIG_SMP) += smp.o mxhead.o
obj-$(CONFIG_XTENSA_VARIANT_HAVE_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_S32C1I_SELFTEST) += s32c1i_selftest.o
AFLAGS_head.o += -mtext-section-literals
AFLAGS_mxhead.o += -mtext-section-literals
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 6a16decf278f..70e362e6038e 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -15,6 +15,7 @@
* Joe Taylor <joe@tensilica.com, joetylr@yahoo.com>
*/
+#include <linux/dma-contiguous.h>
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/mm.h>
@@ -146,6 +147,8 @@ static void *xtensa_dma_alloc(struct device *dev, size_t size,
{
unsigned long ret;
unsigned long uncached = 0;
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ struct page *page = NULL;
/* ignore region speicifiers */
@@ -153,11 +156,18 @@ static void *xtensa_dma_alloc(struct device *dev, size_t size,
if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
flag |= GFP_DMA;
- ret = (unsigned long)__get_free_pages(flag, get_order(size));
- if (ret == 0)
+ if (gfpflags_allow_blocking(flag))
+ page = dma_alloc_from_contiguous(dev, count, get_order(size));
+
+ if (!page)
+ page = alloc_pages(flag, get_order(size));
+
+ if (!page)
return NULL;
+ ret = (unsigned long)page_address(page);
+
/* We currently don't support coherent memory outside KSEG */
BUG_ON(ret < XCHAL_KSEG_CACHED_VADDR ||
@@ -170,16 +180,19 @@ static void *xtensa_dma_alloc(struct device *dev, size_t size,
return (void *)uncached;
}
-static void xtensa_dma_free(struct device *hwdev, size_t size, void *vaddr,
+static void xtensa_dma_free(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_handle, unsigned long attrs)
{
unsigned long addr = (unsigned long)vaddr +
XCHAL_KSEG_CACHED_VADDR - XCHAL_KSEG_BYPASS_VADDR;
+ struct page *page = virt_to_page(addr);
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
BUG_ON(addr < XCHAL_KSEG_CACHED_VADDR ||
addr > XCHAL_KSEG_CACHED_VADDR + XCHAL_KSEG_SIZE - 1);
- free_pages(addr, get_order(size));
+ if (!dma_release_from_contiguous(dev, page, count))
+ __free_pages(page, get_order(size));
}
static dma_addr_t xtensa_map_page(struct device *dev, struct page *page,
diff --git a/arch/xtensa/kernel/s32c1i_selftest.c b/arch/xtensa/kernel/s32c1i_selftest.c
new file mode 100644
index 000000000000..07e56e3a9a8b
--- /dev/null
+++ b/arch/xtensa/kernel/s32c1i_selftest.c
@@ -0,0 +1,128 @@
+/*
+ * S32C1I selftest.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2016 Cadence Design Systems Inc.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+#include <asm/traps.h>
+
+#if XCHAL_HAVE_S32C1I
+
+static int __initdata rcw_word, rcw_probe_pc, rcw_exc;
+
+/*
+ * Basic atomic compare-and-swap, that records PC of S32C1I for probing.
+ *
+ * If *v == cmp, set *v = set. Return previous *v.
+ */
+static inline int probed_compare_swap(int *v, int cmp, int set)
+{
+ int tmp;
+
+ __asm__ __volatile__(
+ " movi %1, 1f\n"
+ " s32i %1, %4, 0\n"
+ " wsr %2, scompare1\n"
+ "1: s32c1i %0, %3, 0\n"
+ : "=a" (set), "=&a" (tmp)
+ : "a" (cmp), "a" (v), "a" (&rcw_probe_pc), "0" (set)
+ : "memory"
+ );
+ return set;
+}
+
+/* Handle probed exception */
+
+static void __init do_probed_exception(struct pt_regs *regs,
+ unsigned long exccause)
+{
+ if (regs->pc == rcw_probe_pc) { /* exception on s32c1i ? */
+ regs->pc += 3; /* skip the s32c1i instruction */
+ rcw_exc = exccause;
+ } else {
+ do_unhandled(regs, exccause);
+ }
+}
+
+/* Simple test of S32C1I (soc bringup assist) */
+
+static int __init check_s32c1i(void)
+{
+ int n, cause1, cause2;
+ void *handbus, *handdata, *handaddr; /* temporarily saved handlers */
+
+ rcw_probe_pc = 0;
+ handbus = trap_set_handler(EXCCAUSE_LOAD_STORE_ERROR,
+ do_probed_exception);
+ handdata = trap_set_handler(EXCCAUSE_LOAD_STORE_DATA_ERROR,
+ do_probed_exception);
+ handaddr = trap_set_handler(EXCCAUSE_LOAD_STORE_ADDR_ERROR,
+ do_probed_exception);
+
+ /* First try an S32C1I that does not store: */
+ rcw_exc = 0;
+ rcw_word = 1;
+ n = probed_compare_swap(&rcw_word, 0, 2);
+ cause1 = rcw_exc;
+
+ /* took exception? */
+ if (cause1 != 0) {
+ /* unclean exception? */
+ if (n != 2 || rcw_word != 1)
+ panic("S32C1I exception error");
+ } else if (rcw_word != 1 || n != 1) {
+ panic("S32C1I compare error");
+ }
+
+ /* Then an S32C1I that stores: */
+ rcw_exc = 0;
+ rcw_word = 0x1234567;
+ n = probed_compare_swap(&rcw_word, 0x1234567, 0xabcde);
+ cause2 = rcw_exc;
+
+ if (cause2 != 0) {
+ /* unclean exception? */
+ if (n != 0xabcde || rcw_word != 0x1234567)
+ panic("S32C1I exception error (b)");
+ } else if (rcw_word != 0xabcde || n != 0x1234567) {
+ panic("S32C1I store error");
+ }
+
+ /* Verify consistency of exceptions: */
+ if (cause1 || cause2) {
+ pr_warn("S32C1I took exception %d, %d\n", cause1, cause2);
+ /* If emulation of S32C1I upon bus error gets implemented,
+ * we can get rid of this panic for single core (not SMP)
+ */
+ panic("S32C1I exceptions not currently supported");
+ }
+ if (cause1 != cause2)
+ panic("inconsistent S32C1I exceptions");
+
+ trap_set_handler(EXCCAUSE_LOAD_STORE_ERROR, handbus);
+ trap_set_handler(EXCCAUSE_LOAD_STORE_DATA_ERROR, handdata);
+ trap_set_handler(EXCCAUSE_LOAD_STORE_ADDR_ERROR, handaddr);
+ return 0;
+}
+
+#else /* XCHAL_HAVE_S32C1I */
+
+/* This condition should not occur with a commercially deployed processor.
+ * Display reminder for early engr test or demo chips / FPGA bitstreams
+ */
+static int __init check_s32c1i(void)
+{
+ pr_warn("Processor configuration lacks atomic compare-and-swap support!\n");
+ return 0;
+}
+
+#endif /* XCHAL_HAVE_S32C1I */
+
+early_initcall(check_s32c1i);
diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c
index 88a044af7504..848e8568fb3c 100644
--- a/arch/xtensa/kernel/setup.c
+++ b/arch/xtensa/kernel/setup.c
@@ -31,10 +31,6 @@
# include <linux/console.h>
#endif
-#ifdef CONFIG_RTC
-# include <linux/timex.h>
-#endif
-
#ifdef CONFIG_PROC_FS
# include <linux/seq_file.h>
#endif
@@ -48,24 +44,22 @@
#include <asm/page.h>
#include <asm/setup.h>
#include <asm/param.h>
-#include <asm/traps.h>
#include <asm/smp.h>
#include <asm/sysmem.h>
#include <platform/hardware.h>
#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE)
-struct screen_info screen_info = { 0, 24, 0, 0, 0, 80, 0, 0, 0, 24, 1, 16};
-#endif
-
-#ifdef CONFIG_BLK_DEV_FD
-extern struct fd_ops no_fd_ops;
-struct fd_ops *fd_ops;
+struct screen_info screen_info = {
+ .orig_x = 0,
+ .orig_y = 24,
+ .orig_video_cols = 80,
+ .orig_video_lines = 24,
+ .orig_video_isVGA = 1,
+ .orig_video_points = 16,
+};
#endif
-extern struct rtc_ops no_rtc_ops;
-struct rtc_ops *rtc_ops;
-
#ifdef CONFIG_BLK_DEV_INITRD
extern unsigned long initrd_start;
extern unsigned long initrd_end;
@@ -77,7 +71,6 @@ extern int initrd_below_start_ok;
void *dtb_start = __dtb_start;
#endif
-unsigned char aux_device_present;
extern unsigned long loops_per_jiffy;
/* Command line specified as configuration option. */
@@ -317,120 +310,6 @@ extern char _SecondaryResetVector_text_start;
extern char _SecondaryResetVector_text_end;
#endif
-
-#ifdef CONFIG_S32C1I_SELFTEST
-#if XCHAL_HAVE_S32C1I
-
-static int __initdata rcw_word, rcw_probe_pc, rcw_exc;
-
-/*
- * Basic atomic compare-and-swap, that records PC of S32C1I for probing.
- *
- * If *v == cmp, set *v = set. Return previous *v.
- */
-static inline int probed_compare_swap(int *v, int cmp, int set)
-{
- int tmp;
-
- __asm__ __volatile__(
- " movi %1, 1f\n"
- " s32i %1, %4, 0\n"
- " wsr %2, scompare1\n"
- "1: s32c1i %0, %3, 0\n"
- : "=a" (set), "=&a" (tmp)
- : "a" (cmp), "a" (v), "a" (&rcw_probe_pc), "0" (set)
- : "memory"
- );
- return set;
-}
-
-/* Handle probed exception */
-
-static void __init do_probed_exception(struct pt_regs *regs,
- unsigned long exccause)
-{
- if (regs->pc == rcw_probe_pc) { /* exception on s32c1i ? */
- regs->pc += 3; /* skip the s32c1i instruction */
- rcw_exc = exccause;
- } else {
- do_unhandled(regs, exccause);
- }
-}
-
-/* Simple test of S32C1I (soc bringup assist) */
-
-static int __init check_s32c1i(void)
-{
- int n, cause1, cause2;
- void *handbus, *handdata, *handaddr; /* temporarily saved handlers */
-
- rcw_probe_pc = 0;
- handbus = trap_set_handler(EXCCAUSE_LOAD_STORE_ERROR,
- do_probed_exception);
- handdata = trap_set_handler(EXCCAUSE_LOAD_STORE_DATA_ERROR,
- do_probed_exception);
- handaddr = trap_set_handler(EXCCAUSE_LOAD_STORE_ADDR_ERROR,
- do_probed_exception);
-
- /* First try an S32C1I that does not store: */
- rcw_exc = 0;
- rcw_word = 1;
- n = probed_compare_swap(&rcw_word, 0, 2);
- cause1 = rcw_exc;
-
- /* took exception? */
- if (cause1 != 0) {
- /* unclean exception? */
- if (n != 2 || rcw_word != 1)
- panic("S32C1I exception error");
- } else if (rcw_word != 1 || n != 1) {
- panic("S32C1I compare error");
- }
-
- /* Then an S32C1I that stores: */
- rcw_exc = 0;
- rcw_word = 0x1234567;
- n = probed_compare_swap(&rcw_word, 0x1234567, 0xabcde);
- cause2 = rcw_exc;
-
- if (cause2 != 0) {
- /* unclean exception? */
- if (n != 0xabcde || rcw_word != 0x1234567)
- panic("S32C1I exception error (b)");
- } else if (rcw_word != 0xabcde || n != 0x1234567) {
- panic("S32C1I store error");
- }
-
- /* Verify consistency of exceptions: */
- if (cause1 || cause2) {
- pr_warn("S32C1I took exception %d, %d\n", cause1, cause2);
- /* If emulation of S32C1I upon bus error gets implemented,
- we can get rid of this panic for single core (not SMP) */
- panic("S32C1I exceptions not currently supported");
- }
- if (cause1 != cause2)
- panic("inconsistent S32C1I exceptions");
-
- trap_set_handler(EXCCAUSE_LOAD_STORE_ERROR, handbus);
- trap_set_handler(EXCCAUSE_LOAD_STORE_DATA_ERROR, handdata);
- trap_set_handler(EXCCAUSE_LOAD_STORE_ADDR_ERROR, handaddr);
- return 0;
-}
-
-#else /* XCHAL_HAVE_S32C1I */
-
-/* This condition should not occur with a commercially deployed processor.
- Display reminder for early engr test or demo chips / FPGA bitstreams */
-static int __init check_s32c1i(void)
-{
- pr_warn("Processor configuration lacks atomic compare-and-swap support!\n");
- return 0;
-}
-
-#endif /* XCHAL_HAVE_S32C1I */
-early_initcall(check_s32c1i);
-#endif /* CONFIG_S32C1I_SELFTEST */
-
static inline int mem_reserve(unsigned long start, unsigned long end)
{
return memblock_reserve(start, end - start);
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 80e4cfb2471a..720fe4e8b497 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -26,6 +26,7 @@
#include <linux/nodemask.h>
#include <linux/mm.h>
#include <linux/of_fdt.h>
+#include <linux/dma-contiguous.h>
#include <asm/bootparam.h>
#include <asm/page.h>
@@ -60,6 +61,7 @@ void __init bootmem_init(void)
max_low_pfn = min(max_pfn, MAX_LOW_PFN);
memblock_set_current_limit(PFN_PHYS(max_low_pfn));
+ dma_contiguous_reserve(PFN_PHYS(max_low_pfn));
memblock_dump_all();
}