23 files changed, 3131 insertions, 1277 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4a8697f7d4ef..52000010c62e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -20,13 +20,11 @@ obj-y			:= intel_cacheinfo.o scattered.o topology.o
 obj-y			+= common.o
 obj-y			+= rdrand.o
 obj-y			+= match.o
+obj-y			+= bugs.o
 
 obj-$(CONFIG_PROC_FS)	+= proc.o
 obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
 
-obj-$(CONFIG_X86_32)	+= bugs.o
-obj-$(CONFIG_X86_64)	+= bugs_64.o
-
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= amd.o
 obj-$(CONFIG_CPU_SUP_CYRIX_32)		+= cyrix.o
@@ -34,6 +32,8 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
+obj-$(CONFIG_INTEL_RDT_A)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
+
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
 obj-$(CONFIG_MICROCODE)			+= microcode/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1e81a37c034e..71cae73a5076 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -20,6 +20,10 @@
 
 #include "cpu.h"
 
+static const int amd_erratum_383[];
+static const int amd_erratum_400[];
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+
 /*
  * nodes_per_socket: Stores the number of nodes per socket.
  * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
@@ -314,11 +318,30 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		smp_num_siblings = ((ebx >> 8) & 3) + 1;
 		c->x86_max_cores /= smp_num_siblings;
 		c->cpu_core_id = ebx & 0xff;
+
+		/*
+		 * We may have multiple LLCs if L3 caches exist, so check if we
+		 * have an L3 cache by looking at the L3 cache CPUID leaf.
+		 */
+		if (cpuid_edx(0x80000006)) {
+			if (c->x86 == 0x17) {
+				/*
+				 * LLC is at the core complex level.
+				 * Core complex id is ApicId[3].
+				 */
+				per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+			} else {
+				/* LLC is at the node level. */
+				per_cpu(cpu_llc_id, cpu) = node_id;
+			}
+		}
 	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
 		u64 value;
 
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
 		node_id = value & 7;
+
+		per_cpu(cpu_llc_id, cpu) = node_id;
 	} else
 		return;
 
@@ -329,9 +352,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_AMD_DCM);
 		cus_per_node = c->x86_max_cores / nodes_per_socket;
 
-		/* store NodeID, use llc_shared_map to store sibling info */
-		per_cpu(cpu_llc_id, cpu) = node_id;
-
 		/* core id has to be in the [0 .. cores_per_node - 1] range */
 		c->cpu_core_id %= cus_per_node;
 	}
@@ -356,15 +376,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
 	/* use socket ID also for last level cache */
 	per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
 	amd_get_topology(c);
-
-	/*
-	 * Fix percpu cpu_llc_id here as LLC topology is different
-	 * for Fam17h systems.
-	 */
-	 if (c->x86 != 0x17 || !cpuid_edx(0x80000006))
-		return;
-
-	per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
 #endif
 }
 
@@ -585,11 +596,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 	/* F16h erratum 793, CVE-2013-6885 */
 	if (c->x86 == 0x16 && c->x86_model <= 0xf)
 		msr_set_bit(MSR_AMD64_LS_CFG, 15);
-}
 
-static const int amd_erratum_383[];
-static const int amd_erratum_400[];
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+	/*
+	 * Check whether the machine is affected by erratum 400. This is
+	 * used to select the proper idle routine and to enable the check
+	 * whether the machine is affected in arch_post_acpi_init(), which
+	 * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+	 */
+	if (cpu_has_amd_erratum(c, amd_erratum_400))
+		set_cpu_bug(c, X86_BUG_AMD_E400);
+}
 
 static void init_amd_k8(struct cpuinfo_x86 *c)
 {
@@ -770,9 +786,6 @@ static void init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 > 0x11)
 		set_cpu_cap(c, X86_FEATURE_ARAT);
 
-	if (cpu_has_amd_erratum(c, amd_erratum_400))
-		set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
-
 	rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
 
 	/* 3DNow or LM implies PREFETCHW */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bd17db15a2c1..a44ef52184df 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -16,15 +16,19 @@
 #include <asm/msr.h>
 #include <asm/paravirt.h>
 #include <asm/alternative.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
 
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
-#ifndef CONFIG_SMP
-	pr_info("CPU: ");
-	print_cpu_info(&boot_cpu_data);
-#endif
 
+	if (!IS_ENABLED(CONFIG_SMP)) {
+		pr_info("CPU: ");
+		print_cpu_info(&boot_cpu_data);
+	}
+
+#ifdef CONFIG_X86_32
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
 	 *
@@ -40,4 +44,18 @@ void __init check_bugs(void)
 	alternative_instructions();
 
 	fpu__init_check_bugs();
+#else /* CONFIG_X86_64 */
+	alternative_instructions();
+
+	/*
+	 * Make sure the first 2MB area is not mapped by huge pages
+	 * There are typically fixed size MTRRs in there and overlapping
+	 * MTRRs into large pages causes slow downs.
+	 *
+	 * Right now we don't do that with gbpages because there seems
+	 * very little benefit for that case.
+	 */
+	if (!direct_gbpages)
+		set_memory_4k((unsigned long)__va(0), 1);
+#endif
 }
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
deleted file mode 100644
index a972ac4c7e7d..000000000000
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright (C) 1994  Linus Torvalds
- *  Copyright (C) 2000  SuSE
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/alternative.h>
-#include <asm/bugs.h>
-#include <asm/processor.h>
-#include <asm/mtrr.h>
-#include <asm/cacheflush.h>
-
-void __init check_bugs(void)
-{
-	identify_boot_cpu();
-#if !defined(CONFIG_SMP)
-	pr_info("CPU: ");
-	print_cpu_info(&boot_cpu_data);
-#endif
-	alternative_instructions();
-
-	/*
-	 * Make sure the first 2MB area is not mapped by huge pages
-	 * There are typically fixed size MTRRs in there and overlapping
-	 * MTRRs into large pages causes slow downs.
-	 *
-	 * Right now we don't do that with gbpages because there seems
-	 * very little benefit for that case.
-	 */
-	if (!direct_gbpages)
-		set_memory_4k((unsigned long)__va(0), 1);
-}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc9e980c68ec..dc1697ca5191 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -667,13 +667,14 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_capability[CPUID_1_EDX] = edx;
 	}
 
+	/* Thermal and Power Management Leaf: level 0x00000006 (eax) */
+	if (c->cpuid_level >= 0x00000006)
+		c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
+
 	/* Additional Intel-defined flags: level 0x00000007 */
 	if (c->cpuid_level >= 0x00000007) {
 		cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
-
 		c->x86_capability[CPUID_7_0_EBX] = ebx;
-
-		c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
 		c->x86_capability[CPUID_7_ECX] = ecx;
 	}
 
@@ -979,29 +980,21 @@ static void x86_init_cache_qos(struct cpuinfo_x86 *c)
 }
 
 /*
- * The physical to logical package id mapping is initialized from the
- * acpi/mptables information. Make sure that CPUID actually agrees with
- * that.
+ * Validate that ACPI/mptables have the same information about the
+ * effective APIC id and update the package map.
  */
-static void sanitize_package_id(struct cpuinfo_x86 *c)
+static void validate_apic_and_package_id(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
-	unsigned int pkg, apicid, cpu = smp_processor_id();
+	unsigned int apicid, cpu = smp_processor_id();
 
 	apicid = apic->cpu_present_to_apicid(cpu);
-	pkg = apicid >> boot_cpu_data.x86_coreid_bits;
 
-	if (apicid != c->initial_apicid) {
-		pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x CPUID: %x\n",
+	if (apicid != c->apicid) {
+		pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n",
 		       cpu, apicid, c->initial_apicid);
-		c->initial_apicid = apicid;
-	}
-	if (pkg != c->phys_proc_id) {
-		pr_err(FW_BUG "CPU%u: Using firmware package id %u instead of %u\n",
-		       cpu, pkg, c->phys_proc_id);
-		c->phys_proc_id = pkg;
 	}
-	c->logical_proc_id = topology_phys_to_logical_pkg(pkg);
+	BUG_ON(topology_update_package_map(c->phys_proc_id, cpu));
 #else
 	c->logical_proc_id = 0;
 #endif
@@ -1132,7 +1125,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 #ifdef CONFIG_NUMA
 	numa_add_cpu(smp_processor_id());
 #endif
-	sanitize_package_id(c);
 }
 
 /*
@@ -1172,7 +1164,6 @@ void enable_sep_cpu(void)
 void __init identify_boot_cpu(void)
 {
 	identify_cpu(&boot_cpu_data);
-	init_amd_e400_c1e_mask();
 #ifdef CONFIG_X86_32
 	sysenter_setup();
 	enable_sep_cpu();
@@ -1188,53 +1179,9 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
 	enable_sep_cpu();
 #endif
 	mtrr_ap_init();
+	validate_apic_and_package_id(c);
 }
 
-struct msr_range {
-	unsigned	min;
-	unsigned	max;
-};
-
-static const struct msr_range msr_range_array[] = {
-	{ 0x00000000, 0x00000418},
-	{ 0xc0000000, 0xc000040b},
-	{ 0xc0010000, 0xc0010142},
-	{ 0xc0011000, 0xc001103b},
-};
-
-static void __print_cpu_msr(void)
-{
-	unsigned index_min, index_max;
-	unsigned index;
-	u64 val;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
-		index_min = msr_range_array[i].min;
-		index_max = msr_range_array[i].max;
-
-		for (index = index_min; index < index_max; index++) {
-			if (rdmsrl_safe(index, &val))
-				continue;
-			pr_info(" MSR%08x: %016llx\n", index, val);
-		}
-	}
-}
-
-static int show_msr;
-
-static __init int setup_show_msr(char *arg)
-{
-	int num;
-
-	get_option(&arg, &num);
-
-	if (num > 0)
-		show_msr = num;
-	return 1;
-}
-__setup("show_msr=", setup_show_msr);
-
 static __init int setup_noclflush(char *arg)
 {
 	setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
@@ -1268,14 +1215,6 @@ void print_cpu_info(struct cpuinfo_x86 *c)
 		pr_cont(", stepping: 0x%x)\n", c->x86_mask);
 	else
 		pr_cont(")\n");
-
-	print_cpu_msr(c);
-}
-
-void print_cpu_msr(struct cpuinfo_x86 *c)
-{
-	if (c->cpu_index < show_msr)
-		__print_cpu_msr();
 }
 
 static __init int setup_disablecpuid(char *arg)
@@ -1490,11 +1429,8 @@ void cpu_init(void)
 	 */
 	cr4_init_shadow();
 
-	/*
-	 * Load microcode on this cpu if a valid microcode is available.
-	 * This is early microcode loading procedure.
-	 */
-	load_ucode_ap();
+	if (cpu)
+		load_ucode_ap();
 
 	t = &per_cpu(cpu_tss, cpu);
 	oist = &per_cpu(orig_ist, cpu);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index de6626c18e42..0282b0df004a 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -153,6 +153,7 @@ struct _cpuid4_info_regs {
 	union _cpuid4_leaf_eax eax;
 	union _cpuid4_leaf_ebx ebx;
 	union _cpuid4_leaf_ecx ecx;
+	unsigned int id;
 	unsigned long size;
 	struct amd_northbridge *nb;
 };
@@ -894,6 +895,8 @@ static void __cache_cpumap_setup(unsigned int cpu, int index,
 static void ci_leaf_init(struct cacheinfo *this_leaf,
 			 struct _cpuid4_info_regs *base)
 {
+	this_leaf->id = base->id;
+	this_leaf->attributes = CACHE_ID;
 	this_leaf->level = base->eax.split.level;
 	this_leaf->type = cache_type_map[base->eax.split.type];
 	this_leaf->coherency_line_size =
@@ -920,6 +923,22 @@ static int __init_cache_level(unsigned int cpu)
 	return 0;
 }
 
+/*
+ * The max shared threads number comes from CPUID.4:EAX[25-14] with input
+ * ECX as cache index. Then right shift apicid by the number's order to get
+ * cache id for this cache node.
+ */
+static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	unsigned long num_threads_sharing;
+	int index_msb;
+
+	num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing;
+	index_msb = get_count_order(num_threads_sharing);
+	id4_regs->id = c->apicid >> index_msb;
+}
+
 static int __populate_cache_leaves(unsigned int cpu)
 {
 	unsigned int idx, ret;
@@ -931,9 +950,12 @@ static int __populate_cache_leaves(unsigned int cpu)
 		ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
 		if (ret)
 			return ret;
+		get_cache_id(cpu, &id4_regs);
 		ci_leaf_init(this_leaf++, &id4_regs);
 		__cache_cpumap_setup(cpu, idx, &id4_regs);
 	}
+	this_cpu_ci->cpu_map_populated = true;
+
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
new file mode 100644
index 000000000000..5a533fefefa0
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -0,0 +1,403 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ *    Tony Luck <tony.luck@intel.com>
+ *    Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpuhotplug.h>
+
+#include <asm/intel-family.h>
+#include <asm/intel_rdt.h>
+
+/* Mutex to protect rdtgroup access. */
+DEFINE_MUTEX(rdtgroup_mutex);
+
+DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+
+#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
+
+struct rdt_resource rdt_resources_all[] = {
+	{
+		.name		= "L3",
+		.domains	= domain_init(RDT_RESOURCE_L3),
+		.msr_base	= IA32_L3_CBM_BASE,
+		.min_cbm_bits	= 1,
+		.cache_level	= 3,
+		.cbm_idx_multi	= 1,
+		.cbm_idx_offset	= 0
+	},
+	{
+		.name		= "L3DATA",
+		.domains	= domain_init(RDT_RESOURCE_L3DATA),
+		.msr_base	= IA32_L3_CBM_BASE,
+		.min_cbm_bits	= 1,
+		.cache_level	= 3,
+		.cbm_idx_multi	= 2,
+		.cbm_idx_offset	= 0
+	},
+	{
+		.name		= "L3CODE",
+		.domains	= domain_init(RDT_RESOURCE_L3CODE),
+		.msr_base	= IA32_L3_CBM_BASE,
+		.min_cbm_bits	= 1,
+		.cache_level	= 3,
+		.cbm_idx_multi	= 2,
+		.cbm_idx_offset	= 1
+	},
+	{
+		.name		= "L2",
+		.domains	= domain_init(RDT_RESOURCE_L2),
+		.msr_base	= IA32_L2_CBM_BASE,
+		.min_cbm_bits	= 1,
+		.cache_level	= 2,
+		.cbm_idx_multi	= 1,
+		.cbm_idx_offset	= 0
+	},
+};
+
+static int cbm_idx(struct rdt_resource *r, int closid)
+{
+	return closid * r->cbm_idx_multi + r->cbm_idx_offset;
+}
+
+/*
+ * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
+ * as they do not have CPUID enumeration support for Cache allocation.
+ * The check for Vendor/Family/Model is not enough to guarantee that
+ * the MSRs won't #GP fault because only the following SKUs support
+ * CAT:
+ *	Intel(R) Xeon(R)  CPU E5-2658  v3  @  2.20GHz
+ *	Intel(R) Xeon(R)  CPU E5-2648L v3  @  1.80GHz
+ *	Intel(R) Xeon(R)  CPU E5-2628L v3  @  2.00GHz
+ *	Intel(R) Xeon(R)  CPU E5-2618L v3  @  2.30GHz
+ *	Intel(R) Xeon(R)  CPU E5-2608L v3  @  2.00GHz
+ *	Intel(R) Xeon(R)  CPU E5-2658A v3  @  2.20GHz
+ *
+ * Probe by trying to write the first of the L3 cach mask registers
+ * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
+ * is always 20 on hsw server parts. The minimum cache bitmask length
+ * allowed for HSW server is always 2 bits. Hardcode all of them.
+ */
+static inline bool cache_alloc_hsw_probe(void)
+{
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+	    boot_cpu_data.x86 == 6 &&
+	    boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) {
+		struct rdt_resource *r  = &rdt_resources_all[RDT_RESOURCE_L3];
+		u32 l, h, max_cbm = BIT_MASK(20) - 1;
+
+		if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
+			return false;
+		rdmsr(IA32_L3_CBM_BASE, l, h);
+
+		/* If all the bits were set in MSR, return success */
+		if (l != max_cbm)
+			return false;
+
+		r->num_closid = 4;
+		r->cbm_len = 20;
+		r->max_cbm = max_cbm;
+		r->min_cbm_bits = 2;
+		r->capable = true;
+		r->enabled = true;
+
+		return true;
+	}
+
+	return false;
+}
+
+static void rdt_get_config(int idx, struct rdt_resource *r)
+{
+	union cpuid_0x10_1_eax eax;
+	union cpuid_0x10_1_edx edx;
+	u32 ebx, ecx;
+
+	cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
+	r->num_closid = edx.split.cos_max + 1;
+	r->cbm_len = eax.split.cbm_len + 1;
+	r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1;
+	r->capable = true;
+	r->enabled = true;
+}
+
+static void rdt_get_cdp_l3_config(int type)
+{
+	struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
+	struct rdt_resource *r = &rdt_resources_all[type];
+
+	r->num_closid = r_l3->num_closid / 2;
+	r->cbm_len = r_l3->cbm_len;
+	r->max_cbm = r_l3->max_cbm;
+	r->capable = true;
+	/*
+	 * By default, CDP is disabled. CDP can be enabled by mount parameter
+	 * "cdp" during resctrl file system mount time.
+	 */
+	r->enabled = false;
+}
+
+static inline bool get_rdt_resources(void)
+{
+	bool ret = false;
+
+	if (cache_alloc_hsw_probe())
+		return true;
+
+	if (!boot_cpu_has(X86_FEATURE_RDT_A))
+		return false;
+
+	if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
+		rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+		if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
+			rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
+			rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
+		}
+		ret = true;
+	}
+	if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
+		/* CPUID 0x10.2 fields are same format at 0x10.1 */
+		rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+		ret = true;
+	}
+
+	return ret;
+}
+
+static int get_cache_id(int cpu, int level)
+{
+	struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+	int i;
+
+	for (i = 0; i < ci->num_leaves; i++) {
+		if (ci->info_list[i].level == level)
+			return ci->info_list[i].id;
+	}
+
+	return -1;
+}
+
+void rdt_cbm_update(void *arg)
+{
+	struct msr_param *m = (struct msr_param *)arg;
+	struct rdt_resource *r = m->res;
+	int i, cpu = smp_processor_id();
+	struct rdt_domain *d;
+
+	list_for_each_entry(d, &r->domains, list) {
+		/* Find the domain that contains this CPU */
+		if (cpumask_test_cpu(cpu, &d->cpu_mask))
+			goto found;
+	}
+	pr_info_once("cpu %d not found in any domain for resource %s\n",
+		     cpu, r->name);
+
+	return;
+
+found:
+	for (i = m->low; i < m->high; i++) {
+		int idx = cbm_idx(r, i);
+
+		wrmsrl(r->msr_base + idx, d->cbm[i]);
+	}
+}
+
+/*
+ * rdt_find_domain - Find a domain in a resource that matches input resource id
+ *
+ * Search resource r's domain list to find the resource id. If the resource
+ * id is found in a domain, return the domain. Otherwise, if requested by
+ * caller, return the first domain whose id is bigger than the input id.
+ * The domain list is sorted by id in ascending order.
+ */
+static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+					  struct list_head **pos)
+{
+	struct rdt_domain *d;
+	struct list_head *l;
+
+	if (id < 0)
+		return ERR_PTR(id);
+
+	list_for_each(l, &r->domains) {
+		d = list_entry(l, struct rdt_domain, list);
+		/* When id is found, return its domain. */
+		if (id == d->id)
+			return d;
+		/* Stop searching when finding id's position in sorted list. */
+		if (id < d->id)
+			break;
+	}
+
+	if (pos)
+		*pos = l;
+
+	return NULL;
+}
+
+/*
+ * domain_add_cpu - Add a cpu to a resource's domain list.
+ *
+ * If an existing domain in the resource r's domain list matches the cpu's
+ * resource id, add the cpu in the domain.
+ *
+ * Otherwise, a new domain is allocated and inserted into the right position
+ * in the domain list sorted by id in ascending order.
+ *
+ * The order in the domain list is visible to users when we print entries
+ * in the schemata file and schemata input is validated to have the same order
+ * as this list.
+ */
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
+{
+	int i, id = get_cache_id(cpu, r->cache_level);
+	struct list_head *add_pos = NULL;
+	struct rdt_domain *d;
+
+	d = rdt_find_domain(r, id, &add_pos);
+	if (IS_ERR(d)) {
+		pr_warn("Could't find cache id for cpu %d\n", cpu);
+		return;
+	}
+
+	if (d) {
+		cpumask_set_cpu(cpu, &d->cpu_mask);
+		return;
+	}
+
+	d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu));
+	if (!d)
+		return;
+
+	d->id = id;
+
+	d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL);
+	if (!d->cbm) {
+		kfree(d);
+		return;
+	}
+
+	for (i = 0; i < r->num_closid; i++) {
+		int idx = cbm_idx(r, i);
+
+		d->cbm[i] = r->max_cbm;
+		wrmsrl(r->msr_base + idx, d->cbm[i]);
+	}
+
+	cpumask_set_cpu(cpu, &d->cpu_mask);
+	list_add_tail(&d->list, add_pos);
+	r->num_domains++;
+}
+
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+	int id = get_cache_id(cpu, r->cache_level);
+	struct rdt_domain *d;
+
+	d = rdt_find_domain(r, id, NULL);
+	if (IS_ERR_OR_NULL(d)) {
+		pr_warn("Could't find cache id for cpu %d\n", cpu);
+		return;
+	}
+
+	cpumask_clear_cpu(cpu, &d->cpu_mask);
+	if (cpumask_empty(&d->cpu_mask)) {
+		r->num_domains--;
+		kfree(d->cbm);
+		list_del(&d->list);
+		kfree(d);
+	}
+}
+
+static void clear_closid(int cpu)
+{
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+	per_cpu(cpu_closid, cpu) = 0;
+	state->closid = 0;
+	wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
+}
+
+static int intel_rdt_online_cpu(unsigned int cpu)
+{
+	struct rdt_resource *r;
+
+	mutex_lock(&rdtgroup_mutex);
+	for_each_capable_rdt_resource(r)
+		domain_add_cpu(cpu, r);
+	/* The cpu is set in default rdtgroup after online. */
+	cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+	clear_closid(cpu);
+	mutex_unlock(&rdtgroup_mutex);
+
+	return 0;
+}
+
+static int intel_rdt_offline_cpu(unsigned int cpu)
+{
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+
+	mutex_lock(&rdtgroup_mutex);
+	for_each_capable_rdt_resource(r)
+		domain_remove_cpu(cpu, r);
+	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask))
+			break;
+	}
+	clear_closid(cpu);
+	mutex_unlock(&rdtgroup_mutex);
+
+	return 0;
+}
+
+static int __init intel_rdt_late_init(void)
+{
+	struct rdt_resource *r;
+	int state, ret;
+
+	if (!get_rdt_resources())
+		return -ENODEV;
+
+	state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+				  "x86/rdt/cat:online:",
+				  intel_rdt_online_cpu, intel_rdt_offline_cpu);
+	if (state < 0)
+		return state;
+
+	ret = rdtgroup_init();
+	if (ret) {
+		cpuhp_remove_state(state);
+		return ret;
+	}
+
+	for_each_capable_rdt_resource(r)
+		pr_info("Intel RDT %s allocation detected\n", r->name);
+
+	return 0;
+}
+
+late_initcall(intel_rdt_late_init);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
new file mode 100644
index 000000000000..8af04afdfcb9
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -0,0 +1,1115 @@
+/*
+ * User interface for Resource Alloction in Resource Director Technology(RDT)
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/task_work.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_common.h>
+
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+struct kernfs_root *rdt_root;
+struct rdtgroup rdtgroup_default;
+LIST_HEAD(rdt_all_groups);
+
+/* Kernel fs node for "info" directory under root */
+static struct kernfs_node *kn_info;
+
+/*
+ * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
+ * we can keep a bitmap of free CLOSIDs in a single integer.
+ *
+ * Using a global CLOSID across all resources has some advantages and
+ * some drawbacks:
+ * + We can simply set "current->closid" to assign a task to a resource
+ *   group.
+ * + Context switch code can avoid extra memory references deciding which
+ *   CLOSID to load into the PQR_ASSOC MSR
+ * - We give up some options in configuring resource groups across multi-socket
+ *   systems.
+ * - Our choices on how to configure each resource become progressively more
+ *   limited as the number of resources grows.
+ */
+static int closid_free_map;
+
+static void closid_init(void)
+{
+	struct rdt_resource *r;
+	int rdt_min_closid = 32;
+
+	/* Compute rdt_min_closid across all resources */
+	for_each_enabled_rdt_resource(r)
+		rdt_min_closid = min(rdt_min_closid, r->num_closid);
+
+	closid_free_map = BIT_MASK(rdt_min_closid) - 1;
+
+	/* CLOSID 0 is always reserved for the default group */
+	closid_free_map &= ~1;
+}
+
+int closid_alloc(void)
+{
+	int closid = ffs(closid_free_map);
+
+	if (closid == 0)
+		return -ENOSPC;
+	closid--;
+	closid_free_map &= ~(1 << closid);
+
+	return closid;
+}
+
+static void closid_free(int closid)
+{
+	closid_free_map |= 1 << closid;
+}
+
+/* set uid and gid of rdtgroup dirs and files to that of the creator */
+static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+				.ia_uid = current_fsuid(),
+				.ia_gid = current_fsgid(), };
+
+	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+		return 0;
+
+	return kernfs_setattr(kn, &iattr);
+}
+
+static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
+{
+	struct kernfs_node *kn;
+	int ret;
+
+	kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
+				  0, rft->kf_ops, rft, NULL, NULL);
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
+
+	ret = rdtgroup_kn_set_ugid(kn);
+	if (ret) {
+		kernfs_remove(kn);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts,
+			      int len)
+{
+	struct rftype *rft;
+	int ret;
+
+	lockdep_assert_held(&rdtgroup_mutex);
+
+	for (rft = rfts; rft < rfts + len; rft++) {
+		ret = rdtgroup_add_file(kn, rft);
+		if (ret)
+			goto error;
+	}
+
+	return 0;
+error:
+	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+	while (--rft >= rfts)
+		kernfs_remove_by_name(kn, rft->name);
+	return ret;
+}
+
+static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+	struct kernfs_open_file *of = m->private;
+	struct rftype *rft = of->kn->priv;
+
+	if (rft->seq_show)
+		return rft->seq_show(of, m, arg);
+	return 0;
+}
+
+static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
+				   size_t nbytes, loff_t off)
+{
+	struct rftype *rft = of->kn->priv;
+
+	if (rft->write)
+		return rft->write(of, buf, nbytes, off);
+
+	return -EINVAL;
+}
+
+static struct kernfs_ops rdtgroup_kf_single_ops = {
+	.atomic_write_len	= PAGE_SIZE,
+	.write			= rdtgroup_file_write,
+	.seq_show		= rdtgroup_seqfile_show,
+};
+
+static int rdtgroup_cpus_show(struct kernfs_open_file *of,
+			      struct seq_file *s, void *v)
+{
+	struct rdtgroup *rdtgrp;
+	int ret = 0;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+	if (rdtgrp)
+		seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask));
+	else
+		ret = -ENOENT;
+	rdtgroup_kn_unlock(of->kn);
+
+	return ret;
+}
+
+/*
+ * This is safe against intel_rdt_sched_in() called from __switch_to()
+ * because __switch_to() is executed with interrupts disabled. A local call
+ * from rdt_update_closid() is proteced against __switch_to() because
+ * preemption is disabled.
+ */
+static void rdt_update_cpu_closid(void *closid)
+{
+	if (closid)
+		this_cpu_write(cpu_closid, *(int *)closid);
+	/*
+	 * We cannot unconditionally write the MSR because the current
+	 * executing task might have its own closid selected. Just reuse
+	 * the context switch code.
+	 */
+	intel_rdt_sched_in();
+}
+
+/*
+ * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
+ *
+ * Per task closids must have been set up before calling this function.
+ *
+ * The per cpu closids are updated with the smp function call, when @closid
+ * is not NULL. If @closid is NULL then all affected percpu closids must
+ * have been set up before calling this function.
+ */
+static void
+rdt_update_closid(const struct cpumask *cpu_mask, int *closid)
+{
+	int cpu = get_cpu();
+
+	if (cpumask_test_cpu(cpu, cpu_mask))
+		rdt_update_cpu_closid(closid);
+	smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1);
+	put_cpu();
+}
+
+static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
+				   char *buf, size_t nbytes, loff_t off)
+{
+	cpumask_var_t tmpmask, newmask;
+	struct rdtgroup *rdtgrp, *r;
+	int ret;
+
+	if (!buf)
+		return -EINVAL;
+
+	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+		return -ENOMEM;
+	if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
+		free_cpumask_var(tmpmask);
+		return -ENOMEM;
+	}
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (!rdtgrp) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	ret = cpumask_parse(buf, newmask);
+	if (ret)
+		goto unlock;
+
+	/* check that user didn't specify any offline cpus */
+	cpumask_andnot(tmpmask, newmask, cpu_online_mask);
+	if (cpumask_weight(tmpmask)) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	/* Check whether cpus are dropped from this group */
+	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+	if (cpumask_weight(tmpmask)) {
+		/* Can't drop from default group */
+		if (rdtgrp == &rdtgroup_default) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+		/* Give any dropped cpus to rdtgroup_default */
+		cpumask_or(&rdtgroup_default.cpu_mask,
+			   &rdtgroup_default.cpu_mask, tmpmask);
+		rdt_update_closid(tmpmask, &rdtgroup_default.closid);
+	}
+
+	/*
+	 * If we added cpus, remove them from previous group that owned them
+	 * and update per-cpu closid
+	 */
+	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+	if (cpumask_weight(tmpmask)) {
+		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+			if (r == rdtgrp)
+				continue;
+			cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
+		}
+		rdt_update_closid(tmpmask, &rdtgrp->closid);
+	}
+
+	/* Done pushing/pulling - update this group with new mask */
+	cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+unlock:
+	rdtgroup_kn_unlock(of->kn);
+	free_cpumask_var(tmpmask);
+	free_cpumask_var(newmask);
+
+	return ret ?: nbytes;
+}
+
+struct task_move_callback {
+	struct callback_head	work;
+	struct rdtgroup		*rdtgrp;
+};
+
+static void move_myself(struct callback_head *head)
+{
+	struct task_move_callback *callback;
+	struct rdtgroup *rdtgrp;
+
+	callback = container_of(head, struct task_move_callback, work);
+	rdtgrp = callback->rdtgrp;
+
+	/*
+	 * If resource group was deleted before this task work callback
+	 * was invoked, then assign the task to root group and free the
+	 * resource group.
+	 */
+	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+	    (rdtgrp->flags & RDT_DELETED)) {
+		current->closid = 0;
+		kfree(rdtgrp);
+	}
+
+	preempt_disable();
+	/* update PQR_ASSOC MSR to make resource group go into effect */
+	intel_rdt_sched_in();
+	preempt_enable();
+
+	kfree(callback);
+}
+
+static int __rdtgroup_move_task(struct task_struct *tsk,
+				struct rdtgroup *rdtgrp)
+{
+	struct task_move_callback *callback;
+	int ret;
+
+	callback = kzalloc(sizeof(*callback), GFP_KERNEL);
+	if (!callback)
+		return -ENOMEM;
+	callback->work.func = move_myself;
+	callback->rdtgrp = rdtgrp;
+
+	/*
+	 * Take a refcount, so rdtgrp cannot be freed before the
+	 * callback has been invoked.
+	 */
+	atomic_inc(&rdtgrp->waitcount);
+	ret = task_work_add(tsk, &callback->work, true);
+	if (ret) {
+		/*
+		 * Task is exiting. Drop the refcount and free the callback.
+		 * No need to check the refcount as the group cannot be
+		 * deleted before the write function unlocks rdtgroup_mutex.
+		 */
+		atomic_dec(&rdtgrp->waitcount);
+		kfree(callback);
+	} else {
+		tsk->closid = rdtgrp->closid;
+	}
+	return ret;
+}
+
+static int rdtgroup_task_write_permission(struct task_struct *task,
+					  struct kernfs_open_file *of)
+{
+	const struct cred *tcred = get_task_cred(task);
+	const struct cred *cred = current_cred();
+	int ret = 0;
+
+	/*
+	 * Even if we're attaching all tasks in the thread group, we only
+	 * need to check permissions on one of them.
+	 */
+	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+	    !uid_eq(cred->euid, tcred->uid) &&
+	    !uid_eq(cred->euid, tcred->suid))
+		ret = -EPERM;
+
+	put_cred(tcred);
+	return ret;
+}
+
+static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
+			      struct kernfs_open_file *of)
+{
+	struct task_struct *tsk;
+	int ret;
+
+	rcu_read_lock();
+	if (pid) {
+		tsk = find_task_by_vpid(pid);
+		if (!tsk) {
+			rcu_read_unlock();
+			return -ESRCH;
+		}
+	} else {
+		tsk = current;
+	}
+
+	get_task_struct(tsk);
+	rcu_read_unlock();
+
+	ret = rdtgroup_task_write_permission(tsk, of);
+	if (!ret)
+		ret = __rdtgroup_move_task(tsk, rdtgrp);
+
+	put_task_struct(tsk);
+	return ret;
+}
+
+static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
+				    char *buf, size_t nbytes, loff_t off)
+{
+	struct rdtgroup *rdtgrp;
+	int ret = 0;
+	pid_t pid;
+
+	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+		return -EINVAL;
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+	if (rdtgrp)
+		ret = rdtgroup_move_task(pid, rdtgrp, of);
+	else
+		ret = -ENOENT;
+
+	rdtgroup_kn_unlock(of->kn);
+
+	return ret ?: nbytes;
+}
+
+static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
+{
+	struct task_struct *p, *t;
+
+	rcu_read_lock();
+	for_each_process_thread(p, t) {
+		if (t->closid == r->closid)
+			seq_printf(s, "%d\n", t->pid);
+	}
+	rcu_read_unlock();
+}
+
+static int rdtgroup_tasks_show(struct kernfs_open_file *of,
+			       struct seq_file *s, void *v)
+{
+	struct rdtgroup *rdtgrp;
+	int ret = 0;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (rdtgrp)
+		show_rdt_tasks(rdtgrp, s);
+	else
+		ret = -ENOENT;
+	rdtgroup_kn_unlock(of->kn);
+
+	return ret;
+}
+
+/* Files in each rdtgroup */
+static struct rftype rdtgroup_base_files[] = {
+	{
+		.name		= "cpus",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_cpus_write,
+		.seq_show	= rdtgroup_cpus_show,
+	},
+	{
+		.name		= "tasks",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_tasks_write,
+		.seq_show	= rdtgroup_tasks_show,
+	},
+	{
+		.name		= "schemata",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_schemata_write,
+		.seq_show	= rdtgroup_schemata_show,
+	},
+};
+
+static int rdt_num_closids_show(struct kernfs_open_file *of,
+				struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+
+	seq_printf(seq, "%d\n", r->num_closid);
+
+	return 0;
+}
+
+static int rdt_cbm_mask_show(struct kernfs_open_file *of,
+			     struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+
+	seq_printf(seq, "%x\n", r->max_cbm);
+
+	return 0;
+}
+
+static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
+			     struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+
+	seq_printf(seq, "%d\n", r->min_cbm_bits);
+
+	return 0;
+}
+
+/* rdtgroup information files for one cache resource. */
+static struct rftype res_info_files[] = {
+	{
+		.name		= "num_closids",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_num_closids_show,
+	},
+	{
+		.name		= "cbm_mask",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_cbm_mask_show,
+	},
+	{
+		.name		= "min_cbm_bits",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_min_cbm_bits_show,
+	},
+};
+
+static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
+{
+	struct kernfs_node *kn_subdir;
+	struct rdt_resource *r;
+	int ret;
+
+	/* create the directory */
+	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
+	if (IS_ERR(kn_info))
+		return PTR_ERR(kn_info);
+	kernfs_get(kn_info);
+
+	for_each_enabled_rdt_resource(r) {
+		kn_subdir = kernfs_create_dir(kn_info, r->name,
+					      kn_info->mode, r);
+		if (IS_ERR(kn_subdir)) {
+			ret = PTR_ERR(kn_subdir);
+			goto out_destroy;
+		}
+		kernfs_get(kn_subdir);
+		ret = rdtgroup_kn_set_ugid(kn_subdir);
+		if (ret)
+			goto out_destroy;
+		ret = rdtgroup_add_files(kn_subdir, res_info_files,
+					 ARRAY_SIZE(res_info_files));
+		if (ret)
+			goto out_destroy;
+		kernfs_activate(kn_subdir);
+	}
+
+	/*
+	 * This extra ref will be put in kernfs_remove() and guarantees
+	 * that @rdtgrp->kn is always accessible.
+	 */
+	kernfs_get(kn_info);
+
+	ret = rdtgroup_kn_set_ugid(kn_info);
+	if (ret)
+		goto out_destroy;
+
+	kernfs_activate(kn_info);
+
+	return 0;
+
+out_destroy:
+	kernfs_remove(kn_info);
+	return ret;
+}
+
+static void l3_qos_cfg_update(void *arg)
+{
+	bool *enable = arg;
+
+	wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
+}
+
+static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
+{
+	cpumask_var_t cpu_mask;
+	struct rdt_domain *d;
+	int cpu;
+
+	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	list_for_each_entry(d, &r->domains, list) {
+		/* Pick one CPU from each domain instance to update MSR */
+		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+	}
+	cpu = get_cpu();
+	/* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
+	if (cpumask_test_cpu(cpu, cpu_mask))
+		l3_qos_cfg_update(&enable);
+	/* Update QOS_CFG MSR on all other cpus in cpu_mask. */
+	smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1);
+	put_cpu();
+
+	free_cpumask_var(cpu_mask);
+
+	return 0;
+}
+
+static int cdp_enable(void)
+{
+	struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA];
+	struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE];
+	struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
+	int ret;
+
+	if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable)
+		return -EINVAL;
+
+	ret = set_l3_qos_cfg(r_l3, true);
+	if (!ret) {
+		r_l3->enabled = false;
+		r_l3data->enabled = true;
+		r_l3code->enabled = true;
+	}
+	return ret;
+}
+
+static void cdp_disable(void)
+{
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+	r->enabled = r->capable;
+
+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) {
+		rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false;
+		rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false;
+		set_l3_qos_cfg(r, false);
+	}
+}
+
+static int parse_rdtgroupfs_options(char *data)
+{
+	char *token, *o = data;
+	int ret = 0;
+
+	while ((token = strsep(&o, ",")) != NULL) {
+		if (!*token)
+			return -EINVAL;
+
+		if (!strcmp(token, "cdp"))
+			ret = cdp_enable();
+	}
+
+	return ret;
+}
+
+/*
+ * We don't allow rdtgroup directories to be created anywhere
+ * except the root directory. Thus when looking for the rdtgroup
+ * structure for a kernfs node we are either looking at a directory,
+ * in which case the rdtgroup structure is pointed at by the "priv"
+ * field, otherwise we have a file, and need only look to the parent
+ * to find the rdtgroup.
+ */
+static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
+{
+	if (kernfs_type(kn) == KERNFS_DIR) {
+		/*
+		 * All the resource directories use "kn->priv"
+		 * to point to the "struct rdtgroup" for the
+		 * resource. "info" and its subdirectories don't
+		 * have rdtgroup structures, so return NULL here.
+		 */
+		if (kn == kn_info || kn->parent == kn_info)
+			return NULL;
+		else
+			return kn->priv;
+	} else {
+		return kn->parent->priv;
+	}
+}
+
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
+{
+	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+	if (!rdtgrp)
+		return NULL;
+
+	atomic_inc(&rdtgrp->waitcount);
+	kernfs_break_active_protection(kn);
+
+	mutex_lock(&rdtgroup_mutex);
+
+	/* Was this group deleted while we waited? */
+	if (rdtgrp->flags & RDT_DELETED)
+		return NULL;
+
+	return rdtgrp;
+}
+
+void rdtgroup_kn_unlock(struct kernfs_node *kn)
+{
+	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+	if (!rdtgrp)
+		return;
+
+	mutex_unlock(&rdtgroup_mutex);
+
+	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+	    (rdtgrp->flags & RDT_DELETED)) {
+		kernfs_unbreak_active_protection(kn);
+		kernfs_put(kn);
+		kfree(rdtgrp);
+	} else {
+		kernfs_unbreak_active_protection(kn);
+	}
+}
+
+static struct dentry *rdt_mount(struct file_system_type *fs_type,
+				int flags, const char *unused_dev_name,
+				void *data)
+{
+	struct dentry *dentry;
+	int ret;
+
+	mutex_lock(&rdtgroup_mutex);
+	/*
+	 * resctrl file system can only be mounted once.
+	 */
+	if (static_branch_unlikely(&rdt_enable_key)) {
+		dentry = ERR_PTR(-EBUSY);
+		goto out;
+	}
+
+	ret = parse_rdtgroupfs_options(data);
+	if (ret) {
+		dentry = ERR_PTR(ret);
+		goto out_cdp;
+	}
+
+	closid_init();
+
+	ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
+	if (ret) {
+		dentry = ERR_PTR(ret);
+		goto out_cdp;
+	}
+
+	dentry = kernfs_mount(fs_type, flags, rdt_root,
+			      RDTGROUP_SUPER_MAGIC, NULL);
+	if (IS_ERR(dentry))
+		goto out_cdp;
+
+	static_branch_enable(&rdt_enable_key);
+	goto out;
+
+out_cdp:
+	cdp_disable();
+out:
+	mutex_unlock(&rdtgroup_mutex);
+
+	return dentry;
+}
+
+static int reset_all_cbms(struct rdt_resource *r)
+{
+	struct msr_param msr_param;
+	cpumask_var_t cpu_mask;
+	struct rdt_domain *d;
+	int i, cpu;
+
+	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	msr_param.res = r;
+	msr_param.low = 0;
+	msr_param.high = r->num_closid;
+
+	/*
+	 * Disable resource control for this resource by setting all
+	 * CBMs in all domains to the maximum mask value. Pick one CPU
+	 * from each domain to update the MSRs below.
+	 */
+	list_for_each_entry(d, &r->domains, list) {
+		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+
+		for (i = 0; i < r->num_closid; i++)
+			d->cbm[i] = r->max_cbm;
+	}
+	cpu = get_cpu();
+	/* Update CBM on this cpu if it's in cpu_mask. */
+	if (cpumask_test_cpu(cpu, cpu_mask))
+		rdt_cbm_update(&msr_param);
+	/* Update CBM on all other cpus in cpu_mask. */
+	smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+	put_cpu();
+
+	free_cpumask_var(cpu_mask);
+
+	return 0;
+}
+
+/*
+ * Move tasks from one to the other group. If @from is NULL, then all tasks
+ * in the systems are moved unconditionally (used for teardown).
+ *
+ * If @mask is not NULL the cpus on which moved tasks are running are set
+ * in that mask so the update smp function call is restricted to affected
+ * cpus.
+ */
+static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
+				 struct cpumask *mask)
+{
+	struct task_struct *p, *t;
+
+	read_lock(&tasklist_lock);
+	for_each_process_thread(p, t) {
+		if (!from || t->closid == from->closid) {
+			t->closid = to->closid;
+#ifdef CONFIG_SMP
+			/*
+			 * This is safe on x86 w/o barriers as the ordering
+			 * of writing to task_cpu() and t->on_cpu is
+			 * reverse to the reading here. The detection is
+			 * inaccurate as tasks might move or schedule
+			 * before the smp function call takes place. In
+			 * such a case the function call is pointless, but
+			 * there is no other side effect.
+			 */
+			if (mask && t->on_cpu)
+				cpumask_set_cpu(task_cpu(t), mask);
+#endif
+		}
+	}
+	read_unlock(&tasklist_lock);
+}
+
+/*
+ * Forcibly remove all of subdirectories under root.
+ */
+static void rmdir_all_sub(void)
+{
+	struct rdtgroup *rdtgrp, *tmp;
+
+	/* Move all tasks to the default resource group */
+	rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
+
+	list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+		/* Remove each rdtgroup other than root */
+		if (rdtgrp == &rdtgroup_default)
+			continue;
+
+		/*
+		 * Give any CPUs back to the default group. We cannot copy
+		 * cpu_online_mask because a CPU might have executed the
+		 * offline callback already, but is still marked online.
+		 */
+		cpumask_or(&rdtgroup_default.cpu_mask,
+			   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+		kernfs_remove(rdtgrp->kn);
+		list_del(&rdtgrp->rdtgroup_list);
+		kfree(rdtgrp);
+	}
+	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
+	get_online_cpus();
+	rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid);
+	put_online_cpus();
+
+	kernfs_remove(kn_info);
+}
+
+static void rdt_kill_sb(struct super_block *sb)
+{
+	struct rdt_resource *r;
+
+	mutex_lock(&rdtgroup_mutex);
+
+	/*Put everything back to default values. */
+	for_each_enabled_rdt_resource(r)
+		reset_all_cbms(r);
+	cdp_disable();
+	rmdir_all_sub();
+	static_branch_disable(&rdt_enable_key);
+	kernfs_kill_sb(sb);
+	mutex_unlock(&rdtgroup_mutex);
+}
+
+static struct file_system_type rdt_fs_type = {
+	.name    = "resctrl",
+	.mount   = rdt_mount,
+	.kill_sb = rdt_kill_sb,
+};
+
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+			  umode_t mode)
+{
+	struct rdtgroup *parent, *rdtgrp;
+	struct kernfs_node *kn;
+	int ret, closid;
+
+	/* Only allow mkdir in the root directory */
+	if (parent_kn != rdtgroup_default.kn)
+		return -EPERM;
+
+	/* Do not accept '\n' to avoid unparsable situation. */
+	if (strchr(name, '\n'))
+		return -EINVAL;
+
+	parent = rdtgroup_kn_lock_live(parent_kn);
+	if (!parent) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	ret = closid_alloc();
+	if (ret < 0)
+		goto out_unlock;
+	closid = ret;
+
+	/* allocate the rdtgroup. */
+	rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
+	if (!rdtgrp) {
+		ret = -ENOSPC;
+		goto out_closid_free;
+	}
+	rdtgrp->closid = closid;
+	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+
+	/* kernfs creates the directory for rdtgrp */
+	kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp);
+	if (IS_ERR(kn)) {
+		ret = PTR_ERR(kn);
+		goto out_cancel_ref;
+	}
+	rdtgrp->kn = kn;
+
+	/*
+	 * kernfs_remove() will drop the reference count on "kn" which
+	 * will free it. But we still need it to stick around for the
+	 * rdtgroup_kn_unlock(kn} call below. Take one extra reference
+	 * here, which will be dropped inside rdtgroup_kn_unlock().
+	 */
+	kernfs_get(kn);
+
+	ret = rdtgroup_kn_set_ugid(kn);
+	if (ret)
+		goto out_destroy;
+
+	ret = rdtgroup_add_files(kn, rdtgroup_base_files,
+				 ARRAY_SIZE(rdtgroup_base_files));
+	if (ret)
+		goto out_destroy;
+
+	kernfs_activate(kn);
+
+	ret = 0;
+	goto out_unlock;
+
+out_destroy:
+	kernfs_remove(rdtgrp->kn);
+out_cancel_ref:
+	list_del(&rdtgrp->rdtgroup_list);
+	kfree(rdtgrp);
+out_closid_free:
+	closid_free(closid);
+out_unlock:
+	rdtgroup_kn_unlock(parent_kn);
+	return ret;
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+	int ret, cpu, closid = rdtgroup_default.closid;
+	struct rdtgroup *rdtgrp;
+	cpumask_var_t tmpmask;
+
+	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+		return -ENOMEM;
+
+	rdtgrp = rdtgroup_kn_lock_live(kn);
+	if (!rdtgrp) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	/* Give any tasks back to the default group */
+	rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
+
+	/* Give any CPUs back to the default group */
+	cpumask_or(&rdtgroup_default.cpu_mask,
+		   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+	/* Update per cpu closid of the moved CPUs first */
+	for_each_cpu(cpu, &rdtgrp->cpu_mask)
+		per_cpu(cpu_closid, cpu) = closid;
+	/*
+	 * Update the MSR on moved CPUs and CPUs which have moved
+	 * task running on them.
+	 */
+	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+	rdt_update_closid(tmpmask, NULL);
+
+	rdtgrp->flags = RDT_DELETED;
+	closid_free(rdtgrp->closid);
+	list_del(&rdtgrp->rdtgroup_list);
+
+	/*
+	 * one extra hold on this, will drop when we kfree(rdtgrp)
+	 * in rdtgroup_kn_unlock()
+	 */
+	kernfs_get(kn);
+	kernfs_remove(rdtgrp->kn);
+	ret = 0;
+out:
+	rdtgroup_kn_unlock(kn);
+	free_cpumask_var(tmpmask);
+	return ret;
+}
+
+static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
+{
+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled)
+		seq_puts(seq, ",cdp");
+	return 0;
+}
+
+static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
+	.mkdir		= rdtgroup_mkdir,
+	.rmdir		= rdtgroup_rmdir,
+	.show_options	= rdtgroup_show_options,
+};
+
+static int __init rdtgroup_setup_root(void)
+{
+	int ret;
+
+	rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
+				      KERNFS_ROOT_CREATE_DEACTIVATED,
+				      &rdtgroup_default);
+	if (IS_ERR(rdt_root))
+		return PTR_ERR(rdt_root);
+
+	mutex_lock(&rdtgroup_mutex);
+
+	rdtgroup_default.closid = 0;
+	list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
+
+	ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files,
+				 ARRAY_SIZE(rdtgroup_base_files));
+	if (ret) {
+		kernfs_destroy_root(rdt_root);
+		goto out;
+	}
+
+	rdtgroup_default.kn = rdt_root->kn;
+	kernfs_activate(rdtgroup_default.kn);
+
+out:
+	mutex_unlock(&rdtgroup_mutex);
+
+	return ret;
+}
+
+/*
+ * rdtgroup_init - rdtgroup initialization
+ *
+ * Setup resctrl file system including set up root, create mount point,
+ * register rdtgroup filesystem, and initialize files under root directory.
+ *
+ * Return: 0 on success or -errno
+ */
+int __init rdtgroup_init(void)
+{
+	int ret = 0;
+
+	ret = rdtgroup_setup_root();
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+	if (ret)
+		goto cleanup_root;
+
+	ret = register_filesystem(&rdt_fs_type);
+	if (ret)
+		goto cleanup_mountpoint;
+
+	return 0;
+
+cleanup_mountpoint:
+	sysfs_remove_mount_point(fs_kobj, "resctrl");
+cleanup_root:
+	kernfs_destroy_root(rdt_root);
+
+	return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c
new file mode 100644
index 000000000000..f369cb8db0d5
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c
@@ -0,0 +1,245 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ *    Tony Luck <tony.luck@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <asm/intel_rdt.h>
+
+/*
+ * Check whether a cache bit mask is valid. The SDM says:
+ *	Please note that all (and only) contiguous '1' combinations
+ *	are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
+ * Additionally Haswell requires at least two bits set.
+ */
+static bool cbm_validate(unsigned long var, struct rdt_resource *r)
+{
+	unsigned long first_bit, zero_bit;
+
+	if (var == 0 || var > r->max_cbm)
+		return false;
+
+	first_bit = find_first_bit(&var, r->cbm_len);
+	zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit);
+
+	if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len)
+		return false;
+
+	if ((zero_bit - first_bit) < r->min_cbm_bits)
+		return false;
+	return true;
+}
+
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+static int parse_cbm(char *buf, struct rdt_resource *r)
+{
+	unsigned long data;
+	int ret;
+
+	ret = kstrtoul(buf, 16, &data);
+	if (ret)
+		return ret;
+	if (!cbm_validate(data, r))
+		return -EINVAL;
+	r->tmp_cbms[r->num_tmp_cbms++] = data;
+
+	return 0;
+}
+
+/*
+ * For each domain in this resource we expect to find a series of:
+ *	id=mask
+ * separated by ";". The "id" is in decimal, and must appear in the
+ * right order.
+ */
+static int parse_line(char *line, struct rdt_resource *r)
+{
+	char *dom = NULL, *id;
+	struct rdt_domain *d;
+	unsigned long dom_id;
+
+	list_for_each_entry(d, &r->domains, list) {
+		dom = strsep(&line, ";");
+		if (!dom)
+			return -EINVAL;
+		id = strsep(&dom, "=");
+		if (kstrtoul(id, 10, &dom_id) || dom_id != d->id)
+			return -EINVAL;
+		if (parse_cbm(dom, r))
+			return -EINVAL;
+	}
+
+	/* Any garbage at the end of the line? */
+	if (line && line[0])
+		return -EINVAL;
+	return 0;
+}
+
+static int update_domains(struct rdt_resource *r, int closid)
+{
+	struct msr_param msr_param;
+	cpumask_var_t cpu_mask;
+	struct rdt_domain *d;
+	int cpu, idx = 0;
+
+	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	msr_param.low = closid;
+	msr_param.high = msr_param.low + 1;
+	msr_param.res = r;
+
+	list_for_each_entry(d, &r->domains, list) {
+		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+		d->cbm[msr_param.low] = r->tmp_cbms[idx++];
+	}
+	cpu = get_cpu();
+	/* Update CBM on this cpu if it's in cpu_mask. */
+	if (cpumask_test_cpu(cpu, cpu_mask))
+		rdt_cbm_update(&msr_param);
+	/* Update CBM on other cpus. */
+	smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+	put_cpu();
+
+	free_cpumask_var(cpu_mask);
+
+	return 0;
+}
+
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+	char *tok, *resname;
+	int closid, ret = 0;
+	u32 *l3_cbms = NULL;
+
+	/* Valid input requires a trailing newline */
+	if (nbytes == 0 || buf[nbytes - 1] != '\n')
+		return -EINVAL;
+	buf[nbytes - 1] = '\0';
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (!rdtgrp) {
+		rdtgroup_kn_unlock(of->kn);
+		return -ENOENT;
+	}
+
+	closid = rdtgrp->closid;
+
+	/* get scratch space to save all the masks while we validate input */
+	for_each_enabled_rdt_resource(r) {
+		r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms),
+				      GFP_KERNEL);
+		if (!r->tmp_cbms) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		r->num_tmp_cbms = 0;
+	}
+
+	while ((tok = strsep(&buf, "\n")) != NULL) {
+		resname = strsep(&tok, ":");
+		if (!tok) {
+			ret = -EINVAL;
+			goto out;
+		}
+		for_each_enabled_rdt_resource(r) {
+			if (!strcmp(resname, r->name) &&
+			    closid < r->num_closid) {
+				ret = parse_line(tok, r);
+				if (ret)
+					goto out;
+				break;
+			}
+		}
+		if (!r->name) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	/* Did the parser find all the masks we need? */
+	for_each_enabled_rdt_resource(r) {
+		if (r->num_tmp_cbms != r->num_domains) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	for_each_enabled_rdt_resource(r) {
+		ret = update_domains(r, closid);
+		if (ret)
+			goto out;
+	}
+
+out:
+	rdtgroup_kn_unlock(of->kn);
+	for_each_enabled_rdt_resource(r) {
+		kfree(r->tmp_cbms);
+		r->tmp_cbms = NULL;
+	}
+	return ret ?: nbytes;
+}
+
+static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
+{
+	struct rdt_domain *dom;
+	bool sep = false;
+
+	seq_printf(s, "%s:", r->name);
+	list_for_each_entry(dom, &r->domains, list) {
+		if (sep)
+			seq_puts(s, ";");
+		seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]);
+		sep = true;
+	}
+	seq_puts(s, "\n");
+}
+
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+			   struct seq_file *s, void *v)
+{
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+	int closid, ret = 0;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (rdtgrp) {
+		closid = rdtgrp->closid;
+		for_each_enabled_rdt_resource(r) {
+			if (closid < r->num_closid)
+				show_doms(s, r, closid);
+		}
+	} else {
+		ret = -ENOENT;
+	}
+	rdtgroup_kn_unlock(of->kn);
+	return ret;
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 631356c8cca4..c7efbcfbeda6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -311,7 +311,7 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e
 			*msg = s->msg;
 		s->covered = 1;
 		if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
-			if (panic_on_oops || tolerant < 1)
+			if (tolerant < 1)
 				return MCE_PANIC_SEVERITY;
 		}
 		return s->sev;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a7fdf453d895..00ef43233e03 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -43,6 +43,7 @@
 #include <linux/export.h>
 #include <linux/jump_label.h>
 
+#include <asm/intel-family.h>
 #include <asm/processor.h>
 #include <asm/traps.h>
 #include <asm/tlbflush.h>
@@ -135,6 +136,9 @@ void mce_setup(struct mce *m)
 	m->socketid = cpu_data(m->extcpu).phys_proc_id;
 	m->apicid = cpu_data(m->extcpu).initial_apicid;
 	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+
+	if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
+		rdmsrl(MSR_PPIN, m->ppin);
 }
 
 DEFINE_PER_CPU(struct mce, injectm);
@@ -207,8 +211,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log);
 
 static struct notifier_block mce_srao_nb;
 
+static atomic_t num_notifiers;
+
 void mce_register_decode_chain(struct notifier_block *nb)
 {
+	atomic_inc(&num_notifiers);
+
 	/* Ensure SRAO notifier has the highest priority in the decode chain. */
 	if (nb != &mce_srao_nb && nb->priority == INT_MAX)
 		nb->priority -= 1;
@@ -219,6 +227,8 @@ EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 
 void mce_unregister_decode_chain(struct notifier_block *nb)
 {
+	atomic_dec(&num_notifiers);
+
 	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 }
 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
@@ -270,17 +280,17 @@ struct mca_msr_regs msr_ops = {
 	.misc	= misc_reg
 };
 
-static void print_mce(struct mce *m)
+static void __print_mce(struct mce *m)
 {
-	int ret = 0;
-
-	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
-	       m->extcpu, m->mcgstatus, m->bank, m->status);
+	pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
+		 m->extcpu,
+		 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
+		 m->mcgstatus, m->bank, m->status);
 
 	if (m->ip) {
 		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
-				m->cs, m->ip);
+			m->cs, m->ip);
 
 		if (m->cs == __KERNEL_CS)
 			print_symbol("{%s}", m->ip);
@@ -308,6 +318,13 @@ static void print_mce(struct mce *m)
 	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 		cpu_data(m->extcpu).microcode);
+}
+
+static void print_mce(struct mce *m)
+{
+	int ret = 0;
+
+	__print_mce(m);
 
 	/*
 	 * Print out human-readable details about the MCE error,
@@ -499,7 +516,7 @@ int mce_available(struct cpuinfo_x86 *c)
 
 static void mce_schedule_work(void)
 {
-	if (!mce_gen_pool_empty() && keventd_up())
+	if (!mce_gen_pool_empty())
 		schedule_work(&mce_work);
 }
 
@@ -569,6 +586,32 @@ static struct notifier_block mce_srao_nb = {
 	.priority = INT_MAX,
 };
 
+static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct mce *m = (struct mce *)data;
+
+	if (!m)
+		return NOTIFY_DONE;
+
+	/*
+	 * Run the default notifier if we have only the SRAO
+	 * notifier and us registered.
+	 */
+	if (atomic_read(&num_notifiers) > 2)
+		return NOTIFY_DONE;
+
+	__print_mce(m);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block mce_default_nb = {
+	.notifier_call	= mce_default_notifier,
+	/* lowest prio, we want it to run last. */
+	.priority	= 0,
+};
+
 /*
  * Read ADDR and MISC registers.
  */
@@ -667,6 +710,15 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 	mce_gather_info(&m, NULL);
 
+	/*
+	 * m.tsc was set in mce_setup(). Clear it if not requested.
+	 *
+	 * FIXME: Propagate @flags to mce_gather_info/mce_setup() to avoid
+	 *	  that dance.
+	 */
+	if (!(flags & MCP_TIMESTAMP))
+		m.tsc = 0;
+
 	for (i = 0; i < mca_cfg.banks; i++) {
 		if (!mce_banks[i].ctl || !test_bit(i, *b))
 			continue;
@@ -674,14 +726,12 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		m.misc = 0;
 		m.addr = 0;
 		m.bank = i;
-		m.tsc = 0;
 
 		barrier();
 		m.status = mce_rdmsrl(msr_ops.status(i));
 		if (!(m.status & MCI_STATUS_VAL))
 			continue;
 
-
 		/*
 		 * Uncorrected or signalled events are handled by the exception
 		 * handler when it is enabled, so don't process those here.
@@ -696,9 +746,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 		mce_read_aux(&m, i);
 
-		if (!(flags & MCP_TIMESTAMP))
-			m.tsc = 0;
-
 		severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 
 		if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
@@ -1355,7 +1402,7 @@ static void mce_timer_fn(unsigned long data)
 	iv = __this_cpu_read(mce_next_interval);
 
 	if (mce_available(this_cpu_ptr(&cpu_info))) {
-		machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
+		machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
 
 		if (mce_intel_cmci_poll()) {
 			iv = mce_adjust_timer(iv);
@@ -1745,6 +1792,14 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
 	add_timer_on(t, cpu);
 }
 
+static void __mcheck_cpu_setup_timer(void)
+{
+	struct timer_list *t = this_cpu_ptr(&mce_timer);
+	unsigned int cpu = smp_processor_id();
+
+	setup_pinned_timer(t, mce_timer_fn, cpu);
+}
+
 static void __mcheck_cpu_init_timer(void)
 {
 	struct timer_list *t = this_cpu_ptr(&mce_timer);
@@ -1796,7 +1851,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(c);
 	__mcheck_cpu_init_clear_banks();
-	__mcheck_cpu_init_timer();
+	__mcheck_cpu_setup_timer();
 }
 
 /*
@@ -2138,6 +2193,7 @@ int __init mcheck_init(void)
 {
 	mcheck_intel_therm_init();
 	mce_register_decode_chain(&mce_srao_nb);
+	mce_register_decode_chain(&mce_default_nb);
 	mcheck_vendor_init_severity();
 
 	INIT_WORK(&mce_work, mce_process_work);
@@ -2255,8 +2311,6 @@ static struct bus_type mce_subsys = {
 
 DEFINE_PER_CPU(struct device *, mce_device);
 
-void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
-
 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
 {
 	return container_of(attr, struct mce_bank, attr);
@@ -2409,6 +2463,10 @@ static int mce_device_create(unsigned int cpu)
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
+	dev = per_cpu(mce_device, cpu);
+	if (dev)
+		return 0;
+
 	dev = kzalloc(sizeof *dev, GFP_KERNEL);
 	if (!dev)
 		return -ENOMEM;
@@ -2468,28 +2526,25 @@ static void mce_device_remove(unsigned int cpu)
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void *h)
+static void mce_disable_cpu(void)
 {
-	unsigned long action = *(unsigned long *)h;
-
 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
 		return;
 
-	if (!(action & CPU_TASKS_FROZEN))
+	if (!cpuhp_tasks_frozen)
 		cmci_clear();
 
 	vendor_disable_error_reporting();
 }
 
-static void mce_reenable_cpu(void *h)
+static void mce_reenable_cpu(void)
 {
-	unsigned long action = *(unsigned long *)h;
 	int i;
 
 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
 		return;
 
-	if (!(action & CPU_TASKS_FROZEN))
+	if (!cpuhp_tasks_frozen)
 		cmci_reenable();
 	for (i = 0; i < mca_cfg.banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
@@ -2499,45 +2554,43 @@ static void mce_reenable_cpu(void *h)
 	}
 }
 
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
-mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+static int mce_cpu_dead(unsigned int cpu)
+{
+	mce_intel_hcpu_update(cpu);
+
+	/* intentionally ignoring frozen here */
+	if (!cpuhp_tasks_frozen)
+		cmci_rediscover();
+	return 0;
+}
+
+static int mce_cpu_online(unsigned int cpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
 	struct timer_list *t = &per_cpu(mce_timer, cpu);
+	int ret;
 
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_ONLINE:
-		mce_device_create(cpu);
-		if (threshold_cpu_callback)
-			threshold_cpu_callback(action, cpu);
-		break;
-	case CPU_DEAD:
-		if (threshold_cpu_callback)
-			threshold_cpu_callback(action, cpu);
-		mce_device_remove(cpu);
-		mce_intel_hcpu_update(cpu);
+	mce_device_create(cpu);
 
-		/* intentionally ignoring frozen here */
-		if (!(action & CPU_TASKS_FROZEN))
-			cmci_rediscover();
-		break;
-	case CPU_DOWN_PREPARE:
-		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
-		del_timer_sync(t);
-		break;
-	case CPU_DOWN_FAILED:
-		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
-		mce_start_timer(cpu, t);
-		break;
+	ret = mce_threshold_create_device(cpu);
+	if (ret) {
+		mce_device_remove(cpu);
+		return ret;
 	}
-
-	return NOTIFY_OK;
+	mce_reenable_cpu();
+	mce_start_timer(cpu, t);
+	return 0;
 }
 
-static struct notifier_block mce_cpu_notifier = {
-	.notifier_call = mce_cpu_callback,
-};
+static int mce_cpu_pre_down(unsigned int cpu)
+{
+	struct timer_list *t = &per_cpu(mce_timer, cpu);
+
+	mce_disable_cpu();
+	del_timer_sync(t);
+	mce_threshold_remove_device(cpu);
+	mce_device_remove(cpu);
+	return 0;
+}
 
 static __init void mce_init_banks(void)
 {
@@ -2559,8 +2612,8 @@ static __init void mce_init_banks(void)
 
 static __init int mcheck_init_device(void)
 {
+	enum cpuhp_state hp_online;
 	int err;
-	int i = 0;
 
 	if (!mce_available(&boot_cpu_data)) {
 		err = -EIO;
@@ -2578,23 +2631,16 @@ static __init int mcheck_init_device(void)
 	if (err)
 		goto err_out_mem;
 
-	cpu_notifier_register_begin();
-	for_each_online_cpu(i) {
-		err = mce_device_create(i);
-		if (err) {
-			/*
-			 * Register notifier anyway (and do not unreg it) so
-			 * that we don't leave undeleted timers, see notifier
-			 * callback above.
-			 */
-			__register_hotcpu_notifier(&mce_cpu_notifier);
-			cpu_notifier_register_done();
-			goto err_device_create;
-		}
-	}
+	err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
+				mce_cpu_dead);
+	if (err)
+		goto err_out_mem;
 
-	__register_hotcpu_notifier(&mce_cpu_notifier);
-	cpu_notifier_register_done();
+	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
+				mce_cpu_online, mce_cpu_pre_down);
+	if (err < 0)
+		goto err_out_online;
+	hp_online = err;
 
 	register_syscore_ops(&mce_syscore_ops);
 
@@ -2607,16 +2653,10 @@ static __init int mcheck_init_device(void)
 
 err_register:
 	unregister_syscore_ops(&mce_syscore_ops);
+	cpuhp_remove_state(hp_online);
 
-err_device_create:
-	/*
-	 * We didn't keep track of which devices were created above, but
-	 * even if we had, the set of online cpus might have changed.
-	 * Play safe and remove for every possible cpu, since
-	 * mce_device_remove() will do the right thing.
-	 */
-	for_each_possible_cpu(i)
-		mce_device_remove(i);
+err_out_online:
+	cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
 
 err_out_mem:
 	free_cpumask_var(mce_device_initialized);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 9b5403462936..ffacfdcacb85 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -24,7 +24,6 @@
 
 #include <asm/amd_nb.h>
 #include <asm/apic.h>
-#include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/trace/irq_vectors.h>
@@ -55,6 +54,8 @@
 /* Threshold LVT offset is at MSR0xC0000410[15:12] */
 #define SMCA_THR_LVT_OFF	0xF000
 
+static bool thresholding_en;
+
 static const char * const th_names[] = {
 	"load_store",
 	"insn_fetch",
@@ -69,7 +70,12 @@ static const char * const smca_umc_block_names[] = {
 	"misc_umc"
 };
 
-struct smca_bank_name smca_bank_names[] = {
+struct smca_bank_name {
+	const char *name;	/* Short name for sysfs */
+	const char *long_name;	/* Long name for pretty-printing */
+};
+
+static struct smca_bank_name smca_names[] = {
 	[SMCA_LS]	= { "load_store",	"Load Store Unit" },
 	[SMCA_IF]	= { "insn_fetch",	"Instruction Fetch Unit" },
 	[SMCA_L2_CACHE]	= { "l2_cache",		"L2 Cache" },
@@ -84,9 +90,25 @@ struct smca_bank_name smca_bank_names[] = {
 	[SMCA_PSP]	= { "psp",		"Platform Security Processor" },
 	[SMCA_SMU]	= { "smu",		"System Management Unit" },
 };
-EXPORT_SYMBOL_GPL(smca_bank_names);
 
-static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
+const char *smca_get_name(enum smca_bank_types t)
+{
+	if (t >= N_SMCA_BANK_TYPES)
+		return NULL;
+
+	return smca_names[t].name;
+}
+
+const char *smca_get_long_name(enum smca_bank_types t)
+{
+	if (t >= N_SMCA_BANK_TYPES)
+		return NULL;
+
+	return smca_names[t].long_name;
+}
+EXPORT_SYMBOL_GPL(smca_get_long_name);
+
+static struct smca_hwid smca_hwid_mcatypes[] = {
 	/* { bank_type, hwid_mcatype, xec_bitmap } */
 
 	/* ZN Core (HWID=0xB0) MCA types */
@@ -116,7 +138,7 @@ static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
 	{ SMCA_SMU,	 HWID_MCATYPE(0x01, 0x0), 0x1 },
 };
 
-struct smca_bank_info smca_banks[MAX_NR_BANKS];
+struct smca_bank smca_banks[MAX_NR_BANKS];
 EXPORT_SYMBOL_GPL(smca_banks);
 
 /*
@@ -142,35 +164,34 @@ static void default_deferred_error_interrupt(void)
 }
 void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
 
-/*
- * CPU Initialization
- */
-
 static void get_smca_bank_info(unsigned int bank)
 {
 	unsigned int i, hwid_mcatype, cpu = smp_processor_id();
-	struct smca_hwid_mcatype *type;
-	u32 high, instanceId;
-	u16 hwid, mcatype;
+	struct smca_hwid *s_hwid;
+	u32 high, instance_id;
 
 	/* Collect bank_info using CPU 0 for now. */
 	if (cpu)
 		return;
 
-	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) {
+	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) {
 		pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
 		return;
 	}
 
-	hwid = high & MCI_IPID_HWID;
-	mcatype = (high & MCI_IPID_MCATYPE) >> 16;
-	hwid_mcatype = HWID_MCATYPE(hwid, mcatype);
+	hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID,
+				    (high & MCI_IPID_MCATYPE) >> 16);
 
 	for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
-		type = &smca_hwid_mcatypes[i];
-		if (hwid_mcatype == type->hwid_mcatype) {
-			smca_banks[bank].type = type;
-			smca_banks[bank].type_instance = instanceId;
+		s_hwid = &smca_hwid_mcatypes[i];
+		if (hwid_mcatype == s_hwid->hwid_mcatype) {
+
+			WARN(smca_banks[bank].hwid,
+			     "Bank %s already initialized!\n",
+			     smca_get_name(s_hwid->bank_type));
+
+			smca_banks[bank].hwid = s_hwid;
+			smca_banks[bank].id = instance_id;
 			break;
 		}
 	}
@@ -533,6 +554,206 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 		deferred_error_interrupt_enable(c);
 }
 
+int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
+{
+	u64 dram_base_addr, dram_limit_addr, dram_hole_base;
+	/* We start from the normalized address */
+	u64 ret_addr = norm_addr;
+
+	u32 tmp;
+
+	u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
+	u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
+	u8 intlv_addr_sel, intlv_addr_bit;
+	u8 num_intlv_bits, hashed_bit;
+	u8 lgcy_mmio_hole_en, base = 0;
+	u8 cs_mask, cs_id = 0;
+	bool hash_enabled = false;
+
+	/* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
+	if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp))
+		goto out_err;
+
+	/* Remove HiAddrOffset from normalized address, if enabled: */
+	if (tmp & BIT(0)) {
+		u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8;
+
+		if (norm_addr >= hi_addr_offset) {
+			ret_addr -= hi_addr_offset;
+			base = 1;
+		}
+	}
+
+	/* Read D18F0x110 (DramBaseAddress). */
+	if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp))
+		goto out_err;
+
+	/* Check if address range is valid. */
+	if (!(tmp & BIT(0))) {
+		pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
+			__func__, tmp);
+		goto out_err;
+	}
+
+	lgcy_mmio_hole_en = tmp & BIT(1);
+	intlv_num_chan	  = (tmp >> 4) & 0xF;
+	intlv_addr_sel	  = (tmp >> 8) & 0x7;
+	dram_base_addr	  = (tmp & GENMASK_ULL(31, 12)) << 16;
+
+	/* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
+	if (intlv_addr_sel > 3) {
+		pr_err("%s: Invalid interleave address select %d.\n",
+			__func__, intlv_addr_sel);
+		goto out_err;
+	}
+
+	/* Read D18F0x114 (DramLimitAddress). */
+	if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp))
+		goto out_err;
+
+	intlv_num_sockets = (tmp >> 8) & 0x1;
+	intlv_num_dies	  = (tmp >> 10) & 0x3;
+	dram_limit_addr	  = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
+
+	intlv_addr_bit = intlv_addr_sel + 8;
+
+	/* Re-use intlv_num_chan by setting it equal to log2(#channels) */
+	switch (intlv_num_chan) {
+	case 0:	intlv_num_chan = 0; break;
+	case 1: intlv_num_chan = 1; break;
+	case 3: intlv_num_chan = 2; break;
+	case 5:	intlv_num_chan = 3; break;
+	case 7:	intlv_num_chan = 4; break;
+
+	case 8: intlv_num_chan = 1;
+		hash_enabled = true;
+		break;
+	default:
+		pr_err("%s: Invalid number of interleaved channels %d.\n",
+			__func__, intlv_num_chan);
+		goto out_err;
+	}
+
+	num_intlv_bits = intlv_num_chan;
+
+	if (intlv_num_dies > 2) {
+		pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
+			__func__, intlv_num_dies);
+		goto out_err;
+	}
+
+	num_intlv_bits += intlv_num_dies;
+
+	/* Add a bit if sockets are interleaved. */
+	num_intlv_bits += intlv_num_sockets;
+
+	/* Assert num_intlv_bits <= 4 */
+	if (num_intlv_bits > 4) {
+		pr_err("%s: Invalid interleave bits %d.\n",
+			__func__, num_intlv_bits);
+		goto out_err;
+	}
+
+	if (num_intlv_bits > 0) {
+		u64 temp_addr_x, temp_addr_i, temp_addr_y;
+		u8 die_id_bit, sock_id_bit, cs_fabric_id;
+
+		/*
+		 * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
+		 * This is the fabric id for this coherent slave. Use
+		 * umc/channel# as instance id of the coherent slave
+		 * for FICAA.
+		 */
+		if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp))
+			goto out_err;
+
+		cs_fabric_id = (tmp >> 8) & 0xFF;
+		die_id_bit   = 0;
+
+		/* If interleaved over more than 1 channel: */
+		if (intlv_num_chan) {
+			die_id_bit = intlv_num_chan;
+			cs_mask	   = (1 << die_id_bit) - 1;
+			cs_id	   = cs_fabric_id & cs_mask;
+		}
+
+		sock_id_bit = die_id_bit;
+
+		/* Read D18F1x208 (SystemFabricIdMask). */
+		if (intlv_num_dies || intlv_num_sockets)
+			if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp))
+				goto out_err;
+
+		/* If interleaved over more than 1 die. */
+		if (intlv_num_dies) {
+			sock_id_bit  = die_id_bit + intlv_num_dies;
+			die_id_shift = (tmp >> 24) & 0xF;
+			die_id_mask  = (tmp >> 8) & 0xFF;
+
+			cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
+		}
+
+		/* If interleaved over more than 1 socket. */
+		if (intlv_num_sockets) {
+			socket_id_shift	= (tmp >> 28) & 0xF;
+			socket_id_mask	= (tmp >> 16) & 0xFF;
+
+			cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
+		}
+
+		/*
+		 * The pre-interleaved address consists of XXXXXXIIIYYYYY
+		 * where III is the ID for this CS, and XXXXXXYYYYY are the
+		 * address bits from the post-interleaved address.
+		 * "num_intlv_bits" has been calculated to tell us how many "I"
+		 * bits there are. "intlv_addr_bit" tells us how many "Y" bits
+		 * there are (where "I" starts).
+		 */
+		temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0);
+		temp_addr_i = (cs_id << intlv_addr_bit);
+		temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
+		ret_addr    = temp_addr_x | temp_addr_i | temp_addr_y;
+	}
+
+	/* Add dram base address */
+	ret_addr += dram_base_addr;
+
+	/* If legacy MMIO hole enabled */
+	if (lgcy_mmio_hole_en) {
+		if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp))
+			goto out_err;
+
+		dram_hole_base = tmp & GENMASK(31, 24);
+		if (ret_addr >= dram_hole_base)
+			ret_addr += (BIT_ULL(32) - dram_hole_base);
+	}
+
+	if (hash_enabled) {
+		/* Save some parentheses and grab ls-bit at the end. */
+		hashed_bit =	(ret_addr >> 12) ^
+				(ret_addr >> 18) ^
+				(ret_addr >> 21) ^
+				(ret_addr >> 30) ^
+				cs_id;
+
+		hashed_bit &= BIT(0);
+
+		if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0)))
+			ret_addr ^= BIT(intlv_addr_bit);
+	}
+
+	/* Is calculated system address is above DRAM limit address? */
+	if (ret_addr > dram_limit_addr)
+		goto out_err;
+
+	*sys_addr = ret_addr;
+	return 0;
+
+out_err:
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
+
 static void
 __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
 {
@@ -645,6 +866,7 @@ static void amd_threshold_interrupt(void)
 {
 	u32 low = 0, high = 0, address = 0;
 	unsigned int bank, block, cpu = smp_processor_id();
+	struct thresh_restart tr;
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
@@ -681,6 +903,11 @@ static void amd_threshold_interrupt(void)
 
 log:
 	__log_error(bank, false, true, ((u64)high << 32) | low);
+
+	/* Reset threshold block after logging error. */
+	memset(&tr, 0, sizeof(tr));
+	tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
+	threshold_restart_bank(&tr);
 }
 
 /*
@@ -826,10 +1053,10 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
 		return th_names[bank];
 	}
 
-	if (!smca_banks[bank].type)
+	if (!smca_banks[bank].hwid)
 		return NULL;
 
-	bank_type = smca_banks[bank].type->bank_type;
+	bank_type = smca_banks[bank].hwid->bank_type;
 
 	if (b && bank_type == SMCA_UMC) {
 		if (b->block < ARRAY_SIZE(smca_umc_block_names))
@@ -838,8 +1065,8 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
 	}
 
 	snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
-		 "%s_%x", smca_bank_names[bank_type].name,
-			  smca_banks[bank].type_instance);
+		 "%s_%x", smca_get_name(bank_type),
+			  smca_banks[bank].id);
 	return buf_mcatype;
 }
 
@@ -1010,31 +1237,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
 	return err;
 }
 
-/* create dir/files for all valid threshold banks */
-static int threshold_create_device(unsigned int cpu)
-{
-	unsigned int bank;
-	struct threshold_bank **bp;
-	int err = 0;
-
-	bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
-		     GFP_KERNEL);
-	if (!bp)
-		return -ENOMEM;
-
-	per_cpu(threshold_banks, cpu) = bp;
-
-	for (bank = 0; bank < mca_cfg.banks; ++bank) {
-		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
-			continue;
-		err = threshold_create_bank(cpu, bank);
-		if (err)
-			return err;
-	}
-
-	return err;
-}
-
 static void deallocate_threshold_block(unsigned int cpu,
 						 unsigned int bank)
 {
@@ -1102,48 +1304,71 @@ free_out:
 	per_cpu(threshold_banks, cpu)[bank] = NULL;
 }
 
-static void threshold_remove_device(unsigned int cpu)
+int mce_threshold_remove_device(unsigned int cpu)
 {
 	unsigned int bank;
 
+	if (!thresholding_en)
+		return 0;
+
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
 		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
 			continue;
 		threshold_remove_bank(cpu, bank);
 	}
 	kfree(per_cpu(threshold_banks, cpu));
+	per_cpu(threshold_banks, cpu) = NULL;
+	return 0;
 }
 
-/* get notified when a cpu comes on/off */
-static void
-amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
+/* create dir/files for all valid threshold banks */
+int mce_threshold_create_device(unsigned int cpu)
 {
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		threshold_create_device(cpu);
-		break;
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		threshold_remove_device(cpu);
-		break;
-	default:
-		break;
+	unsigned int bank;
+	struct threshold_bank **bp;
+	int err = 0;
+
+	if (!thresholding_en)
+		return 0;
+
+	bp = per_cpu(threshold_banks, cpu);
+	if (bp)
+		return 0;
+
+	bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
+		     GFP_KERNEL);
+	if (!bp)
+		return -ENOMEM;
+
+	per_cpu(threshold_banks, cpu) = bp;
+
+	for (bank = 0; bank < mca_cfg.banks; ++bank) {
+		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+			continue;
+		err = threshold_create_bank(cpu, bank);
+		if (err)
+			goto err;
 	}
+	return err;
+err:
+	mce_threshold_remove_device(cpu);
+	return err;
 }
 
 static __init int threshold_init_device(void)
 {
 	unsigned lcpu = 0;
 
+	if (mce_threshold_vector == amd_threshold_interrupt)
+		thresholding_en = true;
+
 	/* to hit CPUs online before the notifier is up */
 	for_each_online_cpu(lcpu) {
-		int err = threshold_create_device(lcpu);
+		int err = mce_threshold_create_device(lcpu);
 
 		if (err)
 			return err;
 	}
-	threshold_cpu_callback = amd_64_threshold_cpu_callback;
 
 	return 0;
 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 1defb8ea882c..190b3e6cef4d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -11,6 +11,8 @@
 #include <linux/sched.h>
 #include <linux/cpumask.h>
 #include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/intel-family.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -130,7 +132,7 @@ bool mce_intel_cmci_poll(void)
 	 * Reset the counter if we've logged an error in the last poll
 	 * during the storm.
 	 */
-	if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
+	if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
 		this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
 	else
 		this_cpu_dec(cmci_backoff_cnt);
@@ -342,7 +344,7 @@ void cmci_recheck(void)
 		return;
 
 	local_irq_save(flags);
-	machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+	machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
 	local_irq_restore(flags);
 }
 
@@ -464,11 +466,46 @@ static void intel_clear_lmce(void)
 	wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
 }
 
+static void intel_ppin_init(struct cpuinfo_x86 *c)
+{
+	unsigned long long val;
+
+	/*
+	 * Even if testing the presence of the MSR would be enough, we don't
+	 * want to risk the situation where other models reuse this MSR for
+	 * other purposes.
+	 */
+	switch (c->x86_model) {
+	case INTEL_FAM6_IVYBRIDGE_X:
+	case INTEL_FAM6_HASWELL_X:
+	case INTEL_FAM6_BROADWELL_XEON_D:
+	case INTEL_FAM6_BROADWELL_X:
+	case INTEL_FAM6_SKYLAKE_X:
+		if (rdmsrl_safe(MSR_PPIN_CTL, &val))
+			return;
+
+		if ((val & 3UL) == 1UL) {
+			/* PPIN available but disabled: */
+			return;
+		}
+
+		/* If PPIN is disabled, but not locked, try to enable: */
+		if (!(val & 3UL)) {
+			wrmsrl_safe(MSR_PPIN_CTL,  val | 2UL);
+			rdmsrl_safe(MSR_PPIN_CTL, &val);
+		}
+
+		if ((val & 3UL) == 2UL)
+			set_cpu_cap(c, X86_FEATURE_INTEL_PPIN);
+	}
+}
+
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
 	intel_init_cmci();
 	intel_init_lmce();
+	intel_ppin_init(c);
 }
 
 void mce_intel_feature_clear(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6b9dc4d18ccc..465aca8be009 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -26,7 +26,6 @@
 
 #include <asm/processor.h>
 #include <asm/apic.h>
-#include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/trace/irq_vectors.h>
@@ -271,58 +270,32 @@ static void thermal_throttle_remove_dev(struct device *dev)
 }
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
-thermal_throttle_cpu_callback(struct notifier_block *nfb,
-			      unsigned long action,
-			      void *hcpu)
+static int thermal_throttle_online(unsigned int cpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
-	struct device *dev;
-	int err = 0;
-
-	dev = get_cpu_device(cpu);
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		err = thermal_throttle_add_dev(dev, cpu);
-		WARN_ON(err);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		thermal_throttle_remove_dev(dev);
-		break;
-	}
-	return notifier_from_errno(err);
+	struct device *dev = get_cpu_device(cpu);
+
+	return thermal_throttle_add_dev(dev, cpu);
 }
 
-static struct notifier_block thermal_throttle_cpu_notifier =
+static int thermal_throttle_offline(unsigned int cpu)
 {
-	.notifier_call = thermal_throttle_cpu_callback,
-};
+	struct device *dev = get_cpu_device(cpu);
+
+	thermal_throttle_remove_dev(dev);
+	return 0;
+}
 
 static __init int thermal_throttle_init_device(void)
 {
-	unsigned int cpu = 0;
-	int err;
+	int ret;
 
 	if (!atomic_read(&therm_throt_en))
 		return 0;
 
-	cpu_notifier_register_begin();
-
-	/* connect live CPUs to sysfs */
-	for_each_online_cpu(cpu) {
-		err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
-		WARN_ON(err);
-	}
-
-	__register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
-	cpu_notifier_register_done();
-
-	return 0;
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
+				thermal_throttle_online,
+				thermal_throttle_offline);
+	return ret < 0 ? ret : 0;
 }
 device_initcall(thermal_throttle_init_device);
 
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index fcf9ae9384f4..9beb092d68a5 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -6,7 +6,6 @@
 
 #include <asm/irq_vectors.h>
 #include <asm/apic.h>
-#include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/trace/irq_vectors.h>
 
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
index 220b1a508513..ba12e8aa4a45 100644
--- a/arch/x86/kernel/cpu/microcode/Makefile
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -1,4 +1,4 @@
 microcode-y				:= core.o
 obj-$(CONFIG_MICROCODE)			+= microcode.o
-microcode-$(CONFIG_MICROCODE_INTEL)	+= intel.o intel_lib.o
+microcode-$(CONFIG_MICROCODE_INTEL)	+= intel.o
 microcode-$(CONFIG_MICROCODE_AMD)	+= amd.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 017bda12caae..6a31e2691f3a 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -5,6 +5,7 @@
  *  CPUs and later.
  *
  *  Copyright (C) 2008-2011 Advanced Micro Devices Inc.
+ *	          2013-2016 Borislav Petkov <bp@alien8.de>
  *
  *  Author: Peter Oruba <peter.oruba@amd.com>
  *
@@ -39,64 +40,25 @@
 
 static struct equiv_cpu_entry *equiv_cpu_table;
 
-struct ucode_patch {
-	struct list_head plist;
-	void *data;
-	u32 patch_id;
-	u16 equiv_cpu;
-};
-
-static LIST_HEAD(pcache);
-
 /*
  * This points to the current valid container of microcode patches which we will
- * save from the initrd before jettisoning its contents.
+ * save from the initrd/builtin before jettisoning its contents.
  */
-static u8 *container;
-static size_t container_size;
-static bool ucode_builtin;
+struct container {
+	u8 *data;
+	size_t size;
+} cont;
 
 static u32 ucode_new_rev;
 static u8 amd_ucode_patch[PATCH_MAX_SIZE];
 static u16 this_equiv_id;
 
-static struct cpio_data ucode_cpio;
-
-static struct cpio_data __init find_ucode_in_initrd(void)
-{
-#ifdef CONFIG_BLK_DEV_INITRD
-	char *path;
-	void *start;
-	size_t size;
-
-	/*
-	 * Microcode patch container file is prepended to the initrd in cpio
-	 * format. See Documentation/x86/early-microcode.txt
-	 */
-	static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
-
-#ifdef CONFIG_X86_32
-	struct boot_params *p;
-
-	/*
-	 * On 32-bit, early load occurs before paging is turned on so we need
-	 * to use physical addresses.
-	 */
-	p       = (struct boot_params *)__pa_nodebug(&boot_params);
-	path    = (char *)__pa_nodebug(ucode_path);
-	start   = (void *)p->hdr.ramdisk_image;
-	size    = p->hdr.ramdisk_size;
-#else
-	path    = ucode_path;
-	start   = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
-	size    = boot_params.hdr.ramdisk_size;
-#endif /* !CONFIG_X86_32 */
-
-	return find_cpio_data(path, start, size, NULL);
-#else
-	return (struct cpio_data){ NULL, 0, "" };
-#endif
-}
+/*
+ * Microcode patch container file is prepended to the initrd in cpio
+ * format. See Documentation/x86/early-microcode.txt
+ */
+static const char
+ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
 
 static size_t compute_container_size(u8 *data, u32 total_size)
 {
@@ -135,48 +97,49 @@ static size_t compute_container_size(u8 *data, u32 total_size)
 	return size;
 }
 
+static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table,
+				unsigned int sig)
+{
+	int i = 0;
+
+	if (!equiv_cpu_table)
+		return 0;
+
+	while (equiv_cpu_table[i].installed_cpu != 0) {
+		if (sig == equiv_cpu_table[i].installed_cpu)
+			return equiv_cpu_table[i].equiv_cpu;
+
+		i++;
+	}
+	return 0;
+}
+
 /*
- * Early load occurs before we can vmalloc(). So we look for the microcode
- * patch container file in initrd, traverse equivalent cpu table, look for a
- * matching microcode patch, and update, all in initrd memory in place.
- * When vmalloc() is available for use later -- on 64-bit during first AP load,
- * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
- * load_microcode_amd() to save equivalent cpu table and microcode patches in
- * kernel heap memory.
+ * This scans the ucode blob for the proper container as we can have multiple
+ * containers glued together. Returns the equivalence ID from the equivalence
+ * table or 0 if none found.
  */
-static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
+static u16
+find_proper_container(u8 *ucode, size_t size, struct container *ret_cont)
 {
+	struct container ret = { NULL, 0 };
+	u32 eax, ebx, ecx, edx;
 	struct equiv_cpu_entry *eq;
-	size_t *cont_sz;
-	u32 *header;
-	u8  *data, **cont;
-	u8 (*patch)[PATCH_MAX_SIZE];
-	u16 eq_id = 0;
 	int offset, left;
-	u32 rev, eax, ebx, ecx, edx;
-	u32 *new_rev;
-
-#ifdef CONFIG_X86_32
-	new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
-	cont_sz = (size_t *)__pa_nodebug(&container_size);
-	cont	= (u8 **)__pa_nodebug(&container);
-	patch	= (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
-#else
-	new_rev = &ucode_new_rev;
-	cont_sz = &container_size;
-	cont	= &container;
-	patch	= &amd_ucode_patch;
-#endif
+	u16 eq_id = 0;
+	u32 *header;
+	u8 *data;
 
 	data   = ucode;
 	left   = size;
 	header = (u32 *)data;
 
+
 	/* find equiv cpu table */
 	if (header[0] != UCODE_MAGIC ||
 	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
 	    header[2] == 0)                            /* size */
-		return;
+		return eq_id;
 
 	eax = 0x00000001;
 	ecx = 0;
@@ -185,7 +148,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
 	while (left > 0) {
 		eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
 
-		*cont = data;
+		ret.data = data;
 
 		/* Advance past the container header */
 		offset = header[2] + CONTAINER_HDR_SZ;
@@ -194,15 +157,16 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
 
 		eq_id = find_equiv_id(eq, eax);
 		if (eq_id) {
-			this_equiv_id = eq_id;
-			*cont_sz = compute_container_size(*cont, left + offset);
+			ret.size = compute_container_size(ret.data, left + offset);
 
 			/*
 			 * truncate how much we need to iterate over in the
 			 * ucode update loop below
 			 */
-			left = *cont_sz - offset;
-			break;
+			left = ret.size - offset;
+
+			*ret_cont = ret;
+			return eq_id;
 		}
 
 		/*
@@ -212,6 +176,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
 		 */
 		while (left > 0) {
 			header = (u32 *)data;
+
 			if (header[0] == UCODE_MAGIC &&
 			    header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
 				break;
@@ -226,14 +191,65 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
 		ucode     = data;
 	}
 
-	if (!eq_id) {
-		*cont = NULL;
-		*cont_sz = 0;
-		return;
-	}
+	return eq_id;
+}
+
+static int __apply_microcode_amd(struct microcode_amd *mc_amd)
+{
+	u32 rev, dummy;
+
+	native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
+
+	/* verify patch application was successful */
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+	if (rev != mc_amd->hdr.patch_id)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ *
+ * Returns true if container found (sets @ret_cont), false otherwise.
+ */
+static bool apply_microcode_early_amd(void *ucode, size_t size, bool save_patch,
+				      struct container *ret_cont)
+{
+	u8 (*patch)[PATCH_MAX_SIZE];
+	u32 rev, *header, *new_rev;
+	struct container ret;
+	int offset, left;
+	u16 eq_id = 0;
+	u8  *data;
+
+#ifdef CONFIG_X86_32
+	new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+	patch	= (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
+#else
+	new_rev = &ucode_new_rev;
+	patch	= &amd_ucode_patch;
+#endif
 
 	if (check_current_patch_level(&rev, true))
-		return;
+		return false;
+
+	eq_id = find_proper_container(ucode, size, &ret);
+	if (!eq_id)
+		return false;
+
+	this_equiv_id = eq_id;
+	header = (u32 *)ret.data;
+
+	/* We're pointing to an equiv table, skip over it. */
+	data = ret.data +  header[2] + CONTAINER_HDR_SZ;
+	left = ret.size - (header[2] + CONTAINER_HDR_SZ);
 
 	while (left > 0) {
 		struct microcode_amd *mc;
@@ -252,8 +268,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
 				*new_rev = rev;
 
 				if (save_patch)
-					memcpy(patch, mc,
-					       min_t(u32, header[1], PATCH_MAX_SIZE));
+					memcpy(patch, mc, min_t(u32, header[1], PATCH_MAX_SIZE));
 			}
 		}
 
@@ -261,10 +276,14 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
 		data   += offset;
 		left   -= offset;
 	}
+
+	if (ret_cont)
+		*ret_cont = ret;
+
+	return true;
 }
 
-static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
-					      unsigned int family)
+static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
 {
 #ifdef CONFIG_X86_64
 	char fw_name[36] = "amd-ucode/microcode_amd.bin";
@@ -281,47 +300,49 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
 
 void __init load_ucode_amd_bsp(unsigned int family)
 {
+	struct ucode_cpu_info *uci;
+	u32 eax, ebx, ecx, edx;
 	struct cpio_data cp;
-	bool *builtin;
-	void **data;
-	size_t *size;
+	const char *path;
+	bool use_pa;
 
-#ifdef CONFIG_X86_32
-	data =  (void **)__pa_nodebug(&ucode_cpio.data);
-	size = (size_t *)__pa_nodebug(&ucode_cpio.size);
-	builtin = (bool *)__pa_nodebug(&ucode_builtin);
-#else
-	data = &ucode_cpio.data;
-	size = &ucode_cpio.size;
-	builtin = &ucode_builtin;
-#endif
+	if (IS_ENABLED(CONFIG_X86_32)) {
+		uci	= (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info);
+		path	= (const char *)__pa_nodebug(ucode_path);
+		use_pa	= true;
+	} else {
+		uci     = ucode_cpu_info;
+		path	= ucode_path;
+		use_pa	= false;
+	}
 
-	*builtin = load_builtin_amd_microcode(&cp, family);
-	if (!*builtin)
-		cp = find_ucode_in_initrd();
+	if (!get_builtin_microcode(&cp, family))
+		cp = find_microcode_in_initrd(path, use_pa);
 
 	if (!(cp.data && cp.size))
 		return;
 
-	*data = cp.data;
-	*size = cp.size;
+	/* Get BSP's CPUID.EAX(1), needed in load_microcode_amd() */
+	eax = 1;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	uci->cpu_sig.sig = eax;
 
-	apply_ucode_in_initrd(cp.data, cp.size, true);
+	apply_microcode_early_amd(cp.data, cp.size, true, NULL);
 }
 
 #ifdef CONFIG_X86_32
 /*
  * On 32-bit, since AP's early load occurs before paging is turned on, we
- * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
- * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
- * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
+ * cannot traverse cpu_equiv_table and microcode_cache in kernel heap memory.
+ * So during cold boot, AP will apply_ucode_in_initrd() just like the BSP.
+ * In save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
  * which is used upon resume from suspend.
  */
-void load_ucode_amd_ap(void)
+void load_ucode_amd_ap(unsigned int family)
 {
 	struct microcode_amd *mc;
-	size_t *usize;
-	void **ucode;
+	struct cpio_data cp;
 
 	mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
 	if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
@@ -329,60 +350,62 @@ void load_ucode_amd_ap(void)
 		return;
 	}
 
-	ucode = (void *)__pa_nodebug(&container);
-	usize = (size_t *)__pa_nodebug(&container_size);
+	if (!get_builtin_microcode(&cp, family))
+		cp = find_microcode_in_initrd((const char *)__pa_nodebug(ucode_path), true);
 
-	if (!*ucode || !*usize)
+	if (!(cp.data && cp.size))
 		return;
 
-	apply_ucode_in_initrd(*ucode, *usize, false);
-}
-
-static void __init collect_cpu_sig_on_bsp(void *arg)
-{
-	unsigned int cpu = smp_processor_id();
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	uci->cpu_sig.sig = cpuid_eax(0x00000001);
-}
-
-static void __init get_bsp_sig(void)
-{
-	unsigned int bsp = boot_cpu_data.cpu_index;
-	struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
-
-	if (!uci->cpu_sig.sig)
-		smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
+	/*
+	 * This would set amd_ucode_patch above so that the following APs can
+	 * use it directly instead of going down this path again.
+	 */
+	apply_microcode_early_amd(cp.data, cp.size, true, NULL);
 }
 #else
-void load_ucode_amd_ap(void)
+void load_ucode_amd_ap(unsigned int family)
 {
-	unsigned int cpu = smp_processor_id();
 	struct equiv_cpu_entry *eq;
 	struct microcode_amd *mc;
-	u8 *cont = container;
 	u32 rev, eax;
 	u16 eq_id;
 
-	/* Exit if called on the BSP. */
-	if (!cpu)
+	/* 64-bit runs with paging enabled, thus early==false. */
+	if (check_current_patch_level(&rev, false))
 		return;
 
-	if (!container)
-		return;
+	/* First AP hasn't cached it yet, go through the blob. */
+	if (!cont.data) {
+		struct cpio_data cp = { NULL, 0, "" };
 
-	/*
-	 * 64-bit runs with paging enabled, thus early==false.
-	 */
-	if (check_current_patch_level(&rev, false))
-		return;
+		if (cont.size == -1)
+			return;
 
-	/* Add CONFIG_RANDOMIZE_MEMORY offset. */
-	if (!ucode_builtin)
-		cont += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+reget:
+		if (!get_builtin_microcode(&cp, family)) {
+#ifdef CONFIG_BLK_DEV_INITRD
+			cp = find_cpio_data(ucode_path, (void *)initrd_start,
+					    initrd_end - initrd_start, NULL);
+#endif
+			if (!(cp.data && cp.size)) {
+				/*
+				 * Mark it so that other APs do not scan again
+				 * for no real reason and slow down boot
+				 * needlessly.
+				 */
+				cont.size = -1;
+				return;
+			}
+		}
+
+		if (!apply_microcode_early_amd(cp.data, cp.size, false, &cont)) {
+			cont.size = -1;
+			return;
+		}
+	}
 
 	eax = cpuid_eax(0x00000001);
-	eq  = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ);
+	eq  = (struct equiv_cpu_entry *)(cont.data + CONTAINER_HDR_SZ);
 
 	eq_id = find_equiv_id(eq, eax);
 	if (!eq_id)
@@ -397,61 +420,50 @@ void load_ucode_amd_ap(void)
 		}
 
 	} else {
-		if (!ucode_cpio.data)
-			return;
 
 		/*
 		 * AP has a different equivalence ID than BSP, looks like
 		 * mixed-steppings silicon so go through the ucode blob anew.
 		 */
-		apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
+		goto reget;
 	}
 }
-#endif
+#endif /* CONFIG_X86_32 */
+
+static enum ucode_state
+load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
 
-int __init save_microcode_in_initrd_amd(void)
+int __init save_microcode_in_initrd_amd(unsigned int fam)
 {
-	unsigned long cont;
-	int retval = 0;
 	enum ucode_state ret;
-	u8 *cont_va;
-	u32 eax;
+	int retval = 0;
+	u16 eq_id;
 
-	if (!container)
-		return -EINVAL;
+	if (!cont.data) {
+		if (IS_ENABLED(CONFIG_X86_32) && (cont.size != -1)) {
+			struct cpio_data cp = { NULL, 0, "" };
 
-#ifdef CONFIG_X86_32
-	get_bsp_sig();
-	cont	= (unsigned long)container;
-	cont_va = __va(container);
-#else
-	/*
-	 * We need the physical address of the container for both bitness since
-	 * boot_params.hdr.ramdisk_image is a physical address.
-	 */
-	cont    = __pa_nodebug(container);
-	cont_va = container;
+#ifdef CONFIG_BLK_DEV_INITRD
+			cp = find_cpio_data(ucode_path, (void *)initrd_start,
+					    initrd_end - initrd_start, NULL);
 #endif
 
-	/*
-	 * Take into account the fact that the ramdisk might get relocated and
-	 * therefore we need to recompute the container's position in virtual
-	 * memory space.
-	 */
-	if (relocated_ramdisk)
-		container = (u8 *)(__va(relocated_ramdisk) +
-			     (cont - boot_params.hdr.ramdisk_image));
-	else
-		container = cont_va;
+			if (!(cp.data && cp.size)) {
+				cont.size = -1;
+				return -EINVAL;
+			}
 
-	/* Add CONFIG_RANDOMIZE_MEMORY offset. */
-	if (!ucode_builtin)
-		container += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+			eq_id = find_proper_container(cp.data, cp.size, &cont);
+			if (!eq_id) {
+				cont.size = -1;
+				return -EINVAL;
+			}
 
-	eax   = cpuid_eax(0x00000001);
-	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+		} else
+			return -EINVAL;
+	}
 
-	ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
+	ret = load_microcode_amd(smp_processor_id(), fam, cont.data, cont.size);
 	if (ret != UCODE_OK)
 		retval = -EINVAL;
 
@@ -459,8 +471,8 @@ int __init save_microcode_in_initrd_amd(void)
 	 * This will be freed any msec now, stash patches for the current
 	 * family and switch to patch cache for cpu hotplug, etc later.
 	 */
-	container = NULL;
-	container_size = 0;
+	cont.data = NULL;
+	cont.size = 0;
 
 	return retval;
 }
@@ -478,8 +490,10 @@ void reload_ucode_amd(void)
 		return;
 
 	mc = (struct microcode_amd *)amd_ucode_patch;
+	if (!mc)
+		return;
 
-	if (mc && rev < mc->hdr.patch_id) {
+	if (rev < mc->hdr.patch_id) {
 		if (!__apply_microcode_amd(mc)) {
 			ucode_new_rev = mc->hdr.patch_id;
 			pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
@@ -513,7 +527,7 @@ static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
 {
 	struct ucode_patch *p;
 
-	list_for_each_entry(p, &pcache, plist)
+	list_for_each_entry(p, &microcode_cache, plist)
 		if (p->equiv_cpu == equiv_cpu)
 			return p;
 	return NULL;
@@ -523,7 +537,7 @@ static void update_cache(struct ucode_patch *new_patch)
 {
 	struct ucode_patch *p;
 
-	list_for_each_entry(p, &pcache, plist) {
+	list_for_each_entry(p, &microcode_cache, plist) {
 		if (p->equiv_cpu == new_patch->equiv_cpu) {
 			if (p->patch_id >= new_patch->patch_id)
 				/* we already have the latest patch */
@@ -536,14 +550,14 @@ static void update_cache(struct ucode_patch *new_patch)
 		}
 	}
 	/* no patch found, add it */
-	list_add_tail(&new_patch->plist, &pcache);
+	list_add_tail(&new_patch->plist, &microcode_cache);
 }
 
 static void free_cache(void)
 {
 	struct ucode_patch *p, *tmp;
 
-	list_for_each_entry_safe(p, tmp, &pcache, plist) {
+	list_for_each_entry_safe(p, tmp, &microcode_cache, plist) {
 		__list_del(p->plist.prev, p->plist.next);
 		kfree(p->data);
 		kfree(p);
@@ -663,21 +677,7 @@ bool check_current_patch_level(u32 *rev, bool early)
 	return ret;
 }
 
-int __apply_microcode_amd(struct microcode_amd *mc_amd)
-{
-	u32 rev, dummy;
-
-	native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
-
-	/* verify patch application was successful */
-	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-	if (rev != mc_amd->hdr.patch_id)
-		return -1;
-
-	return 0;
-}
-
-int apply_microcode_amd(int cpu)
+static int apply_microcode_amd(int cpu)
 {
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	struct microcode_amd *mc_amd;
@@ -860,7 +860,8 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
 	return UCODE_OK;
 }
 
-enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
+static enum ucode_state
+load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
 {
 	enum ucode_state ret;
 
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 5ce5155f0695..2af69d27da62 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
  *	      2006	Shaohua Li <shaohua.li@intel.com>
- *	      2013-2015	Borislav Petkov <bp@alien8.de>
+ *	      2013-2016	Borislav Petkov <bp@alien8.de>
  *
  * X86 CPU microcode early update for Linux:
  *
@@ -39,11 +39,14 @@
 #include <asm/microcode.h>
 #include <asm/processor.h>
 #include <asm/cmdline.h>
+#include <asm/setup.h>
 
-#define MICROCODE_VERSION	"2.01"
+#define DRIVER_VERSION	"2.2"
 
 static struct microcode_ops	*microcode_ops;
-static bool dis_ucode_ldr;
+static bool dis_ucode_ldr = true;
+
+LIST_HEAD(microcode_cache);
 
 /*
  * Synchronization.
@@ -73,6 +76,7 @@ struct cpu_info_ctx {
 static bool __init check_loader_disabled_bsp(void)
 {
 	static const char *__dis_opt_str = "dis_ucode_ldr";
+	u32 a, b, c, d;
 
 #ifdef CONFIG_X86_32
 	const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
@@ -85,8 +89,23 @@ static bool __init check_loader_disabled_bsp(void)
 	bool *res = &dis_ucode_ldr;
 #endif
 
-	if (cmdline_find_option_bool(cmdline, option))
-		*res = true;
+	if (!have_cpuid_p())
+		return *res;
+
+	a = 1;
+	c = 0;
+	native_cpuid(&a, &b, &c, &d);
+
+	/*
+	 * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
+	 * completely accurate as xen pv guests don't see that CPUID bit set but
+	 * that's good enough as they don't land on the BSP path anyway.
+	 */
+	if (c & BIT(31))
+		return *res;
+
+	if (cmdline_find_option_bool(cmdline, option) <= 0)
+		*res = false;
 
 	return *res;
 }
@@ -118,9 +137,6 @@ void __init load_ucode_bsp(void)
 	if (check_loader_disabled_bsp())
 		return;
 
-	if (!have_cpuid_p())
-		return;
-
 	vendor = x86_cpuid_vendor();
 	family = x86_cpuid_family();
 
@@ -154,9 +170,6 @@ void load_ucode_ap(void)
 	if (check_loader_disabled_ap())
 		return;
 
-	if (!have_cpuid_p())
-		return;
-
 	vendor = x86_cpuid_vendor();
 	family = x86_cpuid_family();
 
@@ -167,7 +180,7 @@ void load_ucode_ap(void)
 		break;
 	case X86_VENDOR_AMD:
 		if (family >= 0x10)
-			load_ucode_amd_ap();
+			load_ucode_amd_ap(family);
 		break;
 	default:
 		break;
@@ -185,7 +198,7 @@ static int __init save_microcode_in_initrd(void)
 		break;
 	case X86_VENDOR_AMD:
 		if (c->x86 >= 0x10)
-			return save_microcode_in_initrd_amd();
+			return save_microcode_in_initrd_amd(c->x86);
 		break;
 	default:
 		break;
@@ -194,6 +207,56 @@ static int __init save_microcode_in_initrd(void)
 	return -EINVAL;
 }
 
+struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+	unsigned long start = 0;
+	size_t size;
+
+#ifdef CONFIG_X86_32
+	struct boot_params *params;
+
+	if (use_pa)
+		params = (struct boot_params *)__pa_nodebug(&boot_params);
+	else
+		params = &boot_params;
+
+	size = params->hdr.ramdisk_size;
+
+	/*
+	 * Set start only if we have an initrd image. We cannot use initrd_start
+	 * because it is not set that early yet.
+	 */
+	if (size)
+		start = params->hdr.ramdisk_image;
+
+# else /* CONFIG_X86_64 */
+	size  = (unsigned long)boot_params.ext_ramdisk_size << 32;
+	size |= boot_params.hdr.ramdisk_size;
+
+	if (size) {
+		start  = (unsigned long)boot_params.ext_ramdisk_image << 32;
+		start |= boot_params.hdr.ramdisk_image;
+
+		start += PAGE_OFFSET;
+	}
+# endif
+
+	/*
+	 * Fixup the start address: after reserve_initrd() runs, initrd_start
+	 * has the virtual address of the beginning of the initrd. It also
+	 * possibly relocates the ramdisk. In either case, initrd_start contains
+	 * the updated address so use that instead.
+	 */
+	if (!use_pa && initrd_start)
+		start = initrd_start;
+
+	return find_cpio_data(path, (void *)start, size, NULL);
+#else /* !CONFIG_BLK_DEV_INITRD */
+	return (struct cpio_data){ NULL, 0, "" };
+#endif
+}
+
 void reload_early_microcode(void)
 {
 	int vendor, family;
@@ -453,16 +516,17 @@ static struct attribute_group mc_attr_group = {
 
 static void microcode_fini_cpu(int cpu)
 {
-	microcode_ops->microcode_fini_cpu(cpu);
+	if (microcode_ops->microcode_fini_cpu)
+		microcode_ops->microcode_fini_cpu(cpu);
 }
 
 static enum ucode_state microcode_resume_cpu(int cpu)
 {
-	pr_debug("CPU%d updated upon resume\n", cpu);
-
 	if (apply_microcode_on_target(cpu))
 		return UCODE_ERROR;
 
+	pr_debug("CPU%d updated upon resume\n", cpu);
+
 	return UCODE_OK;
 }
 
@@ -496,6 +560,9 @@ static enum ucode_state microcode_update_cpu(int cpu)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
+	/* Refresh CPU microcode revision after resume. */
+	collect_cpu_info(cpu);
+
 	if (uci->valid)
 		return microcode_resume_cpu(cpu);
 
@@ -579,12 +646,7 @@ static int mc_cpu_down_prep(unsigned int cpu)
 	/* Suspend is in progress, only remove the interface */
 	sysfs_remove_group(&dev->kobj, &mc_attr_group);
 	pr_debug("CPU%d removed\n", cpu);
-	/*
-	 * When a CPU goes offline, don't free up or invalidate the copy of
-	 * the microcode in kernel memory, so that we can reuse it when the
-	 * CPU comes back online without unnecessarily requesting the userspace
-	 * for it again.
-	 */
+
 	return 0;
 }
 
@@ -649,8 +711,7 @@ int __init microcode_init(void)
 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
 				  mc_cpu_online, mc_cpu_down_prep);
 
-	pr_info("Microcode Update Driver: v" MICROCODE_VERSION
-		" <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
+	pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION);
 
 	return 0;
 
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index cdc0deab00c9..b624b54912e1 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -39,125 +39,83 @@
 #include <asm/setup.h>
 #include <asm/msr.h>
 
-/*
- * Temporary microcode blobs pointers storage. We note here during early load
- * the pointers to microcode blobs we've got from whatever storage (detached
- * initrd, builtin). Later on, we put those into final storage
- * mc_saved_data.mc_saved.
- *
- * Important: those are offsets from the beginning of initrd or absolute
- * addresses within the kernel image when built-in.
- */
-static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT];
-
-static struct mc_saved_data {
-	unsigned int num_saved;
-	struct microcode_intel **mc_saved;
-} mc_saved_data;
+static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
 
-/* Microcode blobs within the initrd. 0 if builtin. */
-static struct ucode_blobs {
-	unsigned long start;
-	bool valid;
-} blobs;
+/* Current microcode patch used in early patching */
+struct microcode_intel *intel_ucode_patch;
 
-/* Go through saved patches and find the one suitable for the current CPU. */
-static enum ucode_state
-find_microcode_patch(struct microcode_intel **saved,
-		     unsigned int num_saved, struct ucode_cpu_info *uci)
+static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
+					unsigned int s2, unsigned int p2)
 {
-	struct microcode_intel *ucode_ptr, *new_mc = NULL;
-	struct microcode_header_intel *mc_hdr;
-	int new_rev, ret, i;
-
-	new_rev = uci->cpu_sig.rev;
-
-	for (i = 0; i < num_saved; i++) {
-		ucode_ptr = saved[i];
-		mc_hdr	  = (struct microcode_header_intel *)ucode_ptr;
-
-		ret = has_newer_microcode(ucode_ptr,
-					  uci->cpu_sig.sig,
-					  uci->cpu_sig.pf,
-					  new_rev);
-		if (!ret)
-			continue;
-
-		new_rev = mc_hdr->rev;
-		new_mc  = ucode_ptr;
-	}
+	if (s1 != s2)
+		return false;
 
-	if (!new_mc)
-		return UCODE_NFOUND;
+	/* Processor flags are either both 0 ... */
+	if (!p1 && !p2)
+		return true;
 
-	uci->mc = (struct microcode_intel *)new_mc;
-	return UCODE_OK;
+	/* ... or they intersect. */
+	return p1 & p2;
 }
 
-static inline void
-copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs,
-	  unsigned long off, int num_saved)
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int find_matching_signature(void *mc, unsigned int csig, int cpf)
 {
+	struct microcode_header_intel *mc_hdr = mc;
+	struct extended_sigtable *ext_hdr;
+	struct extended_signature *ext_sig;
 	int i;
 
-	for (i = 0; i < num_saved; i++)
-		mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off);
-}
-
-#ifdef CONFIG_X86_32
-static void
-microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
-{
-	int i;
-	struct microcode_intel ***mc_saved;
+	if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
+		return 1;
 
-	mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved);
+	/* Look for ext. headers: */
+	if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
+		return 0;
 
-	for (i = 0; i < mcs->num_saved; i++) {
-		struct microcode_intel *p;
+	ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+	ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
 
-		p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i);
-		mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
+	for (i = 0; i < ext_hdr->count; i++) {
+		if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
+			return 1;
+		ext_sig++;
 	}
+	return 0;
 }
-#endif
 
-static enum ucode_state
-load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
-	       unsigned long offset, struct ucode_cpu_info *uci)
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
 {
-	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int count = mcs->num_saved;
+	struct microcode_header_intel *mc_hdr = mc;
 
-	if (!mcs->mc_saved) {
-		copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count);
+	if (mc_hdr->rev <= new_rev)
+		return 0;
 
-		return find_microcode_patch(mc_saved_tmp, count, uci);
-	} else {
-#ifdef CONFIG_X86_32
-		microcode_phys(mc_saved_tmp, mcs);
-		return find_microcode_patch(mc_saved_tmp, count, uci);
-#else
-		return find_microcode_patch(mcs->mc_saved, count, uci);
-#endif
-	}
+	return find_matching_signature(mc, csig, cpf);
 }
 
 /*
  * Given CPU signature and a microcode patch, this function finds if the
  * microcode patch has matching family and model with the CPU.
+ *
+ * %true - if there's a match
+ * %false - otherwise
  */
-static enum ucode_state
-matching_model_microcode(struct microcode_header_intel *mc_header,
-			unsigned long sig)
+static bool microcode_matches(struct microcode_header_intel *mc_header,
+			      unsigned long sig)
 {
-	unsigned int fam, model;
-	unsigned int fam_ucode, model_ucode;
-	struct extended_sigtable *ext_header;
 	unsigned long total_size = get_totalsize(mc_header);
 	unsigned long data_size = get_datasize(mc_header);
-	int ext_sigcount, i;
+	struct extended_sigtable *ext_header;
+	unsigned int fam_ucode, model_ucode;
 	struct extended_signature *ext_sig;
+	unsigned int fam, model;
+	int ext_sigcount, i;
 
 	fam   = x86_family(sig);
 	model = x86_model(sig);
@@ -166,11 +124,11 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
 	model_ucode = x86_model(mc_header->sig);
 
 	if (fam == fam_ucode && model == model_ucode)
-		return UCODE_OK;
+		return true;
 
 	/* Look for ext. headers: */
 	if (total_size <= data_size + MC_HEADER_SIZE)
-		return UCODE_NFOUND;
+		return false;
 
 	ext_header   = (void *) mc_header + data_size + MC_HEADER_SIZE;
 	ext_sig      = (void *)ext_header + EXT_HEADER_SIZE;
@@ -181,192 +139,262 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
 		model_ucode = x86_model(ext_sig->sig);
 
 		if (fam == fam_ucode && model == model_ucode)
-			return UCODE_OK;
+			return true;
 
 		ext_sig++;
 	}
-	return UCODE_NFOUND;
+	return false;
 }
 
-static int
-save_microcode(struct mc_saved_data *mcs,
-	       struct microcode_intel **mc_saved_src,
-	       unsigned int num_saved)
+static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
 {
-	int i, j;
-	struct microcode_intel **saved_ptr;
-	int ret;
+	struct ucode_patch *p;
 
-	if (!num_saved)
-		return -EINVAL;
+	p = kzalloc(size, GFP_KERNEL);
+	if (!p)
+		return ERR_PTR(-ENOMEM);
 
-	/*
-	 * Copy new microcode data.
-	 */
-	saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL);
-	if (!saved_ptr)
-		return -ENOMEM;
-
-	for (i = 0; i < num_saved; i++) {
-		struct microcode_header_intel *mc_hdr;
-		struct microcode_intel *mc;
-		unsigned long size;
-
-		if (!mc_saved_src[i]) {
-			ret = -EINVAL;
-			goto err;
-		}
+	p->data = kmemdup(data, size, GFP_KERNEL);
+	if (!p->data) {
+		kfree(p);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return p;
+}
+
+static void save_microcode_patch(void *data, unsigned int size)
+{
+	struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+	struct ucode_patch *iter, *tmp, *p;
+	bool prev_found = false;
+	unsigned int sig, pf;
+
+	mc_hdr = (struct microcode_header_intel *)data;
+
+	list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
+		mc_saved_hdr = (struct microcode_header_intel *)iter->data;
+		sig	     = mc_saved_hdr->sig;
+		pf	     = mc_saved_hdr->pf;
+
+		if (find_matching_signature(data, sig, pf)) {
+			prev_found = true;
 
-		mc     = mc_saved_src[i];
-		mc_hdr = &mc->hdr;
-		size   = get_totalsize(mc_hdr);
+			if (mc_hdr->rev <= mc_saved_hdr->rev)
+				continue;
 
-		saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL);
-		if (!saved_ptr[i]) {
-			ret = -ENOMEM;
-			goto err;
+			p = __alloc_microcode_buf(data, size);
+			if (IS_ERR(p))
+				pr_err("Error allocating buffer %p\n", data);
+			else
+				list_replace(&iter->plist, &p->plist);
 		}
 	}
 
 	/*
-	 * Point to newly saved microcode.
+	 * There weren't any previous patches found in the list cache; save the
+	 * newly found.
 	 */
-	mcs->mc_saved  = saved_ptr;
-	mcs->num_saved = num_saved;
-
-	return 0;
-
-err:
-	for (j = 0; j <= i; j++)
-		kfree(saved_ptr[j]);
-	kfree(saved_ptr);
-
-	return ret;
+	if (!prev_found) {
+		p = __alloc_microcode_buf(data, size);
+		if (IS_ERR(p))
+			pr_err("Error allocating buffer for %p\n", data);
+		else
+			list_add_tail(&p->plist, &microcode_cache);
+	}
 }
 
-/*
- * A microcode patch in ucode_ptr is saved into mc_saved
- * - if it has matching signature and newer revision compared to an existing
- *   patch mc_saved.
- * - or if it is a newly discovered microcode patch.
- *
- * The microcode patch should have matching model with CPU.
- *
- * Returns: The updated number @num_saved of saved microcode patches.
- */
-static unsigned int _save_mc(struct microcode_intel **mc_saved,
-			     u8 *ucode_ptr, unsigned int num_saved)
+static int microcode_sanity_check(void *mc, int print_err)
 {
-	struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
-	unsigned int sig, pf;
-	int found = 0, i;
+	unsigned long total_size, data_size, ext_table_size;
+	struct microcode_header_intel *mc_header = mc;
+	struct extended_sigtable *ext_header = NULL;
+	u32 sum, orig_sum, ext_sigcount = 0, i;
+	struct extended_signature *ext_sig;
 
-	mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+	total_size = get_totalsize(mc_header);
+	data_size = get_datasize(mc_header);
 
-	for (i = 0; i < num_saved; i++) {
-		mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
-		sig	     = mc_saved_hdr->sig;
-		pf	     = mc_saved_hdr->pf;
+	if (data_size + MC_HEADER_SIZE > total_size) {
+		if (print_err)
+			pr_err("Error: bad microcode data file size.\n");
+		return -EINVAL;
+	}
 
-		if (!find_matching_signature(ucode_ptr, sig, pf))
-			continue;
+	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+		if (print_err)
+			pr_err("Error: invalid/unknown microcode update format.\n");
+		return -EINVAL;
+	}
 
-		found = 1;
+	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+	if (ext_table_size) {
+		u32 ext_table_sum = 0;
+		u32 *ext_tablep;
 
-		if (mc_hdr->rev <= mc_saved_hdr->rev)
-			continue;
+		if ((ext_table_size < EXT_HEADER_SIZE)
+		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+			if (print_err)
+				pr_err("Error: truncated extended signature table.\n");
+			return -EINVAL;
+		}
+
+		ext_header = mc + MC_HEADER_SIZE + data_size;
+		if (ext_table_size != exttable_size(ext_header)) {
+			if (print_err)
+				pr_err("Error: extended signature table size mismatch.\n");
+			return -EFAULT;
+		}
+
+		ext_sigcount = ext_header->count;
 
 		/*
-		 * Found an older ucode saved earlier. Replace it with
-		 * this newer one.
+		 * Check extended table checksum: the sum of all dwords that
+		 * comprise a valid table must be 0.
 		 */
-		mc_saved[i] = (struct microcode_intel *)ucode_ptr;
-		break;
+		ext_tablep = (u32 *)ext_header;
+
+		i = ext_table_size / sizeof(u32);
+		while (i--)
+			ext_table_sum += ext_tablep[i];
+
+		if (ext_table_sum) {
+			if (print_err)
+				pr_warn("Bad extended signature table checksum, aborting.\n");
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * Calculate the checksum of update data and header. The checksum of
+	 * valid update data and header including the extended signature table
+	 * must be 0.
+	 */
+	orig_sum = 0;
+	i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
+	while (i--)
+		orig_sum += ((u32 *)mc)[i];
+
+	if (orig_sum) {
+		if (print_err)
+			pr_err("Bad microcode data checksum, aborting.\n");
+		return -EINVAL;
 	}
 
-	/* Newly detected microcode, save it to memory. */
-	if (i >= num_saved && !found)
-		mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
+	if (!ext_table_size)
+		return 0;
 
-	return num_saved;
+	/*
+	 * Check extended signature checksum: 0 => valid.
+	 */
+	for (i = 0; i < ext_sigcount; i++) {
+		ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+			  EXT_SIGNATURE_SIZE * i;
+
+		sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+		      (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+		if (sum) {
+			if (print_err)
+				pr_err("Bad extended signature checksum, aborting.\n");
+			return -EINVAL;
+		}
+	}
+	return 0;
 }
 
 /*
  * Get microcode matching with BSP's model. Only CPUs with the same model as
  * BSP can stay in the platform.
  */
-static enum ucode_state __init
-get_matching_model_microcode(unsigned long start, void *data, size_t size,
-			     struct mc_saved_data *mcs, unsigned long *mc_ptrs,
-			     struct ucode_cpu_info *uci)
+static struct microcode_intel *
+scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
 {
-	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
 	struct microcode_header_intel *mc_header;
-	unsigned int num_saved = mcs->num_saved;
-	enum ucode_state state = UCODE_OK;
-	unsigned int leftover = size;
-	u8 *ucode_ptr = data;
+	struct microcode_intel *patch = NULL;
 	unsigned int mc_size;
-	int i;
-
-	while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) {
 
-		if (leftover < sizeof(mc_header))
+	while (size) {
+		if (size < sizeof(struct microcode_header_intel))
 			break;
 
-		mc_header = (struct microcode_header_intel *)ucode_ptr;
+		mc_header = (struct microcode_header_intel *)data;
 
 		mc_size = get_totalsize(mc_header);
-		if (!mc_size || mc_size > leftover ||
-			microcode_sanity_check(ucode_ptr, 0) < 0)
+		if (!mc_size ||
+		    mc_size > size ||
+		    microcode_sanity_check(data, 0) < 0)
 			break;
 
-		leftover -= mc_size;
+		size -= mc_size;
 
-		/*
-		 * Since APs with same family and model as the BSP may boot in
-		 * the platform, we need to find and save microcode patches
-		 * with the same family and model as the BSP.
-		 */
-		if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) {
-			ucode_ptr += mc_size;
+		if (!microcode_matches(mc_header, uci->cpu_sig.sig)) {
+			data += mc_size;
 			continue;
 		}
 
-		num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved);
+		if (save) {
+			save_microcode_patch(data, mc_size);
+			goto next;
+		}
 
-		ucode_ptr += mc_size;
-	}
 
-	if (leftover) {
-		state = UCODE_ERROR;
-		return state;
-	}
+		if (!patch) {
+			if (!has_newer_microcode(data,
+						 uci->cpu_sig.sig,
+						 uci->cpu_sig.pf,
+						 uci->cpu_sig.rev))
+				goto next;
+
+		} else {
+			struct microcode_header_intel *phdr = &patch->hdr;
+
+			if (!has_newer_microcode(data,
+						 phdr->sig,
+						 phdr->pf,
+						 phdr->rev))
+				goto next;
+		}
 
-	if (!num_saved) {
-		state = UCODE_NFOUND;
-		return state;
+		/* We have a newer patch, save it. */
+		patch = data;
+
+next:
+		data += mc_size;
 	}
 
-	for (i = 0; i < num_saved; i++)
-		mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start;
+	if (size)
+		return NULL;
+
+	return patch;
+}
 
-	mcs->num_saved = num_saved;
+static void cpuid_1(void)
+{
+	/*
+	 * According to the Intel SDM, Volume 3, 9.11.7:
+	 *
+	 *   CPUID returns a value in a model specific register in
+	 *   addition to its usual register return values. The
+	 *   semantics of CPUID cause it to deposit an update ID value
+	 *   in the 64-bit model-specific register at address 08BH
+	 *   (IA32_BIOS_SIGN_ID). If no update is present in the
+	 *   processor, the value in the MSR remains unmodified.
+	 *
+	 * Use native_cpuid -- this code runs very early and we don't
+	 * want to mess with paravirt.
+	 */
+	unsigned int eax = 1, ebx, ecx = 0, edx;
 
-	return state;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
 }
 
 static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 {
 	unsigned int val[2];
 	unsigned int family, model;
-	struct cpu_signature csig;
+	struct cpu_signature csig = { 0 };
 	unsigned int eax, ebx, ecx, edx;
 
-	csig.sig = 0;
-	csig.pf = 0;
-	csig.rev = 0;
-
 	memset(uci, 0, sizeof(*uci));
 
 	eax = 0x00000001;
@@ -374,8 +402,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 	native_cpuid(&eax, &ebx, &ecx, &edx);
 	csig.sig = eax;
 
-	family = x86_family(csig.sig);
-	model  = x86_model(csig.sig);
+	family = x86_family(eax);
+	model  = x86_model(eax);
 
 	if ((model >= 5) || (family > 6)) {
 		/* get processor flags from MSR 0x17 */
@@ -385,7 +413,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 	native_wrmsrl(MSR_IA32_UCODE_REV, 0);
 
 	/* As documented in the SDM: Do a CPUID 1 here */
-	sync_core();
+	cpuid_1();
 
 	/* get the current revision from MSR 0x8B */
 	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
@@ -401,40 +429,41 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 static void show_saved_mc(void)
 {
 #ifdef DEBUG
-	int i, j;
+	int i = 0, j;
 	unsigned int sig, pf, rev, total_size, data_size, date;
 	struct ucode_cpu_info uci;
+	struct ucode_patch *p;
 
-	if (!mc_saved_data.num_saved) {
+	if (list_empty(&microcode_cache)) {
 		pr_debug("no microcode data saved.\n");
 		return;
 	}
-	pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved);
 
 	collect_cpu_info_early(&uci);
 
-	sig = uci.cpu_sig.sig;
-	pf = uci.cpu_sig.pf;
-	rev = uci.cpu_sig.rev;
+	sig	= uci.cpu_sig.sig;
+	pf	= uci.cpu_sig.pf;
+	rev	= uci.cpu_sig.rev;
 	pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
 
-	for (i = 0; i < mc_saved_data.num_saved; i++) {
+	list_for_each_entry(p, &microcode_cache, plist) {
 		struct microcode_header_intel *mc_saved_header;
 		struct extended_sigtable *ext_header;
-		int ext_sigcount;
 		struct extended_signature *ext_sig;
+		int ext_sigcount;
+
+		mc_saved_header = (struct microcode_header_intel *)p->data;
 
-		mc_saved_header = (struct microcode_header_intel *)
-				  mc_saved_data.mc_saved[i];
-		sig = mc_saved_header->sig;
-		pf = mc_saved_header->pf;
-		rev = mc_saved_header->rev;
-		total_size = get_totalsize(mc_saved_header);
-		data_size = get_datasize(mc_saved_header);
-		date = mc_saved_header->date;
+		sig	= mc_saved_header->sig;
+		pf	= mc_saved_header->pf;
+		rev	= mc_saved_header->rev;
+		date	= mc_saved_header->date;
+
+		total_size	= get_totalsize(mc_saved_header);
+		data_size	= get_datasize(mc_saved_header);
 
 		pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n",
-			 i, sig, pf, rev, total_size,
+			 i++, sig, pf, rev, total_size,
 			 date & 0xffff,
 			 date >> 24,
 			 (date >> 16) & 0xff);
@@ -443,7 +472,7 @@ static void show_saved_mc(void)
 		if (total_size <= data_size + MC_HEADER_SIZE)
 			continue;
 
-		ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
+		ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE;
 		ext_sigcount = ext_header->count;
 		ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
 
@@ -456,85 +485,43 @@ static void show_saved_mc(void)
 
 			ext_sig++;
 		}
-
 	}
 #endif
 }
 
 /*
- * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
- * hot added or resumes.
- *
- * Please make sure this mc should be a valid microcode patch before calling
- * this function.
+ * Save this microcode patch. It will be loaded early when a CPU is
+ * hot-added or resumes.
  */
-static void save_mc_for_early(u8 *mc)
+static void save_mc_for_early(u8 *mc, unsigned int size)
 {
 #ifdef CONFIG_HOTPLUG_CPU
 	/* Synchronization during CPU hotplug. */
 	static DEFINE_MUTEX(x86_cpu_microcode_mutex);
 
-	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int mc_saved_count_init;
-	unsigned int num_saved;
-	struct microcode_intel **mc_saved;
-	int ret, i;
-
 	mutex_lock(&x86_cpu_microcode_mutex);
 
-	mc_saved_count_init = mc_saved_data.num_saved;
-	num_saved = mc_saved_data.num_saved;
-	mc_saved = mc_saved_data.mc_saved;
-
-	if (mc_saved && num_saved)
-		memcpy(mc_saved_tmp, mc_saved,
-		       num_saved * sizeof(struct microcode_intel *));
-	/*
-	 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
-	 * version.
-	 */
-	num_saved = _save_mc(mc_saved_tmp, mc, num_saved);
-
-	/*
-	 * Save the mc_save_tmp in global mc_saved_data.
-	 */
-	ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved);
-	if (ret) {
-		pr_err("Cannot save microcode patch.\n");
-		goto out;
-	}
-
+	save_microcode_patch(mc, size);
 	show_saved_mc();
 
-	/*
-	 * Free old saved microcode data.
-	 */
-	if (mc_saved) {
-		for (i = 0; i < mc_saved_count_init; i++)
-			kfree(mc_saved[i]);
-		kfree(mc_saved);
-	}
-
-out:
 	mutex_unlock(&x86_cpu_microcode_mutex);
 #endif
 }
 
-static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
+static bool load_builtin_intel_microcode(struct cpio_data *cp)
 {
-#ifdef CONFIG_X86_64
-	unsigned int eax = 0x00000001, ebx, ecx = 0, edx;
+	unsigned int eax = 1, ebx, ecx = 0, edx;
 	char name[30];
 
+	if (IS_ENABLED(CONFIG_X86_32))
+		return false;
+
 	native_cpuid(&eax, &ebx, &ecx, &edx);
 
 	sprintf(name, "intel-ucode/%02x-%02x-%02x",
 		      x86_family(eax), x86_model(eax), x86_stepping(eax));
 
 	return get_builtin_firmware(cp, name);
-#else
-	return false;
-#endif
 }
 
 /*
@@ -570,8 +557,7 @@ void show_ucode_info_early(void)
 }
 
 /*
- * At this point, we can not call printk() yet. Keep microcode patch number in
- * mc_saved_data.mc_saved and delay printing microcode info in
+ * At this point, we can not call printk() yet. Delay printing microcode info in
  * show_ucode_info_early() until printk() works.
  */
 static void print_ucode(struct ucode_cpu_info *uci)
@@ -627,7 +613,7 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 	native_wrmsrl(MSR_IA32_UCODE_REV, 0);
 
 	/* As documented in the SDM: Do a CPUID 1 here */
-	sync_core();
+	cpuid_1();
 
 	/* get the current revision from MSR 0x8B */
 	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
@@ -648,206 +634,140 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 	return 0;
 }
 
-/*
- * This function converts microcode patch offsets previously stored in
- * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data.
- */
 int __init save_microcode_in_initrd_intel(void)
 {
-	struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
-	unsigned int count = mc_saved_data.num_saved;
-	unsigned long offset = 0;
-	int ret;
-
-	if (!count)
-		return 0;
+	struct ucode_cpu_info uci;
+	struct cpio_data cp;
 
 	/*
-	 * We have found a valid initrd but it might've been relocated in the
-	 * meantime so get its updated address.
+	 * AP loading didn't find any microcode patch, no need to save anything.
 	 */
-	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && blobs.valid)
-		offset = initrd_start;
-
-	copy_ptrs(mc_saved, mc_tmp_ptrs, offset, count);
+	if (!intel_ucode_patch || IS_ERR(intel_ucode_patch))
+		return 0;
 
-	ret = save_microcode(&mc_saved_data, mc_saved, count);
-	if (ret)
-		pr_err("Cannot save microcode patches from initrd.\n");
-	else
-		show_saved_mc();
+	if (!load_builtin_intel_microcode(&cp))
+		cp = find_microcode_in_initrd(ucode_path, false);
 
-	return ret;
-}
+	if (!(cp.data && cp.size))
+		return 0;
 
-static __init enum ucode_state
-__scan_microcode_initrd(struct cpio_data *cd, struct ucode_blobs *blbp)
-{
-#ifdef CONFIG_BLK_DEV_INITRD
-	static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
-	char *p = IS_ENABLED(CONFIG_X86_32) ? (char *)__pa_nodebug(ucode_name)
-						    : ucode_name;
-# ifdef CONFIG_X86_32
-	unsigned long start = 0, size;
-	struct boot_params *params;
+	collect_cpu_info_early(&uci);
 
-	params = (struct boot_params *)__pa_nodebug(&boot_params);
-	size   = params->hdr.ramdisk_size;
+	scan_microcode(cp.data, cp.size, &uci, true);
 
-	/*
-	 * Set start only if we have an initrd image. We cannot use initrd_start
-	 * because it is not set that early yet.
-	 */
-	start = (size ? params->hdr.ramdisk_image : 0);
+	show_saved_mc();
 
-# else /* CONFIG_X86_64 */
-	unsigned long start = 0, size;
+	return 0;
+}
 
-	size  = (u64)boot_params.ext_ramdisk_size << 32;
-	size |= boot_params.hdr.ramdisk_size;
 
-	if (size) {
-		start  = (u64)boot_params.ext_ramdisk_image << 32;
-		start |= boot_params.hdr.ramdisk_image;
+/*
+ * @res_patch, output: a pointer to the patch we found.
+ */
+static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci)
+{
+	static const char *path;
+	struct cpio_data cp;
+	bool use_pa;
 
-		start += PAGE_OFFSET;
+	if (IS_ENABLED(CONFIG_X86_32)) {
+		path	  = (const char *)__pa_nodebug(ucode_path);
+		use_pa	  = true;
+	} else {
+		path	  = ucode_path;
+		use_pa	  = false;
 	}
-# endif
-
-	*cd = find_cpio_data(p, (void *)start, size, NULL);
-	if (cd->data) {
-		blbp->start = start;
-		blbp->valid = true;
 
-		return UCODE_OK;
-	} else
-#endif /* CONFIG_BLK_DEV_INITRD */
-		return UCODE_ERROR;
-}
+	/* try built-in microcode first */
+	if (!load_builtin_intel_microcode(&cp))
+		cp = find_microcode_in_initrd(path, use_pa);
 
-static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
-	       struct ucode_cpu_info *uci, struct ucode_blobs *blbp)
-{
-	struct cpio_data cd = { NULL, 0, "" };
-	enum ucode_state ret;
+	if (!(cp.data && cp.size))
+		return NULL;
 
-	/* try built-in microcode first */
-	if (load_builtin_intel_microcode(&cd))
-		/*
-		 * Invalidate blobs as we might've gotten an initrd too,
-		 * supplied by the boot loader, by mistake or simply forgotten
-		 * there. That's fine, we ignore it since we've found builtin
-		 * microcode already.
-		 */
-		blbp->valid = false;
-	else {
-		ret = __scan_microcode_initrd(&cd, blbp);
-		if (ret != UCODE_OK)
-			return ret;
-	}
+	collect_cpu_info_early(uci);
 
-	return get_matching_model_microcode(blbp->start, cd.data, cd.size,
-					    mcs, mc_ptrs, uci);
+	return scan_microcode(cp.data, cp.size, uci, false);
 }
 
-static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
-		      struct ucode_blobs *blbp)
+void __init load_ucode_intel_bsp(void)
 {
+	struct microcode_intel *patch;
 	struct ucode_cpu_info uci;
-	enum ucode_state ret;
 
-	collect_cpu_info_early(&uci);
-
-	ret = scan_microcode(mcs, mc_ptrs, &uci, blbp);
-	if (ret != UCODE_OK)
+	patch = __load_ucode_intel(&uci);
+	if (!patch)
 		return;
 
-	ret = load_microcode(mcs, mc_ptrs, blbp->start, &uci);
-	if (ret != UCODE_OK)
-		return;
+	uci.mc = patch;
 
 	apply_microcode_early(&uci, true);
 }
 
-void __init load_ucode_intel_bsp(void)
+void load_ucode_intel_ap(void)
 {
-	struct ucode_blobs *blobs_p;
-	struct mc_saved_data *mcs;
-	unsigned long *ptrs;
+	struct microcode_intel *patch, **iup;
+	struct ucode_cpu_info uci;
 
-#ifdef CONFIG_X86_32
-	mcs	= (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
-	ptrs	= (unsigned long *)__pa_nodebug(&mc_tmp_ptrs);
-	blobs_p	= (struct ucode_blobs *)__pa_nodebug(&blobs);
-#else
-	mcs	= &mc_saved_data;
-	ptrs	= mc_tmp_ptrs;
-	blobs_p = &blobs;
-#endif
+	if (IS_ENABLED(CONFIG_X86_32))
+		iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch);
+	else
+		iup = &intel_ucode_patch;
+
+reget:
+	if (!*iup) {
+		patch = __load_ucode_intel(&uci);
+		if (!patch)
+			return;
+
+		*iup = patch;
+	}
+
+	uci.mc = *iup;
 
-	_load_ucode_intel_bsp(mcs, ptrs, blobs_p);
+	if (apply_microcode_early(&uci, true)) {
+		/* Mixed-silicon system? Try to refetch the proper patch: */
+		*iup = NULL;
+
+		goto reget;
+	}
 }
 
-void load_ucode_intel_ap(void)
+static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
 {
-	struct ucode_blobs *blobs_p;
-	unsigned long *ptrs, start = 0;
-	struct mc_saved_data *mcs;
-	struct ucode_cpu_info uci;
-	enum ucode_state ret;
-
-#ifdef CONFIG_X86_32
-	mcs	= (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
-	ptrs	= (unsigned long *)__pa_nodebug(mc_tmp_ptrs);
-	blobs_p	= (struct ucode_blobs *)__pa_nodebug(&blobs);
-#else
-	mcs	= &mc_saved_data;
-	ptrs	= mc_tmp_ptrs;
-	blobs_p = &blobs;
-#endif
+	struct microcode_header_intel *phdr;
+	struct ucode_patch *iter, *tmp;
 
-	/*
-	 * If there is no valid ucode previously saved in memory, no need to
-	 * update ucode on this AP.
-	 */
-	if (!mcs->num_saved)
-		return;
+	list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
 
-	if (blobs_p->valid) {
-		start = blobs_p->start;
+		phdr = (struct microcode_header_intel *)iter->data;
 
-		/*
-		 * Pay attention to CONFIG_RANDOMIZE_MEMORY=y as it shuffles
-		 * physmem mapping too and there we have the initrd.
-		 */
-		start += PAGE_OFFSET - __PAGE_OFFSET_BASE;
-	}
+		if (phdr->rev <= uci->cpu_sig.rev)
+			continue;
 
-	collect_cpu_info_early(&uci);
-	ret = load_microcode(mcs, ptrs, start, &uci);
-	if (ret != UCODE_OK)
-		return;
+		if (!find_matching_signature(phdr,
+					     uci->cpu_sig.sig,
+					     uci->cpu_sig.pf))
+			continue;
 
-	apply_microcode_early(&uci, true);
+		return iter->data;
+	}
+	return NULL;
 }
 
 void reload_ucode_intel(void)
 {
+	struct microcode_intel *p;
 	struct ucode_cpu_info uci;
-	enum ucode_state ret;
-
-	if (!mc_saved_data.num_saved)
-		return;
 
 	collect_cpu_info_early(&uci);
 
-	ret = find_microcode_patch(mc_saved_data.mc_saved,
-				   mc_saved_data.num_saved, &uci);
-	if (ret != UCODE_OK)
+	p = find_patch(&uci);
+	if (!p)
 		return;
 
+	uci.mc = p;
+
 	apply_microcode_early(&uci, false);
 }
 
@@ -879,24 +799,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
 	return 0;
 }
 
-/*
- * return 0 - no update found
- * return 1 - found update
- */
-static int get_matching_mc(struct microcode_intel *mc, int cpu)
-{
-	struct cpu_signature cpu_sig;
-	unsigned int csig, cpf, crev;
-
-	collect_cpu_info(cpu, &cpu_sig);
-
-	csig = cpu_sig.sig;
-	cpf = cpu_sig.pf;
-	crev = cpu_sig.rev;
-
-	return has_newer_microcode(mc, csig, cpf, crev);
-}
-
 static int apply_microcode_intel(int cpu)
 {
 	struct microcode_intel *mc;
@@ -911,23 +813,19 @@ static int apply_microcode_intel(int cpu)
 
 	uci = ucode_cpu_info + cpu;
 	mc = uci->mc;
-	if (!mc)
-		return 0;
-
-	/*
-	 * Microcode on this CPU could be updated earlier. Only apply the
-	 * microcode patch in mc when it is newer than the one on this
-	 * CPU.
-	 */
-	if (!get_matching_mc(mc, cpu))
-		return 0;
+	if (!mc) {
+		/* Look for a newer patch in our cache: */
+		mc = find_patch(uci);
+		if (!mc)
+			return 0;
+	}
 
 	/* write microcode via MSR 0x79 */
 	wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
 	wrmsrl(MSR_IA32_UCODE_REV, 0);
 
 	/* As documented in the SDM: Do a CPUID 1 here */
-	sync_core();
+	cpuid_1();
 
 	/* get the current revision from MSR 0x8B */
 	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
@@ -962,7 +860,6 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 	u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
 	int new_rev = uci->cpu_sig.rev;
 	unsigned int leftover = size;
-	enum ucode_state state = UCODE_OK;
 	unsigned int curr_mc_size = 0;
 	unsigned int csig, cpf;
 
@@ -1015,14 +912,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 
 	if (leftover) {
 		vfree(new_mc);
-		state = UCODE_ERROR;
-		goto out;
+		return UCODE_ERROR;
 	}
 
-	if (!new_mc) {
-		state = UCODE_NFOUND;
-		goto out;
-	}
+	if (!new_mc)
+		return UCODE_NFOUND;
 
 	vfree(uci->mc);
 	uci->mc = (struct microcode_intel *)new_mc;
@@ -1032,12 +926,12 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 	 * permanent memory. So it will be loaded early when a CPU is hot added
 	 * or resumes.
 	 */
-	save_mc_for_early(new_mc);
+	save_mc_for_early(new_mc, curr_mc_size);
 
 	pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
 		 cpu, new_rev, uci->cpu_sig.rev);
-out:
-	return state;
+
+	return UCODE_OK;
 }
 
 static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -1081,20 +975,11 @@ request_microcode_user(int cpu, const void __user *buf, size_t size)
 	return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
 }
 
-static void microcode_fini_cpu(int cpu)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	vfree(uci->mc);
-	uci->mc = NULL;
-}
-
 static struct microcode_ops microcode_intel_ops = {
 	.request_microcode_user		  = request_microcode_user,
 	.request_microcode_fw             = request_microcode_fw,
 	.collect_cpu_info                 = collect_cpu_info,
 	.apply_microcode                  = apply_microcode_intel,
-	.microcode_fini_cpu               = microcode_fini_cpu,
 };
 
 struct microcode_ops * __init init_intel_microcode(void)
@@ -1109,4 +994,3 @@ struct microcode_ops * __init init_intel_microcode(void)
 
 	return &microcode_intel_ops;
 }
-
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
deleted file mode 100644
index 406cb6c0d9dd..000000000000
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- *	Intel CPU Microcode Update Driver for Linux
- *
- *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- *			   H Peter Anvin" <hpa@zytor.com>
- *
- *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
- *
- *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *	Software Developer's Manual
- *	Order Number 253668 or free download from:
- *
- *	http://developer.intel.com/Assets/PDF/manual/253668.pdf
- *
- *	For more information, go to http://www.urbanmyth.org/microcode
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- */
-#include <linux/firmware.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-
-#include <asm/microcode_intel.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-
-static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
-					unsigned int s2, unsigned int p2)
-{
-	if (s1 != s2)
-		return false;
-
-	/* Processor flags are either both 0 ... */
-	if (!p1 && !p2)
-		return true;
-
-	/* ... or they intersect. */
-	return p1 & p2;
-}
-
-int microcode_sanity_check(void *mc, int print_err)
-{
-	unsigned long total_size, data_size, ext_table_size;
-	struct microcode_header_intel *mc_header = mc;
-	struct extended_sigtable *ext_header = NULL;
-	u32 sum, orig_sum, ext_sigcount = 0, i;
-	struct extended_signature *ext_sig;
-
-	total_size = get_totalsize(mc_header);
-	data_size = get_datasize(mc_header);
-
-	if (data_size + MC_HEADER_SIZE > total_size) {
-		if (print_err)
-			pr_err("Error: bad microcode data file size.\n");
-		return -EINVAL;
-	}
-
-	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
-		if (print_err)
-			pr_err("Error: invalid/unknown microcode update format.\n");
-		return -EINVAL;
-	}
-
-	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
-	if (ext_table_size) {
-		u32 ext_table_sum = 0;
-		u32 *ext_tablep;
-
-		if ((ext_table_size < EXT_HEADER_SIZE)
-		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
-			if (print_err)
-				pr_err("Error: truncated extended signature table.\n");
-			return -EINVAL;
-		}
-
-		ext_header = mc + MC_HEADER_SIZE + data_size;
-		if (ext_table_size != exttable_size(ext_header)) {
-			if (print_err)
-				pr_err("Error: extended signature table size mismatch.\n");
-			return -EFAULT;
-		}
-
-		ext_sigcount = ext_header->count;
-
-		/*
-		 * Check extended table checksum: the sum of all dwords that
-		 * comprise a valid table must be 0.
-		 */
-		ext_tablep = (u32 *)ext_header;
-
-		i = ext_table_size / sizeof(u32);
-		while (i--)
-			ext_table_sum += ext_tablep[i];
-
-		if (ext_table_sum) {
-			if (print_err)
-				pr_warn("Bad extended signature table checksum, aborting.\n");
-			return -EINVAL;
-		}
-	}
-
-	/*
-	 * Calculate the checksum of update data and header. The checksum of
-	 * valid update data and header including the extended signature table
-	 * must be 0.
-	 */
-	orig_sum = 0;
-	i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
-	while (i--)
-		orig_sum += ((u32 *)mc)[i];
-
-	if (orig_sum) {
-		if (print_err)
-			pr_err("Bad microcode data checksum, aborting.\n");
-		return -EINVAL;
-	}
-
-	if (!ext_table_size)
-		return 0;
-
-	/*
-	 * Check extended signature checksum: 0 => valid.
-	 */
-	for (i = 0; i < ext_sigcount; i++) {
-		ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
-			  EXT_SIGNATURE_SIZE * i;
-
-		sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
-		      (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
-		if (sum) {
-			if (print_err)
-				pr_err("Bad extended signature checksum, aborting.\n");
-			return -EINVAL;
-		}
-	}
-	return 0;
-}
-
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-int find_matching_signature(void *mc, unsigned int csig, int cpf)
-{
-	struct microcode_header_intel *mc_hdr = mc;
-	struct extended_sigtable *ext_hdr;
-	struct extended_signature *ext_sig;
-	int i;
-
-	if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
-		return 1;
-
-	/* Look for ext. headers: */
-	if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
-		return 0;
-
-	ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
-	ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
-
-	for (i = 0; i < ext_hdr->count; i++) {
-		if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
-			return 1;
-		ext_sig++;
-	}
-	return 0;
-}
-
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
-{
-	struct microcode_header_intel *mc_hdr = mc;
-
-	if (mc_hdr->rev <= new_rev)
-		return 0;
-
-	return find_matching_signature(mc, csig, cpf);
-}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 8f44c5a50ab8..f37e02e41a77 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -25,12 +25,12 @@
 #include <asm/hyperv.h>
 #include <asm/mshyperv.h>
 #include <asm/desc.h>
-#include <asm/idle.h>
 #include <asm/irq_regs.h>
 #include <asm/i8259.h>
 #include <asm/apic.h>
 #include <asm/timer.h>
 #include <asm/reboot.h>
+#include <asm/nmi.h>
 
 struct ms_hyperv_info ms_hyperv;
 EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -158,6 +158,26 @@ static unsigned char hv_get_nmi_reason(void)
 	return 0;
 }
 
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
+ * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle
+ * unknown NMI on the first CPU which gets it.
+ */
+static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
+{
+	static atomic_t nmi_cpu = ATOMIC_INIT(-1);
+
+	if (!unknown_nmi_panic)
+		return NMI_DONE;
+
+	if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1)
+		return NMI_HANDLED;
+
+	return NMI_DONE;
+}
+#endif
+
 static void __init ms_hyperv_init_platform(void)
 {
 	/*
@@ -183,6 +203,9 @@ static void __init ms_hyperv_init_platform(void)
 		pr_info("HyperV: LAPIC Timer Frequency: %#x\n",
 			lapic_timer_frequency);
 	}
+
+	register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,
+			     "hv_nmi_unknown");
 #endif
 
 	if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 1db8dc490b66..d9794060fe22 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -17,11 +17,20 @@ struct cpuid_bit {
 	u32 sub_leaf;
 };
 
-enum cpuid_regs {
-	CR_EAX = 0,
-	CR_ECX,
-	CR_EDX,
-	CR_EBX
+/* Please keep the leaf sorted by cpuid_bit.level for faster search. */
+static const struct cpuid_bit cpuid_bits[] = {
+	{ X86_FEATURE_APERFMPERF,       CPUID_ECX,  0, 0x00000006, 0 },
+	{ X86_FEATURE_EPB,		CPUID_ECX,  3, 0x00000006, 0 },
+	{ X86_FEATURE_INTEL_PT,		CPUID_EBX, 25, 0x00000007, 0 },
+	{ X86_FEATURE_AVX512_4VNNIW,    CPUID_EDX,  2, 0x00000007, 0 },
+	{ X86_FEATURE_AVX512_4FMAPS,    CPUID_EDX,  3, 0x00000007, 0 },
+	{ X86_FEATURE_CAT_L3,		CPUID_EBX,  1, 0x00000010, 0 },
+	{ X86_FEATURE_CAT_L2,		CPUID_EBX,  2, 0x00000010, 0 },
+	{ X86_FEATURE_CDP_L3,		CPUID_ECX,  2, 0x00000010, 1 },
+	{ X86_FEATURE_HW_PSTATE,	CPUID_EDX,  7, 0x80000007, 0 },
+	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 },
+	{ X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
+	{ 0, 0, 0, 0, 0 }
 };
 
 void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
@@ -30,18 +39,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	u32 regs[4];
 	const struct cpuid_bit *cb;
 
-	static const struct cpuid_bit cpuid_bits[] = {
-		{ X86_FEATURE_INTEL_PT,		CR_EBX,25, 0x00000007, 0 },
-		{ X86_FEATURE_AVX512_4VNNIW,	CR_EDX, 2, 0x00000007, 0 },
-		{ X86_FEATURE_AVX512_4FMAPS,	CR_EDX, 3, 0x00000007, 0 },
-		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
-		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
-		{ X86_FEATURE_HW_PSTATE,	CR_EDX, 7, 0x80000007, 0 },
-		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
-		{ X86_FEATURE_PROC_FEEDBACK,	CR_EDX,11, 0x80000007, 0 },
-		{ 0, 0, 0, 0, 0 }
-	};
-
 	for (cb = cpuid_bits; cb->feature; cb++) {
 
 		/* Verify that the level is valid */
@@ -50,10 +47,35 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		    max_level > (cb->level | 0xffff))
 			continue;
 
-		cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
-			    &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
+		cpuid_count(cb->level, cb->sub_leaf, &regs[CPUID_EAX],
+			    &regs[CPUID_EBX], &regs[CPUID_ECX],
+			    &regs[CPUID_EDX]);
 
 		if (regs[cb->reg] & (1 << cb->bit))
 			set_cpu_cap(c, cb->feature);
 	}
 }
+
+u32 get_scattered_cpuid_leaf(unsigned int level, unsigned int sub_leaf,
+			     enum cpuid_regs_idx reg)
+{
+	const struct cpuid_bit *cb;
+	u32 cpuid_val = 0;
+
+	for (cb = cpuid_bits; cb->feature; cb++) {
+
+		if (level > cb->level)
+			continue;
+
+		if (level < cb->level)
+			break;
+
+		if (reg == cb->reg && sub_leaf == cb->sub_leaf) {
+			if (cpu_has(&boot_cpu_data, cb->feature))
+				cpuid_val |= BIT(cb->bit);
+		}
+	}
+
+	return cpuid_val;
+}
+EXPORT_SYMBOL_GPL(get_scattered_cpuid_leaf);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 5130985b758b..891f4dad7b2c 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,11 +24,16 @@
 #include <linux/dmi.h>
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/clocksource.h>
 #include <asm/div64.h>
 #include <asm/x86_init.h>
 #include <asm/hypervisor.h>
 #include <asm/timer.h>
 #include <asm/apic.h>
+#include <asm/timer.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt)	"vmware: " fmt
 
 #define CPUID_VMWARE_INFO_LEAF	0x40000000
 #define VMWARE_HYPERVISOR_MAGIC	0x564D5868
@@ -48,6 +53,8 @@
 			"2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) :	\
 			"memory");
 
+static unsigned long vmware_tsc_khz __ro_after_init;
+
 static inline int __vmware_platform(void)
 {
 	uint32_t eax, ebx, ecx, edx;
@@ -57,35 +64,80 @@ static inline int __vmware_platform(void)
 
 static unsigned long vmware_get_tsc_khz(void)
 {
-	uint64_t tsc_hz, lpj;
-	uint32_t eax, ebx, ecx, edx;
+	return vmware_tsc_khz;
+}
 
-	VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+#ifdef CONFIG_PARAVIRT
+static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
+static int vmw_sched_clock __initdata = 1;
 
-	tsc_hz = eax | (((uint64_t)ebx) << 32);
-	do_div(tsc_hz, 1000);
-	BUG_ON(tsc_hz >> 32);
-	pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
-			 (unsigned long) tsc_hz / 1000,
-			 (unsigned long) tsc_hz % 1000);
-
-	if (!preset_lpj) {
-		lpj = ((u64)tsc_hz * 1000);
-		do_div(lpj, HZ);
-		preset_lpj = lpj;
-	}
+static __init int setup_vmw_sched_clock(char *s)
+{
+	vmw_sched_clock = 0;
+	return 0;
+}
+early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
+
+static unsigned long long vmware_sched_clock(void)
+{
+	unsigned long long ns;
 
-	return tsc_hz;
+	ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul,
+			     vmware_cyc2ns.cyc2ns_shift);
+	ns -= vmware_cyc2ns.cyc2ns_offset;
+	return ns;
 }
 
+static void __init vmware_sched_clock_setup(void)
+{
+	struct cyc2ns_data *d = &vmware_cyc2ns;
+	unsigned long long tsc_now = rdtsc();
+
+	clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift,
+			       vmware_tsc_khz, NSEC_PER_MSEC, 0);
+	d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
+					   d->cyc2ns_shift);
+
+	pv_time_ops.sched_clock = vmware_sched_clock;
+	pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset);
+}
+
+static void __init vmware_paravirt_ops_setup(void)
+{
+	pv_info.name = "VMware hypervisor";
+	pv_cpu_ops.io_delay = paravirt_nop;
+
+	if (vmware_tsc_khz && vmw_sched_clock)
+		vmware_sched_clock_setup();
+}
+#else
+#define vmware_paravirt_ops_setup() do {} while (0)
+#endif
+
 static void __init vmware_platform_setup(void)
 {
 	uint32_t eax, ebx, ecx, edx;
+	uint64_t lpj, tsc_khz;
 
 	VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
 
 	if (ebx != UINT_MAX) {
+		lpj = tsc_khz = eax | (((uint64_t)ebx) << 32);
+		do_div(tsc_khz, 1000);
+		WARN_ON(tsc_khz >> 32);
+		pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
+			(unsigned long) tsc_khz / 1000,
+			(unsigned long) tsc_khz % 1000);
+
+		if (!preset_lpj) {
+			do_div(lpj, HZ);
+			preset_lpj = lpj;
+		}
+
+		vmware_tsc_khz = tsc_khz;
 		x86_platform.calibrate_tsc = vmware_get_tsc_khz;
+		x86_platform.calibrate_cpu = vmware_get_tsc_khz;
+
 #ifdef CONFIG_X86_LOCAL_APIC
 		/* Skip lapic calibration since we know the bus frequency. */
 		lapic_timer_frequency = ecx / HZ;
@@ -96,6 +148,8 @@ static void __init vmware_platform_setup(void)
 		pr_warn("Failed to get TSC freq from the hypervisor\n");
 	}
 
+	vmware_paravirt_ops_setup();
+
 #ifdef CONFIG_X86_IO_APIC
 	no_timer_check = 1;
 #endif