10 files changed, 1193 insertions, 29 deletions
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index d15a7fe51618..c1799dd1d0d9 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -403,6 +403,7 @@ config CPU_V7M
 	bool
 	select CPU_32v7M
 	select CPU_ABRT_NOMMU
+	select CPU_CACHE_V7M
 	select CPU_CACHE_NOP
 	select CPU_PABRT_LEGACY
 	select CPU_THUMBONLY
@@ -518,6 +519,9 @@ config CPU_CACHE_VIPT
 config CPU_CACHE_FA
 	bool
 
+config CPU_CACHE_V7M
+	bool
+
 if MMU
 # The copy-page model
 config CPU_COPY_V4WT
@@ -750,14 +754,14 @@ config CPU_HIGH_VECTOR
 
 config CPU_ICACHE_DISABLE
 	bool "Disable I-Cache (I-bit)"
-	depends on CPU_CP15 && !(CPU_ARM720T || CPU_ARM740T || CPU_XSCALE || CPU_XSC3)
+	depends on (CPU_CP15 && !(CPU_ARM720T || CPU_ARM740T || CPU_XSCALE || CPU_XSC3)) || CPU_V7M
 	help
 	  Say Y here to disable the processor instruction cache. Unless
 	  you have a reason not to or are unsure, say N.
 
 config CPU_DCACHE_DISABLE
 	bool "Disable D-Cache (C-bit)"
-	depends on CPU_CP15 && !SMP
+	depends on (CPU_CP15 && !SMP) || CPU_V7M
 	help
 	  Say Y here to disable the processor data cache. Unless
 	  you have a reason not to or are unsure, say N.
@@ -792,7 +796,7 @@ config CPU_CACHE_ROUND_ROBIN
 
 config CPU_BPREDICT_DISABLE
 	bool "Disable branch prediction"
-	depends on CPU_ARM1020 || CPU_V6 || CPU_V6K || CPU_MOHAWK || CPU_XSC3 || CPU_V7 || CPU_FA526
+	depends on CPU_ARM1020 || CPU_V6 || CPU_V6K || CPU_MOHAWK || CPU_XSC3 || CPU_V7 || CPU_FA526 || CPU_V7M
 	help
 	  Say Y here to disable branch prediction.  If unsure, say N.
 
@@ -916,6 +920,13 @@ config CACHE_L2X0
 	help
 	  This option enables the L2x0 PrimeCell.
 
+config CACHE_L2X0_PMU
+	bool "L2x0 performance monitor support" if CACHE_L2X0
+	depends on PERF_EVENTS
+	help
+	  This option enables support for the performance monitoring features
+	  of the L220 and PL310 outer cache controllers.
+
 if CACHE_L2X0
 
 config PL310_ERRATA_588369
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 7f76d96ce546..e8698241ece9 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -43,9 +43,11 @@ obj-$(CONFIG_CPU_CACHE_V6)	+= cache-v6.o
 obj-$(CONFIG_CPU_CACHE_V7)	+= cache-v7.o
 obj-$(CONFIG_CPU_CACHE_FA)	+= cache-fa.o
 obj-$(CONFIG_CPU_CACHE_NOP)	+= cache-nop.o
+obj-$(CONFIG_CPU_CACHE_V7M)	+= cache-v7m.o
 
 AFLAGS_cache-v6.o	:=-Wa,-march=armv6
 AFLAGS_cache-v7.o	:=-Wa,-march=armv7-a
+AFLAGS_cache-v7m.o	:=-Wa,-march=armv7-m
 
 obj-$(CONFIG_CPU_COPY_V4WT)	+= copypage-v4wt.o
 obj-$(CONFIG_CPU_COPY_V4WB)	+= copypage-v4wb.o
@@ -101,6 +103,7 @@ AFLAGS_proc-v7.o	:=-Wa,-march=armv7-a
 obj-$(CONFIG_OUTER_CACHE)	+= l2c-common.o
 obj-$(CONFIG_CACHE_FEROCEON_L2)	+= cache-feroceon-l2.o
 obj-$(CONFIG_CACHE_L2X0)	+= cache-l2x0.o l2c-l2x0-resume.o
+obj-$(CONFIG_CACHE_L2X0_PMU)	+= cache-l2x0-pmu.o
 obj-$(CONFIG_CACHE_XSC3L2)	+= cache-xsc3l2.o
 obj-$(CONFIG_CACHE_TAUROS2)	+= cache-tauros2.o
 obj-$(CONFIG_CACHE_UNIPHIER)	+= cache-uniphier.o
diff --git a/arch/arm/mm/cache-l2x0-pmu.c b/arch/arm/mm/cache-l2x0-pmu.c
new file mode 100644
index 000000000000..976d3057272e
--- /dev/null
+++ b/arch/arm/mm/cache-l2x0-pmu.c
@@ -0,0 +1,584 @@
+/*
+ * L220/L310 cache controller support
+ *
+ * Copyright (C) 2016 ARM Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/errno.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/list.h>
+#include <linux/perf_event.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <asm/hardware/cache-l2x0.h>
+
+#define PMU_NR_COUNTERS 2
+
+static void __iomem *l2x0_base;
+static struct pmu *l2x0_pmu;
+static cpumask_t pmu_cpu;
+
+static const char *l2x0_name;
+
+static ktime_t l2x0_pmu_poll_period;
+static struct hrtimer l2x0_pmu_hrtimer;
+
+/*
+ * The L220/PL310 PMU has two equivalent counters, Counter1 and Counter0.
+ * Registers controlling these are laid out in pairs, in descending order, i.e.
+ * the register for Counter1 comes first, followed by the register for
+ * Counter0.
+ * We ensure that idx 0 -> Counter0, and idx1 -> Counter1.
+ */
+static struct perf_event *events[PMU_NR_COUNTERS];
+
+/* Find an unused counter */
+static int l2x0_pmu_find_idx(void)
+{
+	int i;
+
+	for (i = 0; i < PMU_NR_COUNTERS; i++) {
+		if (!events[i])
+			return i;
+	}
+
+	return -1;
+}
+
+/* How many counters are allocated? */
+static int l2x0_pmu_num_active_counters(void)
+{
+	int i, cnt = 0;
+
+	for (i = 0; i < PMU_NR_COUNTERS; i++) {
+		if (events[i])
+			cnt++;
+	}
+
+	return cnt;
+}
+
+static void l2x0_pmu_counter_config_write(int idx, u32 val)
+{
+	writel_relaxed(val, l2x0_base + L2X0_EVENT_CNT0_CFG - 4 * idx);
+}
+
+static u32 l2x0_pmu_counter_read(int idx)
+{
+	return readl_relaxed(l2x0_base + L2X0_EVENT_CNT0_VAL - 4 * idx);
+}
+
+static void l2x0_pmu_counter_write(int idx, u32 val)
+{
+	writel_relaxed(val, l2x0_base + L2X0_EVENT_CNT0_VAL - 4 * idx);
+}
+
+static void __l2x0_pmu_enable(void)
+{
+	u32 val = readl_relaxed(l2x0_base + L2X0_EVENT_CNT_CTRL);
+	val |= L2X0_EVENT_CNT_CTRL_ENABLE;
+	writel_relaxed(val, l2x0_base + L2X0_EVENT_CNT_CTRL);
+}
+
+static void __l2x0_pmu_disable(void)
+{
+	u32 val = readl_relaxed(l2x0_base + L2X0_EVENT_CNT_CTRL);
+	val &= ~L2X0_EVENT_CNT_CTRL_ENABLE;
+	writel_relaxed(val, l2x0_base + L2X0_EVENT_CNT_CTRL);
+}
+
+static void l2x0_pmu_enable(struct pmu *pmu)
+{
+	if (l2x0_pmu_num_active_counters() == 0)
+		return;
+
+	__l2x0_pmu_enable();
+}
+
+static void l2x0_pmu_disable(struct pmu *pmu)
+{
+	if (l2x0_pmu_num_active_counters() == 0)
+		return;
+
+	__l2x0_pmu_disable();
+}
+
+static void warn_if_saturated(u32 count)
+{
+	if (count != 0xffffffff)
+		return;
+
+	pr_warn_ratelimited("L2X0 counter saturated. Poll period too long\n");
+}
+
+static void l2x0_pmu_event_read(struct perf_event *event)
+{
+	struct hw_perf_event *hw = &event->hw;
+	u64 prev_count, new_count, mask;
+
+	do {
+		 prev_count = local64_read(&hw->prev_count);
+		 new_count = l2x0_pmu_counter_read(hw->idx);
+	} while (local64_xchg(&hw->prev_count, new_count) != prev_count);
+
+	mask = GENMASK_ULL(31, 0);
+	local64_add((new_count - prev_count) & mask, &event->count);
+
+	warn_if_saturated(new_count);
+}
+
+static void l2x0_pmu_event_configure(struct perf_event *event)
+{
+	struct hw_perf_event *hw = &event->hw;
+
+	/*
+	 * The L2X0 counters saturate at 0xffffffff rather than wrapping, so we
+	 * will *always* lose some number of events when a counter saturates,
+	 * and have no way of detecting how many were lost.
+	 *
+	 * To minimize the impact of this, we try to maximize the period by
+	 * always starting counters at zero. To ensure that group ratios are
+	 * representative, we poll periodically to avoid counters saturating.
+	 * See l2x0_pmu_poll().
+	 */
+	local64_set(&hw->prev_count, 0);
+	l2x0_pmu_counter_write(hw->idx, 0);
+}
+
+static enum hrtimer_restart l2x0_pmu_poll(struct hrtimer *hrtimer)
+{
+	unsigned long flags;
+	int i;
+
+	local_irq_save(flags);
+	__l2x0_pmu_disable();
+
+	for (i = 0; i < PMU_NR_COUNTERS; i++) {
+		struct perf_event *event = events[i];
+
+		if (!event)
+			continue;
+
+		l2x0_pmu_event_read(event);
+		l2x0_pmu_event_configure(event);
+	}
+
+	__l2x0_pmu_enable();
+	local_irq_restore(flags);
+
+	hrtimer_forward_now(hrtimer, l2x0_pmu_poll_period);
+	return HRTIMER_RESTART;
+}
+
+
+static void __l2x0_pmu_event_enable(int idx, u32 event)
+{
+	u32 val;
+
+	val = event << L2X0_EVENT_CNT_CFG_SRC_SHIFT;
+	val |= L2X0_EVENT_CNT_CFG_INT_DISABLED;
+	l2x0_pmu_counter_config_write(idx, val);
+}
+
+static void l2x0_pmu_event_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hw = &event->hw;
+
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	if (flags & PERF_EF_RELOAD) {
+		WARN_ON_ONCE(!(hw->state & PERF_HES_UPTODATE));
+		l2x0_pmu_event_configure(event);
+	}
+
+	hw->state = 0;
+
+	__l2x0_pmu_event_enable(hw->idx, hw->config_base);
+}
+
+static void __l2x0_pmu_event_disable(int idx)
+{
+	u32 val;
+
+	val = L2X0_EVENT_CNT_CFG_SRC_DISABLED << L2X0_EVENT_CNT_CFG_SRC_SHIFT;
+	val |= L2X0_EVENT_CNT_CFG_INT_DISABLED;
+	l2x0_pmu_counter_config_write(idx, val);
+}
+
+static void l2x0_pmu_event_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hw = &event->hw;
+
+	if (WARN_ON_ONCE(event->hw.state & PERF_HES_STOPPED))
+		return;
+
+	__l2x0_pmu_event_disable(hw->idx);
+
+	hw->state |= PERF_HES_STOPPED;
+
+	if (flags & PERF_EF_UPDATE) {
+		l2x0_pmu_event_read(event);
+		hw->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static int l2x0_pmu_event_add(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hw = &event->hw;
+	int idx = l2x0_pmu_find_idx();
+
+	if (idx == -1)
+		return -EAGAIN;
+
+	/*
+	 * Pin the timer, so that the overflows are handled by the chosen
+	 * event->cpu (this is the same one as presented in "cpumask"
+	 * attribute).
+	 */
+	if (l2x0_pmu_num_active_counters() == 0)
+		hrtimer_start(&l2x0_pmu_hrtimer, l2x0_pmu_poll_period,
+			      HRTIMER_MODE_REL_PINNED);
+
+	events[idx] = event;
+	hw->idx = idx;
+
+	l2x0_pmu_event_configure(event);
+
+	hw->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+	if (flags & PERF_EF_START)
+		l2x0_pmu_event_start(event, 0);
+
+	return 0;
+}
+
+static void l2x0_pmu_event_del(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hw = &event->hw;
+
+	l2x0_pmu_event_stop(event, PERF_EF_UPDATE);
+
+	events[hw->idx] = NULL;
+	hw->idx = -1;
+
+	if (l2x0_pmu_num_active_counters() == 0)
+		hrtimer_cancel(&l2x0_pmu_hrtimer);
+}
+
+static bool l2x0_pmu_group_is_valid(struct perf_event *event)
+{
+	struct pmu *pmu = event->pmu;
+	struct perf_event *leader = event->group_leader;
+	struct perf_event *sibling;
+	int num_hw = 0;
+
+	if (leader->pmu == pmu)
+		num_hw++;
+	else if (!is_software_event(leader))
+		return false;
+
+	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+		if (sibling->pmu == pmu)
+			num_hw++;
+		else if (!is_software_event(sibling))
+			return false;
+	}
+
+	return num_hw <= PMU_NR_COUNTERS;
+}
+
+static int l2x0_pmu_event_init(struct perf_event *event)
+{
+	struct hw_perf_event *hw = &event->hw;
+
+	if (event->attr.type != l2x0_pmu->type)
+		return -ENOENT;
+
+	if (is_sampling_event(event) ||
+	    event->attach_state & PERF_ATTACH_TASK)
+		return -EINVAL;
+
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest)
+		return -EINVAL;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	if (event->attr.config & ~L2X0_EVENT_CNT_CFG_SRC_MASK)
+		return -EINVAL;
+
+	hw->config_base = event->attr.config;
+
+	if (!l2x0_pmu_group_is_valid(event))
+		return -EINVAL;
+
+	event->cpu = cpumask_first(&pmu_cpu);
+
+	return 0;
+}
+
+struct l2x0_event_attribute {
+	struct device_attribute attr;
+	unsigned int config;
+	bool pl310_only;
+};
+
+#define L2X0_EVENT_ATTR(_name, _config, _pl310_only)				\
+	(&((struct l2x0_event_attribute[]) {{					\
+		.attr = __ATTR(_name, S_IRUGO, l2x0_pmu_event_show, NULL),	\
+		.config = _config,						\
+		.pl310_only = _pl310_only,					\
+	}})[0].attr.attr)
+
+#define L220_PLUS_EVENT_ATTR(_name, _config)					\
+	L2X0_EVENT_ATTR(_name, _config, false)
+
+#define PL310_EVENT_ATTR(_name, _config)					\
+	L2X0_EVENT_ATTR(_name, _config, true)
+
+static ssize_t l2x0_pmu_event_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct l2x0_event_attribute *lattr;
+
+	lattr = container_of(attr, typeof(*lattr), attr);
+	return snprintf(buf, PAGE_SIZE, "config=0x%x\n", lattr->config);
+}
+
+static umode_t l2x0_pmu_event_attr_is_visible(struct kobject *kobj,
+					      struct attribute *attr,
+					      int unused)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct l2x0_event_attribute *lattr;
+
+	lattr = container_of(attr, typeof(*lattr), attr.attr);
+
+	if (!lattr->pl310_only || strcmp("l2c_310", pmu->name) == 0)
+		return attr->mode;
+
+	return 0;
+}
+
+static struct attribute *l2x0_pmu_event_attrs[] = {
+	L220_PLUS_EVENT_ATTR(co,	0x1),
+	L220_PLUS_EVENT_ATTR(drhit,	0x2),
+	L220_PLUS_EVENT_ATTR(drreq,	0x3),
+	L220_PLUS_EVENT_ATTR(dwhit,	0x4),
+	L220_PLUS_EVENT_ATTR(dwreq,	0x5),
+	L220_PLUS_EVENT_ATTR(dwtreq,	0x6),
+	L220_PLUS_EVENT_ATTR(irhit,	0x7),
+	L220_PLUS_EVENT_ATTR(irreq,	0x8),
+	L220_PLUS_EVENT_ATTR(wa,	0x9),
+	PL310_EVENT_ATTR(ipfalloc,	0xa),
+	PL310_EVENT_ATTR(epfhit,	0xb),
+	PL310_EVENT_ATTR(epfalloc,	0xc),
+	PL310_EVENT_ATTR(srrcvd,	0xd),
+	PL310_EVENT_ATTR(srconf,	0xe),
+	PL310_EVENT_ATTR(epfrcvd,	0xf),
+	NULL
+};
+
+static struct attribute_group l2x0_pmu_event_attrs_group = {
+	.name = "events",
+	.attrs = l2x0_pmu_event_attrs,
+	.is_visible = l2x0_pmu_event_attr_is_visible,
+};
+
+static ssize_t l2x0_pmu_cpumask_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &pmu_cpu);
+}
+
+static struct device_attribute l2x0_pmu_cpumask_attr =
+		__ATTR(cpumask, S_IRUGO, l2x0_pmu_cpumask_show, NULL);
+
+static struct attribute *l2x0_pmu_cpumask_attrs[] = {
+	&l2x0_pmu_cpumask_attr.attr,
+	NULL,
+};
+
+static struct attribute_group l2x0_pmu_cpumask_attr_group = {
+	.attrs = l2x0_pmu_cpumask_attrs,
+};
+
+static const struct attribute_group *l2x0_pmu_attr_groups[] = {
+	&l2x0_pmu_event_attrs_group,
+	&l2x0_pmu_cpumask_attr_group,
+	NULL,
+};
+
+static void l2x0_pmu_reset(void)
+{
+	int i;
+
+	__l2x0_pmu_disable();
+
+	for (i = 0; i < PMU_NR_COUNTERS; i++)
+		__l2x0_pmu_event_disable(i);
+}
+
+static int l2x0_pmu_offline_cpu(unsigned int cpu)
+{
+	unsigned int target;
+
+	if (!cpumask_test_and_clear_cpu(cpu, &pmu_cpu))
+		return 0;
+
+	target = cpumask_any_but(cpu_online_mask, cpu);
+	if (target >= nr_cpu_ids)
+		return 0;
+
+	perf_pmu_migrate_context(l2x0_pmu, cpu, target);
+	cpumask_set_cpu(target, &pmu_cpu);
+
+	return 0;
+}
+
+void l2x0_pmu_suspend(void)
+{
+	int i;
+
+	if (!l2x0_pmu)
+		return;
+
+	l2x0_pmu_disable(l2x0_pmu);
+
+	for (i = 0; i < PMU_NR_COUNTERS; i++) {
+		if (events[i])
+			l2x0_pmu_event_stop(events[i], PERF_EF_UPDATE);
+	}
+
+}
+
+void l2x0_pmu_resume(void)
+{
+	int i;
+
+	if (!l2x0_pmu)
+		return;
+
+	l2x0_pmu_reset();
+
+	for (i = 0; i < PMU_NR_COUNTERS; i++) {
+		if (events[i])
+			l2x0_pmu_event_start(events[i], PERF_EF_RELOAD);
+	}
+
+	l2x0_pmu_enable(l2x0_pmu);
+}
+
+void __init l2x0_pmu_register(void __iomem *base, u32 part)
+{
+	/*
+	 * Determine whether we support the PMU, and choose the name for sysfs.
+	 * This is also used by l2x0_pmu_event_attr_is_visible to determine
+	 * which events to display, as the PL310 PMU supports a superset of
+	 * L220 events.
+	 *
+	 * The L210 PMU has a different programmer's interface, and is not
+	 * supported by this driver.
+	 *
+	 * We must defer registering the PMU until the perf subsystem is up and
+	 * running, so just stash the name and base, and leave that to another
+	 * initcall.
+	 */
+	switch (part & L2X0_CACHE_ID_PART_MASK) {
+	case L2X0_CACHE_ID_PART_L220:
+		l2x0_name = "l2c_220";
+		break;
+	case L2X0_CACHE_ID_PART_L310:
+		l2x0_name = "l2c_310";
+		break;
+	default:
+		return;
+	}
+
+	l2x0_base = base;
+}
+
+static __init int l2x0_pmu_init(void)
+{
+	int ret;
+
+	if (!l2x0_base)
+		return 0;
+
+	l2x0_pmu = kzalloc(sizeof(*l2x0_pmu), GFP_KERNEL);
+	if (!l2x0_pmu) {
+		pr_warn("Unable to allocate L2x0 PMU\n");
+		return -ENOMEM;
+	}
+
+	*l2x0_pmu = (struct pmu) {
+		.task_ctx_nr = perf_invalid_context,
+		.pmu_enable = l2x0_pmu_enable,
+		.pmu_disable = l2x0_pmu_disable,
+		.read = l2x0_pmu_event_read,
+		.start = l2x0_pmu_event_start,
+		.stop = l2x0_pmu_event_stop,
+		.add = l2x0_pmu_event_add,
+		.del = l2x0_pmu_event_del,
+		.event_init = l2x0_pmu_event_init,
+		.attr_groups = l2x0_pmu_attr_groups,
+	};
+
+	l2x0_pmu_reset();
+
+	/*
+	 * We always use a hrtimer rather than an interrupt.
+	 * See comments in l2x0_pmu_event_configure and l2x0_pmu_poll.
+	 *
+	 * Polling once a second allows the counters to fill up to 1/128th on a
+	 * quad-core test chip with cores clocked at 400MHz. Hopefully this
+	 * leaves sufficient headroom to avoid overflow on production silicon
+	 * at higher frequencies.
+	 */
+	l2x0_pmu_poll_period = ms_to_ktime(1000);
+	hrtimer_init(&l2x0_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	l2x0_pmu_hrtimer.function = l2x0_pmu_poll;
+
+	cpumask_set_cpu(0, &pmu_cpu);
+	ret = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_ARM_L2X0_ONLINE,
+					"AP_PERF_ARM_L2X0_ONLINE", NULL,
+					l2x0_pmu_offline_cpu);
+	if (ret)
+		goto out_pmu;
+
+	ret = perf_pmu_register(l2x0_pmu, l2x0_name, -1);
+	if (ret)
+		goto out_cpuhp;
+
+	return 0;
+
+out_cpuhp:
+	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_ARM_L2X0_ONLINE);
+out_pmu:
+	kfree(l2x0_pmu);
+	l2x0_pmu = NULL;
+	return ret;
+}
+device_initcall(l2x0_pmu_init);
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index cc12905ae6f8..d1870c777c6e 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -142,6 +142,8 @@ static void l2c_disable(void)
 {
 	void __iomem *base = l2x0_base;
 
+	l2x0_pmu_suspend();
+
 	outer_cache.flush_all();
 	l2c_write_sec(0, base, L2X0_CTRL);
 	dsb(st);
@@ -159,6 +161,8 @@ static void l2c_resume(void)
 	/* Do not touch the controller if already enabled. */
 	if (!(readl_relaxed(base + L2X0_CTRL) & L2X0_CTRL_EN))
 		l2c_enable(base, l2x0_data->num_lock);
+
+	l2x0_pmu_resume();
 }
 
 /*
@@ -709,9 +713,8 @@ static void __init l2c310_fixup(void __iomem *base, u32 cache_id,
 	if (revision >= L310_CACHE_ID_RTL_R3P0 &&
 	    revision < L310_CACHE_ID_RTL_R3P2) {
 		u32 val = l2x0_saved_regs.prefetch_ctrl;
-		/* I don't think bit23 is required here... but iMX6 does so */
-		if (val & (BIT(30) | BIT(23))) {
-			val &= ~(BIT(30) | BIT(23));
+		if (val & L310_PREFETCH_CTRL_DBL_LINEFILL) {
+			val &= ~L310_PREFETCH_CTRL_DBL_LINEFILL;
 			l2x0_saved_regs.prefetch_ctrl = val;
 			errata[n++] = "752271";
 		}
@@ -892,6 +895,8 @@ static int __init __l2c_init(const struct l2c_init_data *data,
 	pr_info("%s: CACHE_ID 0x%08x, AUX_CTRL 0x%08x\n",
 		data->type, cache_id, aux);
 
+	l2x0_pmu_register(l2x0_base, cache_id);
+
 	return 0;
 }
 
diff --git a/arch/arm/mm/cache-v7m.S b/arch/arm/mm/cache-v7m.S
new file mode 100644
index 000000000000..816a7e44e6f1
--- /dev/null
+++ b/arch/arm/mm/cache-v7m.S
@@ -0,0 +1,453 @@
+/*
+ *  linux/arch/arm/mm/cache-v7m.S
+ *
+ *  Based on linux/arch/arm/mm/cache-v7.S
+ *
+ *  Copyright (C) 2001 Deep Blue Solutions Ltd.
+ *  Copyright (C) 2005 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  This is the "shell" of the ARMv7M processor support.
+ */
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <asm/assembler.h>
+#include <asm/errno.h>
+#include <asm/unwind.h>
+#include <asm/v7m.h>
+
+#include "proc-macros.S"
+
+/* Generic V7M read/write macros for memory mapped cache operations */
+.macro v7m_cache_read, rt, reg
+	movw	\rt, #:lower16:BASEADDR_V7M_SCB + \reg
+	movt	\rt, #:upper16:BASEADDR_V7M_SCB + \reg
+	ldr     \rt, [\rt]
+.endm
+
+.macro v7m_cacheop, rt, tmp, op, c = al
+	movw\c	\tmp, #:lower16:BASEADDR_V7M_SCB + \op
+	movt\c	\tmp, #:upper16:BASEADDR_V7M_SCB + \op
+	str\c	\rt, [\tmp]
+.endm
+
+
+.macro	read_ccsidr, rt
+	v7m_cache_read \rt, V7M_SCB_CCSIDR
+.endm
+
+.macro read_clidr, rt
+	v7m_cache_read \rt, V7M_SCB_CLIDR
+.endm
+
+.macro	write_csselr, rt, tmp
+	v7m_cacheop \rt, \tmp, V7M_SCB_CSSELR
+.endm
+
+/*
+ * dcisw: Invalidate data cache by set/way
+ */
+.macro dcisw, rt, tmp
+	v7m_cacheop \rt, \tmp, V7M_SCB_DCISW
+.endm
+
+/*
+ * dccisw: Clean and invalidate data cache by set/way
+ */
+.macro dccisw, rt, tmp
+	v7m_cacheop \rt, \tmp, V7M_SCB_DCCISW
+.endm
+
+/*
+ * dccimvac: Clean and invalidate data cache line by MVA to PoC.
+ */
+.irp    c,,eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,hs,lo
+.macro dccimvac\c, rt, tmp
+	v7m_cacheop \rt, \tmp, V7M_SCB_DCCIMVAC, \c
+.endm
+.endr
+
+/*
+ * dcimvac: Invalidate data cache line by MVA to PoC
+ */
+.macro dcimvac, rt, tmp
+	v7m_cacheop \rt, \tmp, V7M_SCB_DCIMVAC
+.endm
+
+/*
+ * dccmvau: Clean data cache line by MVA to PoU
+ */
+.macro dccmvau, rt, tmp
+	v7m_cacheop \rt, \tmp, V7M_SCB_DCCMVAU
+.endm
+
+/*
+ * dccmvac: Clean data cache line by MVA to PoC
+ */
+.macro dccmvac,  rt, tmp
+	v7m_cacheop \rt, \tmp, V7M_SCB_DCCMVAC
+.endm
+
+/*
+ * icimvau: Invalidate instruction caches by MVA to PoU
+ */
+.macro icimvau, rt, tmp
+	v7m_cacheop \rt, \tmp, V7M_SCB_ICIMVAU
+.endm
+
+/*
+ * Invalidate the icache, inner shareable if SMP, invalidate BTB for UP.
+ * rt data ignored by ICIALLU(IS), so can be used for the address
+ */
+.macro invalidate_icache, rt
+	v7m_cacheop \rt, \rt, V7M_SCB_ICIALLU
+	mov \rt, #0
+.endm
+
+/*
+ * Invalidate the BTB, inner shareable if SMP.
+ * rt data ignored by BPIALL, so it can be used for the address
+ */
+.macro invalidate_bp, rt
+	v7m_cacheop \rt, \rt, V7M_SCB_BPIALL
+	mov \rt, #0
+.endm
+
+ENTRY(v7m_invalidate_l1)
+	mov	r0, #0
+
+	write_csselr r0, r1
+	read_ccsidr r0
+
+	movw	r1, #0x7fff
+	and	r2, r1, r0, lsr #13
+
+	movw	r1, #0x3ff
+
+	and	r3, r1, r0, lsr #3      @ NumWays - 1
+	add	r2, r2, #1              @ NumSets
+
+	and	r0, r0, #0x7
+	add	r0, r0, #4      @ SetShift
+
+	clz	r1, r3          @ WayShift
+	add	r4, r3, #1      @ NumWays
+1:	sub	r2, r2, #1      @ NumSets--
+	mov	r3, r4          @ Temp = NumWays
+2:	subs	r3, r3, #1      @ Temp--
+	mov	r5, r3, lsl r1
+	mov	r6, r2, lsl r0
+	orr	r5, r5, r6      @ Reg = (Temp<<WayShift)|(NumSets<<SetShift)
+	dcisw	r5, r6
+	bgt	2b
+	cmp	r2, #0
+	bgt	1b
+	dsb	st
+	isb
+	ret	lr
+ENDPROC(v7m_invalidate_l1)
+
+/*
+ *	v7m_flush_icache_all()
+ *
+ *	Flush the whole I-cache.
+ *
+ *	Registers:
+ *	r0 - set to 0
+ */
+ENTRY(v7m_flush_icache_all)
+	invalidate_icache r0
+	ret	lr
+ENDPROC(v7m_flush_icache_all)
+
+/*
+ *	v7m_flush_dcache_all()
+ *
+ *	Flush the whole D-cache.
+ *
+ *	Corrupted registers: r0-r7, r9-r11
+ */
+ENTRY(v7m_flush_dcache_all)
+	dmb					@ ensure ordering with previous memory accesses
+	read_clidr r0
+	mov	r3, r0, lsr #23			@ move LoC into position
+	ands	r3, r3, #7 << 1			@ extract LoC*2 from clidr
+	beq	finished			@ if loc is 0, then no need to clean
+start_flush_levels:
+	mov	r10, #0				@ start clean at cache level 0
+flush_levels:
+	add	r2, r10, r10, lsr #1		@ work out 3x current cache level
+	mov	r1, r0, lsr r2			@ extract cache type bits from clidr
+	and	r1, r1, #7			@ mask of the bits for current cache only
+	cmp	r1, #2				@ see what cache we have at this level
+	blt	skip				@ skip if no cache, or just i-cache
+#ifdef CONFIG_PREEMPT
+	save_and_disable_irqs_notrace r9	@ make cssr&csidr read atomic
+#endif
+	write_csselr r10, r1			@ set current cache level
+	isb					@ isb to sych the new cssr&csidr
+	read_ccsidr r1				@ read the new csidr
+#ifdef CONFIG_PREEMPT
+	restore_irqs_notrace r9
+#endif
+	and	r2, r1, #7			@ extract the length of the cache lines
+	add	r2, r2, #4			@ add 4 (line length offset)
+	movw	r4, #0x3ff
+	ands	r4, r4, r1, lsr #3		@ find maximum number on the way size
+	clz	r5, r4				@ find bit position of way size increment
+	movw	r7, #0x7fff
+	ands	r7, r7, r1, lsr #13		@ extract max number of the index size
+loop1:
+	mov	r9, r7				@ create working copy of max index
+loop2:
+	lsl	r6, r4, r5
+	orr	r11, r10, r6			@ factor way and cache number into r11
+	lsl	r6, r9, r2
+	orr	r11, r11, r6			@ factor index number into r11
+	dccisw	r11, r6				@ clean/invalidate by set/way
+	subs	r9, r9, #1			@ decrement the index
+	bge	loop2
+	subs	r4, r4, #1			@ decrement the way
+	bge	loop1
+skip:
+	add	r10, r10, #2			@ increment cache number
+	cmp	r3, r10
+	bgt	flush_levels
+finished:
+	mov	r10, #0				@ swith back to cache level 0
+	write_csselr r10, r3			@ select current cache level in cssr
+	dsb	st
+	isb
+	ret	lr
+ENDPROC(v7m_flush_dcache_all)
+
+/*
+ *	v7m_flush_cache_all()
+ *
+ *	Flush the entire cache system.
+ *  The data cache flush is now achieved using atomic clean / invalidates
+ *  working outwards from L1 cache. This is done using Set/Way based cache
+ *  maintenance instructions.
+ *  The instruction cache can still be invalidated back to the point of
+ *  unification in a single instruction.
+ *
+ */
+ENTRY(v7m_flush_kern_cache_all)
+	stmfd	sp!, {r4-r7, r9-r11, lr}
+	bl	v7m_flush_dcache_all
+	invalidate_icache r0
+	ldmfd	sp!, {r4-r7, r9-r11, lr}
+	ret	lr
+ENDPROC(v7m_flush_kern_cache_all)
+
+/*
+ *	v7m_flush_cache_all()
+ *
+ *	Flush all TLB entries in a particular address space
+ *
+ *	- mm    - mm_struct describing address space
+ */
+ENTRY(v7m_flush_user_cache_all)
+	/*FALLTHROUGH*/
+
+/*
+ *	v7m_flush_cache_range(start, end, flags)
+ *
+ *	Flush a range of TLB entries in the specified address space.
+ *
+ *	- start - start address (may not be aligned)
+ *	- end   - end address (exclusive, may not be aligned)
+ *	- flags	- vm_area_struct flags describing address space
+ *
+ *	It is assumed that:
+ *	- we have a VIPT cache.
+ */
+ENTRY(v7m_flush_user_cache_range)
+	ret	lr
+ENDPROC(v7m_flush_user_cache_all)
+ENDPROC(v7m_flush_user_cache_range)
+
+/*
+ *	v7m_coherent_kern_range(start,end)
+ *
+ *	Ensure that the I and D caches are coherent within specified
+ *	region.  This is typically used when code has been written to
+ *	a memory region, and will be executed.
+ *
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ *
+ *	It is assumed that:
+ *	- the Icache does not read data from the write buffer
+ */
+ENTRY(v7m_coherent_kern_range)
+	/* FALLTHROUGH */
+
+/*
+ *	v7m_coherent_user_range(start,end)
+ *
+ *	Ensure that the I and D caches are coherent within specified
+ *	region.  This is typically used when code has been written to
+ *	a memory region, and will be executed.
+ *
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ *
+ *	It is assumed that:
+ *	- the Icache does not read data from the write buffer
+ */
+ENTRY(v7m_coherent_user_range)
+ UNWIND(.fnstart		)
+	dcache_line_size r2, r3
+	sub	r3, r2, #1
+	bic	r12, r0, r3
+1:
+/*
+ * We use open coded version of dccmvau otherwise USER() would
+ * point at movw instruction.
+ */
+	dccmvau	r12, r3
+	add	r12, r12, r2
+	cmp	r12, r1
+	blo	1b
+	dsb	ishst
+	icache_line_size r2, r3
+	sub	r3, r2, #1
+	bic	r12, r0, r3
+2:
+	icimvau r12, r3
+	add	r12, r12, r2
+	cmp	r12, r1
+	blo	2b
+	invalidate_bp r0
+	dsb	ishst
+	isb
+	ret	lr
+ UNWIND(.fnend		)
+ENDPROC(v7m_coherent_kern_range)
+ENDPROC(v7m_coherent_user_range)
+
+/*
+ *	v7m_flush_kern_dcache_area(void *addr, size_t size)
+ *
+ *	Ensure that the data held in the page kaddr is written back
+ *	to the page in question.
+ *
+ *	- addr	- kernel address
+ *	- size	- region size
+ */
+ENTRY(v7m_flush_kern_dcache_area)
+	dcache_line_size r2, r3
+	add	r1, r0, r1
+	sub	r3, r2, #1
+	bic	r0, r0, r3
+1:
+	dccimvac r0, r3		@ clean & invalidate D line / unified line
+	add	r0, r0, r2
+	cmp	r0, r1
+	blo	1b
+	dsb	st
+	ret	lr
+ENDPROC(v7m_flush_kern_dcache_area)
+
+/*
+ *	v7m_dma_inv_range(start,end)
+ *
+ *	Invalidate the data cache within the specified region; we will
+ *	be performing a DMA operation in this region and we want to
+ *	purge old data in the cache.
+ *
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+v7m_dma_inv_range:
+	dcache_line_size r2, r3
+	sub	r3, r2, #1
+	tst	r0, r3
+	bic	r0, r0, r3
+	dccimvacne r0, r3
+	subne	r3, r2, #1	@ restore r3, corrupted by v7m's dccimvac
+	tst	r1, r3
+	bic	r1, r1, r3
+	dccimvacne r1, r3
+1:
+	dcimvac r0, r3
+	add	r0, r0, r2
+	cmp	r0, r1
+	blo	1b
+	dsb	st
+	ret	lr
+ENDPROC(v7m_dma_inv_range)
+
+/*
+ *	v7m_dma_clean_range(start,end)
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+v7m_dma_clean_range:
+	dcache_line_size r2, r3
+	sub	r3, r2, #1
+	bic	r0, r0, r3
+1:
+	dccmvac r0, r3			@ clean D / U line
+	add	r0, r0, r2
+	cmp	r0, r1
+	blo	1b
+	dsb	st
+	ret	lr
+ENDPROC(v7m_dma_clean_range)
+
+/*
+ *	v7m_dma_flush_range(start,end)
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+ENTRY(v7m_dma_flush_range)
+	dcache_line_size r2, r3
+	sub	r3, r2, #1
+	bic	r0, r0, r3
+1:
+	dccimvac r0, r3			 @ clean & invalidate D / U line
+	add	r0, r0, r2
+	cmp	r0, r1
+	blo	1b
+	dsb	st
+	ret	lr
+ENDPROC(v7m_dma_flush_range)
+
+/*
+ *	dma_map_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v7m_dma_map_area)
+	add	r1, r1, r0
+	teq	r2, #DMA_FROM_DEVICE
+	beq	v7m_dma_inv_range
+	b	v7m_dma_clean_range
+ENDPROC(v7m_dma_map_area)
+
+/*
+ *	dma_unmap_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v7m_dma_unmap_area)
+	add	r1, r1, r0
+	teq	r2, #DMA_TO_DEVICE
+	bne	v7m_dma_inv_range
+	ret	lr
+ENDPROC(v7m_dma_unmap_area)
+
+	.globl	v7m_flush_kern_cache_louis
+	.equ	v7m_flush_kern_cache_louis, v7m_flush_kern_cache_all
+
+	__INITDATA
+
+	@ define struct cpu_cache_fns (see <asm/cacheflush.h> and proc-macros.S)
+	define_cache_functions v7m
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index c6834c0cfd1c..a2302aba5df2 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -436,7 +436,7 @@ static int __init atomic_pool_init(void)
 		gen_pool_set_algo(atomic_pool,
 				gen_pool_first_fit_order_align,
 				(void *)PAGE_SHIFT);
-		pr_info("DMA: preallocated %zd KiB pool for atomic coherent allocations\n",
+		pr_info("DMA: preallocated %zu KiB pool for atomic coherent allocations\n",
 		       atomic_pool_size / 1024);
 		return 0;
 	}
@@ -445,7 +445,7 @@ destroy_genpool:
 	gen_pool_destroy(atomic_pool);
 	atomic_pool = NULL;
 out:
-	pr_err("DMA: failed to allocate %zx KiB pool for atomic coherent allocation\n",
+	pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
 	       atomic_pool_size / 1024);
 	return -ENOMEM;
 }
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 62f4d01941f7..4001dd15818d 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -137,7 +137,7 @@ void __init init_default_cache_policy(unsigned long pmd)
 
 	initial_pmd_value = pmd;
 
-	pmd &= PMD_SECT_TEX(1) | PMD_SECT_BUFFERABLE | PMD_SECT_CACHEABLE;
+	pmd &= PMD_SECT_CACHE_MASK;
 
 	for (i = 0; i < ARRAY_SIZE(cache_policies); i++)
 		if (cache_policies[i].pmd == pmd) {
@@ -243,7 +243,7 @@ __setup("noalign", noalign_setup);
 #define PROT_PTE_S2_DEVICE	PROT_PTE_DEVICE
 #define PROT_SECT_DEVICE	PMD_TYPE_SECT|PMD_SECT_AP_WRITE
 
-static struct mem_type mem_types[] = {
+static struct mem_type mem_types[] __ro_after_init = {
 	[MT_DEVICE] = {		  /* Strongly ordered / ARMv6 shared device */
 		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED |
 				  L_PTE_SHARED,
@@ -728,7 +728,8 @@ static void *__init late_alloc(unsigned long sz)
 {
 	void *ptr = (void *)__get_free_pages(PGALLOC_GFP, get_order(sz));
 
-	BUG_ON(!ptr);
+	if (!ptr || !pgtable_page_ctor(virt_to_page(ptr)))
+		BUG();
 	return ptr;
 }
 
@@ -1155,10 +1156,19 @@ void __init sanity_check_meminfo(void)
 {
 	phys_addr_t memblock_limit = 0;
 	int highmem = 0;
-	phys_addr_t vmalloc_limit = __pa(vmalloc_min - 1) + 1;
+	u64 vmalloc_limit;
 	struct memblock_region *reg;
 	bool should_use_highmem = false;
 
+	/*
+	 * Let's use our own (unoptimized) equivalent of __pa() that is
+	 * not affected by wrap-arounds when sizeof(phys_addr_t) == 4.
+	 * The result is used as the upper bound on physical memory address
+	 * and may itself be outside the valid range for which phys_addr_t
+	 * and therefore __pa() is defined.
+	 */
+	vmalloc_limit = (u64)(uintptr_t)vmalloc_min - PAGE_OFFSET + PHYS_OFFSET;
+
 	for_each_memblock(memory, reg) {
 		phys_addr_t block_start = reg->base;
 		phys_addr_t block_end = reg->base + reg->size;
@@ -1183,10 +1193,11 @@ void __init sanity_check_meminfo(void)
 			if (reg->size > size_limit) {
 				phys_addr_t overlap_size = reg->size - size_limit;
 
-				pr_notice("Truncating RAM at %pa-%pa to -%pa",
-					  &block_start, &block_end, &vmalloc_limit);
-				memblock_remove(vmalloc_limit, overlap_size);
+				pr_notice("Truncating RAM at %pa-%pa",
+					  &block_start, &block_end);
 				block_end = vmalloc_limit;
+				pr_cont(" to -%pa", &block_end);
+				memblock_remove(vmalloc_limit, overlap_size);
 				should_use_highmem = true;
 			}
 		}
diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S
index c671f345266a..0d40c285bd86 100644
--- a/arch/arm/mm/proc-macros.S
+++ b/arch/arm/mm/proc-macros.S
@@ -7,6 +7,10 @@
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 
+#ifdef CONFIG_CPU_V7M
+#include <asm/v7m.h>
+#endif
+
 /*
  * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
  */
@@ -70,7 +74,13 @@
  * on ARMv7.
  */
 	.macro	dcache_line_size, reg, tmp
+#ifdef CONFIG_CPU_V7M
+	movw	\tmp, #:lower16:BASEADDR_V7M_SCB + V7M_SCB_CTR
+	movt	\tmp, #:upper16:BASEADDR_V7M_SCB + V7M_SCB_CTR
+	ldr     \tmp, [\tmp]
+#else
 	mrc	p15, 0, \tmp, c0, c0, 1		@ read ctr
+#endif
 	lsr	\tmp, \tmp, #16
 	and	\tmp, \tmp, #0xf		@ cache line size encoding
 	mov	\reg, #4			@ bytes per word
@@ -82,7 +92,13 @@
  * on ARMv7.
  */
 	.macro	icache_line_size, reg, tmp
+#ifdef CONFIG_CPU_V7M
+	movw	\tmp, #:lower16:BASEADDR_V7M_SCB + V7M_SCB_CTR
+	movt	\tmp, #:upper16:BASEADDR_V7M_SCB + V7M_SCB_CTR
+	ldr     \tmp, [\tmp]
+#else
 	mrc	p15, 0, \tmp, c0, c0, 1		@ read ctr
+#endif
 	and	\tmp, \tmp, #0xf		@ cache line size encoding
 	mov	\reg, #4			@ bytes per word
 	mov	\reg, \reg, lsl \tmp		@ actual cache line size
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index a7123b4e129d..d00d52c9de3e 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -16,6 +16,7 @@
 #include <asm/hwcap.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/pgtable.h>
+#include <asm/memory.h>
 
 #include "proc-macros.S"
 
diff --git a/arch/arm/mm/proc-v7m.S b/arch/arm/mm/proc-v7m.S
index 7229d8d0be1a..f6d333f09bfe 100644
--- a/arch/arm/mm/proc-v7m.S
+++ b/arch/arm/mm/proc-v7m.S
@@ -74,14 +74,42 @@ ENTRY(cpu_v7m_do_resume)
 ENDPROC(cpu_v7m_do_resume)
 #endif
 
+ENTRY(cpu_cm7_dcache_clean_area)
+	dcache_line_size r2, r3
+	movw	r3, #:lower16:BASEADDR_V7M_SCB + V7M_SCB_DCCMVAC
+	movt	r3, #:upper16:BASEADDR_V7M_SCB + V7M_SCB_DCCMVAC
+
+1:	str	r0, [r3]		@ clean D entry
+	add	r0, r0, r2
+	subs	r1, r1, r2
+	bhi	1b
+	dsb
+	ret	lr
+ENDPROC(cpu_cm7_dcache_clean_area)
+
+ENTRY(cpu_cm7_proc_fin)
+	movw	r2, #:lower16:(BASEADDR_V7M_SCB + V7M_SCB_CCR)
+	movt	r2, #:upper16:(BASEADDR_V7M_SCB + V7M_SCB_CCR)
+	ldr	r0, [r2]
+	bic	r0, r0, #(V7M_SCB_CCR_DC | V7M_SCB_CCR_IC)
+	str	r0, [r2]
+	ret	lr
+ENDPROC(cpu_cm7_proc_fin)
+
 	.section ".text.init", #alloc, #execinstr
 
+__v7m_cm7_setup:
+	mov	r8, #(V7M_SCB_CCR_DC | V7M_SCB_CCR_IC| V7M_SCB_CCR_BP)
+	b	__v7m_setup_cont
 /*
  *	__v7m_setup
  *
  *	This should be able to cover all ARMv7-M cores.
  */
 __v7m_setup:
+	mov	r8, 0
+
+__v7m_setup_cont:
 	@ Configure the vector table base address
 	ldr	r0, =BASEADDR_V7M_SCB
 	ldr	r12, =vector_table
@@ -104,6 +132,7 @@ __v7m_setup:
 	badr	r1, 1f
 	ldr	r5, [r12, #11 * 4]	@ read the SVC vector entry
 	str	r1, [r12, #11 * 4]	@ write the temporary SVC vector entry
+	dsb
 	mov	r6, lr			@ save LR
 	ldr	sp, =init_thread_union + THREAD_START_SP
 	cpsie	i
@@ -116,15 +145,32 @@ __v7m_setup:
 	mov	r1, #1
 	msr	control, r1		@ Thread mode has unpriviledged access
 
+	@ Configure caches (if implemented)
+	teq     r8, #0
+	stmneia	r12, {r0-r6, lr}	@ v7m_invalidate_l1 touches r0-r6
+	blne	v7m_invalidate_l1
+	teq     r8, #0			@ re-evalutae condition
+	ldmneia	r12, {r0-r6, lr}
+
 	@ Configure the System Control Register to ensure 8-byte stack alignment
 	@ Note the STKALIGN bit is either RW or RAO.
-	ldr	r12, [r0, V7M_SCB_CCR]	@ system control register
-	orr	r12, #V7M_SCB_CCR_STKALIGN
-	str	r12, [r0, V7M_SCB_CCR]
+	ldr	r0, [r0, V7M_SCB_CCR]   @ system control register
+	orr	r0, #V7M_SCB_CCR_STKALIGN
+	orr	r0, r0, r8
+
 	ret	lr
 ENDPROC(__v7m_setup)
 
+/*
+ * Cortex-M7 processor functions
+ */
+	globl_equ	cpu_cm7_proc_init,	cpu_v7m_proc_init
+	globl_equ	cpu_cm7_reset,		cpu_v7m_reset
+	globl_equ	cpu_cm7_do_idle,	cpu_v7m_do_idle
+	globl_equ	cpu_cm7_switch_mm,	cpu_v7m_switch_mm
+
 	define_processor_functions v7m, dabort=nommu_early_abort, pabort=legacy_pabort, nommu=1
+	define_processor_functions cm7, dabort=nommu_early_abort, pabort=legacy_pabort, nommu=1
 
 	.section ".rodata"
 	string cpu_arch_name, "armv7m"
@@ -133,6 +179,50 @@ ENDPROC(__v7m_setup)
 
 	.section ".proc.info.init", #alloc
 
+.macro __v7m_proc name, initfunc, cache_fns = nop_cache_fns, hwcaps = 0,  proc_fns = v7m_processor_functions
+	.long	0			/* proc_info_list.__cpu_mm_mmu_flags */
+	.long	0			/* proc_info_list.__cpu_io_mmu_flags */
+	initfn	\initfunc, \name
+	.long	cpu_arch_name
+	.long	cpu_elf_name
+	.long	HWCAP_HALF | HWCAP_THUMB | HWCAP_FAST_MULT | \hwcaps
+	.long	cpu_v7m_name
+	.long   \proc_fns
+	.long	0			/* proc_info_list.tlb */
+	.long	0			/* proc_info_list.user */
+	.long	\cache_fns
+.endm
+
+	/*
+	 * Match ARM Cortex-M7 processor.
+	 */
+	.type	__v7m_cm7_proc_info, #object
+__v7m_cm7_proc_info:
+	.long	0x410fc270		/* ARM Cortex-M7 0xC27 */
+	.long	0xff0ffff0		/* Mask off revision, patch release */
+	__v7m_proc __v7m_cm7_proc_info, __v7m_cm7_setup, hwcaps = HWCAP_EDSP, cache_fns = v7m_cache_fns, proc_fns = cm7_processor_functions
+	.size	__v7m_cm7_proc_info, . - __v7m_cm7_proc_info
+
+	/*
+	 * Match ARM Cortex-M4 processor.
+	 */
+	.type	__v7m_cm4_proc_info, #object
+__v7m_cm4_proc_info:
+	.long	0x410fc240		/* ARM Cortex-M4 0xC24 */
+	.long	0xff0ffff0		/* Mask off revision, patch release */
+	__v7m_proc __v7m_cm4_proc_info, __v7m_setup, hwcaps = HWCAP_EDSP
+	.size	__v7m_cm4_proc_info, . - __v7m_cm4_proc_info
+
+	/*
+	 * Match ARM Cortex-M3 processor.
+	 */
+	.type	__v7m_cm3_proc_info, #object
+__v7m_cm3_proc_info:
+	.long	0x410fc230		/* ARM Cortex-M3 0xC23 */
+	.long	0xff0ffff0		/* Mask off revision, patch release */
+	__v7m_proc __v7m_cm3_proc_info, __v7m_setup
+	.size	__v7m_cm3_proc_info, . - __v7m_cm3_proc_info
+
 	/*
 	 * Match any ARMv7-M processor core.
 	 */
@@ -140,16 +230,6 @@ ENDPROC(__v7m_setup)
 __v7m_proc_info:
 	.long	0x000f0000		@ Required ID value
 	.long	0x000f0000		@ Mask for ID
-	.long   0			@ proc_info_list.__cpu_mm_mmu_flags
-	.long   0			@ proc_info_list.__cpu_io_mmu_flags
-	initfn	__v7m_setup, __v7m_proc_info	@ proc_info_list.__cpu_flush
-	.long	cpu_arch_name
-	.long	cpu_elf_name
-	.long	HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT
-	.long	cpu_v7m_name
-	.long	v7m_processor_functions	@ proc_info_list.proc
-	.long	0			@ proc_info_list.tlb
-	.long	0			@ proc_info_list.user
-	.long	nop_cache_fns		@ proc_info_list.cache
+	__v7m_proc __v7m_proc_info, __v7m_setup
 	.size	__v7m_proc_info, . - __v7m_proc_info