From 8f65873e47784a390949f0d61e5692dbf2a8253e Mon Sep 17 00:00:00 2001
From: Graf Yang <graf.yang@analog.com>
Date: Tue, 18 Nov 2008 17:48:22 +0800
Subject: Blackfin arch: SMP supporting patchset: Blackfin kernel and memory
 management code

Blackfin dual core BF561 processor can support SMP like features.
https://docs.blackfin.uclinux.org/doku.php?id=linux-kernel:smp-like

In this patch, we provide SMP extend to Blackfin kernel and memory management code

Singed-off-by: Graf Yang <graf.yang@analog.com>
Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
---
 arch/blackfin/kernel/asm-offsets.c |  29 ++++
 arch/blackfin/kernel/bfin_ksyms.c  |  34 ++++
 arch/blackfin/kernel/entry.S       |   1 +
 arch/blackfin/kernel/irqchip.c     |  24 +--
 arch/blackfin/kernel/kgdb.c        |   4 +-
 arch/blackfin/kernel/module.c      |  13 +-
 arch/blackfin/kernel/process.c     |  23 ++-
 arch/blackfin/kernel/ptrace.c      |   8 +-
 arch/blackfin/kernel/reboot.c      |  24 ++-
 arch/blackfin/kernel/setup.c       | 163 ++++++++++++------
 arch/blackfin/kernel/time.c        | 114 +++++++++----
 arch/blackfin/kernel/traps.c       |  56 +++----
 arch/blackfin/mm/init.c            |  60 +++++--
 arch/blackfin/mm/sram-alloc.c      | 336 +++++++++++++++++++++----------------
 14 files changed, 580 insertions(+), 309 deletions(-)

diff --git a/arch/blackfin/kernel/asm-offsets.c b/arch/blackfin/kernel/asm-offsets.c
index 9bb85dd5ccb3..b5df9459d6d5 100644
--- a/arch/blackfin/kernel/asm-offsets.c
+++ b/arch/blackfin/kernel/asm-offsets.c
@@ -56,6 +56,9 @@ int main(void)
 	/* offsets into the thread struct */
 	DEFINE(THREAD_KSP, offsetof(struct thread_struct, ksp));
 	DEFINE(THREAD_USP, offsetof(struct thread_struct, usp));
+	DEFINE(THREAD_SR, offsetof(struct thread_struct, seqstat));
+	DEFINE(PT_SR, offsetof(struct thread_struct, seqstat));
+	DEFINE(THREAD_ESP0, offsetof(struct thread_struct, esp0));
 	DEFINE(THREAD_PC, offsetof(struct thread_struct, pc));
 	DEFINE(KERNEL_STACK_SIZE, THREAD_SIZE);
 
@@ -128,5 +131,31 @@ int main(void)
 	DEFINE(SIGSEGV, SIGSEGV);
 	DEFINE(SIGTRAP, SIGTRAP);
 
+	/* PDA management (in L1 scratchpad) */
+	DEFINE(PDA_SYSCFG, offsetof(struct blackfin_pda, syscfg));
+#ifdef CONFIG_SMP
+	DEFINE(PDA_IRQFLAGS, offsetof(struct blackfin_pda, imask));
+#endif
+	DEFINE(PDA_IPDT, offsetof(struct blackfin_pda, ipdt));
+	DEFINE(PDA_IPDT_SWAPCOUNT, offsetof(struct blackfin_pda, ipdt_swapcount));
+	DEFINE(PDA_DPDT, offsetof(struct blackfin_pda, dpdt));
+	DEFINE(PDA_DPDT_SWAPCOUNT, offsetof(struct blackfin_pda, dpdt_swapcount));
+	DEFINE(PDA_EXIPTR, offsetof(struct blackfin_pda, ex_iptr));
+	DEFINE(PDA_EXOPTR, offsetof(struct blackfin_pda, ex_optr));
+	DEFINE(PDA_EXBUF, offsetof(struct blackfin_pda, ex_buf));
+	DEFINE(PDA_EXIMASK, offsetof(struct blackfin_pda, ex_imask));
+	DEFINE(PDA_EXSTACK, offsetof(struct blackfin_pda, ex_stack));
+#ifdef ANOMALY_05000261
+	DEFINE(PDA_LFRETX, offsetof(struct blackfin_pda, last_cplb_fault_retx));
+#endif
+	DEFINE(PDA_DCPLB, offsetof(struct blackfin_pda, dcplb_fault_addr));
+	DEFINE(PDA_ICPLB, offsetof(struct blackfin_pda, icplb_fault_addr));
+	DEFINE(PDA_RETX, offsetof(struct blackfin_pda, retx));
+	DEFINE(PDA_SEQSTAT, offsetof(struct blackfin_pda, seqstat));
+#ifdef CONFIG_SMP
+	/* Inter-core lock (in L2 SRAM) */
+	DEFINE(SIZEOF_CORELOCK, sizeof(struct corelock_slot));
+#endif
+
 	return 0;
 }
diff --git a/arch/blackfin/kernel/bfin_ksyms.c b/arch/blackfin/kernel/bfin_ksyms.c
index b66f1d4c8344..763c31531e9e 100644
--- a/arch/blackfin/kernel/bfin_ksyms.c
+++ b/arch/blackfin/kernel/bfin_ksyms.c
@@ -68,3 +68,37 @@ EXPORT_SYMBOL(insw_8);
 EXPORT_SYMBOL(outsl);
 EXPORT_SYMBOL(insl);
 EXPORT_SYMBOL(insl_16);
+
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL(__raw_atomic_update_asm);
+EXPORT_SYMBOL(__raw_atomic_clear_asm);
+EXPORT_SYMBOL(__raw_atomic_set_asm);
+EXPORT_SYMBOL(__raw_atomic_xor_asm);
+EXPORT_SYMBOL(__raw_atomic_test_asm);
+EXPORT_SYMBOL(__raw_xchg_1_asm);
+EXPORT_SYMBOL(__raw_xchg_2_asm);
+EXPORT_SYMBOL(__raw_xchg_4_asm);
+EXPORT_SYMBOL(__raw_cmpxchg_1_asm);
+EXPORT_SYMBOL(__raw_cmpxchg_2_asm);
+EXPORT_SYMBOL(__raw_cmpxchg_4_asm);
+EXPORT_SYMBOL(__raw_spin_is_locked_asm);
+EXPORT_SYMBOL(__raw_spin_lock_asm);
+EXPORT_SYMBOL(__raw_spin_trylock_asm);
+EXPORT_SYMBOL(__raw_spin_unlock_asm);
+EXPORT_SYMBOL(__raw_read_lock_asm);
+EXPORT_SYMBOL(__raw_read_trylock_asm);
+EXPORT_SYMBOL(__raw_read_unlock_asm);
+EXPORT_SYMBOL(__raw_write_lock_asm);
+EXPORT_SYMBOL(__raw_write_trylock_asm);
+EXPORT_SYMBOL(__raw_write_unlock_asm);
+EXPORT_SYMBOL(__raw_bit_set_asm);
+EXPORT_SYMBOL(__raw_bit_clear_asm);
+EXPORT_SYMBOL(__raw_bit_toggle_asm);
+EXPORT_SYMBOL(__raw_bit_test_asm);
+EXPORT_SYMBOL(__raw_bit_test_set_asm);
+EXPORT_SYMBOL(__raw_bit_test_clear_asm);
+EXPORT_SYMBOL(__raw_bit_test_toggle_asm);
+EXPORT_SYMBOL(__raw_uncached_fetch_asm);
+EXPORT_SYMBOL(__raw_smp_mark_barrier_asm);
+EXPORT_SYMBOL(__raw_smp_check_barrier_asm);
+#endif
diff --git a/arch/blackfin/kernel/entry.S b/arch/blackfin/kernel/entry.S
index faea88ebb2ef..c0c3fe811228 100644
--- a/arch/blackfin/kernel/entry.S
+++ b/arch/blackfin/kernel/entry.S
@@ -30,6 +30,7 @@
 #include <linux/linkage.h>
 #include <asm/thread_info.h>
 #include <asm/errno.h>
+#include <asm/blackfin.h>
 #include <asm/asm-offsets.h>
 
 #include <asm/context.S>
diff --git a/arch/blackfin/kernel/irqchip.c b/arch/blackfin/kernel/irqchip.c
index 07402f57c9de..9eebb782fd30 100644
--- a/arch/blackfin/kernel/irqchip.c
+++ b/arch/blackfin/kernel/irqchip.c
@@ -36,7 +36,7 @@
 #include <linux/irq.h>
 #include <asm/trace.h>
 
-static unsigned long irq_err_count;
+static atomic_t irq_err_count;
 static spinlock_t irq_controller_lock;
 
 /*
@@ -48,7 +48,7 @@ void dummy_mask_unmask_irq(unsigned int irq)
 
 void ack_bad_irq(unsigned int irq)
 {
-	irq_err_count += 1;
+	atomic_inc(&irq_err_count);
 	printk(KERN_ERR "IRQ: spurious interrupt %d\n", irq);
 }
 EXPORT_SYMBOL(ack_bad_irq);
@@ -72,7 +72,7 @@ static struct irq_desc bad_irq_desc = {
 
 int show_interrupts(struct seq_file *p, void *v)
 {
-	int i = *(loff_t *) v;
+	int i = *(loff_t *) v, j;
 	struct irqaction *action;
 	unsigned long flags;
 
@@ -80,19 +80,20 @@ int show_interrupts(struct seq_file *p, void *v)
 		spin_lock_irqsave(&irq_desc[i].lock, flags);
 		action = irq_desc[i].action;
 		if (!action)
-			goto unlock;
-
-		seq_printf(p, "%3d: %10u ", i, kstat_irqs(i));
+			goto skip;
+		seq_printf(p, "%3d: ", i);
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+		seq_printf(p, " %8s", irq_desc[i].chip->name);
 		seq_printf(p, "  %s", action->name);
 		for (action = action->next; action; action = action->next)
-			seq_printf(p, ", %s", action->name);
+			seq_printf(p, "  %s", action->name);
 
 		seq_putc(p, '\n');
- unlock:
+ skip:
 		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
-	} else if (i == NR_IRQS) {
-		seq_printf(p, "Err: %10lu\n", irq_err_count);
-	}
+	} else if (i == NR_IRQS)
+		seq_printf(p, "Err: %10u\n",  atomic_read(&irq_err_count));
 	return 0;
 }
 
@@ -101,7 +102,6 @@ int show_interrupts(struct seq_file *p, void *v)
  * come via this function.  Instead, they should provide their
  * own 'handler'
  */
-
 #ifdef CONFIG_DO_IRQ_L1
 __attribute__((l1_text))
 #endif
diff --git a/arch/blackfin/kernel/kgdb.c b/arch/blackfin/kernel/kgdb.c
index b795a207742c..ab4022131a2a 100644
--- a/arch/blackfin/kernel/kgdb.c
+++ b/arch/blackfin/kernel/kgdb.c
@@ -363,12 +363,12 @@ void kgdb_passive_cpu_callback(void *info)
 
 void kgdb_roundup_cpus(unsigned long flags)
 {
-	smp_call_function(kgdb_passive_cpu_callback, NULL, 0, 0);
+	smp_call_function(kgdb_passive_cpu_callback, NULL, 0);
 }
 
 void kgdb_roundup_cpu(int cpu, unsigned long flags)
 {
-	smp_call_function_single(cpu, kgdb_passive_cpu_callback, NULL, 0, 0);
+	smp_call_function_single(cpu, kgdb_passive_cpu_callback, NULL, 0);
 }
 #endif
 
diff --git a/arch/blackfin/kernel/module.c b/arch/blackfin/kernel/module.c
index e1bebc80a5bf..2e14cadd4302 100644
--- a/arch/blackfin/kernel/module.c
+++ b/arch/blackfin/kernel/module.c
@@ -343,7 +343,13 @@ apply_relocate_add(Elf_Shdr * sechdrs, const char *strtab,
 		pr_debug("location is %x, value is %x type is %d \n",
 			 (unsigned int) location32, value,
 			 ELF32_R_TYPE(rel[i].r_info));
-
+#ifdef CONFIG_SMP
+		if ((unsigned long)location16 >= COREB_L1_DATA_A_START) {
+			printk(KERN_ERR "module %s: cannot relocate in L1: %u (SMP kernel)",
+				       mod->name, ELF32_R_TYPE(rel[i].r_info));
+			return -ENOEXEC;
+		}
+#endif
 		switch (ELF32_R_TYPE(rel[i].r_info)) {
 
 		case R_pcrel24:
@@ -436,6 +442,7 @@ module_finalize(const Elf_Ehdr * hdr,
 {
 	unsigned int i, strindex = 0, symindex = 0;
 	char *secstrings;
+	long err = 0;
 
 	secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
@@ -460,8 +467,10 @@ module_finalize(const Elf_Ehdr * hdr,
 		    (strcmp(".rela.l1.text", secstrings + sechdrs[i].sh_name) == 0) ||
 		    ((strcmp(".rela.text", secstrings + sechdrs[i].sh_name) == 0) &&
 			(hdr->e_flags & (EF_BFIN_CODE_IN_L1|EF_BFIN_CODE_IN_L2))))) {
-			apply_relocate_add((Elf_Shdr *) sechdrs, strtab,
+			err = apply_relocate_add((Elf_Shdr *) sechdrs, strtab,
 					   symindex, i, mod);
+			if (err < 0)
+				return -ENOEXEC;
 		}
 	}
 	return 0;
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 326e3019cd23..4359ea253010 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -171,6 +171,13 @@ asmlinkage int bfin_clone(struct pt_regs *regs)
 	unsigned long clone_flags;
 	unsigned long newsp;
 
+#ifdef __ARCH_SYNC_CORE_DCACHE
+	if (current->rt.nr_cpus_allowed == num_possible_cpus()) {
+		current->cpus_allowed = cpumask_of_cpu(smp_processor_id());
+		current->rt.nr_cpus_allowed = 1;
+	}
+#endif
+
 	/* syscall2 puts clone_flags in r0 and usp in r1 */
 	clone_flags = regs->r0;
 	newsp = regs->r1;
@@ -338,22 +345,22 @@ int _access_ok(unsigned long addr, unsigned long size)
 	if (addr >= (unsigned long)__init_begin &&
 	    addr + size <= (unsigned long)__init_end)
 		return 1;
-	if (addr >= L1_SCRATCH_START
-	    && addr + size <= L1_SCRATCH_START + L1_SCRATCH_LENGTH)
+	if (addr >= get_l1_scratch_start()
+	    && addr + size <= get_l1_scratch_start() + L1_SCRATCH_LENGTH)
 		return 1;
 #if L1_CODE_LENGTH != 0
-	if (addr >= L1_CODE_START + (_etext_l1 - _stext_l1)
-	    && addr + size <= L1_CODE_START + L1_CODE_LENGTH)
+	if (addr >= get_l1_code_start() + (_etext_l1 - _stext_l1)
+	    && addr + size <= get_l1_code_start() + L1_CODE_LENGTH)
 		return 1;
 #endif
 #if L1_DATA_A_LENGTH != 0
-	if (addr >= L1_DATA_A_START + (_ebss_l1 - _sdata_l1)
-	    && addr + size <= L1_DATA_A_START + L1_DATA_A_LENGTH)
+	if (addr >= get_l1_data_a_start() + (_ebss_l1 - _sdata_l1)
+	    && addr + size <= get_l1_data_a_start() + L1_DATA_A_LENGTH)
 		return 1;
 #endif
 #if L1_DATA_B_LENGTH != 0
-	if (addr >= L1_DATA_B_START + (_ebss_b_l1 - _sdata_b_l1)
-	    && addr + size <= L1_DATA_B_START + L1_DATA_B_LENGTH)
+	if (addr >= get_l1_data_b_start() + (_ebss_b_l1 - _sdata_b_l1)
+	    && addr + size <= get_l1_data_b_start() + L1_DATA_B_LENGTH)
 		return 1;
 #endif
 #if L2_LENGTH != 0
diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index 140bf00e9974..4de44f387dd5 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -220,8 +220,8 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 				break;
 			pr_debug("ptrace: user address is valid\n");
 
-			if (L1_CODE_LENGTH != 0 && addr >= L1_CODE_START
-			    && addr + sizeof(tmp) <= L1_CODE_START + L1_CODE_LENGTH) {
+			if (L1_CODE_LENGTH != 0 && addr >= get_l1_code_start()
+			    && addr + sizeof(tmp) <= get_l1_code_start() + L1_CODE_LENGTH) {
 				safe_dma_memcpy (&tmp, (const void *)(addr), sizeof(tmp));
 				copied = sizeof(tmp);
 
@@ -300,8 +300,8 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 				break;
 			pr_debug("ptrace: user address is valid\n");
 
-			if (L1_CODE_LENGTH != 0 && addr >= L1_CODE_START
-			    && addr + sizeof(data) <= L1_CODE_START + L1_CODE_LENGTH) {
+			if (L1_CODE_LENGTH != 0 && addr >= get_l1_code_start()
+			    && addr + sizeof(data) <= get_l1_code_start() + L1_CODE_LENGTH) {
 				safe_dma_memcpy ((void *)(addr), &data, sizeof(data));
 				copied = sizeof(data);
 
diff --git a/arch/blackfin/kernel/reboot.c b/arch/blackfin/kernel/reboot.c
index ae97ca407b0d..eeee8cb43360 100644
--- a/arch/blackfin/kernel/reboot.c
+++ b/arch/blackfin/kernel/reboot.c
@@ -21,7 +21,7 @@
  * the core reset.
  */
 __attribute__((l1_text))
-static void bfin_reset(void)
+static void _bfin_reset(void)
 {
 	/* Wait for completion of "system" events such as cache line
 	 * line fills so that we avoid infinite stalls later on as
@@ -66,6 +66,18 @@ static void bfin_reset(void)
 	}
 }
 
+static void bfin_reset(void)
+{
+	if (ANOMALY_05000353 || ANOMALY_05000386)
+		_bfin_reset();
+	else
+		/* the bootrom checks to see how it was reset and will
+		 * automatically perform a software reset for us when
+		 * it starts executing boot
+		 */
+		asm("raise 1;");
+}
+
 __attribute__((weak))
 void native_machine_restart(char *cmd)
 {
@@ -75,14 +87,10 @@ void machine_restart(char *cmd)
 {
 	native_machine_restart(cmd);
 	local_irq_disable();
-	if (ANOMALY_05000353 || ANOMALY_05000386)
-		bfin_reset();
+	if (smp_processor_id())
+		smp_call_function((void *)bfin_reset, 0, 1);
 	else
-		/* the bootrom checks to see how it was reset and will
-		 * automatically perform a software reset for us when
-		 * it starts executing boot
-		 */
-		asm("raise 1;");
+		bfin_reset();
 }
 
 __attribute__((weak))
diff --git a/arch/blackfin/kernel/setup.c b/arch/blackfin/kernel/setup.c
index 71a9a8c53cea..c644d234a02e 100644
--- a/arch/blackfin/kernel/setup.c
+++ b/arch/blackfin/kernel/setup.c
@@ -26,11 +26,10 @@
 #include <asm/blackfin.h>
 #include <asm/cplbinit.h>
 #include <asm/div64.h>
+#include <asm/cpu.h>
 #include <asm/fixed_code.h>
 #include <asm/early_printk.h>
 
-static DEFINE_PER_CPU(struct cpu, cpu_devices);
-
 u16 _bfin_swrst;
 EXPORT_SYMBOL(_bfin_swrst);
 
@@ -79,29 +78,76 @@ static struct change_member *change_point[2*BFIN_MEMMAP_MAX] __initdata;
 static struct bfin_memmap_entry *overlap_list[BFIN_MEMMAP_MAX] __initdata;
 static struct bfin_memmap_entry new_map[BFIN_MEMMAP_MAX] __initdata;
 
-void __init bfin_cache_init(void)
-{
+DEFINE_PER_CPU(struct blackfin_cpudata, cpu_data);
+
 #if defined(CONFIG_BFIN_DCACHE) || defined(CONFIG_BFIN_ICACHE)
-	generate_cplb_tables();
+void __init generate_cplb_tables(void)
+{
+	unsigned int cpu;
+
+	/* Generate per-CPU I&D CPLB tables */
+	for (cpu = 0; cpu < num_possible_cpus(); ++cpu)
+		generate_cplb_tables_cpu(cpu);
+}
 #endif
 
+void __cpuinit bfin_setup_caches(unsigned int cpu)
+{
 #ifdef CONFIG_BFIN_ICACHE
-	bfin_icache_init();
-	printk(KERN_INFO "Instruction Cache Enabled\n");
+#ifdef CONFIG_MPU
+	bfin_icache_init(icplb_tbl[cpu]);
+#else
+	bfin_icache_init(icplb_tables[cpu]);
+#endif
 #endif
 
 #ifdef CONFIG_BFIN_DCACHE
-	bfin_dcache_init();
-	printk(KERN_INFO "Data Cache Enabled"
+#ifdef CONFIG_MPU
+	bfin_dcache_init(dcplb_tbl[cpu]);
+#else
+	bfin_dcache_init(dcplb_tables[cpu]);
+#endif
+#endif
+
+	/*
+	 * In cache coherence emulation mode, we need to have the
+	 * D-cache enabled before running any atomic operation which
+	 * might invove cache invalidation (i.e. spinlock, rwlock).
+	 * So printk's are deferred until then.
+	 */
+#ifdef CONFIG_BFIN_ICACHE
+	printk(KERN_INFO "Instruction Cache Enabled for CPU%u\n", cpu);
+#endif
+#ifdef CONFIG_BFIN_DCACHE
+	printk(KERN_INFO "Data Cache Enabled for CPU%u"
 # if defined CONFIG_BFIN_WB
 		" (write-back)"
 # elif defined CONFIG_BFIN_WT
 		" (write-through)"
 # endif
-		"\n");
+		"\n", cpu);
 #endif
 }
 
+void __cpuinit bfin_setup_cpudata(unsigned int cpu)
+{
+	struct blackfin_cpudata *cpudata = &per_cpu(cpu_data, cpu);
+
+	cpudata->idle = current;
+	cpudata->loops_per_jiffy = loops_per_jiffy;
+	cpudata->cclk = get_cclk();
+	cpudata->imemctl = bfin_read_IMEM_CONTROL();
+	cpudata->dmemctl = bfin_read_DMEM_CONTROL();
+}
+
+void __init bfin_cache_init(void)
+{
+#if defined(CONFIG_BFIN_DCACHE) || defined(CONFIG_BFIN_ICACHE)
+	generate_cplb_tables();
+#endif
+	bfin_setup_caches(0);
+}
+
 void __init bfin_relocate_l1_mem(void)
 {
 	unsigned long l1_code_length;
@@ -230,7 +276,7 @@ static int __init sanitize_memmap(struct bfin_memmap_entry *map, int *pnr_map)
 	/* record all known change-points (starting and ending addresses),
 	   omitting those that are for empty memory regions */
 	chgidx = 0;
-	for (i = 0; i < old_nr; i++)	{
+	for (i = 0; i < old_nr; i++) {
 		if (map[i].size != 0) {
 			change_point[chgidx]->addr = map[i].addr;
 			change_point[chgidx++]->pentry = &map[i];
@@ -238,13 +284,13 @@ static int __init sanitize_memmap(struct bfin_memmap_entry *map, int *pnr_map)
 			change_point[chgidx++]->pentry = &map[i];
 		}
 	}
-	chg_nr = chgidx;    	/* true number of change-points */
+	chg_nr = chgidx;	/* true number of change-points */
 
 	/* sort change-point list by memory addresses (low -> high) */
 	still_changing = 1;
-	while (still_changing)	{
+	while (still_changing) {
 		still_changing = 0;
-		for (i = 1; i < chg_nr; i++)  {
+		for (i = 1; i < chg_nr; i++) {
 			/* if <current_addr> > <last_addr>, swap */
 			/* or, if current=<start_addr> & last=<end_addr>, swap */
 			if ((change_point[i]->addr < change_point[i-1]->addr) ||
@@ -261,10 +307,10 @@ static int __init sanitize_memmap(struct bfin_memmap_entry *map, int *pnr_map)
 	}
 
 	/* create a new memmap, removing overlaps */
-	overlap_entries = 0;	 /* number of entries in the overlap table */
-	new_entry = 0;	 /* index for creating new memmap entries */
-	last_type = 0;		 /* start with undefined memory type */
-	last_addr = 0;		 /* start with 0 as last starting address */
+	overlap_entries = 0;	/* number of entries in the overlap table */
+	new_entry = 0;		/* index for creating new memmap entries */
+	last_type = 0;		/* start with undefined memory type */
+	last_addr = 0;		/* start with 0 as last starting address */
 	/* loop through change-points, determining affect on the new memmap */
 	for (chgidx = 0; chgidx < chg_nr; chgidx++) {
 		/* keep track of all overlapping memmap entries */
@@ -286,14 +332,14 @@ static int __init sanitize_memmap(struct bfin_memmap_entry *map, int *pnr_map)
 			if (overlap_list[i]->type > current_type)
 				current_type = overlap_list[i]->type;
 		/* continue building up new memmap based on this information */
-		if (current_type != last_type)	{
+		if (current_type != last_type) {
 			if (last_type != 0) {
 				new_map[new_entry].size =
 					change_point[chgidx]->addr - last_addr;
 				/* move forward only if the new size was non-zero */
 				if (new_map[new_entry].size != 0)
 					if (++new_entry >= BFIN_MEMMAP_MAX)
-						break; 	/* no more space left for new entries */
+						break;	/* no more space left for new entries */
 			}
 			if (current_type != 0) {
 				new_map[new_entry].addr = change_point[chgidx]->addr;
@@ -303,9 +349,9 @@ static int __init sanitize_memmap(struct bfin_memmap_entry *map, int *pnr_map)
 			last_type = current_type;
 		}
 	}
-	new_nr = new_entry;   /* retain count for new entries */
+	new_nr = new_entry;	/* retain count for new entries */
 
-	/* copy new  mapping into original location */
+	/* copy new mapping into original location */
 	memcpy(map, new_map, new_nr*sizeof(struct bfin_memmap_entry));
 	*pnr_map = new_nr;
 
@@ -361,7 +407,6 @@ static __init int parse_memmap(char *arg)
  *  - "memmap=XXX[KkmM][@][$]XXX[KkmM]" defines a memory region
  *       @ from <start> to <start>+<mem>, type RAM
  *       $ from <start> to <start>+<mem>, type RESERVED
- *
  */
 static __init void parse_cmdline_early(char *cmdline_p)
 {
@@ -383,12 +428,10 @@ static __init void parse_cmdline_early(char *cmdline_p)
 					if (*to != ' ') {
 						if (*to == '$'
 						    || *(to + 1) == '$')
-							reserved_mem_dcache_on =
-							    1;
+							reserved_mem_dcache_on = 1;
 						if (*to == '#'
 						    || *(to + 1) == '#')
-							reserved_mem_icache_on =
-							    1;
+							reserved_mem_icache_on = 1;
 					}
 				}
 			} else if (!memcmp(to, "earlyprintk=", 12)) {
@@ -417,9 +460,8 @@ static __init void parse_cmdline_early(char *cmdline_p)
  *	[_ramend - DMA_UNCACHED_REGION,
  *		_ramend]:			uncached DMA region
  *  [_ramend, physical_mem_end]:	memory not managed by kernel
- *
  */
-static __init void  memory_setup(void)
+static __init void memory_setup(void)
 {
 #ifdef CONFIG_MTD_UCLINUX
 	unsigned long mtd_phys = 0;
@@ -436,7 +478,7 @@ static __init void  memory_setup(void)
 	memory_end = _ramend - DMA_UNCACHED_REGION;
 
 #ifdef CONFIG_MPU
-	/* Round up to multiple of 4MB.  */
+	/* Round up to multiple of 4MB */
 	memory_start = (_ramstart + 0x3fffff) & ~0x3fffff;
 #else
 	memory_start = PAGE_ALIGN(_ramstart);
@@ -616,7 +658,7 @@ static __init void setup_bootmem_allocator(void)
 	end_pfn = memory_end >> PAGE_SHIFT;
 
 	/*
-	 * give all the memory to the bootmap allocator,  tell it to put the
+	 * give all the memory to the bootmap allocator, tell it to put the
 	 * boot mem_map at the start of memory.
 	 */
 	bootmap_size = init_bootmem_node(NODE_DATA(0),
@@ -791,7 +833,11 @@ void __init setup_arch(char **cmdline_p)
 	bfin_write_SWRST(_bfin_swrst | DOUBLE_FAULT);
 #endif
 
+#ifdef CONFIG_SMP
+	if (_bfin_swrst & SWRST_DBL_FAULT_A) {
+#else
 	if (_bfin_swrst & RESET_DOUBLE) {
+#endif
 		printk(KERN_EMERG "Recovering from DOUBLE FAULT event\n");
 #ifdef CONFIG_DEBUG_DOUBLEFAULT
 		/* We assume the crashing kernel, and the current symbol table match */
@@ -835,7 +881,7 @@ void __init setup_arch(char **cmdline_p)
 	printk(KERN_INFO "Blackfin Linux support by http://blackfin.uclinux.org/\n");
 
 	printk(KERN_INFO "Processor Speed: %lu MHz core clock and %lu MHz System Clock\n",
-	       cclk / 1000000,  sclk / 1000000);
+	       cclk / 1000000, sclk / 1000000);
 
 	if (ANOMALY_05000273 && (cclk >> 1) <= sclk)
 		printk("\n\n\nANOMALY_05000273: CCLK must be >= 2*SCLK !!!\n\n\n");
@@ -867,18 +913,21 @@ void __init setup_arch(char **cmdline_p)
 	BUG_ON((char *)&safe_user_instruction - (char *)&fixed_code_start
 		!= SAFE_USER_INSTRUCTION - FIXED_CODE_START);
 
+#ifdef CONFIG_SMP
+	platform_init_cpus();
+#endif
 	init_exception_vectors();
-	bfin_cache_init();
+	bfin_cache_init();	/* Initialize caches for the boot CPU */
 }
 
 static int __init topology_init(void)
 {
-	int cpu;
+	unsigned int cpu;
+	/* Record CPU-private information for the boot processor. */
+	bfin_setup_cpudata(0);
 
 	for_each_possible_cpu(cpu) {
-		struct cpu *c = &per_cpu(cpu_devices, cpu);
-
-		register_cpu(c, cpu);
+		register_cpu(&per_cpu(cpu_data, cpu).cpu, cpu);
 	}
 
 	return 0;
@@ -983,15 +1032,15 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	char *cpu, *mmu, *fpu, *vendor, *cache;
 	uint32_t revid;
 
-	u_long cclk = 0, sclk = 0;
+	u_long sclk = 0;
 	u_int icache_size = BFIN_ICACHESIZE / 1024, dcache_size = 0, dsup_banks = 0;
+	struct blackfin_cpudata *cpudata = &per_cpu(cpu_data, *(unsigned int *)v);
 
 	cpu = CPU;
 	mmu = "none";
 	fpu = "none";
 	revid = bfin_revid();
 
-	cclk = get_cclk();
 	sclk = get_sclk();
 
 	switch (bfin_read_CHIPID() & CHIPID_MANUFACTURE) {
@@ -1003,10 +1052,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		break;
 	}
 
-	seq_printf(m, "processor\t: %d\n"
-		"vendor_id\t: %s\n",
-		*(unsigned int *)v,
-		vendor);
+	seq_printf(m, "processor\t: %d\n" "vendor_id\t: %s\n",
+		*(unsigned int *)v, vendor);
 
 	if (CPUID == bfin_cpuid())
 		seq_printf(m, "cpu family\t: 0x%04x\n", CPUID);
@@ -1016,7 +1063,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 
 	seq_printf(m, "model name\t: ADSP-%s %lu(MHz CCLK) %lu(MHz SCLK) (%s)\n"
 		"stepping\t: %d\n",
-		cpu, cclk/1000000, sclk/1000000,
+		cpu, cpudata->cclk/1000000, sclk/1000000,
 #ifdef CONFIG_MPU
 		"mpu on",
 #else
@@ -1025,16 +1072,16 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		revid);
 
 	seq_printf(m, "cpu MHz\t\t: %lu.%03lu/%lu.%03lu\n",
-		cclk/1000000, cclk%1000000,
+		cpudata->cclk/1000000, cpudata->cclk%1000000,
 		sclk/1000000, sclk%1000000);
 	seq_printf(m, "bogomips\t: %lu.%02lu\n"
 		"Calibration\t: %lu loops\n",
-		(loops_per_jiffy * HZ) / 500000,
-		((loops_per_jiffy * HZ) / 5000) % 100,
-		(loops_per_jiffy * HZ));
+		(cpudata->loops_per_jiffy * HZ) / 500000,
+		((cpudata->loops_per_jiffy * HZ) / 5000) % 100,
+		(cpudata->loops_per_jiffy * HZ));
 
 	/* Check Cache configutation */
-	switch (bfin_read_DMEM_CONTROL() & (1 << DMC0_P | 1 << DMC1_P)) {
+	switch (cpudata->dmemctl & (1 << DMC0_P | 1 << DMC1_P)) {
 	case ACACHE_BSRAM:
 		cache = "dbank-A/B\t: cache/sram";
 		dcache_size = 16;
@@ -1058,10 +1105,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	}
 
 	/* Is it turned on? */
-	if ((bfin_read_DMEM_CONTROL() & (ENDCPLB | DMC_ENABLE)) != (ENDCPLB | DMC_ENABLE))
+	if ((cpudata->dmemctl & (ENDCPLB | DMC_ENABLE)) != (ENDCPLB | DMC_ENABLE))
 		dcache_size = 0;
 
-	if ((bfin_read_IMEM_CONTROL() & (IMC | ENICPLB)) != (IMC | ENICPLB))
+	if ((cpudata->imemctl & (IMC | ENICPLB)) != (IMC | ENICPLB))
 		icache_size = 0;
 
 	seq_printf(m, "cache size\t: %d KB(L1 icache) "
@@ -1086,8 +1133,13 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		   "dcache setup\t: %d Super-banks/%d Sub-banks/%d Ways, %d Lines/Way\n",
 		   dsup_banks, BFIN_DSUBBANKS, BFIN_DWAYS,
 		   BFIN_DLINES);
+#ifdef __ARCH_SYNC_CORE_DCACHE
+	seq_printf(m,
+		"SMP Dcache Flushes\t: %lu\n\n",
+		per_cpu(cpu_data, *(unsigned int *)v).dcache_invld_count);
+#endif
 #ifdef CONFIG_BFIN_ICACHE_LOCK
-	switch ((bfin_read_IMEM_CONTROL() >> 3) & WAYALL_L) {
+	switch ((cpudata->imemctl >> 3) & WAYALL_L) {
 	case WAY0_L:
 		seq_printf(m, "Way0 Locked-Down\n");
 		break;
@@ -1136,6 +1188,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	default:
 		seq_printf(m, "No Ways are locked\n");
 	}
+#endif
+	if (*(unsigned int *)v != NR_CPUS-1)
+		return 0;
+
+#if L2_LENGTH
+	seq_printf(m, "L2 SRAM\t\t: %dKB\n", L2_LENGTH/0x400);
 #endif
 	seq_printf(m, "board name\t: %s\n", bfin_board_name);
 	seq_printf(m, "board memory\t: %ld kB (0x%p -> 0x%p)\n",
@@ -1144,6 +1202,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		((int)memory_end - (int)_stext) >> 10,
 		_stext,
 		(void *)memory_end);
+	seq_printf(m, "\n");
 
 	return 0;
 }
diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c
index eb2352320454..06de2ce67a9e 100644
--- a/arch/blackfin/kernel/time.c
+++ b/arch/blackfin/kernel/time.c
@@ -34,9 +34,11 @@
 #include <linux/interrupt.h>
 #include <linux/time.h>
 #include <linux/irq.h>
+#include <linux/delay.h>
 
 #include <asm/blackfin.h>
 #include <asm/time.h>
+#include <asm/gptimers.h>
 
 /* This is an NTP setting */
 #define	TICK_SIZE (tick_nsec / 1000)
@@ -46,11 +48,14 @@ static unsigned long gettimeoffset(void);
 
 static struct irqaction bfin_timer_irq = {
 	.name = "BFIN Timer Tick",
+#ifdef CONFIG_IRQ_PER_CPU
+	.flags = IRQF_DISABLED  | IRQF_PERCPU,
+#else
 	.flags = IRQF_DISABLED
+#endif
 };
 
-static void
-time_sched_init(irq_handler_t timer_routine)
+void setup_core_timer(void)
 {
 	u32 tcount;
 
@@ -71,12 +76,41 @@ time_sched_init(irq_handler_t timer_routine)
 	CSYNC();
 
 	bfin_write_TCNTL(7);
+}
+
+#ifdef CONFIG_TICK_SOURCE_SYSTMR0
+void setup_system_timer0(void)
+{
+	/* Power down the core timer, just to play safe. */
+	bfin_write_TCNTL(0);
+
+	disable_gptimers(TIMER0bit);
+	set_gptimer_status(0, TIMER_STATUS_TRUN0);
+	while (get_gptimer_status(0) & TIMER_STATUS_TRUN0)
+		udelay(10);
+
+	set_gptimer_config(0, 0x59); /* IRQ enable, periodic, PWM_OUT, SCLKed, OUT PAD disabled */
+	set_gptimer_period(TIMER0_id, get_sclk() / HZ);
+	set_gptimer_pwidth(TIMER0_id, 1);
+	SSYNC();
+	enable_gptimers(TIMER0bit);
+}
+#endif
 
+static void
+time_sched_init(irqreturn_t(*timer_routine) (int, void *))
+{
+#ifdef CONFIG_TICK_SOURCE_SYSTMR0
+	setup_system_timer0();
+#else
+	setup_core_timer();
+#endif
 	bfin_timer_irq.handler = (irq_handler_t)timer_routine;
-	/* call setup_irq instead of request_irq because request_irq calls
-	 * kmalloc which has not been initialized yet
-	 */
+#ifdef CONFIG_TICK_SOURCE_SYSTMR0
+	setup_irq(IRQ_TIMER0, &bfin_timer_irq);
+#else
 	setup_irq(IRQ_CORETMR, &bfin_timer_irq);
+#endif
 }
 
 /*
@@ -87,17 +121,23 @@ static unsigned long gettimeoffset(void)
 	unsigned long offset;
 	unsigned long clocks_per_jiffy;
 
+#ifdef CONFIG_TICK_SOURCE_SYSTMR0
+	clocks_per_jiffy =  bfin_read_TIMER0_PERIOD();
+	offset =  bfin_read_TIMER0_COUNTER() / \
+		(((clocks_per_jiffy + 1) * HZ) / USEC_PER_SEC);
+
+	if ((get_gptimer_status(0) & TIMER_STATUS_TIMIL0) && offset < (100000 / HZ / 2))
+		offset += (USEC_PER_SEC / HZ);
+#else
 	clocks_per_jiffy = bfin_read_TPERIOD();
-	offset =
-	    (clocks_per_jiffy -
-	     bfin_read_TCOUNT()) / (((clocks_per_jiffy + 1) * HZ) /
-				    USEC_PER_SEC);
+	offset = (clocks_per_jiffy - bfin_read_TCOUNT()) / \
+		(((clocks_per_jiffy + 1) * HZ)  / USEC_PER_SEC);
 
 	/* Check if we just wrapped the counters and maybe missed a tick */
 	if ((bfin_read_ILAT() & (1 << IRQ_CORETMR))
-	    && (offset < (100000 / HZ / 2)))
+		&& (offset < (100000 / HZ / 2)))
 		offset += (USEC_PER_SEC / HZ);
-
+#endif
 	return offset;
 }
 
@@ -120,34 +160,38 @@ irqreturn_t timer_interrupt(int irq, void *dummy)
 	static long last_rtc_update;
 
 	write_seqlock(&xtime_lock);
-
-	do_timer(1);
-
-	profile_tick(CPU_PROFILING);
-
-	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 */
-
-	if (ntp_synced() &&
-	    xtime.tv_sec > last_rtc_update + 660 &&
-	    (xtime.tv_nsec / NSEC_PER_USEC) >=
-	    500000 - ((unsigned)TICK_SIZE) / 2
-	    && (xtime.tv_nsec / NSEC_PER_USEC) <=
-	    500000 + ((unsigned)TICK_SIZE) / 2) {
-		if (set_rtc_mmss(xtime.tv_sec) == 0)
-			last_rtc_update = xtime.tv_sec;
-		else
-			/* Do it again in 60s. */
-			last_rtc_update = xtime.tv_sec - 600;
+#ifdef CONFIG_TICK_SOURCE_SYSTMR0
+	if (get_gptimer_status(0) & TIMER_STATUS_TIMIL0) {
+#endif
+		do_timer(1);
+
+
+		/*
+		 * If we have an externally synchronized Linux clock, then update
+		 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
+		 * called as close as possible to 500 ms before the new second starts.
+		 */
+
+		if (ntp_synced() &&
+		    xtime.tv_sec > last_rtc_update + 660 &&
+		    (xtime.tv_nsec / NSEC_PER_USEC) >=
+		    500000 - ((unsigned)TICK_SIZE) / 2
+		    && (xtime.tv_nsec / NSEC_PER_USEC) <=
+		    500000 + ((unsigned)TICK_SIZE) / 2) {
+			if (set_rtc_mmss(xtime.tv_sec) == 0)
+				last_rtc_update = xtime.tv_sec;
+			else
+				/* Do it again in 60s. */
+				last_rtc_update = xtime.tv_sec - 600;
+		}
+#ifdef CONFIG_TICK_SOURCE_SYSTMR0
+		set_gptimer_status(0, TIMER_STATUS_TIMIL0);
 	}
+#endif
 	write_sequnlock(&xtime_lock);
 
-#ifndef CONFIG_SMP
 	update_process_times(user_mode(get_irq_regs()));
-#endif
+	profile_tick(CPU_PROFILING);
 
 	return IRQ_HANDLED;
 }
diff --git a/arch/blackfin/kernel/traps.c b/arch/blackfin/kernel/traps.c
index bef025b07443..af7cc43630de 100644
--- a/arch/blackfin/kernel/traps.c
+++ b/arch/blackfin/kernel/traps.c
@@ -75,16 +75,6 @@ void __init trap_init(void)
 	CSYNC();
 }
 
-/*
- * Used to save the RETX, SEQSTAT, I/D CPLB FAULT ADDR
- * values across the transition from exception to IRQ5.
- * We put these in L1, so they are going to be in a valid
- * location during exception context
- */
-__attribute__((l1_data))
-unsigned long saved_retx, saved_seqstat,
-	saved_icplb_fault_addr, saved_dcplb_fault_addr;
-
 static void decode_address(char *buf, unsigned long address)
 {
 #ifdef CONFIG_DEBUG_VERBOSE
@@ -211,18 +201,18 @@ asmlinkage void double_fault_c(struct pt_regs *fp)
 	printk(KERN_EMERG "\n" KERN_EMERG "Double Fault\n");
 #ifdef CONFIG_DEBUG_DOUBLEFAULT_PRINT
 	if (((long)fp->seqstat &  SEQSTAT_EXCAUSE) == VEC_UNCOV) {
+		unsigned int cpu = smp_processor_id();
 		char buf[150];
-		decode_address(buf, saved_retx);
+		decode_address(buf, cpu_pda[cpu].retx);
 		printk(KERN_EMERG "While handling exception (EXCAUSE = 0x%x) at %s:\n",
-			(int)saved_seqstat & SEQSTAT_EXCAUSE, buf);
-		decode_address(buf, saved_dcplb_fault_addr);
+			(unsigned int)cpu_pda[cpu].seqstat & SEQSTAT_EXCAUSE, buf);
+		decode_address(buf, cpu_pda[cpu].dcplb_fault_addr);
 		printk(KERN_NOTICE "   DCPLB_FAULT_ADDR: %s\n", buf);
-		decode_address(buf, saved_icplb_fault_addr);
+		decode_address(buf, cpu_pda[cpu].icplb_fault_addr);
 		printk(KERN_NOTICE "   ICPLB_FAULT_ADDR: %s\n", buf);
 
 		decode_address(buf, fp->retx);
-		printk(KERN_NOTICE "The instruction at %s caused a double exception\n",
-			buf);
+		printk(KERN_NOTICE "The instruction at %s caused a double exception\n", buf);
 	} else
 #endif
 	{
@@ -239,6 +229,9 @@ asmlinkage void trap_c(struct pt_regs *fp)
 {
 #ifdef CONFIG_DEBUG_BFIN_HWTRACE_ON
 	int j;
+#endif
+#ifdef CONFIG_DEBUG_HUNT_FOR_ZERO
+	unsigned int cpu = smp_processor_id();
 #endif
 	int sig = 0;
 	siginfo_t info;
@@ -417,7 +410,7 @@ asmlinkage void trap_c(struct pt_regs *fp)
 		info.si_code = ILL_CPLB_MULHIT;
 		sig = SIGSEGV;
 #ifdef CONFIG_DEBUG_HUNT_FOR_ZERO
-		if (saved_dcplb_fault_addr < FIXED_CODE_START)
+		if (cpu_pda[cpu].dcplb_fault_addr < FIXED_CODE_START)
 			verbose_printk(KERN_NOTICE "NULL pointer access\n");
 		else
 #endif
@@ -471,7 +464,7 @@ asmlinkage void trap_c(struct pt_regs *fp)
 		info.si_code = ILL_CPLB_MULHIT;
 		sig = SIGSEGV;
 #ifdef CONFIG_DEBUG_HUNT_FOR_ZERO
-		if (saved_icplb_fault_addr < FIXED_CODE_START)
+		if (cpu_pda[cpu].icplb_fault_addr < FIXED_CODE_START)
 			verbose_printk(KERN_NOTICE "Jump to NULL address\n");
 		else
 #endif
@@ -960,6 +953,7 @@ void dump_bfin_process(struct pt_regs *fp)
 		else
 			verbose_printk(KERN_NOTICE "COMM= invalid\n");
 
+		printk(KERN_NOTICE "CPU = %d\n", current_thread_info()->cpu);
 		if (!((unsigned long)current->mm & 0x3) && (unsigned long)current->mm >= FIXED_CODE_START)
 			verbose_printk(KERN_NOTICE  "TEXT = 0x%p-0x%p        DATA = 0x%p-0x%p\n"
 				KERN_NOTICE " BSS = 0x%p-0x%p  USER-STACK = 0x%p\n"
@@ -1053,6 +1047,7 @@ void show_regs(struct pt_regs *fp)
 	struct irqaction *action;
 	unsigned int i;
 	unsigned long flags;
+	unsigned int cpu = smp_processor_id();
 
 	verbose_printk(KERN_NOTICE "\n" KERN_NOTICE "SEQUENCER STATUS:\t\t%s\n", print_tainted());
 	verbose_printk(KERN_NOTICE " SEQSTAT: %08lx  IPEND: %04lx  SYSCFG: %04lx\n",
@@ -1112,9 +1107,9 @@ unlock:
 
 	if (((long)fp->seqstat &  SEQSTAT_EXCAUSE) &&
 	    (((long)fp->seqstat & SEQSTAT_EXCAUSE) != VEC_HWERR)) {
-		decode_address(buf, saved_dcplb_fault_addr);
+		decode_address(buf, cpu_pda[cpu].dcplb_fault_addr);
 		verbose_printk(KERN_NOTICE "DCPLB_FAULT_ADDR: %s\n", buf);
-		decode_address(buf, saved_icplb_fault_addr);
+		decode_address(buf, cpu_pda[cpu].icplb_fault_addr);
 		verbose_printk(KERN_NOTICE "ICPLB_FAULT_ADDR: %s\n", buf);
 	}
 
@@ -1153,20 +1148,21 @@ unlock:
 asmlinkage int sys_bfin_spinlock(int *spinlock)__attribute__((l1_text));
 #endif
 
-asmlinkage int sys_bfin_spinlock(int *spinlock)
+static DEFINE_SPINLOCK(bfin_spinlock_lock);
+
+asmlinkage int sys_bfin_spinlock(int *p)
 {
-	int ret = 0;
-	int tmp = 0;
+	int ret, tmp = 0;
 
-	local_irq_disable();
-	ret = get_user(tmp, spinlock);
-	if (ret == 0) {
-		if (tmp)
+	spin_lock(&bfin_spinlock_lock);	/* This would also hold kernel preemption. */
+	ret = get_user(tmp, p);
+	if (likely(ret == 0)) {
+		if (unlikely(tmp))
 			ret = 1;
-		tmp = 1;
-		put_user(tmp, spinlock);
+		else
+			put_user(1, p);
 	}
-	local_irq_enable();
+	spin_unlock(&bfin_spinlock_lock);
 	return ret;
 }
 
diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c
index bc240abb8745..57d306b9c56d 100644
--- a/arch/blackfin/mm/init.c
+++ b/arch/blackfin/mm/init.c
@@ -31,7 +31,8 @@
 #include <linux/bootmem.h>
 #include <linux/uaccess.h>
 #include <asm/bfin-global.h>
-#include <asm/l1layout.h>
+#include <asm/pda.h>
+#include <asm/cplbinit.h>
 #include "blackfin_sram.h"
 
 /*
@@ -53,6 +54,11 @@ static unsigned long empty_bad_page;
 
 unsigned long empty_zero_page;
 
+extern unsigned long exception_stack[NR_CPUS][1024];
+
+struct blackfin_pda cpu_pda[NR_CPUS];
+EXPORT_SYMBOL(cpu_pda);
+
 /*
  * paging_init() continues the virtual memory environment setup which
  * was begun by the code in arch/head.S.
@@ -98,6 +104,42 @@ void __init paging_init(void)
 	}
 }
 
+asmlinkage void init_pda(void)
+{
+	unsigned int cpu = raw_smp_processor_id();
+
+	/* Initialize the PDA fields holding references to other parts
+	   of the memory. The content of such memory is still
+	   undefined at the time of the call, we are only setting up
+	   valid pointers to it. */
+	memset(&cpu_pda[cpu], 0, sizeof(cpu_pda[cpu]));
+
+	cpu_pda[0].next = &cpu_pda[1];
+	cpu_pda[1].next = &cpu_pda[0];
+
+	cpu_pda[cpu].ex_stack = exception_stack[cpu + 1];
+
+#ifdef CONFIG_MPU
+#else
+	cpu_pda[cpu].ipdt = ipdt_tables[cpu];
+	cpu_pda[cpu].dpdt = dpdt_tables[cpu];
+#ifdef CONFIG_CPLB_INFO
+	cpu_pda[cpu].ipdt_swapcount = ipdt_swapcount_tables[cpu];
+	cpu_pda[cpu].dpdt_swapcount = dpdt_swapcount_tables[cpu];
+#endif
+#endif
+
+#ifdef CONFIG_SMP
+	cpu_pda[cpu].imask = 0x1f;
+#endif
+}
+
+void __cpuinit reserve_pda(void)
+{
+	printk(KERN_INFO "PDA for CPU%u reserved at %p\n", smp_processor_id(),
+					&cpu_pda[smp_processor_id()]);
+}
+
 void __init mem_init(void)
 {
 	unsigned int codek = 0, datak = 0, initk = 0;
@@ -141,21 +183,13 @@ void __init mem_init(void)
 
 static int __init sram_init(void)
 {
-	unsigned long tmp;
-
 	/* Initialize the blackfin L1 Memory. */
 	bfin_sram_init();
 
-	/* Allocate this once; never free it.  We assume this gives us a
-	   pointer to the start of L1 scratchpad memory; panic if it
-	   doesn't.  */
-	tmp = (unsigned long)l1sram_alloc(sizeof(struct l1_scratch_task_info));
-	if (tmp != (unsigned long)L1_SCRATCH_TASK_INFO) {
-		printk(KERN_EMERG "mem_init(): Did not get the right address from l1sram_alloc: %08lx != %08lx\n",
-			tmp, (unsigned long)L1_SCRATCH_TASK_INFO);
-		panic("No L1, time to give up\n");
-	}
-
+	/* Reserve the PDA space for the boot CPU right after we
+	 * initialized the scratch memory allocator.
+	 */
+	reserve_pda();
 	return 0;
 }
 pure_initcall(sram_init);
diff --git a/arch/blackfin/mm/sram-alloc.c b/arch/blackfin/mm/sram-alloc.c
index cc6f336e7313..8f82b4c92d07 100644
--- a/arch/blackfin/mm/sram-alloc.c
+++ b/arch/blackfin/mm/sram-alloc.c
@@ -41,8 +41,10 @@
 #include <asm/blackfin.h>
 #include "blackfin_sram.h"
 
-static spinlock_t l1sram_lock, l1_data_sram_lock, l1_inst_sram_lock;
-static spinlock_t l2_sram_lock;
+static DEFINE_PER_CPU(spinlock_t, l1sram_lock) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(spinlock_t, l1_data_sram_lock) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(spinlock_t, l1_inst_sram_lock) ____cacheline_aligned_in_smp;
+static spinlock_t l2_sram_lock ____cacheline_aligned_in_smp;
 
 /* the data structure for L1 scratchpad and DATA SRAM */
 struct sram_piece {
@@ -52,18 +54,22 @@ struct sram_piece {
 	struct sram_piece *next;
 };
 
-static struct sram_piece free_l1_ssram_head, used_l1_ssram_head;
+static DEFINE_PER_CPU(struct sram_piece, free_l1_ssram_head);
+static DEFINE_PER_CPU(struct sram_piece, used_l1_ssram_head);
 
 #if L1_DATA_A_LENGTH != 0
-static struct sram_piece free_l1_data_A_sram_head, used_l1_data_A_sram_head;
+static DEFINE_PER_CPU(struct sram_piece, free_l1_data_A_sram_head);
+static DEFINE_PER_CPU(struct sram_piece, used_l1_data_A_sram_head);
 #endif
 
 #if L1_DATA_B_LENGTH != 0
-static struct sram_piece free_l1_data_B_sram_head, used_l1_data_B_sram_head;
+static DEFINE_PER_CPU(struct sram_piece, free_l1_data_B_sram_head);
+static DEFINE_PER_CPU(struct sram_piece, used_l1_data_B_sram_head);
 #endif
 
 #if L1_CODE_LENGTH != 0
-static struct sram_piece free_l1_inst_sram_head, used_l1_inst_sram_head;
+static DEFINE_PER_CPU(struct sram_piece, free_l1_inst_sram_head);
+static DEFINE_PER_CPU(struct sram_piece, used_l1_inst_sram_head);
 #endif
 
 #if L2_LENGTH != 0
@@ -75,102 +81,115 @@ static struct kmem_cache *sram_piece_cache;
 /* L1 Scratchpad SRAM initialization function */
 static void __init l1sram_init(void)
 {
-	free_l1_ssram_head.next =
-		kmem_cache_alloc(sram_piece_cache, GFP_KERNEL);
-	if (!free_l1_ssram_head.next) {
-		printk(KERN_INFO "Failed to initialize Scratchpad data SRAM\n");
-		return;
+	unsigned int cpu;
+	for (cpu = 0; cpu < num_possible_cpus(); ++cpu) {
+		per_cpu(free_l1_ssram_head, cpu).next =
+			kmem_cache_alloc(sram_piece_cache, GFP_KERNEL);
+		if (!per_cpu(free_l1_ssram_head, cpu).next) {
+			printk(KERN_INFO "Fail to initialize Scratchpad data SRAM.\n");
+			return;
+		}
+
+		per_cpu(free_l1_ssram_head, cpu).next->paddr = (void *)get_l1_scratch_start_cpu(cpu);
+		per_cpu(free_l1_ssram_head, cpu).next->size = L1_SCRATCH_LENGTH;
+		per_cpu(free_l1_ssram_head, cpu).next->pid = 0;
+		per_cpu(free_l1_ssram_head, cpu).next->next = NULL;
+
+		per_cpu(used_l1_ssram_head, cpu).next = NULL;
+
+		/* mutex initialize */
+		spin_lock_init(&per_cpu(l1sram_lock, cpu));
+		printk(KERN_INFO "Blackfin Scratchpad data SRAM: %d KB\n",
+			L1_SCRATCH_LENGTH >> 10);
 	}
-
-	free_l1_ssram_head.next->paddr = (void *)L1_SCRATCH_START;
-	free_l1_ssram_head.next->size = L1_SCRATCH_LENGTH;
-	free_l1_ssram_head.next->pid = 0;
-	free_l1_ssram_head.next->next = NULL;
-
-	used_l1_ssram_head.next = NULL;
-
-	/* mutex initialize */
-	spin_lock_init(&l1sram_lock);
-
-	printk(KERN_INFO "Blackfin Scratchpad data SRAM: %d KB\n",
-	       L1_SCRATCH_LENGTH >> 10);
 }
 
 static void __init l1_data_sram_init(void)
 {
+	unsigned int cpu;
 #if L1_DATA_A_LENGTH != 0
-	free_l1_data_A_sram_head.next =
-		kmem_cache_alloc(sram_piece_cache, GFP_KERNEL);
-	if (!free_l1_data_A_sram_head.next) {
-		printk(KERN_INFO "Failed to initialize L1 Data A SRAM\n");
-		return;
+	for (cpu = 0; cpu < num_possible_cpus(); ++cpu) {
+		per_cpu(free_l1_data_A_sram_head, cpu).next =
+			kmem_cache_alloc(sram_piece_cache, GFP_KERNEL);
+		if (!per_cpu(free_l1_data_A_sram_head, cpu).next) {
+			printk(KERN_INFO "Fail to initialize L1 Data A SRAM.\n");
+			return;
+		}
+
+		per_cpu(free_l1_data_A_sram_head, cpu).next->paddr =
+			(void *)get_l1_data_a_start_cpu(cpu) + (_ebss_l1 - _sdata_l1);
+		per_cpu(free_l1_data_A_sram_head, cpu).next->size =
+			L1_DATA_A_LENGTH - (_ebss_l1 - _sdata_l1);
+		per_cpu(free_l1_data_A_sram_head, cpu).next->pid = 0;
+		per_cpu(free_l1_data_A_sram_head, cpu).next->next = NULL;
+
+		per_cpu(used_l1_data_A_sram_head, cpu).next = NULL;
+
+		printk(KERN_INFO "Blackfin L1 Data A SRAM: %d KB (%d KB free)\n",
+			L1_DATA_A_LENGTH >> 10,
+			per_cpu(free_l1_data_A_sram_head, cpu).next->size >> 10);
 	}
-
-	free_l1_data_A_sram_head.next->paddr =
-		(void *)L1_DATA_A_START + (_ebss_l1 - _sdata_l1);
-	free_l1_data_A_sram_head.next->size =
-		L1_DATA_A_LENGTH - (_ebss_l1 - _sdata_l1);
-	free_l1_data_A_sram_head.next->pid = 0;
-	free_l1_data_A_sram_head.next->next = NULL;
-
-	used_l1_data_A_sram_head.next = NULL;
-
-	printk(KERN_INFO "Blackfin L1 Data A SRAM: %d KB (%d KB free)\n",
-		L1_DATA_A_LENGTH >> 10,
-		free_l1_data_A_sram_head.next->size >> 10);
 #endif
 #if L1_DATA_B_LENGTH != 0
-	free_l1_data_B_sram_head.next =
-		kmem_cache_alloc(sram_piece_cache, GFP_KERNEL);
-	if (!free_l1_data_B_sram_head.next) {
-		printk(KERN_INFO "Failed to initialize L1 Data B SRAM\n");
-		return;
+	for (cpu = 0; cpu < num_possible_cpus(); ++cpu) {
+		per_cpu(free_l1_data_B_sram_head, cpu).next =
+			kmem_cache_alloc(sram_piece_cache, GFP_KERNEL);
+		if (!per_cpu(free_l1_data_B_sram_head, cpu).next) {
+			printk(KERN_INFO "Fail to initialize L1 Data B SRAM.\n");
+			return;
+		}
+
+		per_cpu(free_l1_data_B_sram_head, cpu).next->paddr =
+			(void *)get_l1_data_b_start_cpu(cpu) + (_ebss_b_l1 - _sdata_b_l1);
+		per_cpu(free_l1_data_B_sram_head, cpu).next->size =
+			L1_DATA_B_LENGTH - (_ebss_b_l1 - _sdata_b_l1);
+		per_cpu(free_l1_data_B_sram_head, cpu).next->pid = 0;
+		per_cpu(free_l1_data_B_sram_head, cpu).next->next = NULL;
+
+		per_cpu(used_l1_data_B_sram_head, cpu).next = NULL;
+
+		printk(KERN_INFO "Blackfin L1 Data B SRAM: %d KB (%d KB free)\n",
+			L1_DATA_B_LENGTH >> 10,
+			per_cpu(free_l1_data_B_sram_head, cpu).next->size >> 10);
+		/* mutex initialize */
 	}
-
-	free_l1_data_B_sram_head.next->paddr =
-		(void *)L1_DATA_B_START + (_ebss_b_l1 - _sdata_b_l1);
-	free_l1_data_B_sram_head.next->size =
-		L1_DATA_B_LENGTH - (_ebss_b_l1 - _sdata_b_l1);
-	free_l1_data_B_sram_head.next->pid = 0;
-	free_l1_data_B_sram_head.next->next = NULL;
-
-	used_l1_data_B_sram_head.next = NULL;
-
-	printk(KERN_INFO "Blackfin L1 Data B SRAM: %d KB (%d KB free)\n",
-		L1_DATA_B_LENGTH >> 10,
-		free_l1_data_B_sram_head.next->size >> 10);
 #endif
 
-	/* mutex initialize */
-	spin_lock_init(&l1_data_sram_lock);
+#if L1_DATA_A_LENGTH != 0 || L1_DATA_B_LENGTH != 0
+	for (cpu = 0; cpu < num_possible_cpus(); ++cpu)
+		spin_lock_init(&per_cpu(l1_data_sram_lock, cpu));
+#endif
 }
 
 static void __init l1_inst_sram_init(void)
 {
 #if L1_CODE_LENGTH != 0
-	free_l1_inst_sram_head.next =
-		kmem_cache_alloc(sram_piece_cache, GFP_KERNEL);
-	if (!free_l1_inst_sram_head.next) {
-		printk(KERN_INFO "Failed to initialize L1 Instruction SRAM\n");
-		return;
+	unsigned int cpu;
+	for (cpu = 0; cpu < num_possible_cpus(); ++cpu) {
+		per_cpu(free_l1_inst_sram_head, cpu).next =
+			kmem_cache_alloc(sram_piece_cache, GFP_KERNEL);
+		if (!per_cpu(free_l1_inst_sram_head, cpu).next) {
+			printk(KERN_INFO "Failed to initialize L1 Instruction SRAM\n");
+			return;
+		}
+
+		per_cpu(free_l1_inst_sram_head, cpu).next->paddr =
+			(void *)get_l1_code_start_cpu(cpu) + (_etext_l1 - _stext_l1);
+		per_cpu(free_l1_inst_sram_head, cpu).next->size =
+			L1_CODE_LENGTH - (_etext_l1 - _stext_l1);
+		per_cpu(free_l1_inst_sram_head, cpu).next->pid = 0;
+		per_cpu(free_l1_inst_sram_head, cpu).next->next = NULL;
+
+		per_cpu(used_l1_inst_sram_head, cpu).next = NULL;
+
+		printk(KERN_INFO "Blackfin L1 Instruction SRAM: %d KB (%d KB free)\n",
+			L1_CODE_LENGTH >> 10,
+			per_cpu(free_l1_inst_sram_head, cpu).next->size >> 10);
+
+		/* mutex initialize */
+		spin_lock_init(&per_cpu(l1_inst_sram_lock, cpu));
 	}
-
-	free_l1_inst_sram_head.next->paddr =
-		(void *)L1_CODE_START + (_etext_l1 - _stext_l1);
-	free_l1_inst_sram_head.next->size =
-		L1_CODE_LENGTH - (_etext_l1 - _stext_l1);
-	free_l1_inst_sram_head.next->pid = 0;
-	free_l1_inst_sram_head.next->next = NULL;
-
-	used_l1_inst_sram_head.next = NULL;
-
-	printk(KERN_INFO "Blackfin L1 Instruction SRAM: %d KB (%d KB free)\n",
-		L1_CODE_LENGTH >> 10,
-		free_l1_inst_sram_head.next->size >> 10);
 #endif
-
-	/* mutex initialize */
-	spin_lock_init(&l1_inst_sram_lock);
 }
 
 static void __init l2_sram_init(void)
@@ -179,7 +198,7 @@ static void __init l2_sram_init(void)
 	free_l2_sram_head.next =
 		kmem_cache_alloc(sram_piece_cache, GFP_KERNEL);
 	if (!free_l2_sram_head.next) {
-		printk(KERN_INFO "Failed to initialize L2 SRAM\n");
+		printk(KERN_INFO "Fail to initialize L2 SRAM.\n");
 		return;
 	}
 
@@ -200,6 +219,7 @@ static void __init l2_sram_init(void)
 	/* mutex initialize */
 	spin_lock_init(&l2_sram_lock);
 }
+
 void __init bfin_sram_init(void)
 {
 	sram_piece_cache = kmem_cache_create("sram_piece_cache",
@@ -353,20 +373,20 @@ int sram_free(const void *addr)
 {
 
 #if L1_CODE_LENGTH != 0
-	if (addr >= (void *)L1_CODE_START
-		 && addr < (void *)(L1_CODE_START + L1_CODE_LENGTH))
+	if (addr >= (void *)get_l1_code_start()
+		 && addr < (void *)(get_l1_code_start() + L1_CODE_LENGTH))
 		return l1_inst_sram_free(addr);
 	else
 #endif
 #if L1_DATA_A_LENGTH != 0
-	if (addr >= (void *)L1_DATA_A_START
-		 && addr < (void *)(L1_DATA_A_START + L1_DATA_A_LENGTH))
+	if (addr >= (void *)get_l1_data_a_start()
+		 && addr < (void *)(get_l1_data_a_start() + L1_DATA_A_LENGTH))
 		return l1_data_A_sram_free(addr);
 	else
 #endif
 #if L1_DATA_B_LENGTH != 0
-	if (addr >= (void *)L1_DATA_B_START
-		 && addr < (void *)(L1_DATA_B_START + L1_DATA_B_LENGTH))
+	if (addr >= (void *)get_l1_data_b_start()
+		 && addr < (void *)(get_l1_data_b_start() + L1_DATA_B_LENGTH))
 		return l1_data_B_sram_free(addr);
 	else
 #endif
@@ -384,17 +404,20 @@ void *l1_data_A_sram_alloc(size_t size)
 {
 	unsigned long flags;
 	void *addr = NULL;
+	unsigned int cpu;
 
+	cpu = get_cpu();
 	/* add mutex operation */
-	spin_lock_irqsave(&l1_data_sram_lock, flags);
+	spin_lock_irqsave(&per_cpu(l1_data_sram_lock, cpu), flags);
 
 #if L1_DATA_A_LENGTH != 0
-	addr = _sram_alloc(size, &free_l1_data_A_sram_head,
-			&used_l1_data_A_sram_head);
+	addr = _sram_alloc(size, &per_cpu(free_l1_data_A_sram_head, cpu),
+			&per_cpu(used_l1_data_A_sram_head, cpu));
 #endif
 
 	/* add mutex operation */
-	spin_unlock_irqrestore(&l1_data_sram_lock, flags);
+	spin_unlock_irqrestore(&per_cpu(l1_data_sram_lock, cpu), flags);
+	put_cpu();
 
 	pr_debug("Allocated address in l1_data_A_sram_alloc is 0x%lx+0x%lx\n",
 		 (long unsigned int)addr, size);
@@ -407,19 +430,22 @@ int l1_data_A_sram_free(const void *addr)
 {
 	unsigned long flags;
 	int ret;
+	unsigned int cpu;
 
+	cpu = get_cpu();
 	/* add mutex operation */
-	spin_lock_irqsave(&l1_data_sram_lock, flags);
+	spin_lock_irqsave(&per_cpu(l1_data_sram_lock, cpu), flags);
 
 #if L1_DATA_A_LENGTH != 0
-	ret = _sram_free(addr, &free_l1_data_A_sram_head,
-			&used_l1_data_A_sram_head);
+	ret = _sram_free(addr, &per_cpu(free_l1_data_A_sram_head, cpu),
+			&per_cpu(used_l1_data_A_sram_head, cpu));
 #else
 	ret = -1;
 #endif
 
 	/* add mutex operation */
-	spin_unlock_irqrestore(&l1_data_sram_lock, flags);
+	spin_unlock_irqrestore(&per_cpu(l1_data_sram_lock, cpu), flags);
+	put_cpu();
 
 	return ret;
 }
@@ -430,15 +456,18 @@ void *l1_data_B_sram_alloc(size_t size)
 #if L1_DATA_B_LENGTH != 0
 	unsigned long flags;
 	void *addr;
+	unsigned int cpu;
 
+	cpu = get_cpu();
 	/* add mutex operation */
-	spin_lock_irqsave(&l1_data_sram_lock, flags);
+	spin_lock_irqsave(&per_cpu(l1_data_sram_lock, cpu), flags);
 
-	addr = _sram_alloc(size, &free_l1_data_B_sram_head,
-			&used_l1_data_B_sram_head);
+	addr = _sram_alloc(size, &per_cpu(free_l1_data_B_sram_head, cpu),
+			&per_cpu(used_l1_data_B_sram_head, cpu));
 
 	/* add mutex operation */
-	spin_unlock_irqrestore(&l1_data_sram_lock, flags);
+	spin_unlock_irqrestore(&per_cpu(l1_data_sram_lock, cpu), flags);
+	put_cpu();
 
 	pr_debug("Allocated address in l1_data_B_sram_alloc is 0x%lx+0x%lx\n",
 		 (long unsigned int)addr, size);
@@ -455,15 +484,18 @@ int l1_data_B_sram_free(const void *addr)
 #if L1_DATA_B_LENGTH != 0
 	unsigned long flags;
 	int ret;
+	unsigned int cpu;
 
+	cpu = get_cpu();
 	/* add mutex operation */
-	spin_lock_irqsave(&l1_data_sram_lock, flags);
+	spin_lock_irqsave(&per_cpu(l1_data_sram_lock, cpu), flags);
 
-	ret = _sram_free(addr, &free_l1_data_B_sram_head,
-			&used_l1_data_B_sram_head);
+	ret = _sram_free(addr, &per_cpu(free_l1_data_B_sram_head, cpu),
+			&per_cpu(used_l1_data_B_sram_head, cpu));
 
 	/* add mutex operation */
-	spin_unlock_irqrestore(&l1_data_sram_lock, flags);
+	spin_unlock_irqrestore(&per_cpu(l1_data_sram_lock, cpu), flags);
+	put_cpu();
 
 	return ret;
 #else
@@ -509,15 +541,18 @@ void *l1_inst_sram_alloc(size_t size)
 #if L1_CODE_LENGTH != 0
 	unsigned long flags;
 	void *addr;
+	unsigned int cpu;
 
+	cpu = get_cpu();
 	/* add mutex operation */
-	spin_lock_irqsave(&l1_inst_sram_lock, flags);
+	spin_lock_irqsave(&per_cpu(l1_inst_sram_lock, cpu), flags);
 
-	addr = _sram_alloc(size, &free_l1_inst_sram_head,
-			&used_l1_inst_sram_head);
+	addr = _sram_alloc(size, &per_cpu(free_l1_inst_sram_head, cpu),
+			&per_cpu(used_l1_inst_sram_head, cpu));
 
 	/* add mutex operation */
-	spin_unlock_irqrestore(&l1_inst_sram_lock, flags);
+	spin_unlock_irqrestore(&per_cpu(l1_inst_sram_lock, cpu), flags);
+	put_cpu();
 
 	pr_debug("Allocated address in l1_inst_sram_alloc is 0x%lx+0x%lx\n",
 		 (long unsigned int)addr, size);
@@ -534,15 +569,18 @@ int l1_inst_sram_free(const void *addr)
 #if L1_CODE_LENGTH != 0
 	unsigned long flags;
 	int ret;
+	unsigned int cpu;
 
+	cpu = get_cpu();
 	/* add mutex operation */
-	spin_lock_irqsave(&l1_inst_sram_lock, flags);
+	spin_lock_irqsave(&per_cpu(l1_inst_sram_lock, cpu), flags);
 
-	ret = _sram_free(addr, &free_l1_inst_sram_head,
-			&used_l1_inst_sram_head);
+	ret = _sram_free(addr, &per_cpu(free_l1_inst_sram_head, cpu),
+			&per_cpu(used_l1_inst_sram_head, cpu));
 
 	/* add mutex operation */
-	spin_unlock_irqrestore(&l1_inst_sram_lock, flags);
+	spin_unlock_irqrestore(&per_cpu(l1_inst_sram_lock, cpu), flags);
+	put_cpu();
 
 	return ret;
 #else
@@ -556,15 +594,18 @@ void *l1sram_alloc(size_t size)
 {
 	unsigned long flags;
 	void *addr;
+	unsigned int cpu;
 
+	cpu = get_cpu();
 	/* add mutex operation */
-	spin_lock_irqsave(&l1sram_lock, flags);
+	spin_lock_irqsave(&per_cpu(l1sram_lock, cpu), flags);
 
-	addr = _sram_alloc(size, &free_l1_ssram_head,
-			&used_l1_ssram_head);
+	addr = _sram_alloc(size, &per_cpu(free_l1_ssram_head, cpu),
+			&per_cpu(used_l1_ssram_head, cpu));
 
 	/* add mutex operation */
-	spin_unlock_irqrestore(&l1sram_lock, flags);
+	spin_unlock_irqrestore(&per_cpu(l1sram_lock, cpu), flags);
+	put_cpu();
 
 	return addr;
 }
@@ -574,15 +615,18 @@ void *l1sram_alloc_max(size_t *psize)
 {
 	unsigned long flags;
 	void *addr;
+	unsigned int cpu;
 
+	cpu = get_cpu();
 	/* add mutex operation */
-	spin_lock_irqsave(&l1sram_lock, flags);
+	spin_lock_irqsave(&per_cpu(l1sram_lock, cpu), flags);
 
-	addr = _sram_alloc_max(&free_l1_ssram_head,
-			&used_l1_ssram_head, psize);
+	addr = _sram_alloc_max(&per_cpu(free_l1_ssram_head, cpu),
+			&per_cpu(used_l1_ssram_head, cpu), psize);
 
 	/* add mutex operation */
-	spin_unlock_irqrestore(&l1sram_lock, flags);
+	spin_unlock_irqrestore(&per_cpu(l1sram_lock, cpu), flags);
+	put_cpu();
 
 	return addr;
 }
@@ -592,15 +636,18 @@ int l1sram_free(const void *addr)
 {
 	unsigned long flags;
 	int ret;
+	unsigned int cpu;
 
+	cpu = get_cpu();
 	/* add mutex operation */
-	spin_lock_irqsave(&l1sram_lock, flags);
+	spin_lock_irqsave(&per_cpu(l1sram_lock, cpu), flags);
 
-	ret = _sram_free(addr, &free_l1_ssram_head,
-			&used_l1_ssram_head);
+	ret = _sram_free(addr, &per_cpu(free_l1_ssram_head, cpu),
+			&per_cpu(used_l1_ssram_head, cpu));
 
 	/* add mutex operation */
-	spin_unlock_irqrestore(&l1sram_lock, flags);
+	spin_unlock_irqrestore(&per_cpu(l1sram_lock, cpu), flags);
+	put_cpu();
 
 	return ret;
 }
@@ -761,33 +808,36 @@ static int sram_proc_read(char *buf, char **start, off_t offset, int count,
 		int *eof, void *data)
 {
 	int len = 0;
+	unsigned int cpu;
 
-	if (_sram_proc_read(buf, &len, count, "Scratchpad",
-			&free_l1_ssram_head, &used_l1_ssram_head))
-		goto not_done;
+	for (cpu = 0; cpu < num_possible_cpus(); ++cpu) {
+		if (_sram_proc_read(buf, &len, count, "Scratchpad",
+			&per_cpu(free_l1_ssram_head, cpu), &per_cpu(used_l1_ssram_head, cpu)))
+			goto not_done;
 #if L1_DATA_A_LENGTH != 0
-	if (_sram_proc_read(buf, &len, count, "L1 Data A",
-			&free_l1_data_A_sram_head,
-			&used_l1_data_A_sram_head))
-		goto not_done;
+		if (_sram_proc_read(buf, &len, count, "L1 Data A",
+			&per_cpu(free_l1_data_A_sram_head, cpu),
+			&per_cpu(used_l1_data_A_sram_head, cpu)))
+			goto not_done;
 #endif
 #if L1_DATA_B_LENGTH != 0
-	if (_sram_proc_read(buf, &len, count, "L1 Data B",
-			&free_l1_data_B_sram_head,
-			&used_l1_data_B_sram_head))
-		goto not_done;
+		if (_sram_proc_read(buf, &len, count, "L1 Data B",
+			&per_cpu(free_l1_data_B_sram_head, cpu),
+			&per_cpu(used_l1_data_B_sram_head, cpu)))
+			goto not_done;
 #endif
 #if L1_CODE_LENGTH != 0
-	if (_sram_proc_read(buf, &len, count, "L1 Instruction",
-			&free_l1_inst_sram_head, &used_l1_inst_sram_head))
-		goto not_done;
+		if (_sram_proc_read(buf, &len, count, "L1 Instruction",
+			&per_cpu(free_l1_inst_sram_head, cpu),
+			&per_cpu(used_l1_inst_sram_head, cpu)))
+			goto not_done;
 #endif
+	}
 #if L2_LENGTH != 0
-	if (_sram_proc_read(buf, &len, count, "L2",
-			&free_l2_sram_head, &used_l2_sram_head))
+	if (_sram_proc_read(buf, &len, count, "L2", &free_l2_sram_head,
+		&used_l2_sram_head))
 		goto not_done;
 #endif
-
 	*eof = 1;
  not_done:
 	return len;
-- 
cgit v1.2.3-59-g8ed1b