From 9895f9429cb489ba271c06767531083ae4c4bcbe Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 21 Nov 2007 22:46:14 +0900
Subject: sh: clear/copy_page renames in lib and lib64.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/lib/Makefile       |   3 +-
 arch/sh/lib/clear_page.S   | 152 ++++++++++++++++++
 arch/sh/lib/copy_page.S    | 388 +++++++++++++++++++++++++++++++++++++++++++++
 arch/sh/lib64/Makefile     |   2 +-
 arch/sh/lib64/clear_page.S |  54 +++++++
 arch/sh/lib64/copy_page.S  |  89 +++++++++++
 arch/sh/lib64/page_clear.S |  54 -------
 arch/sh/lib64/page_copy.S  |  89 -----------
 arch/sh/mm/Makefile_32     |   3 +-
 arch/sh/mm/clear_page.S    | 152 ------------------
 arch/sh/mm/copy_page.S     | 388 ---------------------------------------------
 11 files changed, 687 insertions(+), 687 deletions(-)
 create mode 100644 arch/sh/lib/clear_page.S
 create mode 100644 arch/sh/lib/copy_page.S
 create mode 100644 arch/sh/lib64/clear_page.S
 create mode 100644 arch/sh/lib64/copy_page.S
 delete mode 100644 arch/sh/lib64/page_clear.S
 delete mode 100644 arch/sh/lib64/page_copy.S
 delete mode 100644 arch/sh/mm/clear_page.S
 delete mode 100644 arch/sh/mm/copy_page.S

(limited to 'arch')

diff --git a/arch/sh/lib/Makefile b/arch/sh/lib/Makefile
index 6f7ac9eeb54f..ebb55d1149f5 100644
--- a/arch/sh/lib/Makefile
+++ b/arch/sh/lib/Makefile
@@ -8,6 +8,7 @@ lib-y  = delay.o io.o memset.o memmove.o memchr.o \
 memcpy-y			:= memcpy.o
 memcpy-$(CONFIG_CPU_SH4)	:= memcpy-sh4.o
 
-lib-y	+= $(memcpy-y)
+lib-$(CONFIG_MMU)		+= copy_page.o clear_page.o
+lib-y				+= $(memcpy-y)
 
 EXTRA_CFLAGS += -Werror
diff --git a/arch/sh/lib/clear_page.S b/arch/sh/lib/clear_page.S
new file mode 100644
index 000000000000..7a7c81ee3f01
--- /dev/null
+++ b/arch/sh/lib/clear_page.S
@@ -0,0 +1,152 @@
+/*
+ * __clear_user_page, __clear_user, clear_page implementation of SuperH
+ *
+ * Copyright (C) 2001  Kaz Kojima
+ * Copyright (C) 2001, 2002  Niibe Yutaka
+ * Copyright (C) 2006  Paul Mundt
+ */
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * clear_page_slow
+ * @to: P1 address
+ *
+ * void clear_page_slow(void *to)
+ */
+
+/*
+ * r0 --- scratch
+ * r4 --- to
+ * r5 --- to + PAGE_SIZE
+ */
+ENTRY(clear_page_slow)
+	mov	r4,r5
+	mov.l	.Llimit,r0
+	add	r0,r5
+	mov	#0,r0
+	!
+1:
+#if defined(CONFIG_CPU_SH3)
+	mov.l	r0,@r4
+#elif defined(CONFIG_CPU_SH4)
+	movca.l	r0,@r4
+	mov	r4,r1
+#endif
+	add	#32,r4
+	mov.l	r0,@-r4
+	mov.l	r0,@-r4
+	mov.l	r0,@-r4
+	mov.l	r0,@-r4
+	mov.l	r0,@-r4
+	mov.l	r0,@-r4
+	mov.l	r0,@-r4
+#if defined(CONFIG_CPU_SH4)
+	ocbwb	@r1
+#endif
+	cmp/eq	r5,r4
+	bf/s	1b
+	 add	#28,r4
+	!
+	rts
+	 nop
+.Llimit:	.long	(PAGE_SIZE-28)
+
+ENTRY(__clear_user)
+	!
+	mov	#0, r0
+	mov	#0xe0, r1	! 0xffffffe0
+	!
+	! r4..(r4+31)&~32 	   -------- not aligned	[ Area 0 ]
+	! (r4+31)&~32..(r4+r5)&~32 -------- aligned	[ Area 1 ]
+	! (r4+r5)&~32..r4+r5       -------- not aligned	[ Area 2 ]
+	!
+	! Clear area 0
+	mov	r4, r2
+	!
+	tst	r1, r5		! length < 32
+	bt	.Larea2		! skip to remainder
+	!
+	add	#31, r2
+	and	r1, r2
+	cmp/eq	r4, r2
+	bt	.Larea1
+	mov	r2, r3
+	sub	r4, r3
+	mov	r3, r7
+	mov	r4, r2
+	!
+.L0:	dt	r3
+0:	mov.b	r0, @r2
+	bf/s	.L0
+	 add	#1, r2
+	!
+	sub	r7, r5
+	mov	r2, r4
+.Larea1:
+	mov	r4, r3
+	add	r5, r3
+	and	r1, r3
+	cmp/hi	r2, r3
+	bf	.Larea2
+	!
+	! Clear area 1
+#if defined(CONFIG_CPU_SH4)
+1:	movca.l	r0, @r2
+#else
+1:	mov.l	r0, @r2
+#endif
+	add	#4, r2
+2:	mov.l	r0, @r2
+	add	#4, r2
+3:	mov.l	r0, @r2
+	add	#4, r2
+4:	mov.l	r0, @r2
+	add	#4, r2
+5:	mov.l	r0, @r2
+	add	#4, r2
+6:	mov.l	r0, @r2
+	add	#4, r2
+7:	mov.l	r0, @r2
+	add	#4, r2
+8:	mov.l	r0, @r2
+	add	#4, r2
+	cmp/hi	r2, r3
+	bt/s	1b
+	 nop
+	!
+	! Clear area 2
+.Larea2:
+	mov	r4, r3
+	add	r5, r3
+	cmp/hs	r3, r2
+	bt/s	.Ldone
+	 sub	r2, r3
+.L2:	dt	r3
+9:	mov.b	r0, @r2
+	bf/s	.L2
+	 add	#1, r2
+	!
+.Ldone:	rts
+	 mov	#0, r0	! return 0 as normal return
+
+	! return the number of bytes remained
+.Lbad_clear_user:
+	mov	r4, r0
+	add	r5, r0
+	rts
+	 sub	r2, r0
+
+.section __ex_table,"a"
+	.align 2
+	.long	0b, .Lbad_clear_user
+	.long	1b, .Lbad_clear_user
+	.long	2b, .Lbad_clear_user
+	.long	3b, .Lbad_clear_user
+	.long	4b, .Lbad_clear_user
+	.long	5b, .Lbad_clear_user
+	.long	6b, .Lbad_clear_user
+	.long	7b, .Lbad_clear_user
+	.long	8b, .Lbad_clear_user
+	.long	9b, .Lbad_clear_user
+.previous
diff --git a/arch/sh/lib/copy_page.S b/arch/sh/lib/copy_page.S
new file mode 100644
index 000000000000..b879545fa28b
--- /dev/null
+++ b/arch/sh/lib/copy_page.S
@@ -0,0 +1,388 @@
+/*
+ * copy_page, __copy_user_page, __copy_user implementation of SuperH
+ *
+ * Copyright (C) 2001  Niibe Yutaka & Kaz Kojima
+ * Copyright (C) 2002  Toshinobu Sugioka
+ * Copyright (C) 2006  Paul Mundt
+ */
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * copy_page
+ * @to: P1 address
+ * @from: P1 address
+ *
+ * void copy_page(void *to, void *from)
+ */
+
+/*
+ * r0, r1, r2, r3, r4, r5, r6, r7 --- scratch 
+ * r8 --- from + PAGE_SIZE
+ * r9 --- not used
+ * r10 --- to
+ * r11 --- from
+ */
+ENTRY(copy_page)
+	mov.l	r8,@-r15
+	mov.l	r10,@-r15
+	mov.l	r11,@-r15
+	mov	r4,r10
+	mov	r5,r11
+	mov	r5,r8
+	mov.l	.Lpsz,r0
+	add	r0,r8
+	!
+1:	mov.l	@r11+,r0
+	mov.l	@r11+,r1
+	mov.l	@r11+,r2
+	mov.l	@r11+,r3
+	mov.l	@r11+,r4
+	mov.l	@r11+,r5
+	mov.l	@r11+,r6
+	mov.l	@r11+,r7
+#if defined(CONFIG_CPU_SH3)
+	mov.l	r0,@r10
+#elif defined(CONFIG_CPU_SH4)
+	movca.l	r0,@r10
+	mov	r10,r0
+#endif
+	add	#32,r10
+	mov.l	r7,@-r10
+	mov.l	r6,@-r10
+	mov.l	r5,@-r10
+	mov.l	r4,@-r10
+	mov.l	r3,@-r10
+	mov.l	r2,@-r10
+	mov.l	r1,@-r10
+#if defined(CONFIG_CPU_SH4)
+	ocbwb	@r0
+#endif
+	cmp/eq	r11,r8
+	bf/s	1b
+	 add	#28,r10
+	!
+	mov.l	@r15+,r11
+	mov.l	@r15+,r10
+	mov.l	@r15+,r8
+	rts
+	 nop
+
+	.align 2
+.Lpsz:	.long	PAGE_SIZE
+/*
+ * __kernel_size_t __copy_user(void *to, const void *from, __kernel_size_t n);
+ * Return the number of bytes NOT copied
+ */
+#define EX(...)			\
+	9999: __VA_ARGS__ ;		\
+	.section __ex_table, "a";	\
+	.long 9999b, 6000f	;	\
+	.previous
+ENTRY(__copy_user)
+	! Check if small number of bytes
+	mov	#11,r0
+	mov	r4,r3
+	cmp/gt	r0,r6		! r6 (len) > r0 (11)
+	bf/s	.L_cleanup_loop_no_pop
+	 add	r6,r3		! last destination address
+
+	! Calculate bytes needed to align to src
+	mov.l	r11,@-r15
+	neg	r5,r0
+	mov.l	r10,@-r15
+	add	#4,r0
+	mov.l	r9,@-r15
+	and	#3,r0
+	mov.l	r8,@-r15
+	tst	r0,r0
+	bt	2f
+
+1:
+	! Copy bytes to long word align src
+EX(	mov.b	@r5+,r1		)
+	dt	r0
+	add	#-1,r6
+EX(	mov.b	r1,@r4		)
+	bf/s	1b
+	 add	#1,r4
+
+	! Jump to appropriate routine depending on dest
+2:	mov	#3,r1
+	mov	r6, r2
+	and	r4,r1
+	shlr2	r2
+	shll2	r1
+	mova	.L_jump_tbl,r0
+	mov.l	@(r0,r1),r1
+	jmp	@r1
+	 nop
+
+	.align 2
+.L_jump_tbl:
+	.long	.L_dest00
+	.long	.L_dest01
+	.long	.L_dest10
+	.long	.L_dest11
+
+/*
+ * Come here if there are less than 12 bytes to copy
+ *
+ * Keep the branch target close, so the bf/s callee doesn't overflow
+ * and result in a more expensive branch being inserted. This is the
+ * fast-path for small copies, the jump via the jump table will hit the
+ * default slow-path cleanup. -PFM.
+ */
+.L_cleanup_loop_no_pop:
+	tst	r6,r6		! Check explicitly for zero
+	bt	1f
+
+2:
+EX(	mov.b	@r5+,r0		)
+	dt	r6
+EX(	mov.b	r0,@r4		)
+	bf/s	2b
+	 add	#1,r4
+
+1:	mov	#0,r0		! normal return
+5000:
+
+# Exception handler:
+.section .fixup, "ax"
+6000:
+	mov.l	8000f,r1
+	mov	r3,r0
+	jmp	@r1
+	 sub	r4,r0
+	.align	2
+8000:	.long	5000b
+
+.previous
+	rts
+	 nop
+
+! Destination = 00
+
+.L_dest00:
+	! Skip the large copy for small transfers
+	mov	#(32+32-4), r0
+	cmp/gt	r6, r0		! r0 (60) > r6 (len)
+	bt	1f
+
+	! Align dest to a 32 byte boundary
+	neg	r4,r0
+	add	#0x20, r0
+	and	#0x1f, r0
+	tst	r0, r0
+	bt	2f
+
+	sub	r0, r6
+	shlr2	r0
+3:
+EX(	mov.l	@r5+,r1		)
+	dt	r0
+EX(	mov.l	r1,@r4		)
+	bf/s	3b
+	 add	#4,r4
+
+2:
+EX(	mov.l	@r5+,r0		)
+EX(	mov.l	@r5+,r1		)
+EX(	mov.l	@r5+,r2		)
+EX(	mov.l	@r5+,r7		)
+EX(	mov.l	@r5+,r8		)
+EX(	mov.l	@r5+,r9		)
+EX(	mov.l	@r5+,r10	)
+EX(	mov.l	@r5+,r11	)
+#ifdef CONFIG_CPU_SH4
+EX(	movca.l	r0,@r4		)
+#else
+EX(	mov.l	r0,@r4		)
+#endif
+	add	#-32, r6
+EX(	mov.l	r1,@(4,r4)	)
+	mov	#32, r0
+EX(	mov.l	r2,@(8,r4)	)
+	cmp/gt	r6, r0		! r0 (32) > r6 (len)
+EX(	mov.l	r7,@(12,r4)	)
+EX(	mov.l	r8,@(16,r4)	)
+EX(	mov.l	r9,@(20,r4)	)
+EX(	mov.l	r10,@(24,r4)	)
+EX(	mov.l	r11,@(28,r4)	)
+	bf/s	2b
+	 add	#32,r4
+
+1:	mov	r6, r0
+	shlr2	r0
+	tst	r0, r0
+	bt	.L_cleanup
+1:
+EX(	mov.l	@r5+,r1		)
+	dt	r0
+EX(	mov.l	r1,@r4		)
+	bf/s	1b
+	 add	#4,r4
+
+	bra	.L_cleanup
+	 nop
+
+! Destination = 10
+
+.L_dest10:
+	mov	r2,r7
+	shlr2	r7
+	shlr	r7
+	tst	r7,r7
+	mov	#7,r0
+	bt/s	1f
+	 and	r0,r2
+2:
+	dt	r7
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+EX(	mov.l	@r5+,r0		)
+EX(	mov.l	@r5+,r1		)
+EX(	mov.l	@r5+,r8		)
+EX(	mov.l	@r5+,r9		)
+EX(	mov.l	@r5+,r10	)
+EX(	mov.w	r0,@r4		)
+	add	#2,r4
+	xtrct	r1,r0
+	xtrct	r8,r1
+	xtrct	r9,r8
+	xtrct	r10,r9
+
+EX(	mov.l	r0,@r4		)
+EX(	mov.l	r1,@(4,r4)	)
+EX(	mov.l	r8,@(8,r4)	)
+EX(	mov.l	r9,@(12,r4)	)
+
+EX(	mov.l	@r5+,r1		)
+EX(	mov.l	@r5+,r8		)
+EX(	mov.l	@r5+,r0		)
+	xtrct	r1,r10
+	xtrct	r8,r1
+	xtrct	r0,r8
+	shlr16	r0
+EX(	mov.l	r10,@(16,r4)	)
+EX(	mov.l	r1,@(20,r4)	)
+EX(	mov.l	r8,@(24,r4)	)
+EX(	mov.w	r0,@(28,r4)	)
+	bf/s	2b
+	 add	#30,r4
+#else
+EX(	mov.l	@(28,r5),r0	)
+EX(	mov.l	@(24,r5),r8	)
+EX(	mov.l	@(20,r5),r9	)
+EX(	mov.l	@(16,r5),r10	)
+EX(	mov.w	r0,@(30,r4)	)
+	add	#-2,r4
+	xtrct	r8,r0
+	xtrct	r9,r8
+	xtrct	r10,r9
+EX(	mov.l	r0,@(28,r4)	)
+EX(	mov.l	r8,@(24,r4)	)
+EX(	mov.l	r9,@(20,r4)	)
+
+EX(	mov.l	@(12,r5),r0	)
+EX(	mov.l	@(8,r5),r8	)
+	xtrct	r0,r10
+EX(	mov.l	@(4,r5),r9	)
+	mov.l	r10,@(16,r4)
+EX(	mov.l	@r5,r10		)
+	xtrct	r8,r0
+	xtrct	r9,r8
+	xtrct	r10,r9
+EX(	mov.l	r0,@(12,r4)	)
+EX(	mov.l	r8,@(8,r4)	)
+	swap.w	r10,r0
+EX(	mov.l	r9,@(4,r4)	)
+EX(	mov.w	r0,@(2,r4)	)
+
+	add	#32,r5
+	bf/s	2b
+	 add	#34,r4
+#endif
+	tst	r2,r2
+	bt	.L_cleanup
+
+1:	! Read longword, write two words per iteration
+EX(	mov.l	@r5+,r0		)
+	dt	r2
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+EX(	mov.w	r0,@r4		)
+	shlr16	r0
+EX(	mov.w 	r0,@(2,r4)	)
+#else
+EX(	mov.w	r0,@(2,r4)	)
+	shlr16	r0
+EX(	mov.w	r0,@r4		)
+#endif
+	bf/s	1b
+	 add	#4,r4
+
+	bra	.L_cleanup
+	 nop
+
+! Destination = 01 or 11
+
+.L_dest01:
+.L_dest11:
+	! Read longword, write byte, word, byte per iteration
+EX(	mov.l	@r5+,r0		)
+	dt	r2
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+EX(	mov.b	r0,@r4		)
+	shlr8	r0
+	add	#1,r4
+EX(	mov.w	r0,@r4		)
+	shlr16	r0
+EX(	mov.b	r0,@(2,r4)	)
+	bf/s	.L_dest01
+	 add	#3,r4
+#else
+EX(	mov.b	r0,@(3,r4)	)
+	shlr8	r0
+	swap.w	r0,r7
+EX(	mov.b	r7,@r4		)
+	add	#1,r4
+EX(	mov.w	r0,@r4		)
+	bf/s	.L_dest01
+	 add	#3,r4
+#endif
+
+! Cleanup last few bytes
+.L_cleanup:
+	mov	r6,r0
+	and	#3,r0
+	tst	r0,r0
+	bt	.L_exit
+	mov	r0,r6
+
+.L_cleanup_loop:
+EX(	mov.b	@r5+,r0		)
+	dt	r6
+EX(	mov.b	r0,@r4		)
+	bf/s	.L_cleanup_loop
+	 add	#1,r4
+
+.L_exit:
+	mov	#0,r0		! normal return
+
+5000:
+
+# Exception handler:
+.section .fixup, "ax"
+6000:
+	mov.l	8000f,r1
+	mov	r3,r0
+	jmp	@r1
+	 sub	r4,r0
+	.align	2
+8000:	.long	5000b
+
+.previous
+	mov.l	@r15+,r8
+	mov.l	@r15+,r9
+	mov.l	@r15+,r10
+	rts
+	 mov.l	@r15+,r11
diff --git a/arch/sh/lib64/Makefile b/arch/sh/lib64/Makefile
index 2f4086ac6f99..9950966923a0 100644
--- a/arch/sh/lib64/Makefile
+++ b/arch/sh/lib64/Makefile
@@ -11,5 +11,5 @@
 
 # Panic should really be compiled as PIC
 lib-y  := udelay.o c-checksum.o dbg.o panic.o memcpy.o copy_user_memcpy.o \
-		page_copy.o page_clear.o
+		copy_page.o clear_page.o
 
diff --git a/arch/sh/lib64/clear_page.S b/arch/sh/lib64/clear_page.S
new file mode 100644
index 000000000000..007ab48ecc1c
--- /dev/null
+++ b/arch/sh/lib64/clear_page.S
@@ -0,0 +1,54 @@
+/*
+   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
+
+   This file is subject to the terms and conditions of the GNU General Public
+   License.  See the file "COPYING" in the main directory of this archive
+   for more details.
+
+   Tight version of memset for the case of just clearing a page.  It turns out
+   that having the alloco's spaced out slightly due to the increment/branch
+   pair causes them to contend less for access to the cache.  Similarly,
+   keeping the stores apart from the allocos causes less contention.  => Do two
+   separate loops.  Do multiple stores per loop to amortise the
+   increment/branch cost a little.
+
+   Parameters:
+   r2 : source effective address (start of page)
+
+   Always clears 4096 bytes.
+
+   Note : alloco guarded by synco to avoid TAKum03020 erratum
+
+*/
+
+	.section .text..SHmedia32,"ax"
+	.little
+
+	.balign 8
+	.global clear_page
+clear_page:
+	pta/l 1f, tr1
+	pta/l 2f, tr2
+	ptabs/l r18, tr0
+
+	movi 4096, r7
+	add  r2, r7, r7
+	add  r2, r63, r6
+1:
+	alloco r6, 0
+	synco	! TAKum03020
+	addi	r6, 32, r6
+	bgt/l	r7, r6, tr1
+
+	add  r2, r63, r6
+2:
+	st.q  r6,   0, r63
+	st.q  r6,   8, r63
+	st.q  r6,  16, r63
+	st.q  r6,  24, r63
+	addi r6, 32, r6
+	bgt/l r7, r6, tr2
+
+	blink tr0, r63
+
+
diff --git a/arch/sh/lib64/copy_page.S b/arch/sh/lib64/copy_page.S
new file mode 100644
index 000000000000..0ec6fca63b56
--- /dev/null
+++ b/arch/sh/lib64/copy_page.S
@@ -0,0 +1,89 @@
+/*
+   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
+
+   This file is subject to the terms and conditions of the GNU General Public
+   License.  See the file "COPYING" in the main directory of this archive
+   for more details.
+
+   Tight version of mempy for the case of just copying a page.
+   Prefetch strategy empirically optimised against RTL simulations
+   of SH5-101 cut2 eval chip with Cayman board DDR memory.
+
+   Parameters:
+   r2 : destination effective address (start of page)
+   r3 : source effective address (start of page)
+
+   Always copies 4096 bytes.
+
+   Points to review.
+   * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
+     It seems like the prefetch needs to be at at least 4 lines ahead to get
+     the data into the cache in time, and the allocos contend with outstanding
+     prefetches for the same cache set, so it's better to have the numbers
+     different.
+   */
+
+	.section .text..SHmedia32,"ax"
+	.little
+
+	.balign 8
+	.global copy_page
+copy_page:
+
+	/* Copy 4096 bytes worth of data from r3 to r2.
+	   Do prefetches 4 lines ahead.
+	   Do alloco 2 lines ahead */
+
+	pta 1f, tr1
+	pta 2f, tr2
+	pta 3f, tr3
+	ptabs r18, tr0
+
+#if 0
+	/* TAKum03020 */
+	ld.q r3, 0x00, r63
+	ld.q r3, 0x20, r63
+	ld.q r3, 0x40, r63
+	ld.q r3, 0x60, r63
+#endif
+	alloco r2, 0x00
+	synco		! TAKum03020
+	alloco r2, 0x20
+	synco		! TAKum03020
+
+	movi 3968, r6
+	add  r2, r6, r6
+	addi r6, 64, r7
+	addi r7, 64, r8
+	sub r3, r2, r60
+	addi r60, 8, r61
+	addi r61, 8, r62
+	addi r62, 8, r23
+	addi r60, 0x80, r22
+
+/* Minimal code size.  The extra branches inside the loop don't cost much
+   because they overlap with the time spent waiting for prefetches to
+   complete. */
+1:
+#if 0
+	/* TAKum03020 */
+	bge/u r2, r6, tr2  ! skip prefetch for last 4 lines
+	ldx.q r2, r22, r63 ! prefetch 4 lines hence
+#endif
+2:
+	bge/u r2, r7, tr3  ! skip alloco for last 2 lines
+	alloco r2, 0x40    ! alloc destination line 2 lines ahead
+	synco		! TAKum03020
+3:
+	ldx.q r2, r60, r36
+	ldx.q r2, r61, r37
+	ldx.q r2, r62, r38
+	ldx.q r2, r23, r39
+	st.q  r2,   0, r36
+	st.q  r2,   8, r37
+	st.q  r2,  16, r38
+	st.q  r2,  24, r39
+	addi r2, 32, r2
+	bgt/l r8, r2, tr1
+
+	blink tr0, r63	   ! return
diff --git a/arch/sh/lib64/page_clear.S b/arch/sh/lib64/page_clear.S
deleted file mode 100644
index 007ab48ecc1c..000000000000
--- a/arch/sh/lib64/page_clear.S
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
-   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
-
-   This file is subject to the terms and conditions of the GNU General Public
-   License.  See the file "COPYING" in the main directory of this archive
-   for more details.
-
-   Tight version of memset for the case of just clearing a page.  It turns out
-   that having the alloco's spaced out slightly due to the increment/branch
-   pair causes them to contend less for access to the cache.  Similarly,
-   keeping the stores apart from the allocos causes less contention.  => Do two
-   separate loops.  Do multiple stores per loop to amortise the
-   increment/branch cost a little.
-
-   Parameters:
-   r2 : source effective address (start of page)
-
-   Always clears 4096 bytes.
-
-   Note : alloco guarded by synco to avoid TAKum03020 erratum
-
-*/
-
-	.section .text..SHmedia32,"ax"
-	.little
-
-	.balign 8
-	.global clear_page
-clear_page:
-	pta/l 1f, tr1
-	pta/l 2f, tr2
-	ptabs/l r18, tr0
-
-	movi 4096, r7
-	add  r2, r7, r7
-	add  r2, r63, r6
-1:
-	alloco r6, 0
-	synco	! TAKum03020
-	addi	r6, 32, r6
-	bgt/l	r7, r6, tr1
-
-	add  r2, r63, r6
-2:
-	st.q  r6,   0, r63
-	st.q  r6,   8, r63
-	st.q  r6,  16, r63
-	st.q  r6,  24, r63
-	addi r6, 32, r6
-	bgt/l r7, r6, tr2
-
-	blink tr0, r63
-
-
diff --git a/arch/sh/lib64/page_copy.S b/arch/sh/lib64/page_copy.S
deleted file mode 100644
index 0ec6fca63b56..000000000000
--- a/arch/sh/lib64/page_copy.S
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
-   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
-
-   This file is subject to the terms and conditions of the GNU General Public
-   License.  See the file "COPYING" in the main directory of this archive
-   for more details.
-
-   Tight version of mempy for the case of just copying a page.
-   Prefetch strategy empirically optimised against RTL simulations
-   of SH5-101 cut2 eval chip with Cayman board DDR memory.
-
-   Parameters:
-   r2 : destination effective address (start of page)
-   r3 : source effective address (start of page)
-
-   Always copies 4096 bytes.
-
-   Points to review.
-   * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
-     It seems like the prefetch needs to be at at least 4 lines ahead to get
-     the data into the cache in time, and the allocos contend with outstanding
-     prefetches for the same cache set, so it's better to have the numbers
-     different.
-   */
-
-	.section .text..SHmedia32,"ax"
-	.little
-
-	.balign 8
-	.global copy_page
-copy_page:
-
-	/* Copy 4096 bytes worth of data from r3 to r2.
-	   Do prefetches 4 lines ahead.
-	   Do alloco 2 lines ahead */
-
-	pta 1f, tr1
-	pta 2f, tr2
-	pta 3f, tr3
-	ptabs r18, tr0
-
-#if 0
-	/* TAKum03020 */
-	ld.q r3, 0x00, r63
-	ld.q r3, 0x20, r63
-	ld.q r3, 0x40, r63
-	ld.q r3, 0x60, r63
-#endif
-	alloco r2, 0x00
-	synco		! TAKum03020
-	alloco r2, 0x20
-	synco		! TAKum03020
-
-	movi 3968, r6
-	add  r2, r6, r6
-	addi r6, 64, r7
-	addi r7, 64, r8
-	sub r3, r2, r60
-	addi r60, 8, r61
-	addi r61, 8, r62
-	addi r62, 8, r23
-	addi r60, 0x80, r22
-
-/* Minimal code size.  The extra branches inside the loop don't cost much
-   because they overlap with the time spent waiting for prefetches to
-   complete. */
-1:
-#if 0
-	/* TAKum03020 */
-	bge/u r2, r6, tr2  ! skip prefetch for last 4 lines
-	ldx.q r2, r22, r63 ! prefetch 4 lines hence
-#endif
-2:
-	bge/u r2, r7, tr3  ! skip alloco for last 2 lines
-	alloco r2, 0x40    ! alloc destination line 2 lines ahead
-	synco		! TAKum03020
-3:
-	ldx.q r2, r60, r36
-	ldx.q r2, r61, r37
-	ldx.q r2, r62, r38
-	ldx.q r2, r23, r39
-	st.q  r2,   0, r36
-	st.q  r2,   8, r37
-	st.q  r2,  16, r38
-	st.q  r2,  24, r39
-	addi r2, 32, r2
-	bgt/l r8, r2, tr1
-
-	blink tr0, r63	   ! return
diff --git a/arch/sh/mm/Makefile_32 b/arch/sh/mm/Makefile_32
index 095abd14592f..e295db60b91b 100644
--- a/arch/sh/mm/Makefile_32
+++ b/arch/sh/mm/Makefile_32
@@ -12,8 +12,7 @@ obj-$(CONFIG_SH7705_CACHE_32KB)	+= cache-sh7705.o
 endif
 
 mmu-y			:= tlb-nommu.o pg-nommu.o
-mmu-$(CONFIG_MMU)	:= fault_32.o clear_page.o copy_page.o tlbflush_32.o \
-			   ioremap_32.o
+mmu-$(CONFIG_MMU)	:= fault_32.o tlbflush_32.o ioremap_32.o
 
 obj-y			+= $(mmu-y)
 
diff --git a/arch/sh/mm/clear_page.S b/arch/sh/mm/clear_page.S
deleted file mode 100644
index 7a7c81ee3f01..000000000000
--- a/arch/sh/mm/clear_page.S
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * __clear_user_page, __clear_user, clear_page implementation of SuperH
- *
- * Copyright (C) 2001  Kaz Kojima
- * Copyright (C) 2001, 2002  Niibe Yutaka
- * Copyright (C) 2006  Paul Mundt
- */
-#include <linux/linkage.h>
-#include <asm/page.h>
-
-/*
- * clear_page_slow
- * @to: P1 address
- *
- * void clear_page_slow(void *to)
- */
-
-/*
- * r0 --- scratch
- * r4 --- to
- * r5 --- to + PAGE_SIZE
- */
-ENTRY(clear_page_slow)
-	mov	r4,r5
-	mov.l	.Llimit,r0
-	add	r0,r5
-	mov	#0,r0
-	!
-1:
-#if defined(CONFIG_CPU_SH3)
-	mov.l	r0,@r4
-#elif defined(CONFIG_CPU_SH4)
-	movca.l	r0,@r4
-	mov	r4,r1
-#endif
-	add	#32,r4
-	mov.l	r0,@-r4
-	mov.l	r0,@-r4
-	mov.l	r0,@-r4
-	mov.l	r0,@-r4
-	mov.l	r0,@-r4
-	mov.l	r0,@-r4
-	mov.l	r0,@-r4
-#if defined(CONFIG_CPU_SH4)
-	ocbwb	@r1
-#endif
-	cmp/eq	r5,r4
-	bf/s	1b
-	 add	#28,r4
-	!
-	rts
-	 nop
-.Llimit:	.long	(PAGE_SIZE-28)
-
-ENTRY(__clear_user)
-	!
-	mov	#0, r0
-	mov	#0xe0, r1	! 0xffffffe0
-	!
-	! r4..(r4+31)&~32 	   -------- not aligned	[ Area 0 ]
-	! (r4+31)&~32..(r4+r5)&~32 -------- aligned	[ Area 1 ]
-	! (r4+r5)&~32..r4+r5       -------- not aligned	[ Area 2 ]
-	!
-	! Clear area 0
-	mov	r4, r2
-	!
-	tst	r1, r5		! length < 32
-	bt	.Larea2		! skip to remainder
-	!
-	add	#31, r2
-	and	r1, r2
-	cmp/eq	r4, r2
-	bt	.Larea1
-	mov	r2, r3
-	sub	r4, r3
-	mov	r3, r7
-	mov	r4, r2
-	!
-.L0:	dt	r3
-0:	mov.b	r0, @r2
-	bf/s	.L0
-	 add	#1, r2
-	!
-	sub	r7, r5
-	mov	r2, r4
-.Larea1:
-	mov	r4, r3
-	add	r5, r3
-	and	r1, r3
-	cmp/hi	r2, r3
-	bf	.Larea2
-	!
-	! Clear area 1
-#if defined(CONFIG_CPU_SH4)
-1:	movca.l	r0, @r2
-#else
-1:	mov.l	r0, @r2
-#endif
-	add	#4, r2
-2:	mov.l	r0, @r2
-	add	#4, r2
-3:	mov.l	r0, @r2
-	add	#4, r2
-4:	mov.l	r0, @r2
-	add	#4, r2
-5:	mov.l	r0, @r2
-	add	#4, r2
-6:	mov.l	r0, @r2
-	add	#4, r2
-7:	mov.l	r0, @r2
-	add	#4, r2
-8:	mov.l	r0, @r2
-	add	#4, r2
-	cmp/hi	r2, r3
-	bt/s	1b
-	 nop
-	!
-	! Clear area 2
-.Larea2:
-	mov	r4, r3
-	add	r5, r3
-	cmp/hs	r3, r2
-	bt/s	.Ldone
-	 sub	r2, r3
-.L2:	dt	r3
-9:	mov.b	r0, @r2
-	bf/s	.L2
-	 add	#1, r2
-	!
-.Ldone:	rts
-	 mov	#0, r0	! return 0 as normal return
-
-	! return the number of bytes remained
-.Lbad_clear_user:
-	mov	r4, r0
-	add	r5, r0
-	rts
-	 sub	r2, r0
-
-.section __ex_table,"a"
-	.align 2
-	.long	0b, .Lbad_clear_user
-	.long	1b, .Lbad_clear_user
-	.long	2b, .Lbad_clear_user
-	.long	3b, .Lbad_clear_user
-	.long	4b, .Lbad_clear_user
-	.long	5b, .Lbad_clear_user
-	.long	6b, .Lbad_clear_user
-	.long	7b, .Lbad_clear_user
-	.long	8b, .Lbad_clear_user
-	.long	9b, .Lbad_clear_user
-.previous
diff --git a/arch/sh/mm/copy_page.S b/arch/sh/mm/copy_page.S
deleted file mode 100644
index b879545fa28b..000000000000
--- a/arch/sh/mm/copy_page.S
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- * copy_page, __copy_user_page, __copy_user implementation of SuperH
- *
- * Copyright (C) 2001  Niibe Yutaka & Kaz Kojima
- * Copyright (C) 2002  Toshinobu Sugioka
- * Copyright (C) 2006  Paul Mundt
- */
-#include <linux/linkage.h>
-#include <asm/page.h>
-
-/*
- * copy_page
- * @to: P1 address
- * @from: P1 address
- *
- * void copy_page(void *to, void *from)
- */
-
-/*
- * r0, r1, r2, r3, r4, r5, r6, r7 --- scratch 
- * r8 --- from + PAGE_SIZE
- * r9 --- not used
- * r10 --- to
- * r11 --- from
- */
-ENTRY(copy_page)
-	mov.l	r8,@-r15
-	mov.l	r10,@-r15
-	mov.l	r11,@-r15
-	mov	r4,r10
-	mov	r5,r11
-	mov	r5,r8
-	mov.l	.Lpsz,r0
-	add	r0,r8
-	!
-1:	mov.l	@r11+,r0
-	mov.l	@r11+,r1
-	mov.l	@r11+,r2
-	mov.l	@r11+,r3
-	mov.l	@r11+,r4
-	mov.l	@r11+,r5
-	mov.l	@r11+,r6
-	mov.l	@r11+,r7
-#if defined(CONFIG_CPU_SH3)
-	mov.l	r0,@r10
-#elif defined(CONFIG_CPU_SH4)
-	movca.l	r0,@r10
-	mov	r10,r0
-#endif
-	add	#32,r10
-	mov.l	r7,@-r10
-	mov.l	r6,@-r10
-	mov.l	r5,@-r10
-	mov.l	r4,@-r10
-	mov.l	r3,@-r10
-	mov.l	r2,@-r10
-	mov.l	r1,@-r10
-#if defined(CONFIG_CPU_SH4)
-	ocbwb	@r0
-#endif
-	cmp/eq	r11,r8
-	bf/s	1b
-	 add	#28,r10
-	!
-	mov.l	@r15+,r11
-	mov.l	@r15+,r10
-	mov.l	@r15+,r8
-	rts
-	 nop
-
-	.align 2
-.Lpsz:	.long	PAGE_SIZE
-/*
- * __kernel_size_t __copy_user(void *to, const void *from, __kernel_size_t n);
- * Return the number of bytes NOT copied
- */
-#define EX(...)			\
-	9999: __VA_ARGS__ ;		\
-	.section __ex_table, "a";	\
-	.long 9999b, 6000f	;	\
-	.previous
-ENTRY(__copy_user)
-	! Check if small number of bytes
-	mov	#11,r0
-	mov	r4,r3
-	cmp/gt	r0,r6		! r6 (len) > r0 (11)
-	bf/s	.L_cleanup_loop_no_pop
-	 add	r6,r3		! last destination address
-
-	! Calculate bytes needed to align to src
-	mov.l	r11,@-r15
-	neg	r5,r0
-	mov.l	r10,@-r15
-	add	#4,r0
-	mov.l	r9,@-r15
-	and	#3,r0
-	mov.l	r8,@-r15
-	tst	r0,r0
-	bt	2f
-
-1:
-	! Copy bytes to long word align src
-EX(	mov.b	@r5+,r1		)
-	dt	r0
-	add	#-1,r6
-EX(	mov.b	r1,@r4		)
-	bf/s	1b
-	 add	#1,r4
-
-	! Jump to appropriate routine depending on dest
-2:	mov	#3,r1
-	mov	r6, r2
-	and	r4,r1
-	shlr2	r2
-	shll2	r1
-	mova	.L_jump_tbl,r0
-	mov.l	@(r0,r1),r1
-	jmp	@r1
-	 nop
-
-	.align 2
-.L_jump_tbl:
-	.long	.L_dest00
-	.long	.L_dest01
-	.long	.L_dest10
-	.long	.L_dest11
-
-/*
- * Come here if there are less than 12 bytes to copy
- *
- * Keep the branch target close, so the bf/s callee doesn't overflow
- * and result in a more expensive branch being inserted. This is the
- * fast-path for small copies, the jump via the jump table will hit the
- * default slow-path cleanup. -PFM.
- */
-.L_cleanup_loop_no_pop:
-	tst	r6,r6		! Check explicitly for zero
-	bt	1f
-
-2:
-EX(	mov.b	@r5+,r0		)
-	dt	r6
-EX(	mov.b	r0,@r4		)
-	bf/s	2b
-	 add	#1,r4
-
-1:	mov	#0,r0		! normal return
-5000:
-
-# Exception handler:
-.section .fixup, "ax"
-6000:
-	mov.l	8000f,r1
-	mov	r3,r0
-	jmp	@r1
-	 sub	r4,r0
-	.align	2
-8000:	.long	5000b
-
-.previous
-	rts
-	 nop
-
-! Destination = 00
-
-.L_dest00:
-	! Skip the large copy for small transfers
-	mov	#(32+32-4), r0
-	cmp/gt	r6, r0		! r0 (60) > r6 (len)
-	bt	1f
-
-	! Align dest to a 32 byte boundary
-	neg	r4,r0
-	add	#0x20, r0
-	and	#0x1f, r0
-	tst	r0, r0
-	bt	2f
-
-	sub	r0, r6
-	shlr2	r0
-3:
-EX(	mov.l	@r5+,r1		)
-	dt	r0
-EX(	mov.l	r1,@r4		)
-	bf/s	3b
-	 add	#4,r4
-
-2:
-EX(	mov.l	@r5+,r0		)
-EX(	mov.l	@r5+,r1		)
-EX(	mov.l	@r5+,r2		)
-EX(	mov.l	@r5+,r7		)
-EX(	mov.l	@r5+,r8		)
-EX(	mov.l	@r5+,r9		)
-EX(	mov.l	@r5+,r10	)
-EX(	mov.l	@r5+,r11	)
-#ifdef CONFIG_CPU_SH4
-EX(	movca.l	r0,@r4		)
-#else
-EX(	mov.l	r0,@r4		)
-#endif
-	add	#-32, r6
-EX(	mov.l	r1,@(4,r4)	)
-	mov	#32, r0
-EX(	mov.l	r2,@(8,r4)	)
-	cmp/gt	r6, r0		! r0 (32) > r6 (len)
-EX(	mov.l	r7,@(12,r4)	)
-EX(	mov.l	r8,@(16,r4)	)
-EX(	mov.l	r9,@(20,r4)	)
-EX(	mov.l	r10,@(24,r4)	)
-EX(	mov.l	r11,@(28,r4)	)
-	bf/s	2b
-	 add	#32,r4
-
-1:	mov	r6, r0
-	shlr2	r0
-	tst	r0, r0
-	bt	.L_cleanup
-1:
-EX(	mov.l	@r5+,r1		)
-	dt	r0
-EX(	mov.l	r1,@r4		)
-	bf/s	1b
-	 add	#4,r4
-
-	bra	.L_cleanup
-	 nop
-
-! Destination = 10
-
-.L_dest10:
-	mov	r2,r7
-	shlr2	r7
-	shlr	r7
-	tst	r7,r7
-	mov	#7,r0
-	bt/s	1f
-	 and	r0,r2
-2:
-	dt	r7
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-EX(	mov.l	@r5+,r0		)
-EX(	mov.l	@r5+,r1		)
-EX(	mov.l	@r5+,r8		)
-EX(	mov.l	@r5+,r9		)
-EX(	mov.l	@r5+,r10	)
-EX(	mov.w	r0,@r4		)
-	add	#2,r4
-	xtrct	r1,r0
-	xtrct	r8,r1
-	xtrct	r9,r8
-	xtrct	r10,r9
-
-EX(	mov.l	r0,@r4		)
-EX(	mov.l	r1,@(4,r4)	)
-EX(	mov.l	r8,@(8,r4)	)
-EX(	mov.l	r9,@(12,r4)	)
-
-EX(	mov.l	@r5+,r1		)
-EX(	mov.l	@r5+,r8		)
-EX(	mov.l	@r5+,r0		)
-	xtrct	r1,r10
-	xtrct	r8,r1
-	xtrct	r0,r8
-	shlr16	r0
-EX(	mov.l	r10,@(16,r4)	)
-EX(	mov.l	r1,@(20,r4)	)
-EX(	mov.l	r8,@(24,r4)	)
-EX(	mov.w	r0,@(28,r4)	)
-	bf/s	2b
-	 add	#30,r4
-#else
-EX(	mov.l	@(28,r5),r0	)
-EX(	mov.l	@(24,r5),r8	)
-EX(	mov.l	@(20,r5),r9	)
-EX(	mov.l	@(16,r5),r10	)
-EX(	mov.w	r0,@(30,r4)	)
-	add	#-2,r4
-	xtrct	r8,r0
-	xtrct	r9,r8
-	xtrct	r10,r9
-EX(	mov.l	r0,@(28,r4)	)
-EX(	mov.l	r8,@(24,r4)	)
-EX(	mov.l	r9,@(20,r4)	)
-
-EX(	mov.l	@(12,r5),r0	)
-EX(	mov.l	@(8,r5),r8	)
-	xtrct	r0,r10
-EX(	mov.l	@(4,r5),r9	)
-	mov.l	r10,@(16,r4)
-EX(	mov.l	@r5,r10		)
-	xtrct	r8,r0
-	xtrct	r9,r8
-	xtrct	r10,r9
-EX(	mov.l	r0,@(12,r4)	)
-EX(	mov.l	r8,@(8,r4)	)
-	swap.w	r10,r0
-EX(	mov.l	r9,@(4,r4)	)
-EX(	mov.w	r0,@(2,r4)	)
-
-	add	#32,r5
-	bf/s	2b
-	 add	#34,r4
-#endif
-	tst	r2,r2
-	bt	.L_cleanup
-
-1:	! Read longword, write two words per iteration
-EX(	mov.l	@r5+,r0		)
-	dt	r2
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-EX(	mov.w	r0,@r4		)
-	shlr16	r0
-EX(	mov.w 	r0,@(2,r4)	)
-#else
-EX(	mov.w	r0,@(2,r4)	)
-	shlr16	r0
-EX(	mov.w	r0,@r4		)
-#endif
-	bf/s	1b
-	 add	#4,r4
-
-	bra	.L_cleanup
-	 nop
-
-! Destination = 01 or 11
-
-.L_dest01:
-.L_dest11:
-	! Read longword, write byte, word, byte per iteration
-EX(	mov.l	@r5+,r0		)
-	dt	r2
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-EX(	mov.b	r0,@r4		)
-	shlr8	r0
-	add	#1,r4
-EX(	mov.w	r0,@r4		)
-	shlr16	r0
-EX(	mov.b	r0,@(2,r4)	)
-	bf/s	.L_dest01
-	 add	#3,r4
-#else
-EX(	mov.b	r0,@(3,r4)	)
-	shlr8	r0
-	swap.w	r0,r7
-EX(	mov.b	r7,@r4		)
-	add	#1,r4
-EX(	mov.w	r0,@r4		)
-	bf/s	.L_dest01
-	 add	#3,r4
-#endif
-
-! Cleanup last few bytes
-.L_cleanup:
-	mov	r6,r0
-	and	#3,r0
-	tst	r0,r0
-	bt	.L_exit
-	mov	r0,r6
-
-.L_cleanup_loop:
-EX(	mov.b	@r5+,r0		)
-	dt	r6
-EX(	mov.b	r0,@r4		)
-	bf/s	.L_cleanup_loop
-	 add	#1,r4
-
-.L_exit:
-	mov	#0,r0		! normal return
-
-5000:
-
-# Exception handler:
-.section .fixup, "ax"
-6000:
-	mov.l	8000f,r1
-	mov	r3,r0
-	jmp	@r1
-	 sub	r4,r0
-	.align	2
-8000:	.long	5000b
-
-.previous
-	mov.l	@r15+,r8
-	mov.l	@r15+,r9
-	mov.l	@r15+,r10
-	rts
-	 mov.l	@r15+,r11
-- 
cgit v1.2.3-59-g8ed1b