21 files changed, 0 insertions, 4052 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
deleted file mode 100644
index 081fcba01dc0..000000000000
--- a/arch/ia64/lib/Makefile
+++ /dev/null
@@ -1,48 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for ia64-specific library routines..
-#
-
-lib-y := io.o __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o		\
-	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
-	checksum.o clear_page.o csum_partial_copy.o			\
-	clear_user.o strncpy_from_user.o strnlen_user.o			\
-	flush.o ip_fast_csum.o do_csum.o				\
-	memset.o strlen.o xor.o
-
-lib-$(CONFIG_ITANIUM)	+= copy_page.o copy_user.o memcpy.o
-lib-$(CONFIG_MCKINLEY)	+= copy_page_mck.o memcpy_mck.o
-
-AFLAGS___divdi3.o	=
-AFLAGS___udivdi3.o	= -DUNSIGNED
-AFLAGS___moddi3.o	= 	     -DMODULO
-AFLAGS___umoddi3.o	= -DUNSIGNED -DMODULO
-
-AFLAGS___divsi3.o	=
-AFLAGS___udivsi3.o	= -DUNSIGNED
-AFLAGS___modsi3.o	=	     -DMODULO
-AFLAGS___umodsi3.o	= -DUNSIGNED -DMODULO
-
-$(obj)/__divdi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__moddi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__divsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__modsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
deleted file mode 100644
index d26517fe3500..000000000000
--- a/arch/ia64/lib/checksum.c
+++ /dev/null
@@ -1,102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Network checksum routines
- *
- * Copyright (C) 1999, 2003 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * Most of the code coming from arch/alpha/lib/checksum.c
- *
- * This file contains network checksum routines that are better done
- * in an architecture-specific manner due to speed..
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/byteorder.h>
-
-static inline unsigned short
-from64to16 (unsigned long x)
-{
-	/* add up 32-bit words for 33 bits */
-	x = (x & 0xffffffff) + (x >> 32);
-	/* add up 16-bit and 17-bit words for 17+c bits */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up 16-bit and 2-bit for 16+c bit */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up carry.. */
-	x = (x & 0xffff) + (x >> 16);
-	return x;
-}
-
-/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented.
- */
-__sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
-		  __u8 proto, __wsum sum)
-{
-	return (__force __sum16)~from64to16(
-		(__force u64)saddr + (__force u64)daddr +
-		(__force u64)sum + ((len + proto) << 8));
-}
-
-EXPORT_SYMBOL(csum_tcpudp_magic);
-
-__wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
-		   __u8 proto, __wsum sum)
-{
-	unsigned long result;
-
-	result = (__force u64)saddr + (__force u64)daddr +
-		 (__force u64)sum + ((len + proto) << 8);
-
-	/* Fold down to 32-bits so we don't lose in the typedef-less network stack.  */
-	/* 64 to 33 */
-	result = (result & 0xffffffff) + (result >> 32);
-	/* 33 to 32 */
-	result = (result & 0xffffffff) + (result >> 32);
-	return (__force __wsum)result;
-}
-EXPORT_SYMBOL(csum_tcpudp_nofold);
-
-extern unsigned long do_csum (const unsigned char *, long);
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 32-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
-	u64 result = do_csum(buff, len);
-
-	/* add in old sum, and carry.. */
-	result += (__force u32)sum;
-	/* 32+c bits -> 32 bits */
-	result = (result & 0xffffffff) + (result >> 32);
-	return (__force __wsum)result;
-}
-
-EXPORT_SYMBOL(csum_partial);
-
-/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
- */
-__sum16 ip_compute_csum (const void *buff, int len)
-{
-	return (__force __sum16)~do_csum(buff,len);
-}
-
-EXPORT_SYMBOL(ip_compute_csum);
diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S
deleted file mode 100644
index 65b75085c8f4..000000000000
--- a/arch/ia64/lib/clear_page.S
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1999-2002 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
- *
- * 1/06/01 davidm	Tuned for Itanium.
- * 2/12/02 kchen	Tuned for both Itanium and McKinley
- * 3/08/02 davidm	Some more tweaking
- */
-
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-#include <asm/export.h>
-
-#ifdef CONFIG_ITANIUM
-# define L3_LINE_SIZE	64	// Itanium L3 line size
-# define PREFETCH_LINES	9	// magic number
-#else
-# define L3_LINE_SIZE	128	// McKinley L3 line size
-# define PREFETCH_LINES	12	// magic number
-#endif
-
-#define saved_lc	r2
-#define dst_fetch	r3
-#define dst1		r8
-#define dst2		r9
-#define dst3		r10
-#define dst4		r11
-
-#define dst_last	r31
-
-GLOBAL_ENTRY(clear_page)
-	.prologue
-	.regstk 1,0,0,0
-	mov r16 = PAGE_SIZE/L3_LINE_SIZE-1	// main loop count, -1=repeat/until
-	.save ar.lc, saved_lc
-	mov saved_lc = ar.lc
-
-	.body
-	mov ar.lc = (PREFETCH_LINES - 1)
-	mov dst_fetch = in0
-	adds dst1 = 16, in0
-	adds dst2 = 32, in0
-	;;
-.fetch:	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
-	adds dst3 = 48, in0		// executing this multiple times is harmless
-	br.cloop.sptk.few .fetch
-	;;
-	addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
-	mov ar.lc = r16			// one L3 line per iteration
-	adds dst4 = 64, in0
-	;;
-#ifdef CONFIG_ITANIUM
-	// Optimized for Itanium
-1:	stf.spill.nta [dst1] = f0, 64
-	stf.spill.nta [dst2] = f0, 64
-	cmp.lt p8,p0=dst_fetch, dst_last
-	;;
-#else
-	// Optimized for McKinley
-1:	stf.spill.nta [dst1] = f0, 64
-	stf.spill.nta [dst2] = f0, 64
-	stf.spill.nta [dst3] = f0, 64
-	stf.spill.nta [dst4] = f0, 128
-	cmp.lt p8,p0=dst_fetch, dst_last
-	;;
-	stf.spill.nta [dst1] = f0, 64
-	stf.spill.nta [dst2] = f0, 64
-#endif
-	stf.spill.nta [dst3] = f0, 64
-(p8)	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
-	br.cloop.sptk.few 1b
-	;;
-	mov ar.lc = saved_lc		// restore lc
-	br.ret.sptk.many rp
-END(clear_page)
-EXPORT_SYMBOL(clear_page)
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S
deleted file mode 100644
index a28f39d349eb..000000000000
--- a/arch/ia64/lib/clear_user.S
+++ /dev/null
@@ -1,212 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This routine clears to zero a linear memory buffer in user space.
- *
- * Inputs:
- *	in0:	address of buffer
- *	in1:	length of buffer in bytes
- * Outputs:
- *	r8:	number of bytes that didn't get cleared due to a fault
- *
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- */
-
-#include <asm/asmmacro.h>
-#include <asm/export.h>
-
-//
-// arguments
-//
-#define buf		r32
-#define len		r33
-
-//
-// local registers
-//
-#define cnt		r16
-#define buf2		r17
-#define saved_lc	r18
-#define saved_pfs	r19
-#define tmp		r20
-#define len2		r21
-#define len3		r22
-
-//
-// Theory of operations:
-//	- we check whether or not the buffer is small, i.e., less than 17
-//	  in which case we do the byte by byte loop.
-//
-//	- Otherwise we go progressively from 1 byte store to 8byte store in
-//	  the head part, the body is a 16byte store loop and we finish we the
-//	  tail for the last 15 bytes.
-//	  The good point about this breakdown is that the long buffer handling
-//	  contains only 2 branches.
-//
-//	The reason for not using shifting & masking for both the head and the
-//	tail is to stay semantically correct. This routine is not supposed
-//	to write bytes outside of the buffer. While most of the time this would
-//	be ok, we can't tolerate a mistake. A classical example is the case
-//	of multithreaded code were to the extra bytes touched is actually owned
-//	by another thread which runs concurrently to ours. Another, less likely,
-//	example is with device drivers where reading an I/O mapped location may
-//	have side effects (same thing for writing).
-//
-
-GLOBAL_ENTRY(__do_clear_user)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc	saved_pfs=ar.pfs,2,0,0,0
-	cmp.eq p6,p0=r0,len		// check for zero length
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc		// preserve ar.lc (slow)
-	.body
-	;;				// avoid WAW on CFM
-	adds tmp=-1,len			// br.ctop is repeat/until
-	mov ret0=len			// return value is length at this point
-(p6)	br.ret.spnt.many rp
-	;;
-	cmp.lt p6,p0=16,len		// if len > 16 then long memset
-	mov ar.lc=tmp			// initialize lc for small count
-(p6)	br.cond.dptk .long_do_clear
-	;;				// WAR on ar.lc
-	//
-	// worst case 16 iterations, avg 8 iterations
-	//
-	// We could have played with the predicates to use the extra
-	// M slot for 2 stores/iteration but the cost the initialization
-	// the various counters compared to how long the loop is supposed
-	// to last on average does not make this solution viable.
-	//
-1:
-	EX( .Lexit1, st1 [buf]=r0,1 )
-	adds len=-1,len			// countdown length using len
-	br.cloop.dptk 1b
-	;;				// avoid RAW on ar.lc
-	//
-	// .Lexit4: comes from byte by byte loop
-	//	    len contains bytes left
-.Lexit1:
-	mov ret0=len			// faster than using ar.lc
-	mov ar.lc=saved_lc
-	br.ret.sptk.many rp		// end of short clear_user
-
-
-	//
-	// At this point we know we have more than 16 bytes to copy
-	// so we focus on alignment (no branches required)
-	//
-	// The use of len/len2 for countdown of the number of bytes left
-	// instead of ret0 is due to the fact that the exception code
-	// changes the values of r8.
-	//
-.long_do_clear:
-	tbit.nz p6,p0=buf,0		// odd alignment (for long_do_clear)
-	;;
-	EX( .Lexit3, (p6) st1 [buf]=r0,1 )	// 1-byte aligned
-(p6)	adds len=-1,len;;		// sync because buf is modified
-	tbit.nz p6,p0=buf,1
-	;;
-	EX( .Lexit3, (p6) st2 [buf]=r0,2 )	// 2-byte aligned
-(p6)	adds len=-2,len;;
-	tbit.nz p6,p0=buf,2
-	;;
-	EX( .Lexit3, (p6) st4 [buf]=r0,4 )	// 4-byte aligned
-(p6)	adds len=-4,len;;
-	tbit.nz p6,p0=buf,3
-	;;
-	EX( .Lexit3, (p6) st8 [buf]=r0,8 )	// 8-byte aligned
-(p6)	adds len=-8,len;;
-	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
-	;;
-	cmp.eq p6,p0=r0,cnt
-	adds tmp=-1,cnt
-(p6)	br.cond.dpnt .dotail		// we have less than 16 bytes left
-	;;
-	adds buf2=8,buf			// setup second base pointer
-	mov ar.lc=tmp
-	;;
-
-	//
-	// 16bytes/iteration core loop
-	//
-	// The second store can never generate a fault because
-	// we come into the loop only when we are 16-byte aligned.
-	// This means that if we cross a page then it will always be
-	// in the first store and never in the second.
-	//
-	//
-	// We need to keep track of the remaining length. A possible (optimistic)
-	// way would be to use ar.lc and derive how many byte were left by
-	// doing : left= 16*ar.lc + 16.  this would avoid the addition at
-	// every iteration.
-	// However we need to keep the synchronization point. A template
-	// M;;MB does not exist and thus we can keep the addition at no
-	// extra cycle cost (use a nop slot anyway). It also simplifies the
-	// (unlikely)  error recovery code
-	//
-
-2:	EX(.Lexit3, st8 [buf]=r0,16 )
-	;;				// needed to get len correct when error
-	st8 [buf2]=r0,16
-	adds len=-16,len
-	br.cloop.dptk 2b
-	;;
-	mov ar.lc=saved_lc
-	//
-	// tail correction based on len only
-	//
-	// We alternate the use of len3,len2 to allow parallelism and correct
-	// error handling. We also reuse p6/p7 to return correct value.
-	// The addition of len2/len3 does not cost anything more compared to
-	// the regular memset as we had empty slots.
-	//
-.dotail:
-	mov len2=len			// for parallelization of error handling
-	mov len3=len
-	tbit.nz p6,p0=len,3
-	;;
-	EX( .Lexit2, (p6) st8 [buf]=r0,8 )	// at least 8 bytes
-(p6)	adds len3=-8,len2
-	tbit.nz p7,p6=len,2
-	;;
-	EX( .Lexit2, (p7) st4 [buf]=r0,4 )	// at least 4 bytes
-(p7)	adds len2=-4,len3
-	tbit.nz p6,p7=len,1
-	;;
-	EX( .Lexit2, (p6) st2 [buf]=r0,2 )	// at least 2 bytes
-(p6)	adds len3=-2,len2
-	tbit.nz p7,p6=len,0
-	;;
-	EX( .Lexit2, (p7) st1 [buf]=r0 )	// only 1 byte left
-	mov ret0=r0				// success
-	br.ret.sptk.many rp			// end of most likely path
-
-	//
-	// Outlined error handling code
-	//
-
-	//
-	// .Lexit3: comes from core loop, need restore pr/lc
-	//	    len contains bytes left
-	//
-	//
-	// .Lexit2:
-	//	if p6 -> coming from st8 or st2 : len2 contains what's left
-	//	if p7 -> coming from st4 or st1 : len3 contains what's left
-	// We must restore lc/pr even though might not have been used.
-.Lexit2:
-	.pred.rel "mutex", p6, p7
-(p6)	mov len=len2
-(p7)	mov len=len3
-	;;
-	//
-	// .Lexit4: comes from head, need not restore pr/lc
-	//	    len contains bytes left
-	//
-.Lexit3:
-	mov ret0=len
-	mov ar.lc=saved_lc
-	br.ret.sptk.many rp
-END(__do_clear_user)
-EXPORT_SYMBOL(__do_clear_user)
diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S
deleted file mode 100644
index 176f857c522e..000000000000
--- a/arch/ia64/lib/copy_page.S
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard copy_page() function
- *
- * Inputs:
- *	in0:	address of target page
- *	in1:	address of source page
- * Output:
- *	no return value
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger <davidm@hpl.hp.com>
- *
- * 4/06/01 davidm	Tuned to make it perform well both for cached and uncached copies.
- */
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-#include <asm/export.h>
-
-#define PIPE_DEPTH	3
-#define EPI		p[PIPE_DEPTH-1]
-
-#define lcount		r16
-#define saved_pr	r17
-#define saved_lc	r18
-#define saved_pfs	r19
-#define src1		r20
-#define src2		r21
-#define tgt1		r22
-#define tgt2		r23
-#define srcf		r24
-#define tgtf		r25
-#define tgt_last	r26
-
-#define Nrot		((8*PIPE_DEPTH+7)&~7)
-
-GLOBAL_ENTRY(copy_page)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
-
-	.rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
-	      t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
-	.rotp p[PIPE_DEPTH]
-
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc
-	mov ar.ec=PIPE_DEPTH
-
-	mov lcount=PAGE_SIZE/64-1
-	.save pr, saved_pr
-	mov saved_pr=pr
-	mov pr.rot=1<<16
-
-	.body
-
-	mov src1=in1
-	adds src2=8,in1
-	mov tgt_last = PAGE_SIZE
-	;;
-	adds tgt2=8,in0
-	add srcf=512,in1
-	mov ar.lc=lcount
-	mov tgt1=in0
-	add tgtf=512,in0
-	add tgt_last = tgt_last, in0
-	;;
-1:
-(p[0])	ld8 t1[0]=[src1],16
-(EPI)	st8 [tgt1]=t1[PIPE_DEPTH-1],16
-(p[0])	ld8 t2[0]=[src2],16
-(EPI)	st8 [tgt2]=t2[PIPE_DEPTH-1],16
-	cmp.ltu p6,p0 = tgtf, tgt_last
-	;;
-(p[0])	ld8 t3[0]=[src1],16
-(EPI)	st8 [tgt1]=t3[PIPE_DEPTH-1],16
-(p[0])	ld8 t4[0]=[src2],16
-(EPI)	st8 [tgt2]=t4[PIPE_DEPTH-1],16
-	;;
-(p[0])	ld8 t5[0]=[src1],16
-(EPI)	st8 [tgt1]=t5[PIPE_DEPTH-1],16
-(p[0])	ld8 t6[0]=[src2],16
-(EPI)	st8 [tgt2]=t6[PIPE_DEPTH-1],16
-	;;
-(p[0])	ld8 t7[0]=[src1],16
-(EPI)	st8 [tgt1]=t7[PIPE_DEPTH-1],16
-(p[0])	ld8 t8[0]=[src2],16
-(EPI)	st8 [tgt2]=t8[PIPE_DEPTH-1],16
-
-(p6)	lfetch [srcf], 64
-(p6)	lfetch [tgtf], 64
-	br.ctop.sptk.few 1b
-	;;
-	mov pr=saved_pr,0xffffffffffff0000	// restore predicates
-	mov ar.pfs=saved_pfs
-	mov ar.lc=saved_lc
-	br.ret.sptk.many rp
-END(copy_page)
-EXPORT_SYMBOL(copy_page)
diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S
deleted file mode 100644
index d6fd56e4f1c1..000000000000
--- a/arch/ia64/lib/copy_page_mck.S
+++ /dev/null
@@ -1,188 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * McKinley-optimized version of copy_page().
- *
- * Copyright (C) 2002 Hewlett-Packard Co
- *	David Mosberger <davidm@hpl.hp.com>
- *
- * Inputs:
- *	in0:	address of target page
- *	in1:	address of source page
- * Output:
- *	no return value
- *
- * General idea:
- *	- use regular loads and stores to prefetch data to avoid consuming M-slot just for
- *	  lfetches => good for in-cache performance
- *	- avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
- *	  cycle
- *
- * Principle of operation:
- *	First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
- *	To avoid secondary misses in L2, we prefetch both source and destination with a line-size
- *	of 128 bytes.  When both of these lines are in the L2 and the first half of the
- *	source line is in L1, we start copying the remaining words.  The second half of the
- *	source line is prefetched in an earlier iteration, so that by the time we start
- *	accessing it, it's also present in the L1.
- *
- *	We use a software-pipelined loop to control the overall operation.  The pipeline
- *	has 2*PREFETCH_DIST+K stages.  The first PREFETCH_DIST stages are used for prefetching
- *	source cache-lines.  The second PREFETCH_DIST stages are used for prefetching destination
- *	cache-lines, the last K stages are used to copy the cache-line words not copied by
- *	the prefetches.  The four relevant points in the pipelined are called A, B, C, D:
- *	p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
- *	should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
- *	into L1D and p[D] is TRUE if a cacheline needs to be copied.
- *
- *	This all sounds very complicated, but thanks to the modulo-scheduled loop support,
- *	the resulting code is very regular and quite easy to follow (once you get the idea).
- *
- *	As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
- *	as the separate .prefetch_loop.  Logically, this loop performs exactly like the
- *	main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
- *	so that each loop iteration is faster (again, good for cached case).
- *
- *	When reading the code, it helps to keep the following picture in mind:
- *
- *	       word 0 word 1
- *            +------+------+---
- *	      |	v[x] | 	t1  | ^
- *	      |	t2   |	t3  | |
- *	      |	t4   |	t5  | |
- *	      |	t6   |	t7  | | 128 bytes
- *     	      |	n[y] | 	t9  | |	(L2 cache line)
- *	      |	t10  | 	t11 | |
- *	      |	t12  | 	t13 | |
- *	      |	t14  | 	t15 | v
- *	      +------+------+---
- *
- *	Here, v[x] is copied by the (memory) prefetch.  n[y] is loaded at p[C]
- *	to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
- *	an order that avoids bank conflicts.
- */
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-#include <asm/export.h>
-
-#define PREFETCH_DIST	8		// McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
-
-#define src0		r2
-#define src1		r3
-#define dst0		r9
-#define dst1		r10
-#define src_pre_mem	r11
-#define dst_pre_mem	r14
-#define src_pre_l2	r15
-#define dst_pre_l2	r16
-#define t1		r17
-#define t2		r18
-#define t3		r19
-#define t4		r20
-#define t5		t1	// alias!
-#define t6		t2	// alias!
-#define t7		t3	// alias!
-#define t9		t5	// alias!
-#define t10		t4	// alias!
-#define t11		t7	// alias!
-#define t12		t6	// alias!
-#define t14		t10	// alias!
-#define t13		r21
-#define t15		r22
-
-#define saved_lc	r23
-#define saved_pr	r24
-
-#define	A	0
-#define B	(PREFETCH_DIST)
-#define C	(B + PREFETCH_DIST)
-#define D	(C + 3)
-#define N	(D + 1)
-#define Nrot	((N + 7) & ~7)
-
-GLOBAL_ENTRY(copy_page)
-	.prologue
-	alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
-
-	.rotr v[2*PREFETCH_DIST], n[D-C+1]
-	.rotp p[N]
-
-	.save ar.lc, saved_lc
-	mov saved_lc = ar.lc
-	.save pr, saved_pr
-	mov saved_pr = pr
-	.body
-
-	mov src_pre_mem = in1
-	mov pr.rot = 0x10000
-	mov ar.ec = 1				// special unrolled loop
-
-	mov dst_pre_mem = in0
-	mov ar.lc = 2*PREFETCH_DIST - 1
-
-	add src_pre_l2 = 8*8, in1
-	add dst_pre_l2 = 8*8, in0
-	add src0 = 8, in1			// first t1 src
-	add src1 = 3*8, in1			// first t3 src
-	add dst0 = 8, in0			// first t1 dst
-	add dst1 = 3*8, in0			// first t3 dst
-	mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
-	nop.m 0
-	nop.i 0
-	;;
-	// same as .line_copy loop, but with all predicated-off instructions removed:
-.prefetch_loop:
-(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0
-(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2
-	br.ctop.sptk .prefetch_loop
-	;;
-	cmp.eq p16, p0 = r0, r0			// reset p16 to 1 (br.ctop cleared it to zero)
-	mov ar.lc = t1				// with 64KB pages, t1 is too big to fit in 8 bits!
-	mov ar.ec = N				// # of stages in pipeline
-	;;
-.line_copy:
-(p[D])	ld8 t2 = [src0], 3*8			// M0
-(p[D])	ld8 t4 = [src1], 3*8			// M1
-(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2 prefetch dst from memory
-(p[D])	st8 [dst_pre_l2] = n[D-C], 128		// M3 prefetch dst from L2
-	;;
-(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0 prefetch src from memory
-(p[C])	ld8 n[0] = [src_pre_l2], 128		// M1 prefetch src from L2
-(p[D])	st8 [dst0] =  t1, 8			// M2
-(p[D])	st8 [dst1] =  t3, 8			// M3
-	;;
-(p[D])	ld8  t5 = [src0], 8
-(p[D])	ld8  t7 = [src1], 3*8
-(p[D])	st8 [dst0] =  t2, 3*8
-(p[D])	st8 [dst1] =  t4, 3*8
-	;;
-(p[D])	ld8  t6 = [src0], 3*8
-(p[D])	ld8 t10 = [src1], 8
-(p[D])	st8 [dst0] =  t5, 8
-(p[D])	st8 [dst1] =  t7, 3*8
-	;;
-(p[D])	ld8  t9 = [src0], 3*8
-(p[D])	ld8 t11 = [src1], 3*8
-(p[D])	st8 [dst0] =  t6, 3*8
-(p[D])	st8 [dst1] = t10, 8
-	;;
-(p[D])	ld8 t12 = [src0], 8
-(p[D])	ld8 t14 = [src1], 8
-(p[D])	st8 [dst0] =  t9, 3*8
-(p[D])	st8 [dst1] = t11, 3*8
-	;;
-(p[D])	ld8 t13 = [src0], 4*8
-(p[D])	ld8 t15 = [src1], 4*8
-(p[D])	st8 [dst0] = t12, 8
-(p[D])	st8 [dst1] = t14, 8
-	;;
-(p[D-1])ld8  t1 = [src0], 8
-(p[D-1])ld8  t3 = [src1], 8
-(p[D])	st8 [dst0] = t13, 4*8
-(p[D])	st8 [dst1] = t15, 4*8
-	br.ctop.sptk .line_copy
-	;;
-	mov ar.lc = saved_lc
-	mov pr = saved_pr, -1
-	br.ret.sptk.many rp
-END(copy_page)
-EXPORT_SYMBOL(copy_page)
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
deleted file mode 100644
index f681556c6b86..000000000000
--- a/arch/ia64/lib/copy_user.S
+++ /dev/null
@@ -1,613 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the copy_user() routine.
- * It is used to copy date across the kernel/user boundary.
- *
- * The source and destination are always on opposite side of
- * the boundary. When reading from user space we must catch
- * faults on loads. When writing to user space we must catch
- * errors on stores. Note that because of the nature of the copy
- * we don't need to worry about overlapping regions.
- *
- *
- * Inputs:
- *	in0	address of source buffer
- *	in1	address of destination buffer
- *	in2	number of bytes to copy
- *
- * Outputs:
- *	ret0	0 in case of success. The number of bytes NOT copied in
- *		case of error.
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * Fixme:
- *	- handle the case where we have more than 16 bytes and the alignment
- *	  are different.
- *	- more benchmarking
- *	- fix extraneous stop bit introduced by the EX() macro.
- */
-
-#include <asm/asmmacro.h>
-#include <asm/export.h>
-
-//
-// Tuneable parameters
-//
-#define COPY_BREAK	16	// we do byte copy below (must be >=16)
-#define PIPE_DEPTH	21	// pipe depth
-
-#define EPI		p[PIPE_DEPTH-1]
-
-//
-// arguments
-//
-#define dst		in0
-#define src		in1
-#define len		in2
-
-//
-// local registers
-//
-#define t1		r2	// rshift in bytes
-#define t2		r3	// lshift in bytes
-#define rshift		r14	// right shift in bits
-#define lshift		r15	// left shift in bits
-#define word1		r16
-#define word2		r17
-#define cnt		r18
-#define len2		r19
-#define saved_lc	r20
-#define saved_pr	r21
-#define tmp		r22
-#define val		r23
-#define src1		r24
-#define dst1		r25
-#define src2		r26
-#define dst2		r27
-#define len1		r28
-#define enddst		r29
-#define endsrc		r30
-#define saved_pfs	r31
-
-GLOBAL_ENTRY(__copy_user)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
-
-	.rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
-	.rotp p[PIPE_DEPTH]
-
-	adds len2=-1,len	// br.ctop is repeat/until
-	mov ret0=r0
-
-	;;			// RAW of cfm when len=0
-	cmp.eq p8,p0=r0,len	// check for zero length
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc	// preserve ar.lc (slow)
-(p8)	br.ret.spnt.many rp	// empty mempcy()
-	;;
-	add enddst=dst,len	// first byte after end of source
-	add endsrc=src,len	// first byte after end of destination
-	.save pr, saved_pr
-	mov saved_pr=pr		// preserve predicates
-
-	.body
-
-	mov dst1=dst		// copy because of rotation
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-
-	mov src1=src		// copy because of rotation
-	mov ar.lc=len2		// initialize lc for small count
-	cmp.lt p10,p7=COPY_BREAK,len	// if len > COPY_BREAK then long copy
-
-	xor tmp=src,dst		// same alignment test prepare
-(p10)	br.cond.dptk .long_copy_user
-	;;			// RAW pr.rot/p16 ?
-	//
-	// Now we do the byte by byte loop with software pipeline
-	//
-	// p7 is necessarily false by now
-1:
-	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-	br.ctop.dptk.few 1b
-	;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.pfs=saved_pfs		// restore ar.ec
-	br.ret.sptk.many rp		// end of short memcpy
-
-	//
-	// Not 8-byte aligned
-	//
-.diff_align_copy_user:
-	// At this point we know we have more than 16 bytes to copy
-	// and also that src and dest do _not_ have the same alignment.
-	and src2=0x7,src1				// src offset
-	and dst2=0x7,dst1				// dst offset
-	;;
-	// The basic idea is that we copy byte-by-byte at the head so
-	// that we can reach 8-byte alignment for both src1 and dst1.
-	// Then copy the body using software pipelined 8-byte copy,
-	// shifting the two back-to-back words right and left, then copy
-	// the tail by copying byte-by-byte.
-	//
-	// Fault handling. If the byte-by-byte at the head fails on the
-	// load, then restart and finish the pipleline by copying zeros
-	// to the dst1. Then copy zeros for the rest of dst1.
-	// If 8-byte software pipeline fails on the load, do the same as
-	// failure_in3 does. If the byte-by-byte at the tail fails, it is
-	// handled simply by failure_in_pipe1.
-	//
-	// The case p14 represents the source has more bytes in the
-	// the first word (by the shifted part), whereas the p15 needs to
-	// copy some bytes from the 2nd word of the source that has the
-	// tail of the 1st of the destination.
-	//
-
-	//
-	// Optimization. If dst1 is 8-byte aligned (quite common), we don't need
-	// to copy the head to dst1, to start 8-byte copy software pipeline.
-	// We know src1 is not 8-byte aligned in this case.
-	//
-	cmp.eq p14,p15=r0,dst2
-(p15)	br.cond.spnt 1f
-	;;
-	sub t1=8,src2
-	mov t2=src2
-	;;
-	shl rshift=t2,3
-	sub len1=len,t1					// set len1
-	;;
-	sub lshift=64,rshift
-	;;
-	br.cond.spnt .word_copy_user
-	;;
-1:
-	cmp.leu	p14,p15=src2,dst2
-	sub t1=dst2,src2
-	;;
-	.pred.rel "mutex", p14, p15
-(p14)	sub word1=8,src2				// (8 - src offset)
-(p15)	sub t1=r0,t1					// absolute value
-(p15)	sub word1=8,dst2				// (8 - dst offset)
-	;;
-	// For the case p14, we don't need to copy the shifted part to
-	// the 1st word of destination.
-	sub t2=8,t1
-(p14)	sub word1=word1,t1
-	;;
-	sub len1=len,word1				// resulting len
-(p15)	shl rshift=t1,3					// in bits
-(p14)	shl rshift=t2,3
-	;;
-(p14)	sub len1=len1,t1
-	adds cnt=-1,word1
-	;;
-	sub lshift=64,rshift
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-	mov ar.lc=cnt
-	;;
-2:
-	EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
-	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-	br.ctop.dptk.few 2b
-	;;
-	clrrrb
-	;;
-.word_copy_user:
-	cmp.gtu p9,p0=16,len1
-(p9)	br.cond.spnt 4f			// if (16 > len1) skip 8-byte copy
-	;;
-	shr.u cnt=len1,3		// number of 64-bit words
-	;;
-	adds cnt=-1,cnt
-	;;
-	.pred.rel "mutex", p14, p15
-(p14)	sub src1=src1,t2
-(p15)	sub src1=src1,t1
-	//
-	// Now both src1 and dst1 point to an 8-byte aligned address. And
-	// we have more than 8 bytes to copy.
-	//
-	mov ar.lc=cnt
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-	;;
-3:
-	//
-	// The pipleline consists of 3 stages:
-	// 1 (p16):	Load a word from src1
-	// 2 (EPI_1):	Shift right pair, saving to tmp
-	// 3 (EPI):	Store tmp to dst1
-	//
-	// To make it simple, use at least 2 (p16) loops to set up val1[n]
-	// because we need 2 back-to-back val1[] to get tmp.
-	// Note that this implies EPI_2 must be p18 or greater.
-	//
-
-#define EPI_1		p[PIPE_DEPTH-2]
-#define SWITCH(pred, shift)	cmp.eq pred,p0=shift,rshift
-#define CASE(pred, shift)	\
-	(pred)	br.cond.spnt .copy_user_bit##shift
-#define BODY(rshift)						\
-.copy_user_bit##rshift:						\
-1:								\
-	EX(.failure_out,(EPI) st8 [dst1]=tmp,8);		\
-(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
-	EX(3f,(p16) ld8 val1[1]=[src1],8);			\
-(p16)	mov val1[0]=r0;						\
-	br.ctop.dptk 1b;					\
-	;;							\
-	br.cond.sptk.many .diff_align_do_tail;			\
-2:								\
-(EPI)	st8 [dst1]=tmp,8;					\
-(EPI_1)	shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
-3:								\
-(p16)	mov val1[1]=r0;						\
-(p16)	mov val1[0]=r0;						\
-	br.ctop.dptk 2b;					\
-	;;							\
-	br.cond.sptk.many .failure_in2
-
-	//
-	// Since the instruction 'shrp' requires a fixed 128-bit value
-	// specifying the bits to shift, we need to provide 7 cases
-	// below.
-	//
-	SWITCH(p6, 8)
-	SWITCH(p7, 16)
-	SWITCH(p8, 24)
-	SWITCH(p9, 32)
-	SWITCH(p10, 40)
-	SWITCH(p11, 48)
-	SWITCH(p12, 56)
-	;;
-	CASE(p6, 8)
-	CASE(p7, 16)
-	CASE(p8, 24)
-	CASE(p9, 32)
-	CASE(p10, 40)
-	CASE(p11, 48)
-	CASE(p12, 56)
-	;;
-	BODY(8)
-	BODY(16)
-	BODY(24)
-	BODY(32)
-	BODY(40)
-	BODY(48)
-	BODY(56)
-	;;
-.diff_align_do_tail:
-	.pred.rel "mutex", p14, p15
-(p14)	sub src1=src1,t1
-(p14)	adds dst1=-8,dst1
-(p15)	sub dst1=dst1,t1
-	;;
-4:
-	// Tail correction.
-	//
-	// The problem with this piplelined loop is that the last word is not
-	// loaded and thus parf of the last word written is not correct.
-	// To fix that, we simply copy the tail byte by byte.
-
-	sub len1=endsrc,src1,1
-	clrrrb
-	;;
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-	mov ar.lc=len1
-	;;
-5:
-	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-	br.ctop.dptk.few 5b
-	;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// Beginning of long mempcy (i.e. > 16 bytes)
-	//
-.long_copy_user:
-	tbit.nz p6,p7=src1,0	// odd alignment
-	and tmp=7,tmp
-	;;
-	cmp.eq p10,p8=r0,tmp
-	mov len1=len		// copy because of rotation
-(p8)	br.cond.dpnt .diff_align_copy_user
-	;;
-	// At this point we know we have more than 16 bytes to copy
-	// and also that both src and dest have the same alignment
-	// which may not be the one we want. So for now we must move
-	// forward slowly until we reach 16byte alignment: no need to
-	// worry about reaching the end of buffer.
-	//
-	EX(.failure_in1,(p6) ld1 val1[0]=[src1],1)	// 1-byte aligned
-(p6)	adds len1=-1,len1;;
-	tbit.nz p7,p0=src1,1
-	;;
-	EX(.failure_in1,(p7) ld2 val1[1]=[src1],2)	// 2-byte aligned
-(p7)	adds len1=-2,len1;;
-	tbit.nz p8,p0=src1,2
-	;;
-	//
-	// Stop bit not required after ld4 because if we fail on ld4
-	// we have never executed the ld1, therefore st1 is not executed.
-	//
-	EX(.failure_in1,(p8) ld4 val2[0]=[src1],4)	// 4-byte aligned
-	;;
-	EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
-	tbit.nz p9,p0=src1,3
-	;;
-	//
-	// Stop bit not required after ld8 because if we fail on ld8
-	// we have never executed the ld2, therefore st2 is not executed.
-	//
-	EX(.failure_in1,(p9) ld8 val2[1]=[src1],8)	// 8-byte aligned
-	EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
-(p8)	adds len1=-4,len1
-	;;
-	EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
-(p9)	adds len1=-8,len1;;
-	shr.u cnt=len1,4		// number of 128-bit (2x64bit) words
-	;;
-	EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
-	tbit.nz p6,p0=len1,3
-	cmp.eq p7,p0=r0,cnt
-	adds tmp=-1,cnt			// br.ctop is repeat/until
-(p7)	br.cond.dpnt .dotail		// we have less than 16 bytes left
-	;;
-	adds src2=8,src1
-	adds dst2=8,dst1
-	mov ar.lc=tmp
-	;;
-	//
-	// 16bytes/iteration
-	//
-2:
-	EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
-(p16)	ld8 val2[0]=[src2],16
-
-	EX(.failure_out, (EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16)
-(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
-	br.ctop.dptk 2b
-	;;			// RAW on src1 when fall through from loop
-	//
-	// Tail correction based on len only
-	//
-	// No matter where we come from (loop or test) the src1 pointer
-	// is 16 byte aligned AND we have less than 16 bytes to copy.
-	//
-.dotail:
-	EX(.failure_in1,(p6) ld8 val1[0]=[src1],8)	// at least 8 bytes
-	tbit.nz p7,p0=len1,2
-	;;
-	EX(.failure_in1,(p7) ld4 val1[1]=[src1],4)	// at least 4 bytes
-	tbit.nz p8,p0=len1,1
-	;;
-	EX(.failure_in1,(p8) ld2 val2[0]=[src1],2)	// at least 2 bytes
-	tbit.nz p9,p0=len1,0
-	;;
-	EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
-	;;
-	EX(.failure_in1,(p9) ld1 val2[1]=[src1])	// only 1 byte left
-	mov ar.lc=saved_lc
-	;;
-	EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	EX(.failure_out, (p8)	st2 [dst1]=val2[0],2)
-	mov ar.pfs=saved_pfs
-	;;
-	EX(.failure_out, (p9)	st1 [dst1]=val2[1])
-	br.ret.sptk.many rp
-
-
-	//
-	// Here we handle the case where the byte by byte copy fails
-	// on the load.
-	// Several factors make the zeroing of the rest of the buffer kind of
-	// tricky:
-	//	- the pipeline: loads/stores are not in sync (pipeline)
-	//
-	//	  In the same loop iteration, the dst1 pointer does not directly
-	//	  reflect where the faulty load was.
-	//
-	//	- pipeline effect
-	//	  When you get a fault on load, you may have valid data from
-	//	  previous loads not yet store in transit. Such data must be
-	//	  store normally before moving onto zeroing the rest.
-	//
-	//	- single/multi dispersal independence.
-	//
-	// solution:
-	//	- we don't disrupt the pipeline, i.e. data in transit in
-	//	  the software pipeline will be eventually move to memory.
-	//	  We simply replace the load with a simple mov and keep the
-	//	  pipeline going. We can't really do this inline because
-	//	  p16 is always reset to 1 when lc > 0.
-	//
-.failure_in_pipe1:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-1:
-(p16)	mov val1[0]=r0
-(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
-	br.ctop.dptk 1b
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// This is the case where the byte by byte copy fails on the load
-	// when we copy the head. We need to finish the pipeline and copy
-	// zeros for the rest of the destination. Since this happens
-	// at the top we still need to fill the body and tail.
-.failure_in_pipe2:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-2:
-(p16)	mov val1[0]=r0
-(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
-	br.ctop.dptk 2b
-	;;
-	sub len=enddst,dst1,1		// precompute len
-	br.cond.dptk.many .failure_in1bis
-	;;
-
-	//
-	// Here we handle the head & tail part when we check for alignment.
-	// The following code handles only the load failures. The
-	// main diffculty comes from the fact that loads/stores are
-	// scheduled. So when you fail on a load, the stores corresponding
-	// to previous successful loads must be executed.
-	//
-	// However some simplifications are possible given the way
-	// things work.
-	//
-	// 1) HEAD
-	// Theory of operation:
-	//
-	//  Page A   | Page B
-	//  ---------|-----
-	//          1|8 x
-	//	  1 2|8 x
-	//	    4|8 x
-	//	  1 4|8 x
-	//        2 4|8 x
-	//      1 2 4|8 x
-	//	     |1
-	//	     |2 x
-	//	     |4 x
-	//
-	// page_size >= 4k (2^12).  (x means 4, 2, 1)
-	// Here we suppose Page A exists and Page B does not.
-	//
-	// As we move towards eight byte alignment we may encounter faults.
-	// The numbers on each page show the size of the load (current alignment).
-	//
-	// Key point:
-	//	- if you fail on 1, 2, 4 then you have never executed any smaller
-	//	  size loads, e.g. failing ld4 means no ld1 nor ld2 executed
-	//	  before.
-	//
-	// This allows us to simplify the cleanup code, because basically you
-	// only have to worry about "pending" stores in the case of a failing
-	// ld8(). Given the way the code is written today, this means only
-	// worry about st2, st4. There we can use the information encapsulated
-	// into the predicates.
-	//
-	// Other key point:
-	//	- if you fail on the ld8 in the head, it means you went straight
-	//	  to it, i.e. 8byte alignment within an unexisting page.
-	// Again this comes from the fact that if you crossed just for the ld8 then
-	// you are 8byte aligned but also 16byte align, therefore you would
-	// either go for the 16byte copy loop OR the ld8 in the tail part.
-	// The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
-	// because it would mean you had 15bytes to copy in which case you
-	// would have defaulted to the byte by byte copy.
-	//
-	//
-	// 2) TAIL
-	// Here we now we have less than 16 bytes AND we are either 8 or 16 byte
-	// aligned.
-	//
-	// Key point:
-	// This means that we either:
-	//		- are right on a page boundary
-	//	OR
-	//		- are at more than 16 bytes from a page boundary with
-	//		  at most 15 bytes to copy: no chance of crossing.
-	//
-	// This allows us to assume that if we fail on a load we haven't possibly
-	// executed any of the previous (tail) ones, so we don't need to do
-	// any stores. For instance, if we fail on ld2, this means we had
-	// 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
-	//
-	// This means that we are in a situation similar the a fault in the
-	// head part. That's nice!
-	//
-.failure_in1:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-	sub len=endsrc,src1,1
-	//
-	// we know that ret0 can never be zero at this point
-	// because we failed why trying to do a load, i.e. there is still
-	// some work to do.
-	// The failure_in1bis and length problem is taken care of at the
-	// calling side.
-	//
-	;;
-.failure_in1bis:		// from (.failure_in3)
-	mov ar.lc=len		// Continue with a stupid byte store.
-	;;
-5:
-	st1 [dst1]=r0,1
-	br.cloop.dptk 5b
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// Here we simply restart the loop but instead
-	// of doing loads we fill the pipeline with zeroes
-	// We can't simply store r0 because we may have valid
-	// data in transit in the pipeline.
-	// ar.lc and ar.ec are setup correctly at this point
-	//
-	// we MUST use src1/endsrc here and not dst1/enddst because
-	// of the pipeline effect.
-	//
-.failure_in3:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-	;;
-2:
-(p16)	mov val1[0]=r0
-(p16)	mov val2[0]=r0
-(EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16
-(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
-	br.ctop.dptk 2b
-	;;
-	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
-	sub len=enddst,dst1,1		// precompute len
-(p6)	br.cond.dptk .failure_in1bis
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-.failure_in2:
-	sub ret0=endsrc,src1
-	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
-	sub len=enddst,dst1,1		// precompute len
-(p6)	br.cond.dptk .failure_in1bis
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// handling of failures on stores: that's the easy part
-	//
-.failure_out:
-	sub ret0=enddst,dst1
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-END(__copy_user)
-EXPORT_SYMBOL(__copy_user)
diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c
deleted file mode 100644
index 917e3138b277..000000000000
--- a/arch/ia64/lib/csum_partial_copy.c
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Network Checksum & Copy routine
- *
- * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * Most of the code has been imported from Linux/Alpha
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-
-#include <net/checksum.h>
-
-/*
- * XXX Fixme: those 2 inlines are meant for debugging and will go away
- */
-static inline unsigned
-short from64to16(unsigned long x)
-{
-	/* add up 32-bit words for 33 bits */
-	x = (x & 0xffffffff) + (x >> 32);
-	/* add up 16-bit and 17-bit words for 17+c bits */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up 16-bit and 2-bit for 16+c bit */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up carry.. */
-	x = (x & 0xffff) + (x >> 16);
-	return x;
-}
-
-static inline
-unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
-{
-	int odd, count;
-	unsigned long result = (unsigned long)psum;
-
-	if (len <= 0)
-		goto out;
-	odd = 1 & (unsigned long) buff;
-	if (odd) {
-		result = *buff << 8;
-		len--;
-		buff++;
-	}
-	count = len >> 1;		/* nr of 16-bit words.. */
-	if (count) {
-		if (2 & (unsigned long) buff) {
-			result += *(unsigned short *) buff;
-			count--;
-			len -= 2;
-			buff += 2;
-		}
-		count >>= 1;		/* nr of 32-bit words.. */
-		if (count) {
-			if (4 & (unsigned long) buff) {
-				result += *(unsigned int *) buff;
-				count--;
-				len -= 4;
-				buff += 4;
-			}
-			count >>= 1;	/* nr of 64-bit words.. */
-			if (count) {
-				unsigned long carry = 0;
-				do {
-					unsigned long w = *(unsigned long *) buff;
-					count--;
-					buff += 8;
-					result += carry;
-					result += w;
-					carry = (w > result);
-				} while (count);
-				result += carry;
-				result = (result & 0xffffffff) + (result >> 32);
-			}
-			if (len & 4) {
-				result += *(unsigned int *) buff;
-				buff += 4;
-			}
-		}
-		if (len & 2) {
-			result += *(unsigned short *) buff;
-			buff += 2;
-		}
-	}
-	if (len & 1)
-		result += *buff;
-
-	result = from64to16(result);
-
-	if (odd)
-		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-
-out:
-	return result;
-}
diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S
deleted file mode 100644
index 6004dad2597c..000000000000
--- a/arch/ia64/lib/do_csum.S
+++ /dev/null
@@ -1,324 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optmized version of the standard do_csum() function
- *
- * Return: a 64bit quantity containing the 16bit Internet checksum
- *
- * Inputs:
- *	in0: address of buffer to checksum (char *)
- *	in1: length of the buffer (int)
- *
- * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * 02/04/22	Ken Chen <kenneth.w.chen@intel.com>
- *		Data locality study on the checksum buffer.
- *		More optimization cleanup - remove excessive stop bits.
- * 02/04/08	David Mosberger <davidm@hpl.hp.com>
- *		More cleanup and tuning.
- * 01/04/18	Jun Nakajima <jun.nakajima@intel.com>
- *		Clean up and optimize and the software pipeline, loading two
- *		back-to-back 8-byte words per loop. Clean up the initialization
- *		for the loop. Support the cases where load latency = 1 or 2.
- *		Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
- */
-
-#include <asm/asmmacro.h>
-
-//
-// Theory of operations:
-//	The goal is to go as quickly as possible to the point where
-//	we can checksum 16 bytes/loop. Before reaching that point we must
-//	take care of incorrect alignment of first byte.
-//
-//	The code hereafter also takes care of the "tail" part of the buffer
-//	before entering the core loop, if any. The checksum is a sum so it
-//	allows us to commute operations. So we do the "head" and "tail"
-//	first to finish at full speed in the body. Once we get the head and
-//	tail values, we feed them into the pipeline, very handy initialization.
-//
-//	Of course we deal with the special case where the whole buffer fits
-//	into one 8 byte word. In this case we have only one entry in the pipeline.
-//
-//	We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
-//	possible load latency and also to accommodate for head and tail.
-//
-//	The end of the function deals with folding the checksum from 64bits
-//	down to 16bits taking care of the carry.
-//
-//	This version avoids synchronization in the core loop by also using a
-//	pipeline for the accumulation of the checksum in resultx[] (x=1,2).
-//
-//	 wordx[] (x=1,2)
-//	|---|
-//      |   | 0			: new value loaded in pipeline
-//	|---|
-//      |   | -			: in transit data
-//	|---|
-//      |   | LOAD_LATENCY	: current value to add to checksum
-//	|---|
-//      |   | LOAD_LATENCY+1	: previous value added to checksum
-//      |---|			(previous iteration)
-//
-//	resultx[] (x=1,2)
-//	|---|
-//      |   | 0			: initial value
-//	|---|
-//      |   | LOAD_LATENCY-1	: new checksum
-//	|---|
-//      |   | LOAD_LATENCY	: previous value of checksum
-//	|---|
-//      |   | LOAD_LATENCY+1	: final checksum when out of the loop
-//      |---|
-//
-//
-//	See RFC1071 "Computing the Internet Checksum" for various techniques for
-//	calculating the Internet checksum.
-//
-// NOT YET DONE:
-//	- Maybe another algorithm which would take care of the folding at the
-//	  end in a different manner
-//	- Work with people more knowledgeable than me on the network stack
-//	  to figure out if we could not split the function depending on the
-//	  type of packet or alignment we get. Like the ip_fast_csum() routine
-//	  where we know we have at least 20bytes worth of data to checksum.
-//	- Do a better job of handling small packets.
-//	- Note on prefetching: it was found that under various load, i.e. ftp read/write,
-//	  nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
-//	  on the data that buffer points to (partly because the checksum is often preceded by
-//	  a copy_from_user()).  This finding indiate that lfetch will not be beneficial since
-//	  the data is already in the cache.
-//
-
-#define saved_pfs	r11
-#define hmask		r16
-#define tmask		r17
-#define first1		r18
-#define firstval	r19
-#define firstoff	r20
-#define last		r21
-#define lastval		r22
-#define lastoff		r23
-#define saved_lc	r24
-#define saved_pr	r25
-#define tmp1		r26
-#define tmp2		r27
-#define tmp3		r28
-#define carry1		r29
-#define carry2		r30
-#define first2		r31
-
-#define buf		in0
-#define len		in1
-
-#define LOAD_LATENCY	2	// XXX fix me
-
-#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
-# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
-#endif
-
-#define PIPE_DEPTH			(LOAD_LATENCY+2)
-#define ELD	p[LOAD_LATENCY]		// end of load
-#define ELD_1	p[LOAD_LATENCY+1]	// and next stage
-
-// unsigned long do_csum(unsigned char *buf,long len)
-
-GLOBAL_ENTRY(do_csum)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,2,16,0,16
-	.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
-	.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
-	mov ret0=r0		// in case we have zero length
-	cmp.lt p0,p6=r0,len	// check for zero length or negative (32bit len)
-	;;
-	add tmp1=buf,len	// last byte's address
-	.save pr, saved_pr
-	mov saved_pr=pr		// preserve predicates (rotation)
-(p6)	br.ret.spnt.many rp	// return if zero or negative length
-
-	mov hmask=-1		// initialize head mask
-	tbit.nz p15,p0=buf,0	// is buf an odd address?
-	and first1=-8,buf	// 8-byte align down address of first1 element
-
-	and firstoff=7,buf	// how many bytes off for first1 element
-	mov tmask=-1		// initialize tail mask
-
-	;;
-	adds tmp2=-1,tmp1	// last-1
-	and lastoff=7,tmp1	// how many bytes off for last element
-	;;
-	sub tmp1=8,lastoff	// complement to lastoff
-	and last=-8,tmp2	// address of word containing last byte
-	;;
-	sub tmp3=last,first1	// tmp3=distance from first1 to last
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc	// save lc
-	cmp.eq p8,p9=last,first1	// everything fits in one word ?
-
-	ld8 firstval=[first1],8	// load, ahead of time, "first1" word
-	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
-	shl tmp2=firstoff,3	// number of bits
-	;;
-(p9)	ld8 lastval=[last]	// load, ahead of time, "last" word, if needed
-	shl tmp1=tmp1,3		// number of bits
-(p9)	adds tmp3=-8,tmp3	// effectively loaded
-	;;
-(p8)	mov lastval=r0		// we don't need lastval if first1==last
-	shl hmask=hmask,tmp2	// build head mask, mask off [0,first1off[
-	shr.u tmask=tmask,tmp1	// build tail mask, mask off ]8,lastoff]
-	;;
-	.body
-#define count tmp3
-
-(p8)	and hmask=hmask,tmask	// apply tail mask to head mask if 1 word only
-(p9)	and word2[0]=lastval,tmask	// mask last it as appropriate
-	shr.u count=count,3	// how many 8-byte?
-	;;
-	// If count is odd, finish this 8-byte word so that we can
-	// load two back-to-back 8-byte words per loop thereafter.
-	and word1[0]=firstval,hmask	// and mask it as appropriate
-	tbit.nz p10,p11=count,0		// if (count is odd)
-	;;
-(p8)	mov result1[0]=word1[0]
-(p9)	add result1[0]=word1[0],word2[0]
-	;;
-	cmp.ltu p6,p0=result1[0],word1[0]	// check the carry
-	cmp.eq.or.andcm p8,p0=0,count		// exit if zero 8-byte
-	;;
-(p6)	adds result1[0]=1,result1[0]
-(p8)	br.cond.dptk .do_csum_exit	// if (within an 8-byte word)
-(p11)	br.cond.dptk .do_csum16		// if (count is even)
-
-	// Here count is odd.
-	ld8 word1[1]=[first1],8		// load an 8-byte word
-	cmp.eq p9,p10=1,count		// if (count == 1)
-	adds count=-1,count		// loaded an 8-byte word
-	;;
-	add result1[0]=result1[0],word1[1]
-	;;
-	cmp.ltu p6,p0=result1[0],word1[1]
-	;;
-(p6)	adds result1[0]=1,result1[0]
-(p9)	br.cond.sptk .do_csum_exit	// if (count == 1) exit
-	// Fall through to calculate the checksum, feeding result1[0] as
-	// the initial value in result1[0].
-	//
-	// Calculate the checksum loading two 8-byte words per loop.
-	//
-.do_csum16:
-	add first2=8,first1
-	shr.u count=count,1	// we do 16 bytes per loop
-	;;
-	adds count=-1,count
-	mov carry1=r0
-	mov carry2=r0
-	brp.loop.imp 1f,2f
-	;;
-	mov ar.ec=PIPE_DEPTH
-	mov ar.lc=count	// set lc
-	mov pr.rot=1<<16
-	// result1[0] must be initialized in advance.
-	mov result2[0]=r0
-	;;
-	.align 32
-1:
-(ELD_1)	cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
-(pC1[1])adds carry1=1,carry1
-(ELD_1)	cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
-(pC2[1])adds carry2=1,carry2
-(ELD)	add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
-(ELD)	add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
-2:
-(p[0])	ld8 word1[0]=[first1],16
-(p[0])	ld8 word2[0]=[first2],16
-	br.ctop.sptk 1b
-	;;
-	// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
-(pC1[1])adds carry1=1,carry1	// since we miss the last one
-(pC2[1])adds carry2=1,carry2
-	;;
-	add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
-	add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
-	;;
-	cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
-	cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
-	;;
-(p6)	adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
-(p7)	adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
-	;;
-	add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
-	;;
-	cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
-	;;
-(p6)	adds result1[0]=1,result1[0]
-	;;
-.do_csum_exit:
-	//
-	// now fold 64 into 16 bits taking care of carry
-	// that's not very good because it has lots of sequentiality
-	//
-	mov tmp3=0xffff
-	zxt4 tmp1=result1[0]
-	shr.u tmp2=result1[0],32
-	;;
-	add result1[0]=tmp1,tmp2
-	;;
-	and tmp1=result1[0],tmp3
-	shr.u tmp2=result1[0],16
-	;;
-	add result1[0]=tmp1,tmp2
-	;;
-	and tmp1=result1[0],tmp3
-	shr.u tmp2=result1[0],16
-	;;
-	add result1[0]=tmp1,tmp2
-	;;
-	and tmp1=result1[0],tmp3
-	shr.u tmp2=result1[0],16
-	;;
-	add ret0=tmp1,tmp2
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	// if buf was odd then swap bytes
-	mov ar.pfs=saved_pfs		// restore ar.ec
-(p15)	mux1 ret0=ret0,@rev		// reverse word
-	;;
-	mov ar.lc=saved_lc
-(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
-	br.ret.sptk.many rp
-
-//	I (Jun Nakajima) wrote an equivalent code (see below), but it was
-//	not much better than the original. So keep the original there so that
-//	someone else can challenge.
-//
-//	shr.u word1[0]=result1[0],32
-//	zxt4 result1[0]=result1[0]
-//	;;
-//	add result1[0]=result1[0],word1[0]
-//	;;
-//	zxt2 result2[0]=result1[0]
-//	extr.u word1[0]=result1[0],16,16
-//	shr.u carry1=result1[0],32
-//	;;
-//	add result2[0]=result2[0],word1[0]
-//	;;
-//	add result2[0]=result2[0],carry1
-//	;;
-//	extr.u ret0=result2[0],16,16
-//	;;
-//	add ret0=ret0,result2[0]
-//	;;
-//	zxt2 ret0=ret0
-//	mov ar.pfs=saved_pfs		 // restore ar.ec
-//	mov pr=saved_pr,0xffffffffffff0000
-//	;;
-//	// if buf was odd then swap bytes
-//	mov ar.lc=saved_lc
-//(p15)	mux1 ret0=ret0,@rev		// reverse word
-//	;;
-//(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
-//	br.ret.sptk.many rp
-
-END(do_csum)
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
deleted file mode 100644
index 8573d59c9ed1..000000000000
--- a/arch/ia64/lib/flush.S
+++ /dev/null
@@ -1,120 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Cache flushing routines.
- *
- * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 05/28/05 Zoltan Menyhart	Dynamic stride size
- */
-
-#include <asm/asmmacro.h>
-#include <asm/export.h>
-
-
-	/*
-	 * flush_icache_range(start,end)
-	 *
-	 *	Make i-cache(s) coherent with d-caches.
-	 *
-	 *	Must deal with range from start to end-1 but nothing else (need to
-	 *	be careful not to touch addresses that may be unmapped).
-	 *
-	 *	Note: "in0" and "in1" are preserved for debugging purposes.
-	 */
-	.section .kprobes.text,"ax"
-GLOBAL_ENTRY(flush_icache_range)
-
-	.prologue
-	alloc	r2=ar.pfs,2,0,0,0
-	movl	r3=ia64_i_cache_stride_shift
- 	mov	r21=1
-	;;
-	ld8	r20=[r3]		// r20: stride shift
-	sub	r22=in1,r0,1		// last byte address
-	;;
-	shr.u	r23=in0,r20		// start / (stride size)
-	shr.u	r22=r22,r20		// (last byte address) / (stride size)
-	shl	r21=r21,r20		// r21: stride size of the i-cache(s)
-	;;
-	sub	r8=r22,r23		// number of strides - 1
-	shl	r24=r23,r20		// r24: addresses for "fc.i" =
-					//	"start" rounded down to stride boundary
-	.save	ar.lc,r3
-	mov	r3=ar.lc		// save ar.lc
-	;;
-
-	.body
-	mov	ar.lc=r8
-	;;
-	/*
-	 * 32 byte aligned loop, even number of (actually 2) bundles
-	 */
-.Loop:	fc.i	r24			// issuable on M0 only
-	add	r24=r21,r24		// we flush "stride size" bytes per iteration
-	nop.i	0
-	br.cloop.sptk.few .Loop
-	;;
-	sync.i
-	;;
-	srlz.i
-	;;
-	mov	ar.lc=r3		// restore ar.lc
-	br.ret.sptk.many rp
-END(flush_icache_range)
-EXPORT_SYMBOL_GPL(flush_icache_range)
-
-	/*
-	 * clflush_cache_range(start,size)
-	 *
-	 *	Flush cache lines from start to start+size-1.
-	 *
-	 *	Must deal with range from start to start+size-1 but nothing else
-	 *	(need to be careful not to touch addresses that may be
-	 *	unmapped).
-	 *
-	 *	Note: "in0" and "in1" are preserved for debugging purposes.
-	 */
-	.section .kprobes.text,"ax"
-GLOBAL_ENTRY(clflush_cache_range)
-
-	.prologue
-	alloc	r2=ar.pfs,2,0,0,0
-	movl	r3=ia64_cache_stride_shift
-	mov	r21=1
-	add     r22=in1,in0
-	;;
-	ld8	r20=[r3]		// r20: stride shift
-	sub	r22=r22,r0,1		// last byte address
-	;;
-	shr.u	r23=in0,r20		// start / (stride size)
-	shr.u	r22=r22,r20		// (last byte address) / (stride size)
-	shl	r21=r21,r20		// r21: stride size of the i-cache(s)
-	;;
-	sub	r8=r22,r23		// number of strides - 1
-	shl	r24=r23,r20		// r24: addresses for "fc" =
-					//	"start" rounded down to stride
-					//	boundary
-	.save	ar.lc,r3
-	mov	r3=ar.lc		// save ar.lc
-	;;
-
-	.body
-	mov	ar.lc=r8
-	;;
-	/*
-	 * 32 byte aligned loop, even number of (actually 2) bundles
-	 */
-.Loop_fc:
-	fc	r24		// issuable on M0 only
-	add	r24=r21,r24	// we flush "stride size" bytes per iteration
-	nop.i	0
-	br.cloop.sptk.few .Loop_fc
-	;;
-	sync.i
-	;;
-	srlz.i
-	;;
-	mov	ar.lc=r3		// restore ar.lc
-	br.ret.sptk.many rp
-END(clflush_cache_range)
diff --git a/arch/ia64/lib/idiv32.S b/arch/ia64/lib/idiv32.S
deleted file mode 100644
index def92b708e6e..000000000000
--- a/arch/ia64/lib/idiv32.S
+++ /dev/null
@@ -1,86 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2000 Hewlett-Packard Co
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 32-bit integer division.
- *
- * This code is based on the application note entitled "Divide, Square Root
- * and Remainder Algorithms for the IA-64 Architecture".  This document
- * is available as Intel document number 248725-002 or via the web at
- * http://developer.intel.com/software/opensource/numerics/
- *
- * For more details on the theory behind these algorithms, see "IA-64
- * and Elementary Functions" by Peter Markstein; HP Professional Books
- * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions)
- */
-
-#include <asm/asmmacro.h>
-#include <asm/export.h>
-
-#ifdef MODULO
-# define OP	mod
-#else
-# define OP	div
-#endif
-
-#ifdef UNSIGNED
-# define SGN	u
-# define EXTEND	zxt4
-# define INT_TO_FP(a,b)	fcvt.xuf.s1 a=b
-# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s1 a=b
-#else
-# define SGN
-# define EXTEND	sxt4
-# define INT_TO_FP(a,b)	fcvt.xf a=b
-# define FP_TO_INT(a,b)	fcvt.fx.trunc.s1 a=b
-#endif
-
-#define PASTE1(a,b)	a##b
-#define PASTE(a,b)	PASTE1(a,b)
-#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,si3))
-
-GLOBAL_ENTRY(NAME)
-	.regstk 2,0,0,0
-	// Transfer inputs to FP registers.
-	mov r2 = 0xffdd			// r2 = -34 + 65535 (fp reg format bias)
-	EXTEND in0 = in0		// in0 = a
-	EXTEND in1 = in1		// in1 = b
-	;;
-	setf.sig f8 = in0
-	setf.sig f9 = in1
-#ifdef MODULO
-	sub in1 = r0, in1		// in1 = -b
-#endif
-	;;
-	// Convert the inputs to FP, to avoid FP software-assist faults.
-	INT_TO_FP(f8, f8)
-	INT_TO_FP(f9, f9)
-	;;
-	setf.exp f7 = r2		// f7 = 2^-34
-	frcpa.s1 f6, p6 = f8, f9	// y0 = frcpa(b)
-	;;
-(p6)	fmpy.s1 f8 = f8, f6		// q0 = a*y0
-(p6)	fnma.s1 f6 = f9, f6, f1		// e0 = -b*y0 + 1 
-	;;
-#ifdef MODULO
-	setf.sig f9 = in1		// f9 = -b
-#endif
-(p6)	fma.s1 f8 = f6, f8, f8		// q1 = e0*q0 + q0
-(p6)	fma.s1 f6 = f6, f6, f7		// e1 = e0*e0 + 2^-34
-	;;
-#ifdef MODULO
-	setf.sig f7 = in0
-#endif
-(p6)	fma.s1 f6 = f6, f8, f8		// q2 = e1*q1 + q1
-	;;
-	FP_TO_INT(f6, f6)		// q = trunc(q2)
-	;;
-#ifdef MODULO
-	xma.l f6 = f6, f9, f7		// r = q*(-b) + a
-	;;
-#endif
-	getf.sig r8 = f6		// transfer result to result register
-	br.ret.sptk.many rp
-END(NAME)
-EXPORT_SYMBOL(NAME)
diff --git a/arch/ia64/lib/idiv64.S b/arch/ia64/lib/idiv64.S
deleted file mode 100644
index a8ba3bd3d4d8..000000000000
--- a/arch/ia64/lib/idiv64.S
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1999-2000 Hewlett-Packard Co
- * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 64-bit integer division.
- *
- * This code is based on the application note entitled "Divide, Square Root
- * and Remainder Algorithms for the IA-64 Architecture".  This document
- * is available as Intel document number 248725-002 or via the web at
- * http://developer.intel.com/software/opensource/numerics/
- *
- * For more details on the theory behind these algorithms, see "IA-64
- * and Elementary Functions" by Peter Markstein; HP Professional Books
- * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions)
- */
-
-#include <asm/asmmacro.h>
-#include <asm/export.h>
-
-#ifdef MODULO
-# define OP	mod
-#else
-# define OP	div
-#endif
-
-#ifdef UNSIGNED
-# define SGN	u
-# define INT_TO_FP(a,b)	fcvt.xuf.s1 a=b
-# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s1 a=b
-#else
-# define SGN
-# define INT_TO_FP(a,b)	fcvt.xf a=b
-# define FP_TO_INT(a,b)	fcvt.fx.trunc.s1 a=b
-#endif
-
-#define PASTE1(a,b)	a##b
-#define PASTE(a,b)	PASTE1(a,b)
-#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,di3))
-
-GLOBAL_ENTRY(NAME)
-	.regstk 2,0,0,0
-	// Transfer inputs to FP registers.
-	setf.sig f8 = in0
-	setf.sig f9 = in1
-	;;
-	// Convert the inputs to FP, to avoid FP software-assist faults.
-	INT_TO_FP(f8, f8)
-	INT_TO_FP(f9, f9)
-	;;
-	frcpa.s1 f11, p6 = f8, f9	// y0 = frcpa(b)
-	;;
-(p6)	fmpy.s1 f7 = f8, f11		// q0 = a*y0
-(p6)	fnma.s1 f6 = f9, f11, f1	// e0 = -b*y0 + 1
-	;;
-(p6)	fma.s1 f10 = f7, f6, f7		// q1 = q0*e0 + q0
-(p6)	fmpy.s1 f7 = f6, f6		// e1 = e0*e0
-	;;
-#ifdef MODULO
-	sub in1 = r0, in1		// in1 = -b
-#endif
-(p6)	fma.s1 f10 = f10, f7, f10	// q2 = q1*e1 + q1
-(p6)	fma.s1 f6 = f11, f6, f11	// y1 = y0*e0 + y0
-	;;
-(p6)	fma.s1 f6 = f6, f7, f6		// y2 = y1*e1 + y1
-(p6)	fnma.s1 f7 = f9, f10, f8	// r = -b*q2 + a
-	;;
-#ifdef MODULO
-	setf.sig f8 = in0		// f8 = a
-	setf.sig f9 = in1		// f9 = -b
-#endif
-(p6)	fma.s1 f11 = f7, f6, f10	// q3 = r*y2 + q2
-	;;
-	FP_TO_INT(f11, f11)		// q = trunc(q3)
-	;;
-#ifdef MODULO
-	xma.l f11 = f11, f9, f8		// r = q*(-b) + a
-	;;
-#endif
-	getf.sig r8 = f11		// transfer result to result register
-	br.ret.sptk.many rp
-END(NAME)
-EXPORT_SYMBOL(NAME)
diff --git a/arch/ia64/lib/io.c b/arch/ia64/lib/io.c
deleted file mode 100644
index c3e02462ed16..000000000000
--- a/arch/ia64/lib/io.c
+++ /dev/null
@@ -1,51 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
-#include <linux/types.h>
-
-#include <asm/io.h>
-
-/*
- * Copy data from IO memory space to "real" memory space.
- * This needs to be optimized.
- */
-void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
-{
-	char *dst = to;
-
-	while (count) {
-		count--;
-		*dst++ = readb(from++);
-	}
-}
-EXPORT_SYMBOL(memcpy_fromio);
-
-/*
- * Copy data from "real" memory space to IO memory space.
- * This needs to be optimized.
- */
-void memcpy_toio(volatile void __iomem *to, const void *from, long count)
-{
-	const char *src = from;
-
-	while (count) {
-		count--;
-		writeb(*src++, to++);
-	}
-}
-EXPORT_SYMBOL(memcpy_toio);
-
-/*
- * "memset" on IO memory space.
- * This needs to be optimized.
- */
-void memset_io(volatile void __iomem *dst, int c, long count)
-{
-	unsigned char ch = (char)(c & 0xff);
-
-	while (count) {
-		count--;
-		writeb(ch, dst);
-		dst++;
-	}
-}
-EXPORT_SYMBOL(memset_io);
diff --git a/arch/ia64/lib/ip_fast_csum.S b/arch/ia64/lib/ip_fast_csum.S
deleted file mode 100644
index dc9e6e6fe876..000000000000
--- a/arch/ia64/lib/ip_fast_csum.S
+++ /dev/null
@@ -1,148 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Optmized version of the ip_fast_csum() function
- * Used for calculating IP header checksum
- *
- * Return: 16bit checksum, complemented
- *
- * Inputs:
- *      in0: address of buffer to checksum (char *)
- *      in1: length of the buffer (int)
- *
- * Copyright (C) 2002, 2006 Intel Corp.
- * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
- */
-
-#include <asm/asmmacro.h>
-#include <asm/export.h>
-
-/*
- * Since we know that most likely this function is called with buf aligned
- * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
- * versus calling generic version of do_csum, which has lots of overhead in
- * handling various alignments and sizes.  However, due to lack of constrains
- * put on the function input argument, cases with alignment not on 4-byte or
- * size not equal to 20 bytes will be handled by the generic do_csum function.
- */
-
-#define in0	r32
-#define in1	r33
-#define in2	r34
-#define in3	r35
-#define in4	r36
-#define ret0	r8
-
-GLOBAL_ENTRY(ip_fast_csum)
-	.prologue
-	.body
-	cmp.ne	p6,p7=5,in1	// size other than 20 byte?
-	and	r14=3,in0	// is it aligned on 4-byte?
-	add	r15=4,in0	// second source pointer
-	;;
-	cmp.ne.or.andcm p6,p7=r14,r0
-	;;
-(p7)	ld4	r20=[in0],8
-(p7)	ld4	r21=[r15],8
-(p6)	br.spnt	.generic
-	;;
-	ld4	r22=[in0],8
-	ld4	r23=[r15],8
-	;;
-	ld4	r24=[in0]
-	add	r20=r20,r21
-	add	r22=r22,r23
-	;;
-	add	r20=r20,r22
-	;;
-	add	r20=r20,r24
-	;;
-	shr.u	ret0=r20,16	// now need to add the carry
-	zxt2	r20=r20
-	;;
-	add	r20=ret0,r20
-	;;
-	shr.u	ret0=r20,16	// add carry again
-	zxt2	r20=r20
-	;;
-	add	r20=ret0,r20
-	;;
-	shr.u	ret0=r20,16
-	zxt2	r20=r20
-	;;
-	add	r20=ret0,r20
-	mov	r9=0xffff
-	;;
-	andcm	ret0=r9,r20
-	.restore sp		// reset frame state
-	br.ret.sptk.many b0
-	;;
-
-.generic:
-	.prologue
-	.save ar.pfs, r35
-	alloc	r35=ar.pfs,2,2,2,0
-	.save rp, r34
-	mov	r34=b0
-	.body
-	dep.z	out1=in1,2,30
-	mov	out0=in0
-	;;
-	br.call.sptk.many b0=do_csum
-	;;
-	andcm	ret0=-1,ret0
-	mov	ar.pfs=r35
-	mov	b0=r34
-	br.ret.sptk.many b0
-END(ip_fast_csum)
-EXPORT_SYMBOL(ip_fast_csum)
-
-GLOBAL_ENTRY(csum_ipv6_magic)
-	ld4	r20=[in0],4
-	ld4	r21=[in1],4
-	zxt4	in2=in2
-	;;
-	ld4	r22=[in0],4
-	ld4	r23=[in1],4
-	dep	r15=in3,in2,32,16
-	;;
-	ld4	r24=[in0],4
-	ld4	r25=[in1],4
-	mux1	r15=r15,@rev
-	add	r16=r20,r21
-	add	r17=r22,r23
-	zxt4	in4=in4
-	;;
-	ld4	r26=[in0],4
-	ld4	r27=[in1],4
-	shr.u	r15=r15,16
-	add	r18=r24,r25
-	add	r8=r16,r17
-	;;
-	add	r19=r26,r27
-	add	r8=r8,r18
-	;;
-	add	r8=r8,r19
-	add	r15=r15,in4
-	;;
-	add	r8=r8,r15
-	;;
-	shr.u	r10=r8,32	// now fold sum into short
-	zxt4	r11=r8
-	;;
-	add	r8=r10,r11
-	;;
-	shr.u	r10=r8,16	// yeah, keep it rolling
-	zxt2	r11=r8
-	;;
-	add	r8=r10,r11
-	;;
-	shr.u	r10=r8,16	// three times lucky
-	zxt2	r11=r8
-	;;
-	add	r8=r10,r11
-	mov	r9=0xffff
-	;;
-	andcm	r8=r9,r8
-	br.ret.sptk.many b0
-END(csum_ipv6_magic)
-EXPORT_SYMBOL(csum_ipv6_magic)
diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S
deleted file mode 100644
index 91a625fddbf0..000000000000
--- a/arch/ia64/lib/memcpy.S
+++ /dev/null
@@ -1,304 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard memcpy() function
- *
- * Inputs:
- * 	in0:	destination address
- *	in1:	source address
- *	in2:	number of bytes to copy
- * Output:
- * 	no return value
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <asm/asmmacro.h>
-#include <asm/export.h>
-
-GLOBAL_ENTRY(memcpy)
-
-#	define MEM_LAT	21		/* latency to memory */
-
-#	define dst	r2
-#	define src	r3
-#	define retval	r8
-#	define saved_pfs r9
-#	define saved_lc	r10
-#	define saved_pr	r11
-#	define cnt	r16
-#	define src2	r17
-#	define t0	r18
-#	define t1	r19
-#	define t2	r20
-#	define t3	r21
-#	define t4	r22
-#	define src_end	r23
-
-#	define N	(MEM_LAT + 4)
-#	define Nrot	((N + 7) & ~7)
-
-	/*
-	 * First, check if everything (src, dst, len) is a multiple of eight.  If
-	 * so, we handle everything with no taken branches (other than the loop
-	 * itself) and a small icache footprint.  Otherwise, we jump off to
-	 * the more general copy routine handling arbitrary
-	 * sizes/alignment etc.
-	 */
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc
-	or t0=in0,in1
-	;;
-
-	or t0=t0,in2
-	.save pr, saved_pr
-	mov saved_pr=pr
-
-	.body
-
-	cmp.eq p6,p0=in2,r0	// zero length?
-	mov retval=in0		// return dst
-(p6)	br.ret.spnt.many rp	// zero length, return immediately
-	;;
-
-	mov dst=in0		// copy because of rotation
-	shr.u cnt=in2,3		// number of 8-byte words to copy
-	mov pr.rot=1<<16
-	;;
-
-	adds cnt=-1,cnt		// br.ctop is repeat/until
-	cmp.gtu p7,p0=16,in2	// copying less than 16 bytes?
-	mov ar.ec=N
-	;;
-
-	and t0=0x7,t0
-	mov ar.lc=cnt
-	;;
-	cmp.ne p6,p0=t0,r0
-
-	mov src=in1		// copy because of rotation
-(p7)	br.cond.spnt.few .memcpy_short
-(p6)	br.cond.spnt.few .memcpy_long
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	nop.i	0
-	;;
-	nop.m	0
-	;;
-	.rotr val[N]
-	.rotp p[N]
-	.align 32
-1: { .mib
-(p[0])	ld8 val[0]=[src],8
-	nop.i 0
-	brp.loop.imp 1b, 2f
-}
-2: { .mfb
-(p[N-1])st8 [dst]=val[N-1],8
-	nop.f 0
-	br.ctop.dptk.few 1b
-}
-	;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,-1
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	/*
-	 * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
-	 * copy loop.  This performs relatively poorly on Itanium, but it doesn't
-	 * get used very often (gcc inlines small copies) and due to atomicity
-	 * issues, we want to avoid read-modify-write of entire words.
-	 */
-	.align 32
-.memcpy_short:
-	adds cnt=-1,in2		// br.ctop is repeat/until
-	mov ar.ec=MEM_LAT
-	brp.loop.imp 1f, 2f
-	;;
-	mov ar.lc=cnt
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	nop.i	0
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	;;
-	/*
-	 * It is faster to put a stop bit in the loop here because it makes
-	 * the pipeline shorter (and latency is what matters on short copies).
-	 */
-	.align 32
-1: { .mib
-(p[0])	ld1 val[0]=[src],1
-	nop.i 0
-	brp.loop.imp 1b, 2f
-} ;;
-2: { .mfb
-(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
-	nop.f 0
-	br.ctop.dptk.few 1b
-} ;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,-1
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	/*
-	 * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
-	 * an overriding concern here, but throughput is.  We first do
-	 * sub-word copying until the destination is aligned, then we check
-	 * if the source is also aligned.  If so, we do a simple load/store-loop
-	 * until there are less than 8 bytes left over and then we do the tail,
-	 * by storing the last few bytes using sub-word copying.  If the source
-	 * is not aligned, we branch off to the non-congruent loop.
-	 *
-	 *   stage:   op:
-	 *         0  ld
-	 *	   :
-	 * MEM_LAT+3  shrp
-	 * MEM_LAT+4  st
-	 *
-	 * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
-	 * seems to introduce an unavoidable bubble in the pipeline so the overall
-	 * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
-	 * of 4 byte/cycle.  Still not bad.
-	 */
-#	undef N
-#	undef Nrot
-#	define N	(MEM_LAT + 5)		/* number of stages */
-#	define Nrot	((N+1 + 2 + 7) & ~7)	/* number of rotating regs */
-
-#define LOG_LOOP_SIZE	6
-
-.memcpy_long:
-	alloc t3=ar.pfs,3,Nrot,0,Nrot	// resize register frame
-	and t0=-8,src		// t0 = src & ~7
-	and t2=7,src		// t2 = src & 7
-	;;
-	ld8 t0=[t0]		// t0 = 1st source word
-	adds src2=7,src		// src2 = (src + 7)
-	sub t4=r0,dst		// t4 = -dst
-	;;
-	and src2=-8,src2	// src2 = (src + 7) & ~7
-	shl t2=t2,3		// t2 = 8*(src & 7)
-	shl t4=t4,3		// t4 = 8*(dst & 7)
-	;;
-	ld8 t1=[src2]		// t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
-	sub t3=64,t2		// t3 = 64-8*(src & 7)
-	shr.u t0=t0,t2
-	;;
-	add src_end=src,in2
-	shl t1=t1,t3
-	mov pr=t4,0x38		// (p5,p4,p3)=(dst & 7)
-	;;
-	or t0=t0,t1
-	mov cnt=r0
-	adds src_end=-1,src_end
-	;;
-(p3)	st1 [dst]=t0,1
-(p3)	shr.u t0=t0,8
-(p3)	adds cnt=1,cnt
-	;;
-(p4)	st2 [dst]=t0,2
-(p4)	shr.u t0=t0,16
-(p4)	adds cnt=2,cnt
-	;;
-(p5)	st4 [dst]=t0,4
-(p5)	adds cnt=4,cnt
-	and src_end=-8,src_end	// src_end = last word of source buffer
-	;;
-
-	// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
-
-1:{	add src=cnt,src			// make src point to remainder of source buffer
-	sub cnt=in2,cnt			// cnt = number of bytes left to copy
-	mov t4=ip
-  }	;;
-	and src2=-8,src			// align source pointer
-	adds t4=.memcpy_loops-1b,t4
-	mov ar.ec=N
-
-	and t0=7,src			// t0 = src & 7
-	shr.u t2=cnt,3			// t2 = number of 8-byte words left to copy
-	shl cnt=cnt,3			// move bits 0-2 to 3-5
-	;;
-
-	.rotr val[N+1], w[2]
-	.rotp p[N]
-
-	cmp.ne p6,p0=t0,r0		// is src aligned, too?
-	shl t0=t0,LOG_LOOP_SIZE		// t0 = 8*(src & 7)
-	adds t2=-1,t2			// br.ctop is repeat/until
-	;;
-	add t4=t0,t4
-	mov pr=cnt,0x38			// set (p5,p4,p3) to # of bytes last-word bytes to copy
-	mov ar.lc=t2
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	nop.i	0
-	;;
-	nop.m	0
-	;;
-(p6)	ld8 val[1]=[src2],8		// prime the pump...
-	mov b6=t4
-	br.sptk.few b6
-	;;
-
-.memcpy_tail:
-	// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
-	// less than 8) and t0 contains the last few bytes of the src buffer:
-(p5)	st4 [dst]=t0,4
-(p5)	shr.u t0=t0,32
-	mov ar.lc=saved_lc
-	;;
-(p4)	st2 [dst]=t0,2
-(p4)	shr.u t0=t0,16
-	mov ar.pfs=saved_pfs
-	;;
-(p3)	st1 [dst]=t0
-	mov pr=saved_pr,-1
-	br.ret.sptk.many rp
-
-///////////////////////////////////////////////////////
-	.align 64
-
-#define COPY(shift,index)									\
- 1: { .mib											\
-	(p[0])		ld8 val[0]=[src2],8;							\
-	(p[MEM_LAT+3])	shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;			\
-			brp.loop.imp 1b, 2f							\
-    };												\
- 2: { .mfb											\
-	(p[MEM_LAT+4])	st8 [dst]=w[1],8;							\
-			nop.f 0;								\
-			br.ctop.dptk.few 1b;							\
-    };												\
-			;;									\
-			ld8 val[N-1]=[src_end];	/* load last word (may be same as val[N]) */	\
-			;;									\
-			shrp t0=val[N-1],val[N-index],shift;					\
-			br .memcpy_tail
-.memcpy_loops:
-	COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
-	COPY(8, 0)
-	COPY(16, 0)
-	COPY(24, 0)
-	COPY(32, 0)
-	COPY(40, 0)
-	COPY(48, 0)
-	COPY(56, 0)
-
-END(memcpy)
-EXPORT_SYMBOL(memcpy)
diff --git a/arch/ia64/lib/memcpy_mck.S b/arch/ia64/lib/memcpy_mck.S
deleted file mode 100644
index cc4e6ac914b6..000000000000
--- a/arch/ia64/lib/memcpy_mck.S
+++ /dev/null
@@ -1,659 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Itanium 2-optimized version of memcpy and copy_user function
- *
- * Inputs:
- * 	in0:	destination address
- *	in1:	source address
- *	in2:	number of bytes to copy
- * Output:
- *	for memcpy:    return dest
- * 	for copy_user: return 0 if success,
- *		       or number of byte NOT copied if error occurred.
- *
- * Copyright (C) 2002 Intel Corp.
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
- */
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-#include <asm/export.h>
-
-#define EK(y...) EX(y)
-
-/* McKinley specific optimization */
-
-#define retval		r8
-#define saved_pfs	r31
-#define saved_lc	r10
-#define saved_pr	r11
-#define saved_in0	r14
-#define saved_in1	r15
-#define saved_in2	r16
-
-#define src0		r2
-#define src1		r3
-#define dst0		r17
-#define dst1		r18
-#define cnt		r9
-
-/* r19-r30 are temp for each code section */
-#define PREFETCH_DIST	8
-#define src_pre_mem	r19
-#define dst_pre_mem	r20
-#define src_pre_l2	r21
-#define dst_pre_l2	r22
-#define t1		r23
-#define t2		r24
-#define t3		r25
-#define t4		r26
-#define t5		t1	// alias!
-#define t6		t2	// alias!
-#define t7		t3	// alias!
-#define n8		r27
-#define t9		t5	// alias!
-#define t10		t4	// alias!
-#define t11		t7	// alias!
-#define t12		t6	// alias!
-#define t14		t10	// alias!
-#define t13		r28
-#define t15		r29
-#define tmp		r30
-
-/* defines for long_copy block */
-#define	A	0
-#define B	(PREFETCH_DIST)
-#define C	(B + PREFETCH_DIST)
-#define D	(C + 1)
-#define N	(D + 1)
-#define Nrot	((N + 7) & ~7)
-
-/* alias */
-#define in0		r32
-#define in1		r33
-#define in2		r34
-
-GLOBAL_ENTRY(memcpy)
-	and	r28=0x7,in0
-	and	r29=0x7,in1
-	mov	f6=f0
-	mov	retval=in0
-	br.cond.sptk .common_code
-	;;
-END(memcpy)
-EXPORT_SYMBOL(memcpy)
-GLOBAL_ENTRY(__copy_user)
-	.prologue
-// check dest alignment
-	and	r28=0x7,in0
-	and	r29=0x7,in1
-	mov	f6=f1
-	mov	saved_in0=in0	// save dest pointer
-	mov	saved_in1=in1	// save src pointer
-	mov	retval=r0	// initialize return value
-	;;
-.common_code:
-	cmp.gt	p15,p0=8,in2	// check for small size
-	cmp.ne	p13,p0=0,r28	// check dest alignment
-	cmp.ne	p14,p0=0,r29	// check src alignment
-	add	src0=0,in1
-	sub	r30=8,r28	// for .align_dest
-	mov	saved_in2=in2	// save len
-	;;
-	add	dst0=0,in0
-	add	dst1=1,in0	// dest odd index
-	cmp.le	p6,p0 = 1,r30	// for .align_dest
-(p15)	br.cond.dpnt .memcpy_short
-(p13)	br.cond.dpnt .align_dest
-(p14)	br.cond.dpnt .unaligned_src
-	;;
-
-// both dest and src are aligned on 8-byte boundary
-.aligned_src:
-	.save ar.pfs, saved_pfs
-	alloc	saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
-	.save pr, saved_pr
-	mov	saved_pr=pr
-
-	shr.u	cnt=in2,7	// this much cache line
-	;;
-	cmp.lt	p6,p0=2*PREFETCH_DIST,cnt
-	cmp.lt	p7,p8=1,cnt
-	.save ar.lc, saved_lc
-	mov	saved_lc=ar.lc
-	.body
-	add	cnt=-1,cnt
-	add	src_pre_mem=0,in1	// prefetch src pointer
-	add	dst_pre_mem=0,in0	// prefetch dest pointer
-	;;
-(p7)	mov	ar.lc=cnt	// prefetch count
-(p8)	mov	ar.lc=r0
-(p6)	br.cond.dpnt .long_copy
-	;;
-
-.prefetch:
-	lfetch.fault	  [src_pre_mem], 128
-	lfetch.fault.excl [dst_pre_mem], 128
-	br.cloop.dptk.few .prefetch
-	;;
-
-.medium_copy:
-	and	tmp=31,in2	// copy length after iteration
-	shr.u	r29=in2,5	// number of 32-byte iteration
-	add	dst1=8,dst0	// 2nd dest pointer
-	;;
-	add	cnt=-1,r29	// ctop iteration adjustment
-	cmp.eq	p10,p0=r29,r0	// do we really need to loop?
-	add	src1=8,src0	// 2nd src pointer
-	cmp.le	p6,p0=8,tmp
-	;;
-	cmp.le	p7,p0=16,tmp
-	mov	ar.lc=cnt	// loop setup
-	cmp.eq	p16,p17 = r0,r0
-	mov	ar.ec=2
-(p10)	br.dpnt.few .aligned_src_tail
-	;;
-	TEXT_ALIGN(32)
-1:
-EX(.ex_handler, (p16)	ld8	r34=[src0],16)
-EK(.ex_handler, (p16)	ld8	r38=[src1],16)
-EX(.ex_handler, (p17)	st8	[dst0]=r33,16)
-EK(.ex_handler, (p17)	st8	[dst1]=r37,16)
-	;;
-EX(.ex_handler, (p16)	ld8	r32=[src0],16)
-EK(.ex_handler, (p16)	ld8	r36=[src1],16)
-EX(.ex_handler, (p16)	st8	[dst0]=r34,16)
-EK(.ex_handler, (p16)	st8	[dst1]=r38,16)
-	br.ctop.dptk.few 1b
-	;;
-
-.aligned_src_tail:
-EX(.ex_handler, (p6)	ld8	t1=[src0])
-	mov	ar.lc=saved_lc
-	mov	ar.pfs=saved_pfs
-EX(.ex_hndlr_s, (p7)	ld8	t2=[src1],8)
-	cmp.le	p8,p0=24,tmp
-	and	r21=-8,tmp
-	;;
-EX(.ex_hndlr_s, (p8)	ld8	t3=[src1])
-EX(.ex_handler, (p6)	st8	[dst0]=t1)	// store byte 1
-	and	in2=7,tmp	// remaining length
-EX(.ex_hndlr_d, (p7)	st8	[dst1]=t2,8)	// store byte 2
-	add	src0=src0,r21	// setting up src pointer
-	add	dst0=dst0,r21	// setting up dest pointer
-	;;
-EX(.ex_handler, (p8)	st8	[dst1]=t3)	// store byte 3
-	mov	pr=saved_pr,-1
-	br.dptk.many .memcpy_short
-	;;
-
-/* code taken from copy_page_mck */
-.long_copy:
-	.rotr v[2*PREFETCH_DIST]
-	.rotp p[N]
-
-	mov src_pre_mem = src0
-	mov pr.rot = 0x10000
-	mov ar.ec = 1				// special unrolled loop
-
-	mov dst_pre_mem = dst0
-
-	add src_pre_l2 = 8*8, src0
-	add dst_pre_l2 = 8*8, dst0
-	;;
-	add src0 = 8, src_pre_mem		// first t1 src
-	mov ar.lc = 2*PREFETCH_DIST - 1
-	shr.u cnt=in2,7				// number of lines
-	add src1 = 3*8, src_pre_mem		// first t3 src
-	add dst0 = 8, dst_pre_mem		// first t1 dst
-	add dst1 = 3*8, dst_pre_mem		// first t3 dst
-	;;
-	and tmp=127,in2				// remaining bytes after this block
-	add cnt = -(2*PREFETCH_DIST) - 1, cnt
-	// same as .line_copy loop, but with all predicated-off instructions removed:
-.prefetch_loop:
-EX(.ex_hndlr_lcpy_1, (p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0
-EK(.ex_hndlr_lcpy_1, (p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2
-	br.ctop.sptk .prefetch_loop
-	;;
-	cmp.eq p16, p0 = r0, r0			// reset p16 to 1
-	mov ar.lc = cnt
-	mov ar.ec = N				// # of stages in pipeline
-	;;
-.line_copy:
-EX(.ex_handler,	(p[D])	ld8 t2 = [src0], 3*8)			// M0
-EK(.ex_handler,	(p[D])	ld8 t4 = [src1], 3*8)			// M1
-EX(.ex_handler_lcpy,	(p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2 prefetch dst from memory
-EK(.ex_handler_lcpy,	(p[D])	st8 [dst_pre_l2] = n8, 128)		// M3 prefetch dst from L2
-	;;
-EX(.ex_handler_lcpy,	(p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0 prefetch src from memory
-EK(.ex_handler_lcpy,	(p[C])	ld8 n8 = [src_pre_l2], 128)		// M1 prefetch src from L2
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t1, 8)			// M2
-EK(.ex_handler,	(p[D])	st8 [dst1] =  t3, 8)			// M3
-	;;
-EX(.ex_handler,	(p[D])	ld8  t5 = [src0], 8)
-EK(.ex_handler,	(p[D])	ld8  t7 = [src1], 3*8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t2, 3*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] =  t4, 3*8)
-	;;
-EX(.ex_handler,	(p[D])	ld8  t6 = [src0], 3*8)
-EK(.ex_handler,	(p[D])	ld8 t10 = [src1], 8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t5, 8)
-EK(.ex_handler,	(p[D])	st8 [dst1] =  t7, 3*8)
-	;;
-EX(.ex_handler,	(p[D])	ld8  t9 = [src0], 3*8)
-EK(.ex_handler,	(p[D])	ld8 t11 = [src1], 3*8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t6, 3*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t10, 8)
-	;;
-EX(.ex_handler,	(p[D])	ld8 t12 = [src0], 8)
-EK(.ex_handler,	(p[D])	ld8 t14 = [src1], 8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t9, 3*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t11, 3*8)
-	;;
-EX(.ex_handler,	(p[D])	ld8 t13 = [src0], 4*8)
-EK(.ex_handler,	(p[D])	ld8 t15 = [src1], 4*8)
-EX(.ex_handler,	(p[D])	st8 [dst0] = t12, 8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t14, 8)
-	;;
-EX(.ex_handler,	(p[C])	ld8  t1 = [src0], 8)
-EK(.ex_handler,	(p[C])	ld8  t3 = [src1], 8)
-EX(.ex_handler,	(p[D])	st8 [dst0] = t13, 4*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t15, 4*8)
-	br.ctop.sptk .line_copy
-	;;
-
-	add dst0=-8,dst0
-	add src0=-8,src0
-	mov in2=tmp
-	.restore sp
-	br.sptk.many .medium_copy
-	;;
-
-#define BLOCK_SIZE	128*32
-#define blocksize	r23
-#define curlen		r24
-
-// dest is on 8-byte boundary, src is not. We need to do
-// ld8-ld8, shrp, then st8.  Max 8 byte copy per cycle.
-.unaligned_src:
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc	saved_pfs=ar.pfs,3,5,0,8
-	.save ar.lc, saved_lc
-	mov	saved_lc=ar.lc
-	.save pr, saved_pr
-	mov	saved_pr=pr
-	.body
-.4k_block:
-	mov	saved_in0=dst0	// need to save all input arguments
-	mov	saved_in2=in2
-	mov	blocksize=BLOCK_SIZE
-	;;
-	cmp.lt	p6,p7=blocksize,in2
-	mov	saved_in1=src0
-	;;
-(p6)	mov	in2=blocksize
-	;;
-	shr.u	r21=in2,7	// this much cache line
-	shr.u	r22=in2,4	// number of 16-byte iteration
-	and	curlen=15,in2	// copy length after iteration
-	and	r30=7,src0	// source alignment
-	;;
-	cmp.lt	p7,p8=1,r21
-	add	cnt=-1,r21
-	;;
-
-	add	src_pre_mem=0,src0	// prefetch src pointer
-	add	dst_pre_mem=0,dst0	// prefetch dest pointer
-	and	src0=-8,src0		// 1st src pointer
-(p7)	mov	ar.lc = cnt
-(p8)	mov	ar.lc = r0
-	;;
-	TEXT_ALIGN(32)
-1:	lfetch.fault	  [src_pre_mem], 128
-	lfetch.fault.excl [dst_pre_mem], 128
-	br.cloop.dptk.few 1b
-	;;
-
-	shladd	dst1=r22,3,dst0	// 2nd dest pointer
-	shladd	src1=r22,3,src0	// 2nd src pointer
-	cmp.eq	p8,p9=r22,r0	// do we really need to loop?
-	cmp.le	p6,p7=8,curlen;	// have at least 8 byte remaining?
-	add	cnt=-1,r22	// ctop iteration adjustment
-	;;
-EX(.ex_handler, (p9)	ld8	r33=[src0],8)	// loop primer
-EK(.ex_handler, (p9)	ld8	r37=[src1],8)
-(p8)	br.dpnt.few .noloop
-	;;
-
-// The jump address is calculated based on src alignment. The COPYU
-// macro below need to confine its size to power of two, so an entry
-// can be caulated using shl instead of an expensive multiply. The
-// size is then hard coded by the following #define to match the
-// actual size.  This make it somewhat tedious when COPYU macro gets
-// changed and this need to be adjusted to match.
-#define LOOP_SIZE 6
-1:
-	mov	r29=ip		// jmp_table thread
-	mov	ar.lc=cnt
-	;;
-	add	r29=.jump_table - 1b - (.jmp1-.jump_table), r29
-	shl	r28=r30, LOOP_SIZE	// jmp_table thread
-	mov	ar.ec=2		// loop setup
-	;;
-	add	r29=r29,r28		// jmp_table thread
-	cmp.eq	p16,p17=r0,r0
-	;;
-	mov	b6=r29			// jmp_table thread
-	;;
-	br.cond.sptk.few b6
-
-// for 8-15 byte case
-// We will skip the loop, but need to replicate the side effect
-// that the loop produces.
-.noloop:
-EX(.ex_handler, (p6)	ld8	r37=[src1],8)
-	add	src0=8,src0
-(p6)	shl	r25=r30,3
-	;;
-EX(.ex_handler, (p6)	ld8	r27=[src1])
-(p6)	shr.u	r28=r37,r25
-(p6)	sub	r26=64,r25
-	;;
-(p6)	shl	r27=r27,r26
-	;;
-(p6)	or	r21=r28,r27
-
-.unaligned_src_tail:
-/* check if we have more than blocksize to copy, if so go back */
-	cmp.gt	p8,p0=saved_in2,blocksize
-	;;
-(p8)	add	dst0=saved_in0,blocksize
-(p8)	add	src0=saved_in1,blocksize
-(p8)	sub	in2=saved_in2,blocksize
-(p8)	br.dpnt	.4k_block
-	;;
-
-/* we have up to 15 byte to copy in the tail.
- * part of work is already done in the jump table code
- * we are at the following state.
- * src side:
- * 
- *   xxxxxx xx                   <----- r21 has xxxxxxxx already
- * -------- -------- --------
- * 0        8        16
- *          ^
- *          |
- *          src1
- * 
- * dst
- * -------- -------- --------
- * ^
- * |
- * dst1
- */
-EX(.ex_handler, (p6)	st8	[dst1]=r21,8)	// more than 8 byte to copy
-(p6)	add	curlen=-8,curlen	// update length
-	mov	ar.pfs=saved_pfs
-	;;
-	mov	ar.lc=saved_lc
-	mov	pr=saved_pr,-1
-	mov	in2=curlen	// remaining length
-	mov	dst0=dst1	// dest pointer
-	add	src0=src1,r30	// forward by src alignment
-	;;
-
-// 7 byte or smaller.
-.memcpy_short:
-	cmp.le	p8,p9   = 1,in2
-	cmp.le	p10,p11 = 2,in2
-	cmp.le	p12,p13 = 3,in2
-	cmp.le	p14,p15 = 4,in2
-	add	src1=1,src0	// second src pointer
-	add	dst1=1,dst0	// second dest pointer
-	;;
-
-EX(.ex_handler_short, (p8)	ld1	t1=[src0],2)
-EK(.ex_handler_short, (p10)	ld1	t2=[src1],2)
-(p9)	br.ret.dpnt rp		// 0 byte copy
-	;;
-
-EX(.ex_handler_short, (p8)	st1	[dst0]=t1,2)
-EK(.ex_handler_short, (p10)	st1	[dst1]=t2,2)
-(p11)	br.ret.dpnt rp		// 1 byte copy
-
-EX(.ex_handler_short, (p12)	ld1	t3=[src0],2)
-EK(.ex_handler_short, (p14)	ld1	t4=[src1],2)
-(p13)	br.ret.dpnt rp		// 2 byte copy
-	;;
-
-	cmp.le	p6,p7   = 5,in2
-	cmp.le	p8,p9   = 6,in2
-	cmp.le	p10,p11 = 7,in2
-
-EX(.ex_handler_short, (p12)	st1	[dst0]=t3,2)
-EK(.ex_handler_short, (p14)	st1	[dst1]=t4,2)
-(p15)	br.ret.dpnt rp		// 3 byte copy
-	;;
-
-EX(.ex_handler_short, (p6)	ld1	t5=[src0],2)
-EK(.ex_handler_short, (p8)	ld1	t6=[src1],2)
-(p7)	br.ret.dpnt rp		// 4 byte copy
-	;;
-
-EX(.ex_handler_short, (p6)	st1	[dst0]=t5,2)
-EK(.ex_handler_short, (p8)	st1	[dst1]=t6,2)
-(p9)	br.ret.dptk rp		// 5 byte copy
-
-EX(.ex_handler_short, (p10)	ld1	t7=[src0],2)
-(p11)	br.ret.dptk rp		// 6 byte copy
-	;;
-
-EX(.ex_handler_short, (p10)	st1	[dst0]=t7,2)
-	br.ret.dptk rp		// done all cases
-
-
-/* Align dest to nearest 8-byte boundary. We know we have at
- * least 7 bytes to copy, enough to crawl to 8-byte boundary.
- * Actual number of byte to crawl depend on the dest alignment.
- * 7 byte or less is taken care at .memcpy_short
-
- * src0 - source even index
- * src1 - source  odd index
- * dst0 - dest even index
- * dst1 - dest  odd index
- * r30  - distance to 8-byte boundary
- */
-
-.align_dest:
-	add	src1=1,in1	// source odd index
-	cmp.le	p7,p0 = 2,r30	// for .align_dest
-	cmp.le	p8,p0 = 3,r30	// for .align_dest
-EX(.ex_handler_short, (p6)	ld1	t1=[src0],2)
-	cmp.le	p9,p0 = 4,r30	// for .align_dest
-	cmp.le	p10,p0 = 5,r30
-	;;
-EX(.ex_handler_short, (p7)	ld1	t2=[src1],2)
-EK(.ex_handler_short, (p8)	ld1	t3=[src0],2)
-	cmp.le	p11,p0 = 6,r30
-EX(.ex_handler_short, (p6)	st1	[dst0] = t1,2)
-	cmp.le	p12,p0 = 7,r30
-	;;
-EX(.ex_handler_short, (p9)	ld1	t4=[src1],2)
-EK(.ex_handler_short, (p10)	ld1	t5=[src0],2)
-EX(.ex_handler_short, (p7)	st1	[dst1] = t2,2)
-EK(.ex_handler_short, (p8)	st1	[dst0] = t3,2)
-	;;
-EX(.ex_handler_short, (p11)	ld1	t6=[src1],2)
-EK(.ex_handler_short, (p12)	ld1	t7=[src0],2)
-	cmp.eq	p6,p7=r28,r29
-EX(.ex_handler_short, (p9)	st1	[dst1] = t4,2)
-EK(.ex_handler_short, (p10)	st1	[dst0] = t5,2)
-	sub	in2=in2,r30
-	;;
-EX(.ex_handler_short, (p11)	st1	[dst1] = t6,2)
-EK(.ex_handler_short, (p12)	st1	[dst0] = t7)
-	add	dst0=in0,r30	// setup arguments
-	add	src0=in1,r30
-(p6)	br.cond.dptk .aligned_src
-(p7)	br.cond.dpnt .unaligned_src
-	;;
-
-/* main loop body in jump table format */
-#define COPYU(shift)									\
-1:											\
-EX(.ex_handler,  (p16)	ld8	r32=[src0],8);		/* 1 */				\
-EK(.ex_handler,  (p16)	ld8	r36=[src1],8);						\
-		 (p17)	shrp	r35=r33,r34,shift;;	/* 1 */				\
-EX(.ex_handler,  (p6)	ld8	r22=[src1]);	/* common, prime for tail section */	\
-		 nop.m	0;								\
-		 (p16)	shrp	r38=r36,r37,shift;					\
-EX(.ex_handler,  (p17)	st8	[dst0]=r35,8);		/* 1 */				\
-EK(.ex_handler,  (p17)	st8	[dst1]=r39,8);						\
-		 br.ctop.dptk.few 1b;;							\
-		 (p7)	add	src1=-8,src1;	/* back out for <8 byte case */		\
-		 shrp	r21=r22,r38,shift;	/* speculative work */			\
-		 br.sptk.few .unaligned_src_tail /* branch out of jump table */		\
-		 ;;
-	TEXT_ALIGN(32)
-.jump_table:
-	COPYU(8)	// unaligned cases
-.jmp1:
-	COPYU(16)
-	COPYU(24)
-	COPYU(32)
-	COPYU(40)
-	COPYU(48)
-	COPYU(56)
-
-#undef A
-#undef B
-#undef C
-#undef D
-
-/*
- * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
- * instruction failed in the bundle.  The exception algorithm is that we
- * first figure out the faulting address, then detect if there is any
- * progress made on the copy, if so, redo the copy from last known copied
- * location up to the faulting address (exclusive). In the copy_from_user
- * case, remaining byte in kernel buffer will be zeroed.
- *
- * Take copy_from_user as an example, in the code there are multiple loads
- * in a bundle and those multiple loads could span over two pages, the
- * faulting address is calculated as page_round_down(max(src0, src1)).
- * This is based on knowledge that if we can access one byte in a page, we
- * can access any byte in that page.
- *
- * predicate used in the exception handler:
- * p6-p7: direction
- * p10-p11: src faulting addr calculation
- * p12-p13: dst faulting addr calculation
- */
-
-#define A	r19
-#define B	r20
-#define C	r21
-#define D	r22
-#define F	r28
-
-#define saved_retval	loc0
-#define saved_rtlink	loc1
-#define saved_pfs_stack	loc2
-
-.ex_hndlr_s:
-	add	src0=8,src0
-	br.sptk .ex_handler
-	;;
-.ex_hndlr_d:
-	add	dst0=8,dst0
-	br.sptk .ex_handler
-	;;
-.ex_hndlr_lcpy_1:
-	mov	src1=src_pre_mem
-	mov	dst1=dst_pre_mem
-	cmp.gtu	p10,p11=src_pre_mem,saved_in1
-	cmp.gtu	p12,p13=dst_pre_mem,saved_in0
-	;;
-(p10)	add	src0=8,saved_in1
-(p11)	mov	src0=saved_in1
-(p12)	add	dst0=8,saved_in0
-(p13)	mov	dst0=saved_in0
-	br.sptk	.ex_handler
-.ex_handler_lcpy:
-	// in line_copy block, the preload addresses should always ahead
-	// of the other two src/dst pointers.  Furthermore, src1/dst1 should
-	// always ahead of src0/dst0.
-	mov	src1=src_pre_mem
-	mov	dst1=dst_pre_mem
-.ex_handler:
-	mov	pr=saved_pr,-1		// first restore pr, lc, and pfs
-	mov	ar.lc=saved_lc
-	mov	ar.pfs=saved_pfs
-	;;
-.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
-	cmp.ltu	p6,p7=saved_in0, saved_in1	// get the copy direction
-	cmp.ltu	p10,p11=src0,src1
-	cmp.ltu	p12,p13=dst0,dst1
-	fcmp.eq	p8,p0=f6,f0		// is it memcpy?
-	mov	tmp = dst0
-	;;
-(p11)	mov	src1 = src0		// pick the larger of the two
-(p13)	mov	dst0 = dst1		// make dst0 the smaller one
-(p13)	mov	dst1 = tmp		// and dst1 the larger one
-	;;
-(p6)	dep	F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
-(p7)	dep	F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
-	;;
-(p6)	cmp.le	p14,p0=dst0,saved_in0	// no progress has been made on store
-(p7)	cmp.le	p14,p0=src0,saved_in1	// no progress has been made on load
-	mov	retval=saved_in2
-(p8)	ld1	tmp=[src1]		// force an oops for memcpy call
-(p8)	st1	[dst1]=r0		// force an oops for memcpy call
-(p14)	br.ret.sptk.many rp
-
-/*
- * The remaining byte to copy is calculated as:
- *
- * A =	(faulting_addr - orig_src)	-> len to faulting ld address
- *	or 
- * 	(faulting_addr - orig_dst)	-> len to faulting st address
- * B =	(cur_dst - orig_dst)		-> len copied so far
- * C =	A - B				-> len need to be copied
- * D =	orig_len - A			-> len need to be left along
- */
-(p6)	sub	A = F, saved_in0
-(p7)	sub	A = F, saved_in1
-	clrrrb
-	;;
-	alloc	saved_pfs_stack=ar.pfs,3,3,3,0
-	cmp.lt	p8,p0=A,r0
-	sub	B = dst0, saved_in0	// how many byte copied so far
-	;;
-(p8)	mov	A = 0;			// A shouldn't be negative, cap it
-	;;
-	sub	C = A, B
-	sub	D = saved_in2, A
-	;;
-	cmp.gt	p8,p0=C,r0		// more than 1 byte?
-	mov	r8=0
-	mov	saved_retval = D
-	mov	saved_rtlink = b0
-
-	add	out0=saved_in0, B
-	add	out1=saved_in1, B
-	mov	out2=C
-(p8)	br.call.sptk.few b0=__copy_user	// recursive call
-	;;
-
-	add	saved_retval=saved_retval,r8	// above might return non-zero value
-	;;
-
-	mov	retval=saved_retval
-	mov	ar.pfs=saved_pfs_stack
-	mov	b0=saved_rtlink
-	br.ret.sptk.many rp
-
-/* end of McKinley specific optimization */
-END(__copy_user)
-EXPORT_SYMBOL(__copy_user)
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S
deleted file mode 100644
index 07a8b92c6496..000000000000
--- a/arch/ia64/lib/memset.S
+++ /dev/null
@@ -1,365 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Optimized version of the standard memset() function.
-
-   Copyright (c) 2002 Hewlett-Packard Co/CERN
-	Sverre Jarp <Sverre.Jarp@cern.ch>
-
-   Return: dest
-
-   Inputs:
-        in0:    dest
-        in1:    value
-        in2:    count
-
-   The algorithm is fairly straightforward: set byte by byte until we
-   we get to a 16B-aligned address, then loop on 128 B chunks using an
-   early store as prefetching, then loop on 32B chucks, then clear remaining
-   words, finally clear remaining bytes.
-   Since a stf.spill f0 can store 16B in one go, we use this instruction
-   to get peak speed when value = 0.  */
-
-#include <asm/asmmacro.h>
-#include <asm/export.h>
-#undef ret
-
-#define dest		in0
-#define value		in1
-#define	cnt		in2
-
-#define tmp		r31
-#define save_lc		r30
-#define ptr0		r29
-#define ptr1		r28
-#define ptr2		r27
-#define ptr3		r26
-#define ptr9 		r24
-#define	loopcnt		r23
-#define linecnt		r22
-#define bytecnt		r21
-
-#define fvalue		f6
-
-// This routine uses only scratch predicate registers (p6 - p15)
-#define p_scr		p6			// default register for same-cycle branches
-#define p_nz		p7
-#define p_zr		p8
-#define p_unalgn	p9
-#define p_y		p11
-#define p_n		p12
-#define p_yy		p13
-#define p_nn		p14
-
-#define MIN1		15
-#define MIN1P1HALF	8
-#define LINE_SIZE	128
-#define LSIZE_SH        7			// shift amount
-#define PREF_AHEAD	8
-
-GLOBAL_ENTRY(memset)
-{ .mmi
-	.prologue
-	alloc	tmp = ar.pfs, 3, 0, 0, 0
-	lfetch.nt1 [dest]			//
-	.save   ar.lc, save_lc
-	mov.i	save_lc = ar.lc
-	.body
-} { .mmi
-	mov	ret0 = dest			// return value
-	cmp.ne	p_nz, p_zr = value, r0		// use stf.spill if value is zero
-	cmp.eq	p_scr, p0 = cnt, r0
-;; }
-{ .mmi
-	and	ptr2 = -(MIN1+1), dest		// aligned address
-	and	tmp = MIN1, dest		// prepare to check for correct alignment
-	tbit.nz p_y, p_n = dest, 0		// Do we have an odd address? (M_B_U)
-} { .mib
-	mov	ptr1 = dest
-	mux1	value = value, @brcst		// create 8 identical bytes in word
-(p_scr)	br.ret.dpnt.many rp			// return immediately if count = 0
-;; }
-{ .mib
-	cmp.ne	p_unalgn, p0 = tmp, r0		//
-} { .mib
-	sub	bytecnt = (MIN1+1), tmp		// NB: # of bytes to move is 1 higher than loopcnt
-	cmp.gt	p_scr, p0 = 16, cnt		// is it a minimalistic task?
-(p_scr)	br.cond.dptk.many .move_bytes_unaligned	// go move just a few (M_B_U)
-;; }
-{ .mmi
-(p_unalgn) add	ptr1 = (MIN1+1), ptr2		// after alignment
-(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		// after alignment
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	// should we do a st8 ?
-;; }
-{ .mib
-(p_y)	add	cnt = -8, cnt			//
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	// should we do a st4 ?
-} { .mib
-(p_y)	st8	[ptr2] = value,-4		//
-(p_n)	add	ptr2 = 4, ptr2			//
-;; }
-{ .mib
-(p_yy)	add	cnt = -4, cnt			//
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	// should we do a st2 ?
-} { .mib
-(p_yy)	st4	[ptr2] = value,-2		//
-(p_nn)	add	ptr2 = 2, ptr2			//
-;; }
-{ .mmi
-	mov	tmp = LINE_SIZE+1		// for compare
-(p_y)	add	cnt = -2, cnt			//
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	// should we do a st1 ?
-} { .mmi
-	setf.sig fvalue=value			// transfer value to FLP side
-(p_y)	st2	[ptr2] = value,-1		//
-(p_n)	add	ptr2 = 1, ptr2			//
-;; }
-
-{ .mmi
-(p_yy)	st1	[ptr2] = value 			//
-  	cmp.gt	p_scr, p0 = tmp, cnt		// is it a minimalistic task?
-} { .mbb
-(p_yy)	add	cnt = -1, cnt			//
-(p_scr)	br.cond.dpnt.many .fraction_of_line	// go move just a few
-;; }
-
-{ .mib
-	nop.m 0
-	shr.u	linecnt = cnt, LSIZE_SH
-(p_zr)	br.cond.dptk.many .l1b			// Jump to use stf.spill
-;; }
-
-	TEXT_ALIGN(32) // --------------------- //  L1A: store ahead into cache lines; fill later
-{ .mmi
-	and	tmp = -(LINE_SIZE), cnt		// compute end of range
-	mov	ptr9 = ptr1			// used for prefetching
-	and	cnt = (LINE_SIZE-1), cnt	// remainder
-} { .mmi
-	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
-	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
-;; }
-{ .mmi
-(p_scr)	add	loopcnt = -1, linecnt		//
-	add	ptr2 = 8, ptr1			// start of stores (beyond prefetch stores)
-	add	ptr1 = tmp, ptr1		// first address beyond total range
-;; }
-{ .mmi
-	add	tmp = -1, linecnt		// next loop count
-	mov.i	ar.lc = loopcnt			//
-;; }
-.pref_l1a:
-{ .mib
-	stf8 [ptr9] = fvalue, 128		// Do stores one cache line apart
-	nop.i	0
-	br.cloop.dptk.few .pref_l1a
-;; }
-{ .mmi
-	add	ptr0 = 16, ptr2			// Two stores in parallel
-	mov.i	ar.lc = tmp			//
-;; }
-.l1ax:
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 24
-	stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 24
-	stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 24
-	stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 32
- 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
- ;; }
-{ .mmb
-	stf8 [ptr2] = fvalue, 24
-(p_scr)	stf8 [ptr9] = fvalue, 128
-	br.cloop.dptk.few .l1ax
-;; }
-{ .mbb
-	cmp.le  p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr) br.cond.dpnt.many  .fraction_of_line	// Branch no. 2
-	br.cond.dpnt.many  .move_bytes_from_alignment	// Branch no. 3
-;; }
-
-	TEXT_ALIGN(32)
-.l1b:	// ------------------------------------ //  L1B: store ahead into cache lines; fill later
-{ .mmi
-	and	tmp = -(LINE_SIZE), cnt		// compute end of range
-	mov	ptr9 = ptr1			// used for prefetching
-	and	cnt = (LINE_SIZE-1), cnt	// remainder
-} { .mmi
-	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
-	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
-;; }
-{ .mmi
-(p_scr)	add	loopcnt = -1, linecnt
-	add	ptr2 = 16, ptr1			// start of stores (beyond prefetch stores)
-	add	ptr1 = tmp, ptr1		// first address beyond total range
-;; }
-{ .mmi
-	add	tmp = -1, linecnt		// next loop count
-	mov.i	ar.lc = loopcnt
-;; }
-.pref_l1b:
-{ .mib
-	stf.spill [ptr9] = f0, 128		// Do stores one cache line apart
-	nop.i   0
-	br.cloop.dptk.few .pref_l1b
-;; }
-{ .mmi
-	add	ptr0 = 16, ptr2			// Two stores in parallel
-	mov.i	ar.lc = tmp
-;; }
-.l1bx:
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 64
- 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
- ;; }
-{ .mmb
-	stf.spill [ptr2] = f0, 32
-(p_scr)	stf.spill [ptr9] = f0, 128
-	br.cloop.dptk.few .l1bx
-;; }
-{ .mib
-	cmp.gt  p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr)	br.cond.dpnt.many  .move_bytes_from_alignment	//
-;; }
-
-.fraction_of_line:
-{ .mib
-	add	ptr2 = 16, ptr1
-	shr.u	loopcnt = cnt, 5   		// loopcnt = cnt / 32
-;; }
-{ .mib
-	cmp.eq	p_scr, p0 = loopcnt, r0
-	add	loopcnt = -1, loopcnt
-(p_scr)	br.cond.dpnt.many .store_words
-;; }
-{ .mib
-	and	cnt = 0x1f, cnt			// compute the remaining cnt
-	mov.i   ar.lc = loopcnt
-;; }
-	TEXT_ALIGN(32)
-.l2:	// ------------------------------------ //  L2A:  store 32B in 2 cycles
-{ .mmb
-	stf8	[ptr1] = fvalue, 8
-	stf8	[ptr2] = fvalue, 8
-;; } { .mmb
-	stf8	[ptr1] = fvalue, 24
-	stf8	[ptr2] = fvalue, 24
-	br.cloop.dptk.many .l2
-;; }
-.store_words:
-{ .mib
-	cmp.gt	p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	// Branch
-;; }
-
-{ .mmi
-	stf8	[ptr1] = fvalue, 8		// store
-	cmp.le	p_y, p_n = 16, cnt
-	add	cnt = -8, cnt			// subtract
-;; }
-{ .mmi
-(p_y)	stf8	[ptr1] = fvalue, 8		// store
-(p_y)	cmp.le.unc p_yy, p_nn = 16, cnt
-(p_y)	add	cnt = -8, cnt			// subtract
-;; }
-{ .mmi						// store
-(p_yy)	stf8	[ptr1] = fvalue, 8
-(p_yy)	add	cnt = -8, cnt			// subtract
-;; }
-
-.move_bytes_from_alignment:
-{ .mib
-	cmp.eq	p_scr, p0 = cnt, r0
-	tbit.nz.unc p_y, p0 = cnt, 2		// should we terminate with a st4 ?
-(p_scr)	br.cond.dpnt.few .restore_and_exit
-;; }
-{ .mib
-(p_y)	st4	[ptr1] = value,4
-	tbit.nz.unc p_yy, p0 = cnt, 1		// should we terminate with a st2 ?
-;; }
-{ .mib
-(p_yy)	st2	[ptr1] = value,2
-	tbit.nz.unc p_y, p0 = cnt, 0		// should we terminate with a st1 ?
-;; }
-
-{ .mib
-(p_y)	st1	[ptr1] = value
-;; }
-.restore_and_exit:
-{ .mib
-	nop.m	0
-	mov.i	ar.lc = save_lc
-	br.ret.sptk.many rp
-;; }
-
-.move_bytes_unaligned:
-{ .mmi
-       .pred.rel "mutex",p_y, p_n
-       .pred.rel "mutex",p_yy, p_nn
-(p_n)	cmp.le  p_yy, p_nn = 4, cnt
-(p_y)	cmp.le  p_yy, p_nn = 5, cnt
-(p_n)	add	ptr2 = 2, ptr1
-} { .mmi
-(p_y)	add	ptr2 = 3, ptr1
-(p_y)	st1	[ptr1] = value, 1		// fill 1 (odd-aligned) byte [15, 14 (or less) left]
-(p_y)	add	cnt = -1, cnt
-;; }
-{ .mmi
-(p_yy)	cmp.le.unc p_y, p0 = 8, cnt
-	add	ptr3 = ptr1, cnt		// prepare last store
-	mov.i	ar.lc = save_lc
-} { .mmi
-(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [11, 10 (o less) left]
-(p_yy)	add	cnt = -4, cnt
-;; }
-{ .mmi
-(p_y)	cmp.le.unc p_yy, p0 = 8, cnt
-	add	ptr3 = -1, ptr3			// last store
-	tbit.nz p_scr, p0 = cnt, 1		// will there be a st2 at the end ?
-} { .mmi
-(p_y)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
-(p_y)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [7, 6 (or less) left]
-(p_y)	add	cnt = -4, cnt
-;; }
-{ .mmi
-(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [3, 2 (or less) left]
-	tbit.nz p_y, p0 = cnt, 0		// will there be a st1 at the end ?
-} { .mmi
-(p_yy)	add	cnt = -4, cnt
-;; }
-{ .mmb
-(p_scr)	st2	[ptr1] = value			// fill 2 (aligned) bytes
-(p_y)	st1	[ptr3] = value			// fill last byte (using ptr3)
-	br.ret.sptk.many rp
-}
-END(memset)
-EXPORT_SYMBOL(memset)
diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S
deleted file mode 100644
index d66de5966974..000000000000
--- a/arch/ia64/lib/strlen.S
+++ /dev/null
@@ -1,195 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard strlen() function
- *
- *
- * Inputs:
- *	in0	address of string
- *
- * Outputs:
- *	ret0	the number of characters in the string (0 if empty string)
- *	does not count the \0
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * 09/24/99 S.Eranian add speculation recovery code
- */
-
-#include <asm/asmmacro.h>
-#include <asm/export.h>
-
-//
-//
-// This is an enhanced version of the basic strlen. it includes a combination
-// of compute zero index (czx), parallel comparisons, speculative loads and
-// loop unroll using rotating registers.
-//
-// General Ideas about the algorithm:
-//	  The goal is to look at the string in chunks of 8 bytes.
-//	  so we need to do a few extra checks at the beginning because the
-//	  string may not be 8-byte aligned. In this case we load the 8byte
-//	  quantity which includes the start of the string and mask the unused
-//	  bytes with 0xff to avoid confusing czx.
-//	  We use speculative loads and software pipelining to hide memory
-//	  latency and do read ahead safely. This way we defer any exception.
-//
-//	  Because we don't want the kernel to be relying on particular
-//	  settings of the DCR register, we provide recovery code in case
-//	  speculation fails. The recovery code is going to "redo" the work using
-//	  only normal loads. If we still get a fault then we generate a
-//	  kernel panic. Otherwise we return the strlen as usual.
-//
-//	  The fact that speculation may fail can be caused, for instance, by
-//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
-//	  a NaT bit will be set if the translation is not present. The normal
-//	  load, on the other hand, will cause the translation to be inserted
-//	  if the mapping exists.
-//
-//	  It should be noted that we execute recovery code only when we need
-//	  to use the data that has been speculatively loaded: we don't execute
-//	  recovery code on pure read ahead data.
-//
-// Remarks:
-//	- the cmp r0,r0 is used as a fast way to initialize a predicate
-//	  register to 1. This is required to make sure that we get the parallel
-//	  compare correct.
-//
-//	- we don't use the epilogue counter to exit the loop but we need to set
-//	  it to zero beforehand.
-//
-//	- after the loop we must test for Nat values because neither the
-//	  czx nor cmp instruction raise a NaT consumption fault. We must be
-//	  careful not to look too far for a Nat for which we don't care.
-//	  For instance we don't need to look at a NaT in val2 if the zero byte
-//	  was in val1.
-//
-//	- Clearly performance tuning is required.
-//
-//
-//
-#define saved_pfs	r11
-#define	tmp		r10
-#define base		r16
-#define orig		r17
-#define saved_pr	r18
-#define src		r19
-#define mask		r20
-#define val		r21
-#define val1		r22
-#define val2		r23
-
-GLOBAL_ENTRY(strlen)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
-
-	.rotr v[2], w[2]	// declares our 4 aliases
-
-	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
-	mov orig=in0		// keep trackof initial byte address
-	dep src=0,in0,0,3	// src=8byte-aligned in0 address
-	.save pr, saved_pr
-	mov saved_pr=pr		// preserve predicates (rotation)
-	;;
-
-	.body
-
-	ld8 v[1]=[src],8	// must not speculate: can fail here
-	shl tmp=tmp,3		// multiply by 8bits/byte
-	mov mask=-1		// our mask
-	;;
-	ld8.s w[1]=[src],8	// speculatively load next
-	cmp.eq p6,p0=r0,r0	// sets p6 to true for cmp.and
-	sub tmp=64,tmp		// how many bits to shift our mask on the right
-	;;
-	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
-	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
-	;;
-	add base=-16,src	// keep track of aligned base
-	or v[1]=v[1],mask	// now we have a safe initial byte pattern
-	;;
-1:
-	ld8.s v[0]=[src],8	// speculatively load next
-	czx1.r val1=v[1]	// search 0 byte from right
-	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
-	;;
-	ld8.s w[0]=[src],8	// speculatively load next to next
-	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
-	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
-(p6)	br.wtop.dptk 1b		// loop until p6 == 0
-	;;
-	//
-	// We must return try the recovery code iff
-	// val1_is_nat || (val1==8 && val2_is_nat)
-	//
-	// XXX Fixme
-	//	- there must be a better way of doing the test
-	//
-	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
-	tnat.nz p6,p7=val1	// test NaT on val1
-(p6)	br.cond.spnt .recover	// jump to recovery if val1 is NaT
-	;;
-	//
-	// if we come here p7 is true, i.e., initialized for // cmp
-	//
-	cmp.eq.and  p7,p0=8,val1// val1==8?
-	tnat.nz.and p7,p0=val2	// test NaT if val2
-(p7)	br.cond.spnt .recover	// jump to recovery if val2 is NaT
-	;;
-(p8)	mov val1=val2		// the other test got us out of the loop
-(p8)	adds src=-16,src	// correct position when 3 ahead
-(p9)	adds src=-24,src	// correct position when 4 ahead
-	;;
-	sub ret0=src,orig	// distance from base
-	sub tmp=8,val1		// which byte in word
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	sub ret0=ret0,tmp	// adjust
-	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
-	br.ret.sptk.many rp	// end of normal execution
-
-	//
-	// Outlined recovery code when speculation failed
-	//
-	// This time we don't use speculation and rely on the normal exception
-	// mechanism. that's why the loop is not as good as the previous one
-	// because read ahead is not possible
-	//
-	// IMPORTANT:
-	// Please note that in the case of strlen() as opposed to strlen_user()
-	// we don't use the exception mechanism, as this function is not
-	// supposed to fail. If that happens it means we have a bug and the
-	// code will cause of kernel fault.
-	//
-	// XXX Fixme
-	//	- today we restart from the beginning of the string instead
-	//	  of trying to continue where we left off.
-	//
-.recover:
-	ld8 val=[base],8	// will fail if unrecoverable fault
-	;;
-	or val=val,mask		// remask first bytes
-	cmp.eq p0,p6=r0,r0	// nullify first ld8 in loop
-	;;
-	//
-	// ar.ec is still zero here
-	//
-2:
-(p6)	ld8 val=[base],8	// will fail if unrecoverable fault
-	;;
-	czx1.r val1=val		// search 0 byte from right
-	;;
-	cmp.eq p6,p0=8,val1	// val1==8 ?
-(p6)	br.wtop.dptk 2b		// loop until p6 == 0
-	;;			// (avoid WAW on p63)
-	sub ret0=base,orig	// distance from base
-	sub tmp=8,val1
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	sub ret0=ret0,tmp	// length=now - back -1
-	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
-	br.ret.sptk.many rp	// end of successful recovery code
-END(strlen)
-EXPORT_SYMBOL(strlen)
diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S
deleted file mode 100644
index 49eb81b69cd2..000000000000
--- a/arch/ia64/lib/strncpy_from_user.S
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Just like strncpy() except that if a fault occurs during copying,
- * -EFAULT is returned.
- *
- * Inputs:
- *	in0:	address of destination buffer
- *	in1:	address of string to be copied
- *	in2:	length of buffer in bytes
- * Outputs:
- *	r8:	-EFAULT in case of fault or number of bytes copied if no fault
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
- *			 by Andreas Schwab <schwab@suse.de>).
- */
-
-#include <asm/asmmacro.h>
-#include <asm/export.h>
-
-GLOBAL_ENTRY(__strncpy_from_user)
-	alloc r2=ar.pfs,3,0,0,0
-	mov r8=0
-	mov r9=in1
-	;;
-	add r10=in1,in2
-	cmp.eq p6,p0=r0,in2
-(p6)	br.ret.spnt.many rp
-
-	// XXX braindead copy loop---this needs to be optimized
-.Loop1:
-	EX(.Lexit, ld1 r8=[in1],1)
-	;;
-	EX(.Lexit, st1 [in0]=r8,1)
-	cmp.ne p6,p7=r8,r0
-	;;
-(p6)	cmp.ne.unc p8,p0=in1,r10
-(p8)	br.cond.dpnt.few .Loop1
-	;;
-(p6)	mov r8=in2		// buffer filled up---return buffer length
-(p7)	sub r8=in1,r9,1		// return string length (excluding NUL character)
-[.Lexit:]
-	br.ret.sptk.many rp
-END(__strncpy_from_user)
-EXPORT_SYMBOL(__strncpy_from_user)
diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S
deleted file mode 100644
index 4b684d4da106..000000000000
--- a/arch/ia64/lib/strnlen_user.S
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Returns 0 if exception before NUL or reaching the supplied limit (N),
- * a value greater than N if the string is longer than the limit, else
- * strlen.
- *
- * Inputs:
- *	in0:	address of buffer
- *	in1:	string length limit N
- * Outputs:
- *	r8:	0 in case of fault, strlen(buffer)+1 otherwise
- *
- * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <asm/asmmacro.h>
-#include <asm/export.h>
-
-GLOBAL_ENTRY(__strnlen_user)
-	.prologue
-	alloc r2=ar.pfs,2,0,0,0
-	.save ar.lc, r16
-	mov r16=ar.lc			// preserve ar.lc
-
-	.body
-
-	add r3=-1,in1
-	;;
-	mov ar.lc=r3
-	mov r9=0
-	;;
-	// XXX braindead strlen loop---this needs to be optimized
-.Loop1:
-	EXCLR(.Lexit, ld1 r8=[in0],1)
-	add r9=1,r9
-	;;
-	cmp.eq p6,p0=r8,r0
-(p6)	br.cond.dpnt .Lexit
-	br.cloop.dptk.few .Loop1
-
-	add r9=1,in1			// NUL not found---return N+1
-	;;
-.Lexit:
-	mov r8=r9
-	mov ar.lc=r16			// restore ar.lc
-	br.ret.sptk.many rp
-END(__strnlen_user)
-EXPORT_SYMBOL(__strnlen_user)
diff --git a/arch/ia64/lib/xor.S b/arch/ia64/lib/xor.S
deleted file mode 100644
index 5413dafe6b2e..000000000000
--- a/arch/ia64/lib/xor.S
+++ /dev/null
@@ -1,181 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * arch/ia64/lib/xor.S
- *
- * Optimized RAID-5 checksumming functions for IA-64.
- */
-
-#include <asm/asmmacro.h>
-#include <asm/export.h>
-
-GLOBAL_ENTRY(xor_ia64_2)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 3, 0, 13, 16
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	;;
-	.rotr s1[6+1], s2[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-(p[6+1])st8.nta [r8] = d[1], 8
-	nop.f 0
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_2)
-EXPORT_SYMBOL(xor_ia64_2)
-
-GLOBAL_ENTRY(xor_ia64_3)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 4, 0, 20, 24
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov r18 = in3
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	;;
-	.rotr s1[6+1], s2[6+1], s3[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-	;;
-(p[0])	ld8.nta s3[0] = [r18], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6])	xor d[0] = d[0], s3[6]
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_3)
-EXPORT_SYMBOL(xor_ia64_3)
-
-GLOBAL_ENTRY(xor_ia64_4)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 5, 0, 27, 32
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov r18 = in3
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	mov r19 = in4
-	;;
-	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-(p[0])	ld8.nta s3[0] = [r18], 8
-(p[0])	ld8.nta s4[0] = [r19], 8
-(p[6])	xor r20 = s3[6], s4[6]
-	;;
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6])	xor d[0] = d[0], r20
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_4)
-EXPORT_SYMBOL(xor_ia64_4)
-
-GLOBAL_ENTRY(xor_ia64_5)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 6, 0, 34, 40
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov r18 = in3
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	mov r19 = in4
-	mov r20 = in5
-	;;
-	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-(p[0])	ld8.nta s3[0] = [r18], 8
-(p[0])	ld8.nta s4[0] = [r19], 8
-(p[6])	xor r21 = s3[6], s4[6]
-	;;
-(p[0])	ld8.nta s5[0] = [r20], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6])	xor d[0] = d[0], r21
-	;;
-(p[6])	  xor d[0] = d[0], s5[6]
-	nop.f 0
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_5)
-EXPORT_SYMBOL(xor_ia64_5)