14 files changed, 1511 insertions, 49 deletions
diff --git a/arch/loongarch/lib/Makefile b/arch/loongarch/lib/Makefile
index e36635fccb69..a77bf160bfc4 100644
--- a/arch/loongarch/lib/Makefile
+++ b/arch/loongarch/lib/Makefile
@@ -3,4 +3,9 @@
 # Makefile for LoongArch-specific library files.
 #
 
-lib-y	+= delay.o clear_user.o copy_user.o dump_tlb.o
+lib-y	+= delay.o memset.o memcpy.o memmove.o \
+	   clear_user.o copy_user.o csum.o dump_tlb.o unaligned.o
+
+obj-$(CONFIG_CPU_HAS_LSX) += xor_simd.o xor_simd_glue.o
+
+obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/arch/loongarch/lib/clear_user.S b/arch/loongarch/lib/clear_user.S
index 16ba2b8dd68a..be741544e62b 100644
--- a/arch/loongarch/lib/clear_user.S
+++ b/arch/loongarch/lib/clear_user.S
@@ -3,30 +3,31 @@
  * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
  */
 
+#include <linux/export.h>
+#include <asm/alternative-asm.h>
 #include <asm/asm.h>
 #include <asm/asmmacro.h>
-#include <asm/export.h>
+#include <asm/asm-extable.h>
+#include <asm/cpu.h>
 #include <asm/regdef.h>
 
-.macro fixup_ex from, to, offset, fix
-.if \fix
-	.section .fixup, "ax"
-\to:	addi.d	a0, a1, \offset
-	jr	ra
-	.previous
-.endif
-	.section __ex_table, "a"
-	PTR	\from\()b, \to\()b
-	.previous
-.endm
+SYM_FUNC_START(__clear_user)
+	/*
+	 * Some CPUs support hardware unaligned access
+	 */
+	ALTERNATIVE	"b __clear_user_generic",	\
+			"b __clear_user_fast", CPU_FEATURE_UAL
+SYM_FUNC_END(__clear_user)
+
+EXPORT_SYMBOL(__clear_user)
 
 /*
- * unsigned long __clear_user(void *addr, size_t size)
+ * unsigned long __clear_user_generic(void *addr, size_t size)
  *
  * a0: addr
  * a1: size
  */
-SYM_FUNC_START(__clear_user)
+SYM_FUNC_START(__clear_user_generic)
 	beqz	a1, 2f
 
 1:	st.b	zero, a0, 0
@@ -37,7 +38,169 @@ SYM_FUNC_START(__clear_user)
 2:	move	a0, a1
 	jr	ra
 
-	fixup_ex 1, 3, 0, 1
-SYM_FUNC_END(__clear_user)
+	_asm_extable 1b, 2b
+SYM_FUNC_END(__clear_user_generic)
 
-EXPORT_SYMBOL(__clear_user)
+/*
+ * unsigned long __clear_user_fast(void *addr, unsigned long size)
+ *
+ * a0: addr
+ * a1: size
+ */
+SYM_FUNC_START(__clear_user_fast)
+	sltui	t0, a1, 9
+	bnez	t0, .Lsmall
+
+	add.d	a2, a0, a1
+0:	st.d	zero, a0, 0
+
+	/* align up address */
+	addi.d	a0, a0, 8
+	bstrins.d	a0, zero, 2, 0
+
+	addi.d	a3, a2, -64
+	bgeu	a0, a3, .Llt64
+
+	/* set 64 bytes at a time */
+.Lloop64:
+1:	st.d	zero, a0, 0
+2:	st.d	zero, a0, 8
+3:	st.d	zero, a0, 16
+4:	st.d	zero, a0, 24
+5:	st.d	zero, a0, 32
+6:	st.d	zero, a0, 40
+7:	st.d	zero, a0, 48
+8:	st.d	zero, a0, 56
+	addi.d	a0, a0, 64
+	bltu	a0, a3, .Lloop64
+
+	/* set the remaining bytes */
+.Llt64:
+	addi.d	a3, a2, -32
+	bgeu	a0, a3, .Llt32
+9:	st.d	zero, a0, 0
+10:	st.d	zero, a0, 8
+11:	st.d	zero, a0, 16
+12:	st.d	zero, a0, 24
+	addi.d	a0, a0, 32
+
+.Llt32:
+	addi.d	a3, a2, -16
+	bgeu	a0, a3, .Llt16
+13:	st.d	zero, a0, 0
+14:	st.d	zero, a0, 8
+	addi.d	a0, a0, 16
+
+.Llt16:
+	addi.d	a3, a2, -8
+	bgeu	a0, a3, .Llt8
+15:	st.d	zero, a0, 0
+	addi.d	a0, a0, 8
+
+.Llt8:
+16:	st.d	zero, a2, -8
+
+	/* return */
+	move	a0, zero
+	jr	ra
+
+	.align	4
+.Lsmall:
+	pcaddi	t0, 4
+	slli.d	a2, a1, 4
+	add.d	t0, t0, a2
+	jr	t0
+
+	.align	4
+	move	a0, zero
+	jr	ra
+
+	.align	4
+17:	st.b	zero, a0, 0
+	move	a0, zero
+	jr	ra
+
+	.align	4
+18:	st.h	zero, a0, 0
+	move	a0, zero
+	jr	ra
+
+	.align	4
+19:	st.h	zero, a0, 0
+20:	st.b	zero, a0, 2
+	move	a0, zero
+	jr	ra
+
+	.align	4
+21:	st.w	zero, a0, 0
+	move	a0, zero
+	jr	ra
+
+	.align	4
+22:	st.w	zero, a0, 0
+23:	st.b	zero, a0, 4
+	move	a0, zero
+	jr	ra
+
+	.align	4
+24:	st.w	zero, a0, 0
+25:	st.h	zero, a0, 4
+	move	a0, zero
+	jr	ra
+
+	.align	4
+26:	st.w	zero, a0, 0
+27:	st.w	zero, a0, 3
+	move	a0, zero
+	jr	ra
+
+	.align	4
+28:	st.d	zero, a0, 0
+	move	a0, zero
+	jr	ra
+
+	/* fixup and ex_table */
+.Llarge_fixup:
+	sub.d	a1, a2, a0
+
+.Lsmall_fixup:
+29:	st.b	zero, a0, 0
+	addi.d	a0, a0, 1
+	addi.d	a1, a1, -1
+	bgt	a1, zero, 29b
+
+.Lexit:
+	move	a0, a1
+	jr	ra
+
+	_asm_extable 0b, .Lsmall_fixup
+	_asm_extable 1b, .Llarge_fixup
+	_asm_extable 2b, .Llarge_fixup
+	_asm_extable 3b, .Llarge_fixup
+	_asm_extable 4b, .Llarge_fixup
+	_asm_extable 5b, .Llarge_fixup
+	_asm_extable 6b, .Llarge_fixup
+	_asm_extable 7b, .Llarge_fixup
+	_asm_extable 8b, .Llarge_fixup
+	_asm_extable 9b, .Llarge_fixup
+	_asm_extable 10b, .Llarge_fixup
+	_asm_extable 11b, .Llarge_fixup
+	_asm_extable 12b, .Llarge_fixup
+	_asm_extable 13b, .Llarge_fixup
+	_asm_extable 14b, .Llarge_fixup
+	_asm_extable 15b, .Llarge_fixup
+	_asm_extable 16b, .Llarge_fixup
+	_asm_extable 17b, .Lexit
+	_asm_extable 18b, .Lsmall_fixup
+	_asm_extable 19b, .Lsmall_fixup
+	_asm_extable 20b, .Lsmall_fixup
+	_asm_extable 21b, .Lsmall_fixup
+	_asm_extable 22b, .Lsmall_fixup
+	_asm_extable 23b, .Lsmall_fixup
+	_asm_extable 24b, .Lsmall_fixup
+	_asm_extable 25b, .Lsmall_fixup
+	_asm_extable 26b, .Lsmall_fixup
+	_asm_extable 27b, .Lsmall_fixup
+	_asm_extable 28b, .Lsmall_fixup
+	_asm_extable 29b, .Lexit
+SYM_FUNC_END(__clear_user_fast)
diff --git a/arch/loongarch/lib/copy_user.S b/arch/loongarch/lib/copy_user.S
index 97d20327a69e..feec3d362803 100644
--- a/arch/loongarch/lib/copy_user.S
+++ b/arch/loongarch/lib/copy_user.S
@@ -3,31 +3,32 @@
  * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
  */
 
+#include <linux/export.h>
+#include <asm/alternative-asm.h>
 #include <asm/asm.h>
 #include <asm/asmmacro.h>
-#include <asm/export.h>
+#include <asm/asm-extable.h>
+#include <asm/cpu.h>
 #include <asm/regdef.h>
 
-.macro fixup_ex from, to, offset, fix
-.if \fix
-	.section .fixup, "ax"
-\to:	addi.d	a0, a2, \offset
-	jr	ra
-	.previous
-.endif
-	.section __ex_table, "a"
-	PTR	\from\()b, \to\()b
-	.previous
-.endm
+SYM_FUNC_START(__copy_user)
+	/*
+	 * Some CPUs support hardware unaligned access
+	 */
+	ALTERNATIVE	"b __copy_user_generic",	\
+			"b __copy_user_fast", CPU_FEATURE_UAL
+SYM_FUNC_END(__copy_user)
+
+EXPORT_SYMBOL(__copy_user)
 
 /*
- * unsigned long __copy_user(void *to, const void *from, size_t n)
+ * unsigned long __copy_user_generic(void *to, const void *from, size_t n)
  *
  * a0: to
  * a1: from
  * a2: n
  */
-SYM_FUNC_START(__copy_user)
+SYM_FUNC_START(__copy_user_generic)
 	beqz	a2, 3f
 
 1:	ld.b	t0, a1, 0
@@ -40,8 +41,240 @@ SYM_FUNC_START(__copy_user)
 3:	move	a0, a2
 	jr	ra
 
-	fixup_ex 1, 4, 0, 1
-	fixup_ex 2, 4, 0, 0
-SYM_FUNC_END(__copy_user)
+	_asm_extable 1b, 3b
+	_asm_extable 2b, 3b
+SYM_FUNC_END(__copy_user_generic)
 
-EXPORT_SYMBOL(__copy_user)
+/*
+ * unsigned long __copy_user_fast(void *to, const void *from, unsigned long n)
+ *
+ * a0: to
+ * a1: from
+ * a2: n
+ */
+SYM_FUNC_START(__copy_user_fast)
+	sltui	t0, a2, 9
+	bnez	t0, .Lsmall
+
+0:	ld.d	t0, a1, 0
+1:	st.d	t0, a0, 0
+	add.d	a3, a1, a2
+	add.d	a2, a0, a2
+
+	/* align up destination address */
+	andi	t1, a0, 7
+	sub.d	t0, zero, t1
+	addi.d	t0, t0, 8
+	add.d	a1, a1, t0
+	add.d	a0, a0, t0
+
+	addi.d	a4, a3, -64
+	bgeu	a1, a4, .Llt64
+
+	/* copy 64 bytes at a time */
+.Lloop64:
+2:	ld.d	t0, a1, 0
+3:	ld.d	t1, a1, 8
+4:	ld.d	t2, a1, 16
+5:	ld.d	t3, a1, 24
+6:	ld.d	t4, a1, 32
+7:	ld.d	t5, a1, 40
+8:	ld.d	t6, a1, 48
+9:	ld.d	t7, a1, 56
+10:	st.d	t0, a0, 0
+11:	st.d	t1, a0, 8
+12:	st.d	t2, a0, 16
+13:	st.d	t3, a0, 24
+14:	st.d	t4, a0, 32
+15:	st.d	t5, a0, 40
+16:	st.d	t6, a0, 48
+17:	st.d	t7, a0, 56
+	addi.d	a1, a1, 64
+	addi.d	a0, a0, 64
+	bltu	a1, a4, .Lloop64
+
+	/* copy the remaining bytes */
+.Llt64:
+	addi.d	a4, a3, -32
+	bgeu	a1, a4, .Llt32
+18:	ld.d	t0, a1, 0
+19:	ld.d	t1, a1, 8
+20:	ld.d	t2, a1, 16
+21:	ld.d	t3, a1, 24
+22:	st.d	t0, a0, 0
+23:	st.d	t1, a0, 8
+24:	st.d	t2, a0, 16
+25:	st.d	t3, a0, 24
+	addi.d	a1, a1, 32
+	addi.d	a0, a0, 32
+
+.Llt32:
+	addi.d	a4, a3, -16
+	bgeu	a1, a4, .Llt16
+26:	ld.d	t0, a1, 0
+27:	ld.d	t1, a1, 8
+28:	st.d	t0, a0, 0
+29:	st.d	t1, a0, 8
+	addi.d	a1, a1, 16
+	addi.d	a0, a0, 16
+
+.Llt16:
+	addi.d	a4, a3, -8
+	bgeu	a1, a4, .Llt8
+30:	ld.d	t0, a1, 0
+31:	st.d	t0, a0, 0
+	addi.d	a1, a1, 8
+	addi.d	a0, a0, 8
+
+.Llt8:
+32:	ld.d	t0, a3, -8
+33:	st.d	t0, a2, -8
+
+	/* return */
+	move	a0, zero
+	jr	ra
+
+	.align	5
+.Lsmall:
+	pcaddi	t0, 8
+	slli.d	a3, a2, 5
+	add.d	t0, t0, a3
+	jr	t0
+
+	.align	5
+	move	a0, zero
+	jr	ra
+
+	.align	5
+34:	ld.b	t0, a1, 0
+35:	st.b	t0, a0, 0
+	move	a0, zero
+	jr	ra
+
+	.align	5
+36:	ld.h	t0, a1, 0
+37:	st.h	t0, a0, 0
+	move	a0, zero
+	jr	ra
+
+	.align	5
+38:	ld.h	t0, a1, 0
+39:	ld.b	t1, a1, 2
+40:	st.h	t0, a0, 0
+41:	st.b	t1, a0, 2
+	move	a0, zero
+	jr	ra
+
+	.align	5
+42:	ld.w	t0, a1, 0
+43:	st.w	t0, a0, 0
+	move	a0, zero
+	jr	ra
+
+	.align	5
+44:	ld.w	t0, a1, 0
+45:	ld.b	t1, a1, 4
+46:	st.w	t0, a0, 0
+47:	st.b	t1, a0, 4
+	move	a0, zero
+	jr	ra
+
+	.align	5
+48:	ld.w	t0, a1, 0
+49:	ld.h	t1, a1, 4
+50:	st.w	t0, a0, 0
+51:	st.h	t1, a0, 4
+	move	a0, zero
+	jr	ra
+
+	.align	5
+52:	ld.w	t0, a1, 0
+53:	ld.w	t1, a1, 3
+54:	st.w	t0, a0, 0
+55:	st.w	t1, a0, 3
+	move	a0, zero
+	jr	ra
+
+	.align	5
+56:	ld.d	t0, a1, 0
+57:	st.d	t0, a0, 0
+	move	a0, zero
+	jr	ra
+
+	/* fixup and ex_table */
+.Llarge_fixup:
+	sub.d	a2, a2, a0
+
+.Lsmall_fixup:
+58:	ld.b	t0, a1, 0
+59:	st.b	t0, a0, 0
+	addi.d	a0, a0, 1
+	addi.d	a1, a1, 1
+	addi.d	a2, a2, -1
+	bgt	a2, zero, 58b
+
+.Lexit:
+	move	a0, a2
+	jr	ra
+
+	_asm_extable 0b, .Lsmall_fixup
+	_asm_extable 1b, .Lsmall_fixup
+	_asm_extable 2b, .Llarge_fixup
+	_asm_extable 3b, .Llarge_fixup
+	_asm_extable 4b, .Llarge_fixup
+	_asm_extable 5b, .Llarge_fixup
+	_asm_extable 6b, .Llarge_fixup
+	_asm_extable 7b, .Llarge_fixup
+	_asm_extable 8b, .Llarge_fixup
+	_asm_extable 9b, .Llarge_fixup
+	_asm_extable 10b, .Llarge_fixup
+	_asm_extable 11b, .Llarge_fixup
+	_asm_extable 12b, .Llarge_fixup
+	_asm_extable 13b, .Llarge_fixup
+	_asm_extable 14b, .Llarge_fixup
+	_asm_extable 15b, .Llarge_fixup
+	_asm_extable 16b, .Llarge_fixup
+	_asm_extable 17b, .Llarge_fixup
+	_asm_extable 18b, .Llarge_fixup
+	_asm_extable 19b, .Llarge_fixup
+	_asm_extable 20b, .Llarge_fixup
+	_asm_extable 21b, .Llarge_fixup
+	_asm_extable 22b, .Llarge_fixup
+	_asm_extable 23b, .Llarge_fixup
+	_asm_extable 24b, .Llarge_fixup
+	_asm_extable 25b, .Llarge_fixup
+	_asm_extable 26b, .Llarge_fixup
+	_asm_extable 27b, .Llarge_fixup
+	_asm_extable 28b, .Llarge_fixup
+	_asm_extable 29b, .Llarge_fixup
+	_asm_extable 30b, .Llarge_fixup
+	_asm_extable 31b, .Llarge_fixup
+	_asm_extable 32b, .Llarge_fixup
+	_asm_extable 33b, .Llarge_fixup
+	_asm_extable 34b, .Lexit
+	_asm_extable 35b, .Lexit
+	_asm_extable 36b, .Lsmall_fixup
+	_asm_extable 37b, .Lsmall_fixup
+	_asm_extable 38b, .Lsmall_fixup
+	_asm_extable 39b, .Lsmall_fixup
+	_asm_extable 40b, .Lsmall_fixup
+	_asm_extable 41b, .Lsmall_fixup
+	_asm_extable 42b, .Lsmall_fixup
+	_asm_extable 43b, .Lsmall_fixup
+	_asm_extable 44b, .Lsmall_fixup
+	_asm_extable 45b, .Lsmall_fixup
+	_asm_extable 46b, .Lsmall_fixup
+	_asm_extable 47b, .Lsmall_fixup
+	_asm_extable 48b, .Lsmall_fixup
+	_asm_extable 49b, .Lsmall_fixup
+	_asm_extable 50b, .Lsmall_fixup
+	_asm_extable 51b, .Lsmall_fixup
+	_asm_extable 52b, .Lsmall_fixup
+	_asm_extable 53b, .Lsmall_fixup
+	_asm_extable 54b, .Lsmall_fixup
+	_asm_extable 55b, .Lsmall_fixup
+	_asm_extable 56b, .Lsmall_fixup
+	_asm_extable 57b, .Lsmall_fixup
+	_asm_extable 58b, .Lexit
+	_asm_extable 59b, .Lexit
+SYM_FUNC_END(__copy_user_fast)
diff --git a/arch/loongarch/lib/csum.c b/arch/loongarch/lib/csum.c
new file mode 100644
index 000000000000..a5e84b403c3b
--- /dev/null
+++ b/arch/loongarch/lib/csum.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2019-2020 Arm Ltd.
+
+#include <linux/compiler.h>
+#include <linux/kasan-checks.h>
+#include <linux/kernel.h>
+
+#include <net/checksum.h>
+
+static u64 accumulate(u64 sum, u64 data)
+{
+	sum += data;
+	if (sum < data)
+		sum += 1;
+	return sum;
+}
+
+/*
+ * We over-read the buffer and this makes KASAN unhappy. Instead, disable
+ * instrumentation and call kasan explicitly.
+ */
+unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len)
+{
+	unsigned int offset, shift, sum;
+	const u64 *ptr;
+	u64 data, sum64 = 0;
+
+	if (unlikely(len == 0))
+		return 0;
+
+	offset = (unsigned long)buff & 7;
+	/*
+	 * This is to all intents and purposes safe, since rounding down cannot
+	 * result in a different page or cache line being accessed, and @buff
+	 * should absolutely not be pointing to anything read-sensitive. We do,
+	 * however, have to be careful not to piss off KASAN, which means using
+	 * unchecked reads to accommodate the head and tail, for which we'll
+	 * compensate with an explicit check up-front.
+	 */
+	kasan_check_read(buff, len);
+	ptr = (u64 *)(buff - offset);
+	len = len + offset - 8;
+
+	/*
+	 * Head: zero out any excess leading bytes. Shifting back by the same
+	 * amount should be at least as fast as any other way of handling the
+	 * odd/even alignment, and means we can ignore it until the very end.
+	 */
+	shift = offset * 8;
+	data = *ptr++;
+	data = (data >> shift) << shift;
+
+	/*
+	 * Body: straightforward aligned loads from here on (the paired loads
+	 * underlying the quadword type still only need dword alignment). The
+	 * main loop strictly excludes the tail, so the second loop will always
+	 * run at least once.
+	 */
+	while (unlikely(len > 64)) {
+		__uint128_t tmp1, tmp2, tmp3, tmp4;
+
+		tmp1 = *(__uint128_t *)ptr;
+		tmp2 = *(__uint128_t *)(ptr + 2);
+		tmp3 = *(__uint128_t *)(ptr + 4);
+		tmp4 = *(__uint128_t *)(ptr + 6);
+
+		len -= 64;
+		ptr += 8;
+
+		/* This is the "don't dump the carry flag into a GPR" idiom */
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		tmp2 += (tmp2 >> 64) | (tmp2 << 64);
+		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
+		tmp4 += (tmp4 >> 64) | (tmp4 << 64);
+		tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64);
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64);
+		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
+		tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64);
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		tmp1 = ((tmp1 >> 64) << 64) | sum64;
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		sum64 = tmp1 >> 64;
+	}
+	while (len > 8) {
+		__uint128_t tmp;
+
+		sum64 = accumulate(sum64, data);
+		tmp = *(__uint128_t *)ptr;
+
+		len -= 16;
+		ptr += 2;
+
+		data = tmp >> 64;
+		sum64 = accumulate(sum64, tmp);
+	}
+	if (len > 0) {
+		sum64 = accumulate(sum64, data);
+		data = *ptr;
+		len -= 8;
+	}
+	/*
+	 * Tail: zero any over-read bytes similarly to the head, again
+	 * preserving odd/even alignment.
+	 */
+	shift = len * -8;
+	data = (data << shift) >> shift;
+	sum64 = accumulate(sum64, data);
+
+	/* Finally, folding */
+	sum64 += (sum64 >> 32) | (sum64 << 32);
+	sum = sum64 >> 32;
+	sum += (sum >> 16) | (sum << 16);
+	if (offset & 1)
+		return (u16)swab32(sum);
+
+	return sum >> 16;
+}
+
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+			const struct in6_addr *daddr,
+			__u32 len, __u8 proto, __wsum csum)
+{
+	__uint128_t src, dst;
+	u64 sum = (__force u64)csum;
+
+	src = *(const __uint128_t *)saddr->s6_addr;
+	dst = *(const __uint128_t *)daddr->s6_addr;
+
+	sum += (__force u32)htonl(len);
+	sum += (u32)proto << 24;
+	src += (src >> 64) | (src << 64);
+	dst += (dst >> 64) | (dst << 64);
+
+	sum = accumulate(sum, src >> 64);
+	sum = accumulate(sum, dst >> 64);
+
+	sum += ((sum >> 32) | (sum << 32));
+	return csum_fold((__force __wsum)(sum >> 32));
+}
+EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/loongarch/lib/dump_tlb.c b/arch/loongarch/lib/dump_tlb.c
index cda2c6bc7f09..0b886a6e260f 100644
--- a/arch/loongarch/lib/dump_tlb.c
+++ b/arch/loongarch/lib/dump_tlb.c
@@ -18,11 +18,11 @@ void dump_tlb_regs(void)
 {
 	const int field = 2 * sizeof(unsigned long);
 
-	pr_info("Index    : %0x\n", read_csr_tlbidx());
-	pr_info("PageSize : %0x\n", read_csr_pagesize());
-	pr_info("EntryHi  : %0*llx\n", field, read_csr_entryhi());
-	pr_info("EntryLo0 : %0*llx\n", field, read_csr_entrylo0());
-	pr_info("EntryLo1 : %0*llx\n", field, read_csr_entrylo1());
+	pr_info("Index    : 0x%0x\n", read_csr_tlbidx());
+	pr_info("PageSize : 0x%0x\n", read_csr_pagesize());
+	pr_info("EntryHi  : 0x%0*lx\n", field, read_csr_entryhi());
+	pr_info("EntryLo0 : 0x%0*lx\n", field, read_csr_entrylo0());
+	pr_info("EntryLo1 : 0x%0*lx\n", field, read_csr_entrylo1());
 }
 
 static void dump_tlb(int first, int last)
@@ -33,8 +33,8 @@ static void dump_tlb(int first, int last)
 	unsigned int s_index, s_asid;
 	unsigned int pagesize, c0, c1, i;
 	unsigned long asidmask = cpu_asid_mask(&current_cpu_data);
-	int pwidth = 11;
-	int vwidth = 11;
+	int pwidth = 16;
+	int vwidth = 16;
 	int asidwidth = DIV_ROUND_UP(ilog2(asidmask) + 1, 4);
 
 	s_entryhi = read_csr_entryhi();
@@ -64,22 +64,22 @@ static void dump_tlb(int first, int last)
 		/*
 		 * Only print entries in use
 		 */
-		pr_info("Index: %2d pgsize=%x ", i, (1 << pagesize));
+		pr_info("Index: %4d pgsize=0x%x ", i, (1 << pagesize));
 
 		c0 = (entrylo0 & ENTRYLO_C) >> ENTRYLO_C_SHIFT;
 		c1 = (entrylo1 & ENTRYLO_C) >> ENTRYLO_C_SHIFT;
 
-		pr_cont("va=%0*lx asid=%0*lx",
+		pr_cont("va=0x%0*lx asid=0x%0*lx",
 			vwidth, (entryhi & ~0x1fffUL), asidwidth, asid & asidmask);
 
 		/* NR/NX are in awkward places, so mask them off separately */
 		pa = entrylo0 & ~(ENTRYLO_NR | ENTRYLO_NX);
 		pa = pa & PAGE_MASK;
 		pr_cont("\n\t[");
-		pr_cont("ri=%d xi=%d ",
+		pr_cont("nr=%d nx=%d ",
 			(entrylo0 & ENTRYLO_NR) ? 1 : 0,
 			(entrylo0 & ENTRYLO_NX) ? 1 : 0);
-		pr_cont("pa=%0*llx c=%d d=%d v=%d g=%d plv=%lld] [",
+		pr_cont("pa=0x%0*llx c=%d d=%d v=%d g=%d plv=%lld] [",
 			pwidth, pa, c0,
 			(entrylo0 & ENTRYLO_D) ? 1 : 0,
 			(entrylo0 & ENTRYLO_V) ? 1 : 0,
@@ -88,10 +88,10 @@ static void dump_tlb(int first, int last)
 		/* NR/NX are in awkward places, so mask them off separately */
 		pa = entrylo1 & ~(ENTRYLO_NR | ENTRYLO_NX);
 		pa = pa & PAGE_MASK;
-		pr_cont("ri=%d xi=%d ",
+		pr_cont("nr=%d nx=%d ",
 			(entrylo1 & ENTRYLO_NR) ? 1 : 0,
 			(entrylo1 & ENTRYLO_NX) ? 1 : 0);
-		pr_cont("pa=%0*llx c=%d d=%d v=%d g=%d plv=%lld]\n",
+		pr_cont("pa=0x%0*llx c=%d d=%d v=%d g=%d plv=%lld]\n",
 			pwidth, pa, c1,
 			(entrylo1 & ENTRYLO_D) ? 1 : 0,
 			(entrylo1 & ENTRYLO_V) ? 1 : 0,
diff --git a/arch/loongarch/lib/error-inject.c b/arch/loongarch/lib/error-inject.c
new file mode 100644
index 000000000000..afc9e1c7c973
--- /dev/null
+++ b/arch/loongarch/lib/error-inject.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/error-injection.h>
+#include <linux/kprobes.h>
+
+void override_function_with_return(struct pt_regs *regs)
+{
+	instruction_pointer_set(regs, regs->regs[1]);
+}
+NOKPROBE_SYMBOL(override_function_with_return);
diff --git a/arch/loongarch/lib/memcpy.S b/arch/loongarch/lib/memcpy.S
new file mode 100644
index 000000000000..fa1148878d2b
--- /dev/null
+++ b/arch/loongarch/lib/memcpy.S
@@ -0,0 +1,199 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+ */
+
+#include <linux/export.h>
+#include <asm/alternative-asm.h>
+#include <asm/asm.h>
+#include <asm/asmmacro.h>
+#include <asm/cpu.h>
+#include <asm/regdef.h>
+
+.section .noinstr.text, "ax"
+
+SYM_FUNC_START(memcpy)
+	/*
+	 * Some CPUs support hardware unaligned access
+	 */
+	ALTERNATIVE	"b __memcpy_generic", \
+			"b __memcpy_fast", CPU_FEATURE_UAL
+SYM_FUNC_END(memcpy)
+SYM_FUNC_ALIAS(__memcpy, memcpy)
+
+EXPORT_SYMBOL(memcpy)
+EXPORT_SYMBOL(__memcpy)
+
+_ASM_NOKPROBE(memcpy)
+_ASM_NOKPROBE(__memcpy)
+
+/*
+ * void *__memcpy_generic(void *dst, const void *src, size_t n)
+ *
+ * a0: dst
+ * a1: src
+ * a2: n
+ */
+SYM_FUNC_START(__memcpy_generic)
+	move	a3, a0
+	beqz	a2, 2f
+
+1:	ld.b	t0, a1, 0
+	st.b	t0, a0, 0
+	addi.d	a0, a0, 1
+	addi.d	a1, a1, 1
+	addi.d	a2, a2, -1
+	bgt	a2, zero, 1b
+
+2:	move	a0, a3
+	jr	ra
+SYM_FUNC_END(__memcpy_generic)
+_ASM_NOKPROBE(__memcpy_generic)
+
+	.align	5
+SYM_FUNC_START_NOALIGN(__memcpy_small)
+	pcaddi	t0, 8
+	slli.d	a2, a2, 5
+	add.d	t0, t0, a2
+	jr	t0
+
+	.align	5
+0:	jr	ra
+
+	.align	5
+1:	ld.b	t0, a1, 0
+	st.b	t0, a0, 0
+	jr	ra
+
+	.align	5
+2:	ld.h	t0, a1, 0
+	st.h	t0, a0, 0
+	jr	ra
+
+	.align	5
+3:	ld.h	t0, a1, 0
+	ld.b	t1, a1, 2
+	st.h	t0, a0, 0
+	st.b	t1, a0, 2
+	jr	ra
+
+	.align	5
+4:	ld.w	t0, a1, 0
+	st.w	t0, a0, 0
+	jr	ra
+
+	.align	5
+5:	ld.w	t0, a1, 0
+	ld.b	t1, a1, 4
+	st.w	t0, a0, 0
+	st.b	t1, a0, 4
+	jr	ra
+
+	.align	5
+6:	ld.w	t0, a1, 0
+	ld.h	t1, a1, 4
+	st.w	t0, a0, 0
+	st.h	t1, a0, 4
+	jr	ra
+
+	.align	5
+7:	ld.w	t0, a1, 0
+	ld.w	t1, a1, 3
+	st.w	t0, a0, 0
+	st.w	t1, a0, 3
+	jr	ra
+
+	.align	5
+8:	ld.d	t0, a1, 0
+	st.d	t0, a0, 0
+	jr	ra
+SYM_FUNC_END(__memcpy_small)
+_ASM_NOKPROBE(__memcpy_small)
+
+/*
+ * void *__memcpy_fast(void *dst, const void *src, size_t n)
+ *
+ * a0: dst
+ * a1: src
+ * a2: n
+ */
+SYM_FUNC_START(__memcpy_fast)
+	sltui	t0, a2, 9
+	bnez	t0, __memcpy_small
+
+	add.d	a3, a1, a2
+	add.d	a2, a0, a2
+	ld.d	a6, a1, 0
+	ld.d	a7, a3, -8
+
+	/* align up destination address */
+	andi	t1, a0, 7
+	sub.d	t0, zero, t1
+	addi.d	t0, t0, 8
+	add.d	a1, a1, t0
+	add.d	a5, a0, t0
+
+	addi.d	a4, a3, -64
+	bgeu	a1, a4, .Llt64
+
+	/* copy 64 bytes at a time */
+.Lloop64:
+	ld.d	t0, a1, 0
+	ld.d	t1, a1, 8
+	ld.d	t2, a1, 16
+	ld.d	t3, a1, 24
+	ld.d	t4, a1, 32
+	ld.d	t5, a1, 40
+	ld.d	t6, a1, 48
+	ld.d	t7, a1, 56
+	addi.d	a1, a1, 64
+	st.d	t0, a5, 0
+	st.d	t1, a5, 8
+	st.d	t2, a5, 16
+	st.d	t3, a5, 24
+	st.d	t4, a5, 32
+	st.d	t5, a5, 40
+	st.d	t6, a5, 48
+	st.d	t7, a5, 56
+	addi.d	a5, a5, 64
+	bltu	a1, a4, .Lloop64
+
+	/* copy the remaining bytes */
+.Llt64:
+	addi.d	a4, a3, -32
+	bgeu	a1, a4, .Llt32
+	ld.d	t0, a1, 0
+	ld.d	t1, a1, 8
+	ld.d	t2, a1, 16
+	ld.d	t3, a1, 24
+	addi.d	a1, a1, 32
+	st.d	t0, a5, 0
+	st.d	t1, a5, 8
+	st.d	t2, a5, 16
+	st.d	t3, a5, 24
+	addi.d	a5, a5, 32
+
+.Llt32:
+	addi.d	a4, a3, -16
+	bgeu	a1, a4, .Llt16
+	ld.d	t0, a1, 0
+	ld.d	t1, a1, 8
+	addi.d	a1, a1, 16
+	st.d	t0, a5, 0
+	st.d	t1, a5, 8
+	addi.d	a5, a5, 16
+
+.Llt16:
+	addi.d	a4, a3, -8
+	bgeu	a1, a4, .Llt8
+	ld.d	t0, a1, 0
+	st.d	t0, a5, 0
+
+.Llt8:
+	st.d	a6, a0, 0
+	st.d	a7, a2, -8
+
+	/* return */
+	jr	ra
+SYM_FUNC_END(__memcpy_fast)
+_ASM_NOKPROBE(__memcpy_fast)
diff --git a/arch/loongarch/lib/memmove.S b/arch/loongarch/lib/memmove.S
new file mode 100644
index 000000000000..82dae062fec8
--- /dev/null
+++ b/arch/loongarch/lib/memmove.S
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+ */
+
+#include <linux/export.h>
+#include <asm/alternative-asm.h>
+#include <asm/asm.h>
+#include <asm/asmmacro.h>
+#include <asm/cpu.h>
+#include <asm/regdef.h>
+
+.section .noinstr.text, "ax"
+
+SYM_FUNC_START(memmove)
+	blt	a0, a1, __memcpy	/* dst < src, memcpy */
+	blt	a1, a0, __rmemcpy	/* src < dst, rmemcpy */
+	jr	ra			/* dst == src, return */
+SYM_FUNC_END(memmove)
+SYM_FUNC_ALIAS(__memmove, memmove)
+
+EXPORT_SYMBOL(memmove)
+EXPORT_SYMBOL(__memmove)
+
+_ASM_NOKPROBE(memmove)
+_ASM_NOKPROBE(__memmove)
+
+SYM_FUNC_START(__rmemcpy)
+	/*
+	 * Some CPUs support hardware unaligned access
+	 */
+	ALTERNATIVE	"b __rmemcpy_generic", \
+			"b __rmemcpy_fast", CPU_FEATURE_UAL
+SYM_FUNC_END(__rmemcpy)
+_ASM_NOKPROBE(__rmemcpy)
+
+/*
+ * void *__rmemcpy_generic(void *dst, const void *src, size_t n)
+ *
+ * a0: dst
+ * a1: src
+ * a2: n
+ */
+SYM_FUNC_START(__rmemcpy_generic)
+	move	a3, a0
+	beqz	a2, 2f
+
+	add.d	a0, a0, a2
+	add.d	a1, a1, a2
+
+1:	ld.b	t0, a1, -1
+	st.b	t0, a0, -1
+	addi.d	a0, a0, -1
+	addi.d	a1, a1, -1
+	addi.d	a2, a2, -1
+	bgt	a2, zero, 1b
+
+2:	move	a0, a3
+	jr	ra
+SYM_FUNC_END(__rmemcpy_generic)
+_ASM_NOKPROBE(__rmemcpy_generic)
+
+/*
+ * void *__rmemcpy_fast(void *dst, const void *src, size_t n)
+ *
+ * a0: dst
+ * a1: src
+ * a2: n
+ */
+SYM_FUNC_START(__rmemcpy_fast)
+	sltui	t0, a2, 9
+	bnez	t0, __memcpy_small
+
+	add.d	a3, a1, a2
+	add.d	a2, a0, a2
+	ld.d	a6, a1, 0
+	ld.d	a7, a3, -8
+
+	/* align up destination address */
+	andi	t1, a2, 7
+	sub.d	a3, a3, t1
+	sub.d	a5, a2, t1
+
+	addi.d	a4, a1, 64
+	bgeu	a4, a3, .Llt64
+
+	/* copy 64 bytes at a time */
+.Lloop64:
+	ld.d	t0, a3, -8
+	ld.d	t1, a3, -16
+	ld.d	t2, a3, -24
+	ld.d	t3, a3, -32
+	ld.d	t4, a3, -40
+	ld.d	t5, a3, -48
+	ld.d	t6, a3, -56
+	ld.d	t7, a3, -64
+	addi.d	a3, a3, -64
+	st.d	t0, a5, -8
+	st.d	t1, a5, -16
+	st.d	t2, a5, -24
+	st.d	t3, a5, -32
+	st.d	t4, a5, -40
+	st.d	t5, a5, -48
+	st.d	t6, a5, -56
+	st.d	t7, a5, -64
+	addi.d	a5, a5, -64
+	bltu	a4, a3, .Lloop64
+
+	/* copy the remaining bytes */
+.Llt64:
+	addi.d	a4, a1, 32
+	bgeu	a4, a3, .Llt32
+	ld.d	t0, a3, -8
+	ld.d	t1, a3, -16
+	ld.d	t2, a3, -24
+	ld.d	t3, a3, -32
+	addi.d	a3, a3, -32
+	st.d	t0, a5, -8
+	st.d	t1, a5, -16
+	st.d	t2, a5, -24
+	st.d	t3, a5, -32
+	addi.d	a5, a5, -32
+
+.Llt32:
+	addi.d	a4, a1, 16
+	bgeu	a4, a3, .Llt16
+	ld.d	t0, a3, -8
+	ld.d	t1, a3, -16
+	addi.d	a3, a3, -16
+	st.d	t0, a5, -8
+	st.d	t1, a5, -16
+	addi.d	a5, a5, -16
+
+.Llt16:
+	addi.d	a4, a1, 8
+	bgeu	a4, a3, .Llt8
+	ld.d	t0, a3, -8
+	st.d	t0, a5, -8
+
+.Llt8:
+	st.d	a6, a0, 0
+	st.d	a7, a2, -8
+
+	/* return */
+	jr	ra
+SYM_FUNC_END(__rmemcpy_fast)
+_ASM_NOKPROBE(__rmemcpy_fast)
diff --git a/arch/loongarch/lib/memset.S b/arch/loongarch/lib/memset.S
new file mode 100644
index 000000000000..06d3ca54cbfe
--- /dev/null
+++ b/arch/loongarch/lib/memset.S
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+ */
+
+#include <linux/export.h>
+#include <asm/alternative-asm.h>
+#include <asm/asm.h>
+#include <asm/asmmacro.h>
+#include <asm/cpu.h>
+#include <asm/regdef.h>
+
+.macro fill_to_64 r0
+	bstrins.d \r0, \r0, 15, 8
+	bstrins.d \r0, \r0, 31, 16
+	bstrins.d \r0, \r0, 63, 32
+.endm
+
+.section .noinstr.text, "ax"
+
+SYM_FUNC_START(memset)
+	/*
+	 * Some CPUs support hardware unaligned access
+	 */
+	ALTERNATIVE	"b __memset_generic", \
+			"b __memset_fast", CPU_FEATURE_UAL
+SYM_FUNC_END(memset)
+SYM_FUNC_ALIAS(__memset, memset)
+
+EXPORT_SYMBOL(memset)
+EXPORT_SYMBOL(__memset)
+
+_ASM_NOKPROBE(memset)
+_ASM_NOKPROBE(__memset)
+
+/*
+ * void *__memset_generic(void *s, int c, size_t n)
+ *
+ * a0: s
+ * a1: c
+ * a2: n
+ */
+SYM_FUNC_START(__memset_generic)
+	move	a3, a0
+	beqz	a2, 2f
+
+1:	st.b	a1, a0, 0
+	addi.d	a0, a0, 1
+	addi.d	a2, a2, -1
+	bgt	a2, zero, 1b
+
+2:	move	a0, a3
+	jr	ra
+SYM_FUNC_END(__memset_generic)
+_ASM_NOKPROBE(__memset_generic)
+
+/*
+ * void *__memset_fast(void *s, int c, size_t n)
+ *
+ * a0: s
+ * a1: c
+ * a2: n
+ */
+SYM_FUNC_START(__memset_fast)
+	/* fill a1 to 64 bits */
+	fill_to_64 a1
+
+	sltui	t0, a2, 9
+	bnez	t0, .Lsmall
+
+	add.d	a2, a0, a2
+	st.d	a1, a0, 0
+
+	/* align up address */
+	addi.d	a3, a0, 8
+	bstrins.d	a3, zero, 2, 0
+
+	addi.d	a4, a2, -64
+	bgeu	a3, a4, .Llt64
+
+	/* set 64 bytes at a time */
+.Lloop64:
+	st.d	a1, a3, 0
+	st.d	a1, a3, 8
+	st.d	a1, a3, 16
+	st.d	a1, a3, 24
+	st.d	a1, a3, 32
+	st.d	a1, a3, 40
+	st.d	a1, a3, 48
+	st.d	a1, a3, 56
+	addi.d	a3, a3, 64
+	bltu	a3, a4, .Lloop64
+
+	/* set the remaining bytes */
+.Llt64:
+	addi.d	a4, a2, -32
+	bgeu	a3, a4, .Llt32
+	st.d	a1, a3, 0
+	st.d	a1, a3, 8
+	st.d	a1, a3, 16
+	st.d	a1, a3, 24
+	addi.d	a3, a3, 32
+
+.Llt32:
+	addi.d	a4, a2, -16
+	bgeu	a3, a4, .Llt16
+	st.d	a1, a3, 0
+	st.d	a1, a3, 8
+	addi.d	a3, a3, 16
+
+.Llt16:
+	addi.d	a4, a2, -8
+	bgeu	a3, a4, .Llt8
+	st.d	a1, a3, 0
+
+.Llt8:
+	st.d	a1, a2, -8
+
+	/* return */
+	jr	ra
+
+	.align	4
+.Lsmall:
+	pcaddi	t0, 4
+	slli.d	a2, a2, 4
+	add.d	t0, t0, a2
+	jr	t0
+
+	.align	4
+0:	jr	ra
+
+	.align	4
+1:	st.b	a1, a0, 0
+	jr	ra
+
+	.align	4
+2:	st.h	a1, a0, 0
+	jr	ra
+
+	.align	4
+3:	st.h	a1, a0, 0
+	st.b	a1, a0, 2
+	jr	ra
+
+	.align	4
+4:	st.w	a1, a0, 0
+	jr	ra
+
+	.align	4
+5:	st.w	a1, a0, 0
+	st.b	a1, a0, 4
+	jr	ra
+
+	.align	4
+6:	st.w	a1, a0, 0
+	st.h	a1, a0, 4
+	jr	ra
+
+	.align	4
+7:	st.w	a1, a0, 0
+	st.w	a1, a0, 3
+	jr	ra
+
+	.align	4
+8:	st.d	a1, a0, 0
+	jr	ra
+SYM_FUNC_END(__memset_fast)
+_ASM_NOKPROBE(__memset_fast)
diff --git a/arch/loongarch/lib/unaligned.S b/arch/loongarch/lib/unaligned.S
new file mode 100644
index 000000000000..185f82d85810
--- /dev/null
+++ b/arch/loongarch/lib/unaligned.S
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm.h>
+#include <asm/asmmacro.h>
+#include <asm/asm-extable.h>
+#include <asm/errno.h>
+#include <asm/regdef.h>
+
+.L_fixup_handle_unaligned:
+	li.w	a0, -EFAULT
+	jr	ra
+
+/*
+ * unsigned long unaligned_read(void *addr, void *value, unsigned long n, bool sign)
+ *
+ * a0: addr
+ * a1: value
+ * a2: n
+ * a3: sign
+ */
+SYM_FUNC_START(unaligned_read)
+	beqz	a2, 5f
+
+	li.w	t2, 0
+	addi.d	t0, a2, -1
+	slli.d	t1, t0, 3
+	add.d 	a0, a0, t0
+
+	beqz	a3, 2f
+1:	ld.b	t3, a0, 0
+	b	3f
+
+2:	ld.bu	t3, a0, 0
+3:	sll.d	t3, t3, t1
+	or	t2, t2, t3
+	addi.d	t1, t1, -8
+	addi.d	a0, a0, -1
+	addi.d	a2, a2, -1
+	bgtz	a2, 2b
+4:	st.d	t2, a1, 0
+
+	move	a0, a2
+	jr	ra
+
+5:	li.w    a0, -EFAULT
+	jr	ra
+
+	_asm_extable 1b, .L_fixup_handle_unaligned
+	_asm_extable 2b, .L_fixup_handle_unaligned
+	_asm_extable 4b, .L_fixup_handle_unaligned
+SYM_FUNC_END(unaligned_read)
+
+/*
+ * unsigned long unaligned_write(void *addr, unsigned long value, unsigned long n)
+ *
+ * a0: addr
+ * a1: value
+ * a2: n
+ */
+SYM_FUNC_START(unaligned_write)
+	beqz	a2, 3f
+
+	li.w	t0, 0
+1:	srl.d	t1, a1, t0
+2:	st.b	t1, a0, 0
+	addi.d	t0, t0, 8
+	addi.d	a2, a2, -1
+	addi.d	a0, a0, 1
+	bgtz	a2, 1b
+
+	move	a0, a2
+	jr	ra
+
+3:	li.w    a0, -EFAULT
+	jr	ra
+
+	_asm_extable 2b, .L_fixup_handle_unaligned
+SYM_FUNC_END(unaligned_write)
diff --git a/arch/loongarch/lib/xor_simd.c b/arch/loongarch/lib/xor_simd.c
new file mode 100644
index 000000000000..84cd24b728c4
--- /dev/null
+++ b/arch/loongarch/lib/xor_simd.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * LoongArch SIMD XOR operations
+ *
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ */
+
+#include "xor_simd.h"
+
+/*
+ * Process one cache line (64 bytes) per loop. This is assuming all future
+ * popular LoongArch cores are similar performance-characteristics-wise to the
+ * current models.
+ */
+#define LINE_WIDTH 64
+
+#ifdef CONFIG_CPU_HAS_LSX
+
+#define LD(reg, base, offset)	\
+	"vld $vr" #reg ", %[" #base "], " #offset "\n\t"
+#define ST(reg, base, offset)	\
+	"vst $vr" #reg ", %[" #base "], " #offset "\n\t"
+#define XOR(dj, k)	"vxor.v $vr" #dj ", $vr" #dj ", $vr" #k "\n\t"
+
+#define LD_INOUT_LINE(base)	\
+	LD(0, base, 0)		\
+	LD(1, base, 16)		\
+	LD(2, base, 32)		\
+	LD(3, base, 48)
+
+#define LD_AND_XOR_LINE(base)	\
+	LD(4, base, 0)		\
+	LD(5, base, 16)		\
+	LD(6, base, 32)		\
+	LD(7, base, 48)		\
+	XOR(0, 4)		\
+	XOR(1, 5)		\
+	XOR(2, 6)		\
+	XOR(3, 7)
+
+#define ST_LINE(base)		\
+	ST(0, base, 0)		\
+	ST(1, base, 16)		\
+	ST(2, base, 32)		\
+	ST(3, base, 48)
+
+#define XOR_FUNC_NAME(nr) __xor_lsx_##nr
+#include "xor_template.c"
+
+#undef LD
+#undef ST
+#undef XOR
+#undef LD_INOUT_LINE
+#undef LD_AND_XOR_LINE
+#undef ST_LINE
+#undef XOR_FUNC_NAME
+
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+
+#define LD(reg, base, offset)	\
+	"xvld $xr" #reg ", %[" #base "], " #offset "\n\t"
+#define ST(reg, base, offset)	\
+	"xvst $xr" #reg ", %[" #base "], " #offset "\n\t"
+#define XOR(dj, k)	"xvxor.v $xr" #dj ", $xr" #dj ", $xr" #k "\n\t"
+
+#define LD_INOUT_LINE(base)	\
+	LD(0, base, 0)		\
+	LD(1, base, 32)
+
+#define LD_AND_XOR_LINE(base)	\
+	LD(2, base, 0)		\
+	LD(3, base, 32)		\
+	XOR(0, 2)		\
+	XOR(1, 3)
+
+#define ST_LINE(base)		\
+	ST(0, base, 0)		\
+	ST(1, base, 32)
+
+#define XOR_FUNC_NAME(nr) __xor_lasx_##nr
+#include "xor_template.c"
+
+#undef LD
+#undef ST
+#undef XOR
+#undef LD_INOUT_LINE
+#undef LD_AND_XOR_LINE
+#undef ST_LINE
+#undef XOR_FUNC_NAME
+
+#endif /* CONFIG_CPU_HAS_LASX */
diff --git a/arch/loongarch/lib/xor_simd.h b/arch/loongarch/lib/xor_simd.h
new file mode 100644
index 000000000000..f50f32514d80
--- /dev/null
+++ b/arch/loongarch/lib/xor_simd.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Simple interface to link xor_simd.c and xor_simd_glue.c
+ *
+ * Separating these files ensures that no SIMD instructions are run outside of
+ * the kfpu critical section.
+ */
+
+#ifndef __LOONGARCH_LIB_XOR_SIMD_H
+#define __LOONGARCH_LIB_XOR_SIMD_H
+
+#ifdef CONFIG_CPU_HAS_LSX
+void __xor_lsx_2(unsigned long bytes, unsigned long * __restrict p1,
+		 const unsigned long * __restrict p2);
+void __xor_lsx_3(unsigned long bytes, unsigned long * __restrict p1,
+		 const unsigned long * __restrict p2, const unsigned long * __restrict p3);
+void __xor_lsx_4(unsigned long bytes, unsigned long * __restrict p1,
+		 const unsigned long * __restrict p2, const unsigned long * __restrict p3,
+		 const unsigned long * __restrict p4);
+void __xor_lsx_5(unsigned long bytes, unsigned long * __restrict p1,
+		 const unsigned long * __restrict p2, const unsigned long * __restrict p3,
+		 const unsigned long * __restrict p4, const unsigned long * __restrict p5);
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+void __xor_lasx_2(unsigned long bytes, unsigned long * __restrict p1,
+		  const unsigned long * __restrict p2);
+void __xor_lasx_3(unsigned long bytes, unsigned long * __restrict p1,
+		  const unsigned long * __restrict p2, const unsigned long * __restrict p3);
+void __xor_lasx_4(unsigned long bytes, unsigned long * __restrict p1,
+		  const unsigned long * __restrict p2, const unsigned long * __restrict p3,
+		  const unsigned long * __restrict p4);
+void __xor_lasx_5(unsigned long bytes, unsigned long * __restrict p1,
+		  const unsigned long * __restrict p2, const unsigned long * __restrict p3,
+		  const unsigned long * __restrict p4, const unsigned long * __restrict p5);
+#endif /* CONFIG_CPU_HAS_LASX */
+
+#endif /* __LOONGARCH_LIB_XOR_SIMD_H */
diff --git a/arch/loongarch/lib/xor_simd_glue.c b/arch/loongarch/lib/xor_simd_glue.c
new file mode 100644
index 000000000000..393f689dbcf6
--- /dev/null
+++ b/arch/loongarch/lib/xor_simd_glue.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * LoongArch SIMD XOR operations
+ *
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ */
+
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <asm/fpu.h>
+#include <asm/xor_simd.h>
+#include "xor_simd.h"
+
+#define MAKE_XOR_GLUE_2(flavor)							\
+void xor_##flavor##_2(unsigned long bytes, unsigned long * __restrict p1,	\
+		      const unsigned long * __restrict p2)			\
+{										\
+	kernel_fpu_begin();							\
+	__xor_##flavor##_2(bytes, p1, p2);					\
+	kernel_fpu_end();							\
+}										\
+EXPORT_SYMBOL_GPL(xor_##flavor##_2)
+
+#define MAKE_XOR_GLUE_3(flavor)							\
+void xor_##flavor##_3(unsigned long bytes, unsigned long * __restrict p1,	\
+		      const unsigned long * __restrict p2,			\
+		      const unsigned long * __restrict p3)			\
+{										\
+	kernel_fpu_begin();							\
+	__xor_##flavor##_3(bytes, p1, p2, p3);					\
+	kernel_fpu_end();							\
+}										\
+EXPORT_SYMBOL_GPL(xor_##flavor##_3)
+
+#define MAKE_XOR_GLUE_4(flavor)							\
+void xor_##flavor##_4(unsigned long bytes, unsigned long * __restrict p1,	\
+		      const unsigned long * __restrict p2,			\
+		      const unsigned long * __restrict p3,			\
+		      const unsigned long * __restrict p4)			\
+{										\
+	kernel_fpu_begin();							\
+	__xor_##flavor##_4(bytes, p1, p2, p3, p4);				\
+	kernel_fpu_end();							\
+}										\
+EXPORT_SYMBOL_GPL(xor_##flavor##_4)
+
+#define MAKE_XOR_GLUE_5(flavor)							\
+void xor_##flavor##_5(unsigned long bytes, unsigned long * __restrict p1,	\
+		      const unsigned long * __restrict p2,			\
+		      const unsigned long * __restrict p3,			\
+		      const unsigned long * __restrict p4,			\
+		      const unsigned long * __restrict p5)			\
+{										\
+	kernel_fpu_begin();							\
+	__xor_##flavor##_5(bytes, p1, p2, p3, p4, p5);				\
+	kernel_fpu_end();							\
+}										\
+EXPORT_SYMBOL_GPL(xor_##flavor##_5)
+
+#define MAKE_XOR_GLUES(flavor)		\
+	MAKE_XOR_GLUE_2(flavor);	\
+	MAKE_XOR_GLUE_3(flavor);	\
+	MAKE_XOR_GLUE_4(flavor);	\
+	MAKE_XOR_GLUE_5(flavor)
+
+#ifdef CONFIG_CPU_HAS_LSX
+MAKE_XOR_GLUES(lsx);
+#endif
+
+#ifdef CONFIG_CPU_HAS_LASX
+MAKE_XOR_GLUES(lasx);
+#endif
diff --git a/arch/loongarch/lib/xor_template.c b/arch/loongarch/lib/xor_template.c
new file mode 100644
index 000000000000..0358ced7fe33
--- /dev/null
+++ b/arch/loongarch/lib/xor_template.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ *
+ * Template for XOR operations, instantiated in xor_simd.c.
+ *
+ * Expected preprocessor definitions:
+ *
+ * - LINE_WIDTH
+ * - XOR_FUNC_NAME(nr)
+ * - LD_INOUT_LINE(buf)
+ * - LD_AND_XOR_LINE(buf)
+ * - ST_LINE(buf)
+ */
+
+void XOR_FUNC_NAME(2)(unsigned long bytes,
+		      unsigned long * __restrict v1,
+		      const unsigned long * __restrict v2)
+{
+	unsigned long lines = bytes / LINE_WIDTH;
+
+	do {
+		__asm__ __volatile__ (
+			LD_INOUT_LINE(v1)
+			LD_AND_XOR_LINE(v2)
+			ST_LINE(v1)
+		: : [v1] "r"(v1), [v2] "r"(v2) : "memory"
+		);
+
+		v1 += LINE_WIDTH / sizeof(unsigned long);
+		v2 += LINE_WIDTH / sizeof(unsigned long);
+	} while (--lines > 0);
+}
+
+void XOR_FUNC_NAME(3)(unsigned long bytes,
+		      unsigned long * __restrict v1,
+		      const unsigned long * __restrict v2,
+		      const unsigned long * __restrict v3)
+{
+	unsigned long lines = bytes / LINE_WIDTH;
+
+	do {
+		__asm__ __volatile__ (
+			LD_INOUT_LINE(v1)
+			LD_AND_XOR_LINE(v2)
+			LD_AND_XOR_LINE(v3)
+			ST_LINE(v1)
+		: : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3) : "memory"
+		);
+
+		v1 += LINE_WIDTH / sizeof(unsigned long);
+		v2 += LINE_WIDTH / sizeof(unsigned long);
+		v3 += LINE_WIDTH / sizeof(unsigned long);
+	} while (--lines > 0);
+}
+
+void XOR_FUNC_NAME(4)(unsigned long bytes,
+		      unsigned long * __restrict v1,
+		      const unsigned long * __restrict v2,
+		      const unsigned long * __restrict v3,
+		      const unsigned long * __restrict v4)
+{
+	unsigned long lines = bytes / LINE_WIDTH;
+
+	do {
+		__asm__ __volatile__ (
+			LD_INOUT_LINE(v1)
+			LD_AND_XOR_LINE(v2)
+			LD_AND_XOR_LINE(v3)
+			LD_AND_XOR_LINE(v4)
+			ST_LINE(v1)
+		: : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3), [v4] "r"(v4)
+		: "memory"
+		);
+
+		v1 += LINE_WIDTH / sizeof(unsigned long);
+		v2 += LINE_WIDTH / sizeof(unsigned long);
+		v3 += LINE_WIDTH / sizeof(unsigned long);
+		v4 += LINE_WIDTH / sizeof(unsigned long);
+	} while (--lines > 0);
+}
+
+void XOR_FUNC_NAME(5)(unsigned long bytes,
+		      unsigned long * __restrict v1,
+		      const unsigned long * __restrict v2,
+		      const unsigned long * __restrict v3,
+		      const unsigned long * __restrict v4,
+		      const unsigned long * __restrict v5)
+{
+	unsigned long lines = bytes / LINE_WIDTH;
+
+	do {
+		__asm__ __volatile__ (
+			LD_INOUT_LINE(v1)
+			LD_AND_XOR_LINE(v2)
+			LD_AND_XOR_LINE(v3)
+			LD_AND_XOR_LINE(v4)
+			LD_AND_XOR_LINE(v5)
+			ST_LINE(v1)
+		: : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3), [v4] "r"(v4),
+		    [v5] "r"(v5) : "memory"
+		);
+
+		v1 += LINE_WIDTH / sizeof(unsigned long);
+		v2 += LINE_WIDTH / sizeof(unsigned long);
+		v3 += LINE_WIDTH / sizeof(unsigned long);
+		v4 += LINE_WIDTH / sizeof(unsigned long);
+		v5 += LINE_WIDTH / sizeof(unsigned long);
+	} while (--lines > 0);
+}