aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorKristina Martsenko <kristina.martsenko@arm.com>2024-09-30 17:10:50 +0100
committerCatalin Marinas <catalin.marinas@arm.com>2024-10-17 16:42:51 +0100
commit836ed3c4e473fef9e0814a2ba6dd40f9656c03f1 (patch)
treefcd2bcf0f4a761f16fadedfd87ac1eaf0df1188a
parentarm64: mops: Document booting requirement for HCR_EL2.MCE2 (diff)
downloadwireguard-linux-836ed3c4e473fef9e0814a2ba6dd40f9656c03f1.tar.xz
wireguard-linux-836ed3c4e473fef9e0814a2ba6dd40f9656c03f1.zip
arm64: lib: Use MOPS for memcpy() routines
Make memcpy(), memmove() and memset() use the Armv8.8 FEAT_MOPS instructions when implemented on the CPU. The CPY*/SET* instructions copy or set a block of memory of arbitrary size and alignment. They can be interrupted by the CPU and the copying resumed later. Their performance is expected to be close to the best generic copy/set sequence of loads/stores for a given CPU. Using them in the kernel's copy/set routines therefore avoids the need to periodically rewrite the routines to optimize for new microarchitectures. It could also lead to a performance improvement for some CPUs and systems. With this change the kernel will always use the instructions if they are implemented on the CPU (and have not been disabled by the arm64.nomops command line parameter). When not implemented the usual routines will be used (patched via alternatives). Note, we need to patch B/NOP instead of the whole sequence to avoid executing a partially patched sequence in case the compiler generates a mem*() call inside the alternatives patching code. Note that MOPS instructions have relaxed behavior on Device memory, but it is expected that these routines are not generally used on MMIO. Note: For memcpy(), this uses the CPY* instructions instead of CPYF*, as CPY* allows overlaps between the source and destination buffers, and despite contradicting the C standard, compilers require that memcpy() work on exactly overlapping source and destination: https://gcc.gnu.org/onlinedocs/gcc/Standards.html#C-Language https://reviews.llvm.org/D86993 Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com> Link: https://lore.kernel.org/r/20240930161051.3777828-5-kristina.martsenko@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Diffstat (limited to '')
-rw-r--r--arch/arm64/Kconfig3
-rw-r--r--arch/arm64/lib/memcpy.S19
-rw-r--r--arch/arm64/lib/memset.S20
3 files changed, 40 insertions, 2 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3e29b44d2d7b..d0fe90ea704d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2155,6 +2155,9 @@ config ARM64_EPAN
if the cpu does not implement the feature.
endmenu # "ARMv8.7 architectural features"
+config AS_HAS_MOPS
+ def_bool $(as-instr,.arch_extension mops)
+
menu "ARMv8.9 architectural features"
config ARM64_POE
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 4ab48d49c451..9b99106fb95f 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -57,7 +57,7 @@
The loop tail is handled by always copying 64 bytes from the end.
*/
-SYM_FUNC_START(__pi_memcpy)
+SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
add srcend, src, count
add dstend, dstin, count
cmp count, 128
@@ -238,7 +238,24 @@ L(copy64_from_start):
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin]
ret
+SYM_FUNC_END(__pi_memcpy_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+ .arch_extension mops
+SYM_FUNC_START(__pi_memcpy)
+alternative_if_not ARM64_HAS_MOPS
+ b __pi_memcpy_generic
+alternative_else_nop_endif
+
+ mov dst, dstin
+ cpyp [dst]!, [src]!, count!
+ cpym [dst]!, [src]!, count!
+ cpye [dst]!, [src]!, count!
+ ret
SYM_FUNC_END(__pi_memcpy)
+#else
+SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic)
+#endif
SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
EXPORT_SYMBOL(__memcpy)
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index a5aebe82ad73..97157da65ec6 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -26,6 +26,7 @@
*/
dstin .req x0
+val_x .req x1
val .req w1
count .req x2
tmp1 .req x3
@@ -42,7 +43,7 @@ dst .req x8
tmp3w .req w9
tmp3 .req x9
-SYM_FUNC_START(__pi_memset)
+SYM_FUNC_START_LOCAL(__pi_memset_generic)
mov dst, dstin /* Preserve return value. */
and A_lw, val, #255
orr A_lw, A_lw, A_lw, lsl #8
@@ -201,7 +202,24 @@ SYM_FUNC_START(__pi_memset)
ands count, count, zva_bits_x
b.ne .Ltail_maybe_long
ret
+SYM_FUNC_END(__pi_memset_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+ .arch_extension mops
+SYM_FUNC_START(__pi_memset)
+alternative_if_not ARM64_HAS_MOPS
+ b __pi_memset_generic
+alternative_else_nop_endif
+
+ mov dst, dstin
+ setp [dst]!, count!, val_x
+ setm [dst]!, count!, val_x
+ sete [dst]!, count!, val_x
+ ret
SYM_FUNC_END(__pi_memset)
+#else
+SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic)
+#endif
SYM_FUNC_ALIAS(__memset, __pi_memset)
EXPORT_SYMBOL(__memset)