aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm64/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/lib')
-rw-r--r--arch/arm64/lib/clear_page.S15
-rw-r--r--arch/arm64/lib/copy_page.S5
-rw-r--r--arch/arm64/lib/crc32.S87
-rw-r--r--arch/arm64/lib/delay.c12
-rw-r--r--arch/arm64/lib/insn.c284
-rw-r--r--arch/arm64/lib/kasan_sw_tags.S4
-rw-r--r--arch/arm64/lib/memchr.S5
-rw-r--r--arch/arm64/lib/memcmp.S6
-rw-r--r--arch/arm64/lib/memcpy.S21
-rw-r--r--arch/arm64/lib/memset.S12
-rw-r--r--arch/arm64/lib/mte.S18
-rw-r--r--arch/arm64/lib/strchr.S6
-rw-r--r--arch/arm64/lib/strcmp.S246
-rw-r--r--arch/arm64/lib/strlen.S6
-rw-r--r--arch/arm64/lib/strncmp.S241
-rw-r--r--arch/arm64/lib/strnlen.S6
-rw-r--r--arch/arm64/lib/strrchr.S5
-rw-r--r--arch/arm64/lib/xor-neon.c177
18 files changed, 844 insertions, 312 deletions
diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S
index b84b179edba3..ebde40e7fa2b 100644
--- a/arch/arm64/lib/clear_page.S
+++ b/arch/arm64/lib/clear_page.S
@@ -14,8 +14,9 @@
* Parameters:
* x0 - dest
*/
-SYM_FUNC_START_PI(clear_page)
+SYM_FUNC_START(__pi_clear_page)
mrs x1, dczid_el0
+ tbnz x1, #4, 2f /* Branch if DC ZVA is prohibited */
and w1, w1, #0xf
mov x2, #4
lsl x1, x2, x1
@@ -25,5 +26,15 @@ SYM_FUNC_START_PI(clear_page)
tst x0, #(PAGE_SIZE - 1)
b.ne 1b
ret
-SYM_FUNC_END_PI(clear_page)
+
+2: stnp xzr, xzr, [x0]
+ stnp xzr, xzr, [x0, #16]
+ stnp xzr, xzr, [x0, #32]
+ stnp xzr, xzr, [x0, #48]
+ add x0, x0, #64
+ tst x0, #(PAGE_SIZE - 1)
+ b.ne 2b
+ ret
+SYM_FUNC_END(__pi_clear_page)
+SYM_FUNC_ALIAS(clear_page, __pi_clear_page)
EXPORT_SYMBOL(clear_page)
diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S
index 29144f4cd449..c336d2ffdec5 100644
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -17,7 +17,7 @@
* x0 - dest
* x1 - src
*/
-SYM_FUNC_START_PI(copy_page)
+SYM_FUNC_START(__pi_copy_page)
alternative_if ARM64_HAS_NO_HW_PREFETCH
// Prefetch three cache lines ahead.
prfm pldl1strm, [x1, #128]
@@ -75,5 +75,6 @@ alternative_else_nop_endif
stnp x16, x17, [x0, #112 - 256]
ret
-SYM_FUNC_END_PI(copy_page)
+SYM_FUNC_END(__pi_copy_page)
+SYM_FUNC_ALIAS(copy_page, __pi_copy_page)
EXPORT_SYMBOL(copy_page)
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
index 0f9e10ecda23..8340dccff46f 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -11,7 +11,44 @@
.arch armv8-a+crc
- .macro __crc32, c
+ .macro byteorder, reg, be
+ .if \be
+CPU_LE( rev \reg, \reg )
+ .else
+CPU_BE( rev \reg, \reg )
+ .endif
+ .endm
+
+ .macro byteorder16, reg, be
+ .if \be
+CPU_LE( rev16 \reg, \reg )
+ .else
+CPU_BE( rev16 \reg, \reg )
+ .endif
+ .endm
+
+ .macro bitorder, reg, be
+ .if \be
+ rbit \reg, \reg
+ .endif
+ .endm
+
+ .macro bitorder16, reg, be
+ .if \be
+ rbit \reg, \reg
+ lsr \reg, \reg, #16
+ .endif
+ .endm
+
+ .macro bitorder8, reg, be
+ .if \be
+ rbit \reg, \reg
+ lsr \reg, \reg, #24
+ .endif
+ .endm
+
+ .macro __crc32, c, be=0
+ bitorder w0, \be
cmp x2, #16
b.lt 8f // less than 16 bytes
@@ -24,10 +61,14 @@
add x8, x8, x1
add x1, x1, x7
ldp x5, x6, [x8]
-CPU_BE( rev x3, x3 )
-CPU_BE( rev x4, x4 )
-CPU_BE( rev x5, x5 )
-CPU_BE( rev x6, x6 )
+ byteorder x3, \be
+ byteorder x4, \be
+ byteorder x5, \be
+ byteorder x6, \be
+ bitorder x3, \be
+ bitorder x4, \be
+ bitorder x5, \be
+ bitorder x6, \be
tst x7, #8
crc32\c\()x w8, w0, x3
@@ -55,33 +96,43 @@ CPU_BE( rev x6, x6 )
32: ldp x3, x4, [x1], #32
sub x2, x2, #32
ldp x5, x6, [x1, #-16]
-CPU_BE( rev x3, x3 )
-CPU_BE( rev x4, x4 )
-CPU_BE( rev x5, x5 )
-CPU_BE( rev x6, x6 )
+ byteorder x3, \be
+ byteorder x4, \be
+ byteorder x5, \be
+ byteorder x6, \be
+ bitorder x3, \be
+ bitorder x4, \be
+ bitorder x5, \be
+ bitorder x6, \be
crc32\c\()x w0, w0, x3
crc32\c\()x w0, w0, x4
crc32\c\()x w0, w0, x5
crc32\c\()x w0, w0, x6
cbnz x2, 32b
-0: ret
+0: bitorder w0, \be
+ ret
8: tbz x2, #3, 4f
ldr x3, [x1], #8
-CPU_BE( rev x3, x3 )
+ byteorder x3, \be
+ bitorder x3, \be
crc32\c\()x w0, w0, x3
4: tbz x2, #2, 2f
ldr w3, [x1], #4
-CPU_BE( rev w3, w3 )
+ byteorder w3, \be
+ bitorder w3, \be
crc32\c\()w w0, w0, w3
2: tbz x2, #1, 1f
ldrh w3, [x1], #2
-CPU_BE( rev16 w3, w3 )
+ byteorder16 w3, \be
+ bitorder16 w3, \be
crc32\c\()h w0, w0, w3
1: tbz x2, #0, 0f
ldrb w3, [x1]
+ bitorder8 w3, \be
crc32\c\()b w0, w0, w3
-0: ret
+0: bitorder w0, \be
+ ret
.endm
.align 5
@@ -99,3 +150,11 @@ alternative_if_not ARM64_HAS_CRC32
alternative_else_nop_endif
__crc32 c
SYM_FUNC_END(__crc32c_le)
+
+ .align 5
+SYM_FUNC_START(crc32_be)
+alternative_if_not ARM64_HAS_CRC32
+ b crc32_be_base
+alternative_else_nop_endif
+ __crc32 be=1
+SYM_FUNC_END(crc32_be)
diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c
index 1688af0a4c97..5b7890139bc2 100644
--- a/arch/arm64/lib/delay.c
+++ b/arch/arm64/lib/delay.c
@@ -27,7 +27,17 @@ void __delay(unsigned long cycles)
{
cycles_t start = get_cycles();
- if (arch_timer_evtstrm_available()) {
+ if (cpus_have_const_cap(ARM64_HAS_WFXT)) {
+ u64 end = start + cycles;
+
+ /*
+ * Start with WFIT. If an interrupt makes us resume
+ * early, use a WFET loop to complete the delay.
+ */
+ wfit(end);
+ while ((get_cycles() - start) < cycles)
+ wfet(end);
+ } else if (arch_timer_evtstrm_available()) {
const cycles_t timer_evt_period =
USECS_TO_CYCLES(ARCH_TIMER_EVT_STREAM_PERIOD_US);
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
index fccfe363e567..49e972beeac7 100644
--- a/arch/arm64/lib/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -299,36 +299,31 @@ static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type,
return insn;
}
+static const u32 aarch64_insn_ldst_size[] = {
+ [AARCH64_INSN_SIZE_8] = 0,
+ [AARCH64_INSN_SIZE_16] = 1,
+ [AARCH64_INSN_SIZE_32] = 2,
+ [AARCH64_INSN_SIZE_64] = 3,
+};
+
static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type,
u32 insn)
{
u32 size;
- switch (type) {
- case AARCH64_INSN_SIZE_8:
- size = 0;
- break;
- case AARCH64_INSN_SIZE_16:
- size = 1;
- break;
- case AARCH64_INSN_SIZE_32:
- size = 2;
- break;
- case AARCH64_INSN_SIZE_64:
- size = 3;
- break;
- default:
+ if (type < AARCH64_INSN_SIZE_8 || type > AARCH64_INSN_SIZE_64) {
pr_err("%s: unknown size encoding %d\n", __func__, type);
return AARCH64_BREAK_FAULT;
}
+ size = aarch64_insn_ldst_size[type];
insn &= ~GENMASK(31, 30);
insn |= size << 30;
return insn;
}
-static inline long branch_imm_common(unsigned long pc, unsigned long addr,
+static inline long label_imm_common(unsigned long pc, unsigned long addr,
long range)
{
long offset;
@@ -359,7 +354,7 @@ u32 __kprobes aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr,
* ARM64 virtual address arrangement guarantees all kernel and module
* texts are within +/-128M.
*/
- offset = branch_imm_common(pc, addr, SZ_128M);
+ offset = label_imm_common(pc, addr, SZ_128M);
if (offset >= SZ_128M)
return AARCH64_BREAK_FAULT;
@@ -387,7 +382,7 @@ u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr,
u32 insn;
long offset;
- offset = branch_imm_common(pc, addr, SZ_1M);
+ offset = label_imm_common(pc, addr, SZ_1M);
if (offset >= SZ_1M)
return AARCH64_BREAK_FAULT;
@@ -426,7 +421,7 @@ u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr,
u32 insn;
long offset;
- offset = branch_imm_common(pc, addr, SZ_1M);
+ offset = label_imm_common(pc, addr, SZ_1M);
insn = aarch64_insn_get_bcond_value();
@@ -504,6 +499,72 @@ u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
offset);
}
+u32 aarch64_insn_gen_load_store_imm(enum aarch64_insn_register reg,
+ enum aarch64_insn_register base,
+ unsigned int imm,
+ enum aarch64_insn_size_type size,
+ enum aarch64_insn_ldst_type type)
+{
+ u32 insn;
+ u32 shift;
+
+ if (size < AARCH64_INSN_SIZE_8 || size > AARCH64_INSN_SIZE_64) {
+ pr_err("%s: unknown size encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ shift = aarch64_insn_ldst_size[size];
+ if (imm & ~(BIT(12 + shift) - BIT(shift))) {
+ pr_err("%s: invalid imm: %d\n", __func__, imm);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ imm >>= shift;
+
+ switch (type) {
+ case AARCH64_INSN_LDST_LOAD_IMM_OFFSET:
+ insn = aarch64_insn_get_ldr_imm_value();
+ break;
+ case AARCH64_INSN_LDST_STORE_IMM_OFFSET:
+ insn = aarch64_insn_get_str_imm_value();
+ break;
+ default:
+ pr_err("%s: unknown load/store encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_ldst_size(size, insn);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ base);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm);
+}
+
+u32 aarch64_insn_gen_load_literal(unsigned long pc, unsigned long addr,
+ enum aarch64_insn_register reg,
+ bool is64bit)
+{
+ u32 insn;
+ long offset;
+
+ offset = label_imm_common(pc, addr, SZ_1M);
+ if (offset >= SZ_1M)
+ return AARCH64_BREAK_FAULT;
+
+ insn = aarch64_insn_get_ldr_lit_value();
+
+ if (is64bit)
+ insn |= BIT(30);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+ offset >> 2);
+}
+
u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1,
enum aarch64_insn_register reg2,
enum aarch64_insn_register base,
@@ -578,10 +639,16 @@ u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
switch (type) {
case AARCH64_INSN_LDST_LOAD_EX:
+ case AARCH64_INSN_LDST_LOAD_ACQ_EX:
insn = aarch64_insn_get_load_ex_value();
+ if (type == AARCH64_INSN_LDST_LOAD_ACQ_EX)
+ insn |= BIT(15);
break;
case AARCH64_INSN_LDST_STORE_EX:
+ case AARCH64_INSN_LDST_STORE_REL_EX:
insn = aarch64_insn_get_store_ex_value();
+ if (type == AARCH64_INSN_LDST_STORE_REL_EX)
+ insn |= BIT(15);
break;
default:
pr_err("%s: unknown load/store exclusive encoding %d\n", __func__, type);
@@ -603,12 +670,65 @@ u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
state);
}
-u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
- enum aarch64_insn_register address,
- enum aarch64_insn_register value,
- enum aarch64_insn_size_type size)
+#ifdef CONFIG_ARM64_LSE_ATOMICS
+static u32 aarch64_insn_encode_ldst_order(enum aarch64_insn_mem_order_type type,
+ u32 insn)
+{
+ u32 order;
+
+ switch (type) {
+ case AARCH64_INSN_MEM_ORDER_NONE:
+ order = 0;
+ break;
+ case AARCH64_INSN_MEM_ORDER_ACQ:
+ order = 2;
+ break;
+ case AARCH64_INSN_MEM_ORDER_REL:
+ order = 1;
+ break;
+ case AARCH64_INSN_MEM_ORDER_ACQREL:
+ order = 3;
+ break;
+ default:
+ pr_err("%s: unknown mem order %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn &= ~GENMASK(23, 22);
+ insn |= order << 22;
+
+ return insn;
+}
+
+u32 aarch64_insn_gen_atomic_ld_op(enum aarch64_insn_register result,
+ enum aarch64_insn_register address,
+ enum aarch64_insn_register value,
+ enum aarch64_insn_size_type size,
+ enum aarch64_insn_mem_atomic_op op,
+ enum aarch64_insn_mem_order_type order)
{
- u32 insn = aarch64_insn_get_ldadd_value();
+ u32 insn;
+
+ switch (op) {
+ case AARCH64_INSN_MEM_ATOMIC_ADD:
+ insn = aarch64_insn_get_ldadd_value();
+ break;
+ case AARCH64_INSN_MEM_ATOMIC_CLR:
+ insn = aarch64_insn_get_ldclr_value();
+ break;
+ case AARCH64_INSN_MEM_ATOMIC_EOR:
+ insn = aarch64_insn_get_ldeor_value();
+ break;
+ case AARCH64_INSN_MEM_ATOMIC_SET:
+ insn = aarch64_insn_get_ldset_value();
+ break;
+ case AARCH64_INSN_MEM_ATOMIC_SWP:
+ insn = aarch64_insn_get_swp_value();
+ break;
+ default:
+ pr_err("%s: unimplemented mem atomic op %d\n", __func__, op);
+ return AARCH64_BREAK_FAULT;
+ }
switch (size) {
case AARCH64_INSN_SIZE_32:
@@ -621,6 +741,8 @@ u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
insn = aarch64_insn_encode_ldst_size(size, insn);
+ insn = aarch64_insn_encode_ldst_order(order, insn);
+
insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
result);
@@ -631,17 +753,68 @@ u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
value);
}
-u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address,
- enum aarch64_insn_register value,
- enum aarch64_insn_size_type size)
+static u32 aarch64_insn_encode_cas_order(enum aarch64_insn_mem_order_type type,
+ u32 insn)
{
- /*
- * STADD is simply encoded as an alias for LDADD with XZR as
- * the destination register.
- */
- return aarch64_insn_gen_ldadd(AARCH64_INSN_REG_ZR, address,
- value, size);
+ u32 order;
+
+ switch (type) {
+ case AARCH64_INSN_MEM_ORDER_NONE:
+ order = 0;
+ break;
+ case AARCH64_INSN_MEM_ORDER_ACQ:
+ order = BIT(22);
+ break;
+ case AARCH64_INSN_MEM_ORDER_REL:
+ order = BIT(15);
+ break;
+ case AARCH64_INSN_MEM_ORDER_ACQREL:
+ order = BIT(15) | BIT(22);
+ break;
+ default:
+ pr_err("%s: unknown mem order %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn &= ~(BIT(15) | BIT(22));
+ insn |= order;
+
+ return insn;
+}
+
+u32 aarch64_insn_gen_cas(enum aarch64_insn_register result,
+ enum aarch64_insn_register address,
+ enum aarch64_insn_register value,
+ enum aarch64_insn_size_type size,
+ enum aarch64_insn_mem_order_type order)
+{
+ u32 insn;
+
+ switch (size) {
+ case AARCH64_INSN_SIZE_32:
+ case AARCH64_INSN_SIZE_64:
+ break;
+ default:
+ pr_err("%s: unimplemented size encoding %d\n", __func__, size);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_get_cas_value();
+
+ insn = aarch64_insn_encode_ldst_size(size, insn);
+
+ insn = aarch64_insn_encode_cas_order(order, insn);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+ result);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ address);
+
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
+ value);
}
+#endif
static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type,
enum aarch64_insn_prfm_target target,
@@ -1379,7 +1552,7 @@ static u32 aarch64_encode_immediate(u64 imm,
* Compute the rotation to get a continuous set of
* ones, with the first bit set at position 0
*/
- ror = fls(~imm);
+ ror = fls64(~imm);
}
/*
@@ -1456,3 +1629,48 @@ u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm);
}
+
+u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
+{
+ u32 opt;
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_MB_SY:
+ opt = 0xf;
+ break;
+ case AARCH64_INSN_MB_ST:
+ opt = 0xe;
+ break;
+ case AARCH64_INSN_MB_LD:
+ opt = 0xd;
+ break;
+ case AARCH64_INSN_MB_ISH:
+ opt = 0xb;
+ break;
+ case AARCH64_INSN_MB_ISHST:
+ opt = 0xa;
+ break;
+ case AARCH64_INSN_MB_ISHLD:
+ opt = 0x9;
+ break;
+ case AARCH64_INSN_MB_NSH:
+ opt = 0x7;
+ break;
+ case AARCH64_INSN_MB_NSHST:
+ opt = 0x6;
+ break;
+ case AARCH64_INSN_MB_NSHLD:
+ opt = 0x5;
+ break;
+ default:
+ pr_err("%s: unknown dmb type %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_get_dmb_value();
+ insn &= ~GENMASK(11, 8);
+ insn |= (opt << 8);
+
+ return insn;
+}
diff --git a/arch/arm64/lib/kasan_sw_tags.S b/arch/arm64/lib/kasan_sw_tags.S
index 5b04464c045e..20784ce75def 100644
--- a/arch/arm64/lib/kasan_sw_tags.S
+++ b/arch/arm64/lib/kasan_sw_tags.S
@@ -38,9 +38,7 @@
* incremented by 256 prior to return).
*/
SYM_CODE_START(__hwasan_tag_mismatch)
-#ifdef BTI_C
- BTI_C
-#endif
+ bti c
add x29, sp, #232
stp x2, x3, [sp, #8 * 2]
stp x4, x5, [sp, #8 * 4]
diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S
index 7c2276fdab54..37a9f2a4f7f4 100644
--- a/arch/arm64/lib/memchr.S
+++ b/arch/arm64/lib/memchr.S
@@ -38,7 +38,7 @@
.p2align 4
nop
-SYM_FUNC_START_WEAK_PI(memchr)
+SYM_FUNC_START(__pi_memchr)
and chrin, chrin, #0xff
lsr wordcnt, cntin, #3
cbz wordcnt, L(byte_loop)
@@ -71,5 +71,6 @@ CPU_LE( rev tmp, tmp)
L(not_found):
mov result, #0
ret
-SYM_FUNC_END_PI(memchr)
+SYM_FUNC_END(__pi_memchr)
+SYM_FUNC_ALIAS_WEAK(memchr, __pi_memchr)
EXPORT_SYMBOL_NOKASAN(memchr)
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
index 7d956384222f..a5ccf2c55f91 100644
--- a/arch/arm64/lib/memcmp.S
+++ b/arch/arm64/lib/memcmp.S
@@ -32,7 +32,7 @@
#define tmp1 x7
#define tmp2 x8
-SYM_FUNC_START_WEAK_PI(memcmp)
+SYM_FUNC_START(__pi_memcmp)
subs limit, limit, 8
b.lo L(less8)
@@ -134,6 +134,6 @@ L(byte_loop):
b.eq L(byte_loop)
sub result, data1w, data2w
ret
-
-SYM_FUNC_END_PI(memcmp)
+SYM_FUNC_END(__pi_memcmp)
+SYM_FUNC_ALIAS_WEAK(memcmp, __pi_memcmp)
EXPORT_SYMBOL_NOKASAN(memcmp)
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index b82fd64ee1e1..4ab48d49c451 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -57,10 +57,7 @@
The loop tail is handled by always copying 64 bytes from the end.
*/
-SYM_FUNC_START_ALIAS(__memmove)
-SYM_FUNC_START_WEAK_ALIAS_PI(memmove)
-SYM_FUNC_START_ALIAS(__memcpy)
-SYM_FUNC_START_WEAK_PI(memcpy)
+SYM_FUNC_START(__pi_memcpy)
add srcend, src, count
add dstend, dstin, count
cmp count, 128
@@ -241,12 +238,16 @@ L(copy64_from_start):
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin]
ret
+SYM_FUNC_END(__pi_memcpy)
-SYM_FUNC_END_PI(memcpy)
-EXPORT_SYMBOL(memcpy)
-SYM_FUNC_END_ALIAS(__memcpy)
+SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
EXPORT_SYMBOL(__memcpy)
-SYM_FUNC_END_ALIAS_PI(memmove)
-EXPORT_SYMBOL(memmove)
-SYM_FUNC_END_ALIAS(__memmove)
+SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
+EXPORT_SYMBOL(memcpy)
+
+SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
+
+SYM_FUNC_ALIAS(__memmove, __pi_memmove)
EXPORT_SYMBOL(__memmove)
+SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
+EXPORT_SYMBOL(memmove)
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index a9c1c9a01ea9..a5aebe82ad73 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -42,8 +42,7 @@ dst .req x8
tmp3w .req w9
tmp3 .req x9
-SYM_FUNC_START_ALIAS(__memset)
-SYM_FUNC_START_WEAK_PI(memset)
+SYM_FUNC_START(__pi_memset)
mov dst, dstin /* Preserve return value. */
and A_lw, val, #255
orr A_lw, A_lw, A_lw, lsl #8
@@ -202,7 +201,10 @@ SYM_FUNC_START_WEAK_PI(memset)
ands count, count, zva_bits_x
b.ne .Ltail_maybe_long
ret
-SYM_FUNC_END_PI(memset)
-EXPORT_SYMBOL(memset)
-SYM_FUNC_END_ALIAS(__memset)
+SYM_FUNC_END(__pi_memset)
+
+SYM_FUNC_ALIAS(__memset, __pi_memset)
EXPORT_SYMBOL(__memset)
+
+SYM_FUNC_ALIAS_WEAK(memset, __pi_memset)
+EXPORT_SYMBOL(memset)
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
index e83643b3995f..1b7c93ae7e63 100644
--- a/arch/arm64/lib/mte.S
+++ b/arch/arm64/lib/mte.S
@@ -18,7 +18,7 @@
*/
.macro multitag_transfer_size, reg, tmp
mrs_s \reg, SYS_GMID_EL1
- ubfx \reg, \reg, #SYS_GMID_EL1_BS_SHIFT, #SYS_GMID_EL1_BS_SIZE
+ ubfx \reg, \reg, #GMID_EL1_BS_SHIFT, #GMID_EL1_BS_SIZE
mov \tmp, #4
lsl \reg, \tmp, \reg
.endm
@@ -43,17 +43,23 @@ SYM_FUNC_END(mte_clear_page_tags)
* x0 - address to the beginning of the page
*/
SYM_FUNC_START(mte_zero_clear_page_tags)
+ and x0, x0, #(1 << MTE_TAG_SHIFT) - 1 // clear the tag
mrs x1, dczid_el0
+ tbnz x1, #4, 2f // Branch if DC GZVA is prohibited
and w1, w1, #0xf
mov x2, #4
lsl x1, x2, x1
- and x0, x0, #(1 << MTE_TAG_SHIFT) - 1 // clear the tag
1: dc gzva, x0
add x0, x0, x1
tst x0, #(PAGE_SIZE - 1)
b.ne 1b
ret
+
+2: stz2g x0, [x0], #(MTE_GRANULE_SIZE * 2)
+ tst x0, #(PAGE_SIZE - 1)
+ b.ne 2b
+ ret
SYM_FUNC_END(mte_zero_clear_page_tags)
/*
@@ -87,7 +93,7 @@ SYM_FUNC_START(mte_copy_tags_from_user)
mov x3, x1
cbz x2, 2f
1:
- user_ldst 2f, ldtrb, w4, x1, 0
+USER(2f, ldtrb w4, [x1])
lsl x4, x4, #MTE_TAG_SHIFT
stg x4, [x0], #MTE_GRANULE_SIZE
add x1, x1, #1
@@ -114,7 +120,7 @@ SYM_FUNC_START(mte_copy_tags_to_user)
1:
ldg x4, [x1]
ubfx x4, x4, #MTE_TAG_SHIFT, #MTE_TAG_SIZE
- user_ldst 2f, sttrb, w4, x0, 0
+USER(2f, sttrb w4, [x0])
add x0, x0, #1
add x1, x1, #MTE_GRANULE_SIZE
subs x2, x2, #1
@@ -128,7 +134,7 @@ SYM_FUNC_END(mte_copy_tags_to_user)
/*
* Save the tags in a page
* x0 - page address
- * x1 - tag storage
+ * x1 - tag storage, MTE_PAGE_TAG_STORAGE bytes
*/
SYM_FUNC_START(mte_save_page_tags)
multitag_transfer_size x7, x5
@@ -152,7 +158,7 @@ SYM_FUNC_END(mte_save_page_tags)
/*
* Restore the tags in a page
* x0 - page address
- * x1 - tag storage
+ * x1 - tag storage, MTE_PAGE_TAG_STORAGE bytes
*/
SYM_FUNC_START(mte_restore_page_tags)
multitag_transfer_size x7, x5
diff --git a/arch/arm64/lib/strchr.S b/arch/arm64/lib/strchr.S
index 1f47eae3b0d6..94ee67a6b212 100644
--- a/arch/arm64/lib/strchr.S
+++ b/arch/arm64/lib/strchr.S
@@ -18,7 +18,7 @@
* Returns:
* x0 - address of first occurrence of 'c' or 0
*/
-SYM_FUNC_START_WEAK(strchr)
+SYM_FUNC_START(__pi_strchr)
and w1, w1, #0xff
1: ldrb w2, [x0], #1
cmp w2, w1
@@ -28,5 +28,7 @@ SYM_FUNC_START_WEAK(strchr)
cmp w2, w1
csel x0, x0, xzr, eq
ret
-SYM_FUNC_END(strchr)
+SYM_FUNC_END(__pi_strchr)
+
+SYM_FUNC_ALIAS_WEAK(strchr, __pi_strchr)
EXPORT_SYMBOL_NOKASAN(strchr)
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 83bcad72ec97..9b89b4533607 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -1,9 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (c) 2012-2021, Arm Limited.
+ * Copyright (c) 2012-2022, Arm Limited.
*
* Adapted from the original at:
- * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/strcmp.S
+ * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strcmp.S
*/
#include <linux/linkage.h>
@@ -11,166 +11,180 @@
/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
*/
#define L(label) .L ## label
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-/* Parameters and result. */
#define src1 x0
#define src2 x1
#define result x0
-/* Internal variables. */
#define data1 x2
#define data1w w2
#define data2 x3
#define data2w w3
#define has_nul x4
#define diff x5
+#define off1 x5
#define syndrome x6
-#define tmp1 x7
-#define tmp2 x8
-#define tmp3 x9
-#define zeroones x10
-#define pos x11
-
- /* Start of performance-critical section -- one 64B cache line. */
- .align 6
-SYM_FUNC_START_WEAK_PI(strcmp)
- eor tmp1, src1, src2
- mov zeroones, #REP8_01
- tst tmp1, #7
+#define tmp x6
+#define data3 x7
+#define zeroones x8
+#define shift x9
+#define off2 x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes. */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word.
+ Since carry propagation makes 0x1 bytes before a NUL byte appear
+ NUL too in big-endian, byte-reverse the data before the NUL check. */
+
+
+SYM_FUNC_START(__pi_strcmp)
+ sub off2, src2, src1
+ mov zeroones, REP8_01
+ and tmp, src1, 7
+ tst off2, 7
b.ne L(misaligned8)
- ands tmp1, src1, #7
- b.ne L(mutual_align)
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
+ cbnz tmp, L(mutual_align)
+
+ .p2align 4
+
L(loop_aligned):
- ldr data1, [src1], #8
- ldr data2, [src2], #8
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
L(start_realigned):
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+#ifdef __AARCH64EB__
+ rev tmp, data1
+ sub has_nul, tmp, zeroones
+ orr tmp, tmp, REP8_7f
+#else
+ sub has_nul, data1, zeroones
+ orr tmp, data1, REP8_7f
+#endif
+ bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_aligned)
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
orr syndrome, diff, has_nul
- cbz syndrome, L(loop_aligned)
- /* End of performance-critical section -- one 64B cache line. */
-
L(end):
-#ifndef __AARCH64EB__
+#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
- clz pos, syndrome
rev data2, data2
- lsl data1, data1, pos
- lsl data2, data2, pos
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- ret
-#else
- /* For big-endian we cannot use the trick with the syndrome value
- as carry-propagation can corrupt the upper bits if the trailing
- bytes in the string contain 0x01. */
- /* However, if there is no NUL byte in the dword, we can generate
- the result directly. We can't just subtract the bytes as the
- MSB might be significant. */
- cbnz has_nul, 1f
- cmp data1, data2
- cset result, ne
- cneg result, result, lo
- ret
-1:
- /* Re-compute the NUL-byte detection, using a byte-reversed value. */
- rev tmp3, data1
- sub tmp1, tmp3, zeroones
- orr tmp2, tmp3, #REP8_7f
- bic has_nul, tmp1, tmp2
- rev has_nul, has_nul
- orr syndrome, diff, has_nul
- clz pos, syndrome
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
+#endif
+ clz shift, syndrome
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
- lsl data1, data1, pos
- lsl data2, data2, pos
+ lsl data1, data1, shift
+ lsl data2, data2, shift
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
+ lsr data1, data1, 56
+ sub result, data1, data2, lsr 56
ret
-#endif
+
+ .p2align 4
L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
- the bytes that preceed the start point. */
- bic src1, src1, #7
- bic src2, src2, #7
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- ldr data1, [src1], #8
- neg tmp1, tmp1 /* Bits to alignment -64. */
- ldr data2, [src2], #8
- mov tmp2, #~0
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#endif
- orr data1, data1, tmp2
- orr data2, data2, tmp2
+ the bytes that precede the start point. */
+ bic src1, src1, 7
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
+ neg shift, src2, lsl 3 /* Bits to alignment -64. */
+ mov tmp, -1
+ LS_FW tmp, tmp, shift
+ orr data1, data1, tmp
+ orr data2, data2, tmp
b L(start_realigned)
L(misaligned8):
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
- checking to make sure that we don't access beyond page boundary in
- SRC2. */
- tst src1, #7
- b.eq L(loop_misaligned)
+ checking to make sure that we don't access beyond the end of SRC2. */
+ cbz tmp, L(src1_aligned)
L(do_misaligned):
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- cmp data1w, #1
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ cmp data1w, 0
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.ne L(done)
- tst src1, #7
+ tst src1, 7
b.ne L(do_misaligned)
-L(loop_misaligned):
- /* Test if we are within the last dword of the end of a 4K page. If
- yes then jump back to the misaligned loop to copy a byte at a time. */
- and tmp1, src2, #0xff8
- eor tmp1, tmp1, #0xff8
- cbz tmp1, L(do_misaligned)
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+L(src1_aligned):
+ neg shift, src2, lsl 3
+ bic src2, src2, 7
+ ldr data3, [src2], 8
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ lsr tmp, zeroones, shift
+ orr data3, data3, tmp
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ bics has_nul, has_nul, tmp
+ b.ne L(tail)
+
+ sub off1, src2, src1
+
+ .p2align 4
+
+L(loop_unaligned):
+ ldr data3, [src1, off1]
+ ldr data2, [src1, off2]
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ ldr data1, [src1], 8
+ bics has_nul, has_nul, tmp
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_unaligned)
+
+ lsl tmp, has_nul, shift
+#ifdef __AARCH64EB__
+ rev tmp, tmp
+#endif
+ eor diff, data1, data2
+ orr syndrome, diff, tmp
+ cbnz syndrome, L(end)
+L(tail):
+ ldr data1, [src1]
+ neg shift, shift
+ lsr data2, data3, shift
+ lsr has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+ rev data2, data2
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
orr syndrome, diff, has_nul
- cbz syndrome, L(loop_misaligned)
b L(end)
L(done):
sub result, data1, data2
ret
-
-SYM_FUNC_END_PI(strcmp)
-EXPORT_SYMBOL_NOHWKASAN(strcmp)
+SYM_FUNC_END(__pi_strcmp)
+SYM_FUNC_ALIAS_WEAK(strcmp, __pi_strcmp)
+EXPORT_SYMBOL_NOKASAN(strcmp)
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
index 1648790e91b3..4919fe81ae54 100644
--- a/arch/arm64/lib/strlen.S
+++ b/arch/arm64/lib/strlen.S
@@ -79,7 +79,7 @@
whether the first fetch, which may be misaligned, crosses a page
boundary. */
-SYM_FUNC_START_WEAK_PI(strlen)
+SYM_FUNC_START(__pi_strlen)
and tmp1, srcin, MIN_PAGE_SIZE - 1
mov zeroones, REP8_01
cmp tmp1, MIN_PAGE_SIZE - 16
@@ -208,6 +208,6 @@ L(page_cross):
csel data1, data1, tmp4, eq
csel data2, data2, tmp2, eq
b L(page_cross_entry)
-
-SYM_FUNC_END_PI(strlen)
+SYM_FUNC_END(__pi_strlen)
+SYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen)
EXPORT_SYMBOL_NOKASAN(strlen)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index e42bcfcd37e6..fe7bbc0b42a7 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -1,9 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (c) 2013-2021, Arm Limited.
+ * Copyright (c) 2013-2022, Arm Limited.
*
* Adapted from the original at:
- * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/strncmp.S
+ * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S
*/
#include <linux/linkage.h>
@@ -11,14 +11,14 @@
/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
*/
#define L(label) .L ## label
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
/* Parameters and result. */
#define src1 x0
@@ -39,12 +39,26 @@
#define tmp3 x10
#define zeroones x11
#define pos x12
-#define limit_wd x13
-#define mask x14
-#define endloop x15
+#define mask x13
+#define endloop x14
#define count mask
+#define offset pos
+#define neg_offset x15
-SYM_FUNC_START_WEAK_PI(strncmp)
+/* Define endian dependent shift operations.
+ On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes.
+ LS_BK means shifting towards later bytes.
+ */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
+
+SYM_FUNC_START(__pi_strncmp)
cbz limit, L(ret0)
eor tmp1, src1, src2
mov zeroones, #REP8_01
@@ -52,9 +66,6 @@ SYM_FUNC_START_WEAK_PI(strncmp)
and count, src1, #7
b.ne L(misaligned8)
cbnz count, L(mutual_align)
- /* Calculate the number of full and partial words -1. */
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
- lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
@@ -64,56 +75,52 @@ L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
L(start_realigned):
- subs limit_wd, limit_wd, #1
+ subs limit, limit, #8
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, pl /* Last Dword or differences. */
+ csinv endloop, diff, xzr, hi /* Last Dword or differences. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
b.eq L(loop_aligned)
/* End of main loop */
- /* Not reached the limit, must have found the end or a diff. */
- tbz limit_wd, #63, L(not_limit)
-
- /* Limit % 8 == 0 => all bytes significant. */
- ands limit, limit, #7
- b.eq L(not_limit)
-
- lsl limit, limit, #3 /* Bits -> bytes. */
- mov mask, #~0
-#ifdef __AARCH64EB__
- lsr mask, mask, limit
-#else
- lsl mask, mask, limit
-#endif
- bic data1, data1, mask
- bic data2, data2, mask
-
- /* Make sure that the NUL byte is marked in the syndrome. */
- orr has_nul, has_nul, mask
-
-L(not_limit):
+L(full_check):
+#ifndef __AARCH64EB__
orr syndrome, diff, has_nul
-
-#ifndef __AARCH64EB__
+ add limit, limit, 8 /* Rewind limit to before last subs. */
+L(syndrome_check):
+ /* Limit was reached. Check if the NUL byte or the difference
+ is before the limit. */
rev syndrome, syndrome
rev data1, data1
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
clz pos, syndrome
rev data2, data2
lsl data1, data1, pos
+ cmp limit, pos, lsr #3
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
+ csel result, result, xzr, hi
ret
#else
+ /* Not reached the limit, must have found the end or a diff. */
+ tbz limit, #63, L(not_limit)
+ add tmp1, limit, 8
+ cbz limit, L(not_limit)
+
+ lsl limit, tmp1, #3 /* Bits -> bytes. */
+ mov mask, #~0
+ lsr mask, mask, limit
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ /* Make sure that the NUL byte is marked in the syndrome. */
+ orr has_nul, has_nul, mask
+
+L(not_limit):
/* For big-endian we cannot use the trick with the syndrome value
as carry-propagation can corrupt the upper bits if the trailing
bytes in the string contain 0x01. */
@@ -134,10 +141,11 @@ L(not_limit):
rev has_nul, has_nul
orr syndrome, diff, has_nul
clz pos, syndrome
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
+L(end_quick):
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
@@ -159,22 +167,12 @@ L(mutual_align):
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
-#endif
- and tmp3, limit_wd, #7
- lsr limit_wd, limit_wd, #3
- /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
- add limit, limit, count
- add tmp3, tmp3, count
+ LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
+ /* Adjust the limit and ensure it doesn't overflow. */
+ adds limit, limit, count
+ csinv limit, limit, xzr, lo
orr data1, data1, tmp2
orr data2, data2, tmp2
- add limit_wd, limit_wd, tmp3, lsr #3
b L(start_realigned)
.p2align 4
@@ -197,13 +195,11 @@ L(done):
/* Align the SRC1 to a dword by doing a bytewise compare and then do
the dword loop. */
L(try_misaligned_words):
- lsr limit_wd, limit, #3
- cbz count, L(do_misaligned)
+ cbz count, L(src1_aligned)
neg count, count
and count, count, #7
sub limit, limit, count
- lsr limit_wd, limit, #3
L(page_end_loop):
ldrb data1w, [src1], #1
@@ -214,48 +210,101 @@ L(page_end_loop):
subs count, count, #1
b.hi L(page_end_loop)
-L(do_misaligned):
- /* Prepare ourselves for the next page crossing. Unlike the aligned
- loop, we fetch 1 less dword because we risk crossing bounds on
- SRC2. */
- mov count, #8
- subs limit_wd, limit_wd, #1
- b.lo L(done_loop)
-L(loop_misaligned):
- and tmp2, src2, #0xff8
- eor tmp2, tmp2, #0xff8
- cbz tmp2, L(page_end_loop)
+ /* The following diagram explains the comparison of misaligned strings.
+ The bytes are shown in natural order. For little-endian, it is
+ reversed in the registers. The "x" bytes are before the string.
+ The "|" separates data that is loaded at one time.
+ src1 | a a a a a a a a | b b b c c c c c | . . .
+ src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
+
+ After shifting in each step, the data looks like this:
+ STEP_A STEP_B STEP_C
+ data1 a a a a a a a a b b b c c c c c b b b c c c c c
+ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
+ The bytes with "0" are eliminated from the syndrome via mask.
+
+ Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+ time from SRC2. The comparison happens in 3 steps. After each step
+ the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+ /* Calculate offset from 8 byte alignment to string start in bits. No
+ need to mask offset since shifts are ignoring upper bits. */
+ lsl offset, src2, #3
+ bic src2, src2, #0xf
+ mov mask, -1
+ neg neg_offset, offset
ldr data1, [src1], #8
- ldr data2, [src2], #8
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
- ccmp diff, #0, #0, eq
- b.ne L(not_limit)
- subs limit_wd, limit_wd, #1
- b.pl L(loop_misaligned)
+ ldp tmp1, tmp2, [src2], #16
+ LS_BK mask, mask, neg_offset
+ and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
+ /* Skip the first compare if data in tmp1 is irrelevant. */
+ tbnz offset, 6, L(misaligned_mid_loop)
-L(done_loop):
- /* We found a difference or a NULL before the limit was reached. */
- and limit, limit, #7
- cbz limit, L(not_limit)
- /* Read the last word. */
- sub src1, src1, 8
- sub src2, src2, 8
- ldr data1, [src1, limit]
- ldr data2, [src2, limit]
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
+L(loop_misaligned):
+ /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+ LS_FW data2, tmp1, offset
+ LS_BK tmp1, tmp2, neg_offset
+ subs limit, limit, #8
+ orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
+ sub has_nul, data1, zeroones
eor diff, data1, data2 /* Non-zero if differences found. */
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
- ccmp diff, #0, #0, eq
- b.ne L(not_limit)
+ orr tmp3, data1, #REP8_7f
+ csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
+ orr tmp3, endloop, has_nul
+ cbnz tmp3, L(full_check)
+
+ ldr data1, [src1], #8
+L(misaligned_mid_loop):
+ /* STEP_B: Compare first part of data1 to second part of tmp2. */
+ LS_FW data2, tmp2, offset
+#ifdef __AARCH64EB__
+ /* For big-endian we do a byte reverse to avoid carry-propagation
+ problem described above. This way we can reuse the has_nul in the
+ next step and also use syndrome value trick at the end. */
+ rev tmp3, data1
+ #define data1_fixed tmp3
+#else
+ #define data1_fixed data1
+#endif
+ sub has_nul, data1_fixed, zeroones
+ orr tmp3, data1_fixed, #REP8_7f
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ cmp limit, neg_offset, lsr #3
+ orr syndrome, diff, has_nul
+ bic syndrome, syndrome, mask /* Ignore later bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ /* STEP_C: Compare second part of data1 to first part of tmp1. */
+ ldp tmp1, tmp2, [src2], #16
+ cmp limit, #8
+ LS_BK data2, tmp1, neg_offset
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ orr syndrome, diff, has_nul
+ and syndrome, syndrome, mask /* Ignore earlier bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ ldr data1, [src1], #8
+ sub limit, limit, #8
+ b L(loop_misaligned)
+
+#ifdef __AARCH64EB__
+L(syndrome_check):
+ clz pos, syndrome
+ cmp pos, limit, lsl #3
+ b.lo L(end_quick)
+#endif
L(ret0):
mov result, #0
ret
-
-SYM_FUNC_END_PI(strncmp)
-EXPORT_SYMBOL_NOHWKASAN(strncmp)
+SYM_FUNC_END(__pi_strncmp)
+SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp)
+EXPORT_SYMBOL_NOKASAN(strncmp)
diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S
index b72913a99038..d5ac0e10a01d 100644
--- a/arch/arm64/lib/strnlen.S
+++ b/arch/arm64/lib/strnlen.S
@@ -47,7 +47,7 @@ limit_wd .req x14
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
-SYM_FUNC_START_WEAK_PI(strnlen)
+SYM_FUNC_START(__pi_strnlen)
cbz limit, .Lhit_limit
mov zeroones, #REP8_01
bic src, srcin, #15
@@ -156,5 +156,7 @@ CPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */
.Lhit_limit:
mov len, limit
ret
-SYM_FUNC_END_PI(strnlen)
+SYM_FUNC_END(__pi_strnlen)
+
+SYM_FUNC_ALIAS_WEAK(strnlen, __pi_strnlen)
EXPORT_SYMBOL_NOKASAN(strnlen)
diff --git a/arch/arm64/lib/strrchr.S b/arch/arm64/lib/strrchr.S
index 13132d1ed6d1..a5123cf0ce12 100644
--- a/arch/arm64/lib/strrchr.S
+++ b/arch/arm64/lib/strrchr.S
@@ -18,7 +18,7 @@
* Returns:
* x0 - address of last occurrence of 'c' or 0
*/
-SYM_FUNC_START_WEAK_PI(strrchr)
+SYM_FUNC_START(__pi_strrchr)
mov x3, #0
and w1, w1, #0xff
1: ldrb w2, [x0], #1
@@ -29,5 +29,6 @@ SYM_FUNC_START_WEAK_PI(strrchr)
b 1b
2: mov x0, x3
ret
-SYM_FUNC_END_PI(strrchr)
+SYM_FUNC_END(__pi_strrchr)
+SYM_FUNC_ALIAS_WEAK(strrchr, __pi_strrchr)
EXPORT_SYMBOL_NOKASAN(strrchr)
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index 11bf4f8aca68..96b171995d19 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -10,8 +10,8 @@
#include <linux/module.h>
#include <asm/neon-intrinsics.h>
-void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
- unsigned long *p2)
+void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
@@ -37,8 +37,9 @@ void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
} while (--lines > 0);
}
-void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
- unsigned long *p2, unsigned long *p3)
+void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
@@ -72,8 +73,10 @@ void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
} while (--lines > 0);
}
-void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
- unsigned long *p2, unsigned long *p3, unsigned long *p4)
+void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
@@ -115,9 +118,11 @@ void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
} while (--lines > 0);
}
-void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
- unsigned long *p2, unsigned long *p3,
- unsigned long *p4, unsigned long *p5)
+void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
{
uint64_t *dp1 = (uint64_t *)p1;
uint64_t *dp2 = (uint64_t *)p2;
@@ -167,7 +172,7 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
} while (--lines > 0);
}
-struct xor_block_template const xor_block_inner_neon = {
+struct xor_block_template xor_block_inner_neon __ro_after_init = {
.name = "__inner_neon__",
.do_2 = xor_arm64_neon_2,
.do_3 = xor_arm64_neon_3,
@@ -176,6 +181,158 @@ struct xor_block_template const xor_block_inner_neon = {
};
EXPORT_SYMBOL(xor_block_inner_neon);
+static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
+{
+ uint64x2_t res;
+
+ asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
+ "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
+ : "=w"(res) : "w"(p), "w"(q), "w"(r));
+ return res;
+}
+
+static void xor_arm64_eor3_3(unsigned long bytes,
+ unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
+{
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+ uint64_t *dp3 = (uint64_t *)p3;
+
+ register uint64x2_t v0, v1, v2, v3;
+ long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+ do {
+ /* p1 ^= p2 ^ p3 */
+ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+ vld1q_u64(dp3 + 0));
+ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+ vld1q_u64(dp3 + 2));
+ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+ vld1q_u64(dp3 + 4));
+ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+ vld1q_u64(dp3 + 6));
+
+ /* store */
+ vst1q_u64(dp1 + 0, v0);
+ vst1q_u64(dp1 + 2, v1);
+ vst1q_u64(dp1 + 4, v2);
+ vst1q_u64(dp1 + 6, v3);
+
+ dp1 += 8;
+ dp2 += 8;
+ dp3 += 8;
+ } while (--lines > 0);
+}
+
+static void xor_arm64_eor3_4(unsigned long bytes,
+ unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
+{
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+ uint64_t *dp3 = (uint64_t *)p3;
+ uint64_t *dp4 = (uint64_t *)p4;
+
+ register uint64x2_t v0, v1, v2, v3;
+ long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+ do {
+ /* p1 ^= p2 ^ p3 */
+ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+ vld1q_u64(dp3 + 0));
+ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+ vld1q_u64(dp3 + 2));
+ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+ vld1q_u64(dp3 + 4));
+ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+ vld1q_u64(dp3 + 6));
+
+ /* p1 ^= p4 */
+ v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
+ v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
+ v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
+ v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
+
+ /* store */
+ vst1q_u64(dp1 + 0, v0);
+ vst1q_u64(dp1 + 2, v1);
+ vst1q_u64(dp1 + 4, v2);
+ vst1q_u64(dp1 + 6, v3);
+
+ dp1 += 8;
+ dp2 += 8;
+ dp3 += 8;
+ dp4 += 8;
+ } while (--lines > 0);
+}
+
+static void xor_arm64_eor3_5(unsigned long bytes,
+ unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
+{
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+ uint64_t *dp3 = (uint64_t *)p3;
+ uint64_t *dp4 = (uint64_t *)p4;
+ uint64_t *dp5 = (uint64_t *)p5;
+
+ register uint64x2_t v0, v1, v2, v3;
+ long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+ do {
+ /* p1 ^= p2 ^ p3 */
+ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+ vld1q_u64(dp3 + 0));
+ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+ vld1q_u64(dp3 + 2));
+ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+ vld1q_u64(dp3 + 4));
+ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+ vld1q_u64(dp3 + 6));
+
+ /* p1 ^= p4 ^ p5 */
+ v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
+ v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
+ v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
+ v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
+
+ /* store */
+ vst1q_u64(dp1 + 0, v0);
+ vst1q_u64(dp1 + 2, v1);
+ vst1q_u64(dp1 + 4, v2);
+ vst1q_u64(dp1 + 6, v3);
+
+ dp1 += 8;
+ dp2 += 8;
+ dp3 += 8;
+ dp4 += 8;
+ dp5 += 8;
+ } while (--lines > 0);
+}
+
+static int __init xor_neon_init(void)
+{
+ if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
+ xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
+ xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
+ xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
+ }
+ return 0;
+}
+module_init(xor_neon_init);
+
+static void __exit xor_neon_exit(void)
+{
+}
+module_exit(xor_neon_exit);
+
MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
MODULE_DESCRIPTION("ARMv8 XOR Extensions");
MODULE_LICENSE("GPL");