From 7ef377c4d4abb7a2a74fc319dc1bce46f2449af7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 13 Apr 2025 08:43:50 -0700 Subject: lib/crc: make the CPU feature static keys __ro_after_init All of the CRC library's CPU feature static_keys are initialized by initcalls and never change afterwards, so there's no need for them to be in the regular .data section. Put them in .data..ro_after_init instead. Reviewed-by: "Martin K. Petersen" Acked-by: Ard Biesheuvel Acked-by: Heiko Carstens # s390 Link: https://lore.kernel.org/r/20250413154350.10819-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm/lib/crc-t10dif-glue.c | 4 ++-- arch/arm/lib/crc32-glue.c | 4 ++-- arch/arm64/lib/crc-t10dif-glue.c | 4 ++-- arch/loongarch/lib/crc32-loongarch.c | 2 +- arch/mips/lib/crc32-mips.c | 2 +- arch/powerpc/lib/crc-t10dif-glue.c | 2 +- arch/powerpc/lib/crc32-glue.c | 2 +- arch/s390/lib/crc32-glue.c | 2 +- arch/sparc/lib/crc32_glue.c | 2 +- arch/x86/lib/crc-t10dif-glue.c | 2 +- arch/x86/lib/crc32-glue.c | 4 ++-- arch/x86/lib/crc64-glue.c | 2 +- 12 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/arm/lib/crc-t10dif-glue.c b/arch/arm/lib/crc-t10dif-glue.c index 6efad3d78284..382437094bdd 100644 --- a/arch/arm/lib/crc-t10dif-glue.c +++ b/arch/arm/lib/crc-t10dif-glue.c @@ -16,8 +16,8 @@ #include #include -static DEFINE_STATIC_KEY_FALSE(have_neon); -static DEFINE_STATIC_KEY_FALSE(have_pmull); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); #define CRC_T10DIF_PMULL_CHUNK_SIZE 16U diff --git a/arch/arm/lib/crc32-glue.c b/arch/arm/lib/crc32-glue.c index 4340351dbde8..7ef7db9c0de7 100644 --- a/arch/arm/lib/crc32-glue.c +++ b/arch/arm/lib/crc32-glue.c @@ -18,8 +18,8 @@ #include #include -static DEFINE_STATIC_KEY_FALSE(have_crc32); -static DEFINE_STATIC_KEY_FALSE(have_pmull); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); #define PMULL_MIN_LEN 64 /* min size of buffer for pmull functions */ diff --git a/arch/arm64/lib/crc-t10dif-glue.c b/arch/arm64/lib/crc-t10dif-glue.c index bacd18f23168..99d0b5668a28 100644 --- a/arch/arm64/lib/crc-t10dif-glue.c +++ b/arch/arm64/lib/crc-t10dif-glue.c @@ -17,8 +17,8 @@ #include #include -static DEFINE_STATIC_KEY_FALSE(have_asimd); -static DEFINE_STATIC_KEY_FALSE(have_pmull); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); #define CRC_T10DIF_PMULL_CHUNK_SIZE 16U diff --git a/arch/loongarch/lib/crc32-loongarch.c b/arch/loongarch/lib/crc32-loongarch.c index c44ee4f32557..8e6d1f517e73 100644 --- a/arch/loongarch/lib/crc32-loongarch.c +++ b/arch/loongarch/lib/crc32-loongarch.c @@ -26,7 +26,7 @@ do { \ #define CRC32(crc, value, size) _CRC32(crc, value, size, crc) #define CRC32C(crc, value, size) _CRC32(crc, value, size, crcc) -static DEFINE_STATIC_KEY_FALSE(have_crc32); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) { diff --git a/arch/mips/lib/crc32-mips.c b/arch/mips/lib/crc32-mips.c index 676a4b3e290b..84df361e7181 100644 --- a/arch/mips/lib/crc32-mips.c +++ b/arch/mips/lib/crc32-mips.c @@ -62,7 +62,7 @@ do { \ #define CRC32C(crc, value, size) \ _CRC32(crc, value, size, crc32c) -static DEFINE_STATIC_KEY_FALSE(have_crc32); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) { diff --git a/arch/powerpc/lib/crc-t10dif-glue.c b/arch/powerpc/lib/crc-t10dif-glue.c index f411b0120cc5..ddd5c4088f50 100644 --- a/arch/powerpc/lib/crc-t10dif-glue.c +++ b/arch/powerpc/lib/crc-t10dif-glue.c @@ -21,7 +21,7 @@ #define VECTOR_BREAKPOINT 64 -static DEFINE_STATIC_KEY_FALSE(have_vec_crypto); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto); u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len); diff --git a/arch/powerpc/lib/crc32-glue.c b/arch/powerpc/lib/crc32-glue.c index dbd10f339183..42f2dd3c85dd 100644 --- a/arch/powerpc/lib/crc32-glue.c +++ b/arch/powerpc/lib/crc32-glue.c @@ -13,7 +13,7 @@ #define VECTOR_BREAKPOINT 512 -static DEFINE_STATIC_KEY_FALSE(have_vec_crypto); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto); u32 __crc32c_vpmsum(u32 crc, const u8 *p, size_t len); diff --git a/arch/s390/lib/crc32-glue.c b/arch/s390/lib/crc32-glue.c index 124214a27340..8f20a8e595c3 100644 --- a/arch/s390/lib/crc32-glue.c +++ b/arch/s390/lib/crc32-glue.c @@ -18,7 +18,7 @@ #define VX_ALIGNMENT 16L #define VX_ALIGN_MASK (VX_ALIGNMENT - 1) -static DEFINE_STATIC_KEY_FALSE(have_vxrs); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vxrs); /* * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension diff --git a/arch/sparc/lib/crc32_glue.c b/arch/sparc/lib/crc32_glue.c index a70752c729cf..d34e7cc7e1a1 100644 --- a/arch/sparc/lib/crc32_glue.c +++ b/arch/sparc/lib/crc32_glue.c @@ -17,7 +17,7 @@ #include #include -static DEFINE_STATIC_KEY_FALSE(have_crc32c_opcode); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32c_opcode); u32 crc32_le_arch(u32 crc, const u8 *data, size_t len) { diff --git a/arch/x86/lib/crc-t10dif-glue.c b/arch/x86/lib/crc-t10dif-glue.c index f89c335cde3c..d073b3678edc 100644 --- a/arch/x86/lib/crc-t10dif-glue.c +++ b/arch/x86/lib/crc-t10dif-glue.c @@ -9,7 +9,7 @@ #include #include "crc-pclmul-template.h" -static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16); diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c index e3f93b17ac3f..e6a6285cfca8 100644 --- a/arch/x86/lib/crc32-glue.c +++ b/arch/x86/lib/crc32-glue.c @@ -11,8 +11,8 @@ #include #include "crc-pclmul-template.h" -static DEFINE_STATIC_KEY_FALSE(have_crc32); -static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); diff --git a/arch/x86/lib/crc64-glue.c b/arch/x86/lib/crc64-glue.c index b0e1b719ecbf..1214ee726c16 100644 --- a/arch/x86/lib/crc64-glue.c +++ b/arch/x86/lib/crc64-glue.c @@ -9,7 +9,7 @@ #include #include "crc-pclmul-template.h" -static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64); DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64); -- cgit v1.2.3-59-g8ed1b From 93b988cf8e4c68b823e70a02e4c7c39eaa0053be Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 17 Apr 2025 14:30:56 +0200 Subject: s390/crc32: Remove have_vxrs static key Replace the have_vxrs static key with a cpu_has_vx() call. cpu_has_vx() resolves into a compile time constant (true) if the kernel is compiled for z13 or newer. Otherwise it generates an unconditional one instruction branch, which is patched based on CPU alternatives. In any case the generated code is at least as good as before and avoids static key handling. Signed-off-by: Heiko Carstens Link: https://lore.kernel.org/r/20250417125318.12521F12-hca@linux.ibm.com/ Signed-off-by: Eric Biggers --- arch/s390/lib/crc32-glue.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/s390/lib/crc32-glue.c b/arch/s390/lib/crc32-glue.c index 8f20a8e595c3..649ed7e8b99c 100644 --- a/arch/s390/lib/crc32-glue.c +++ b/arch/s390/lib/crc32-glue.c @@ -18,8 +18,6 @@ #define VX_ALIGNMENT 16L #define VX_ALIGN_MASK (VX_ALIGNMENT - 1) -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vxrs); - /* * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension * @@ -34,8 +32,7 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vxrs); unsigned long prealign, aligned, remaining; \ DECLARE_KERNEL_FPU_ONSTACK16(vxstate); \ \ - if (datalen < VX_MIN_LEN + VX_ALIGN_MASK || \ - !static_branch_likely(&have_vxrs)) \ + if (datalen < VX_MIN_LEN + VX_ALIGN_MASK || !cpu_has_vx()) \ return ___crc32_sw(crc, data, datalen); \ \ if ((unsigned long)data & VX_ALIGN_MASK) { \ @@ -66,8 +63,6 @@ DEFINE_CRC32_VX(crc32c_arch, crc32c_le_vgfm_16, crc32c_base) static int __init crc32_s390_init(void) { - if (cpu_have_feature(S390_CPU_FEATURE_VXRS)) - static_branch_enable(&have_vxrs); return 0; } arch_initcall(crc32_s390_init); @@ -79,10 +74,11 @@ module_exit(crc32_s390_exit); u32 crc32_optimizations(void) { - if (static_key_enabled(&have_vxrs)) + if (cpu_has_vx()) { return CRC32_LE_OPTIMIZATION | CRC32_BE_OPTIMIZATION | CRC32C_OPTIMIZATION; + } return 0; } EXPORT_SYMBOL(crc32_optimizations); -- cgit v1.2.3-59-g8ed1b From fea9ad4dde9bf6c65e72da0d4c1ae7969d0bb8bd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 17 Apr 2025 09:38:29 -0700 Subject: s390/crc32: Remove no-op module init and exit functions Now that the crc32-s390 module init function is a no-op, there is no need to define it. Remove it. The removal of the init function also makes the exit function unnecessary, so remove that too. Acked-by: Heiko Carstens Link: https://lore.kernel.org/r/20250417163829.4599-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/s390/lib/crc32-glue.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/s390/lib/crc32-glue.c b/arch/s390/lib/crc32-glue.c index 649ed7e8b99c..3c4b344417c1 100644 --- a/arch/s390/lib/crc32-glue.c +++ b/arch/s390/lib/crc32-glue.c @@ -61,17 +61,6 @@ DEFINE_CRC32_VX(crc32_le_arch, crc32_le_vgfm_16, crc32_le_base) DEFINE_CRC32_VX(crc32_be_arch, crc32_be_vgfm_16, crc32_be_base) DEFINE_CRC32_VX(crc32c_arch, crc32c_le_vgfm_16, crc32c_base) -static int __init crc32_s390_init(void) -{ - return 0; -} -arch_initcall(crc32_s390_init); - -static void __exit crc32_s390_exit(void) -{ -} -module_exit(crc32_s390_exit); - u32 crc32_optimizations(void) { if (cpu_has_vx()) { -- cgit v1.2.3-59-g8ed1b From 6cc25e4b7c819c183a21d6b9a4bcec84229131d1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Apr 2025 17:20:32 -0700 Subject: arm/crc: drop "glue" from filenames The use of the term "glue" in filenames is a Crypto API-ism that rarely shows up elsewhere in lib/ or arch/*/lib/. I think adopting it there was a mistake. The library just uses standard functions, so the amount of code that could be considered "glue" is quite small. And while often the C functions just wrap the assembly functions, there are also cases like crc32c_arch() in arch/x86/lib/crc32-glue.c that blur the line by in-lining the actual implementation into the C function. That's not "glue code", but rather the actual code. Therefore, let's drop "glue" from the filenames and instead use e.g. crc32.c instead of crc32-glue.c. Reviewed-by: "Martin K. Petersen" Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250424002038.179114-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm/lib/Makefile | 4 +- arch/arm/lib/crc-t10dif-glue.c | 72 ------------------------ arch/arm/lib/crc-t10dif.c | 72 ++++++++++++++++++++++++ arch/arm/lib/crc32-glue.c | 123 ----------------------------------------- arch/arm/lib/crc32.c | 123 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 197 insertions(+), 197 deletions(-) delete mode 100644 arch/arm/lib/crc-t10dif-glue.c create mode 100644 arch/arm/lib/crc-t10dif.c delete mode 100644 arch/arm/lib/crc32-glue.c create mode 100644 arch/arm/lib/crc32.c diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 007874320937..d05dd672bcd9 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -47,7 +47,7 @@ endif obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o obj-$(CONFIG_CRC32_ARCH) += crc32-arm.o -crc32-arm-y := crc32-glue.o crc32-core.o +crc32-arm-y := crc32.o crc32-core.o obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm.o -crc-t10dif-arm-y := crc-t10dif-glue.o crc-t10dif-core.o +crc-t10dif-arm-y := crc-t10dif.o crc-t10dif-core.o diff --git a/arch/arm/lib/crc-t10dif-glue.c b/arch/arm/lib/crc-t10dif-glue.c deleted file mode 100644 index 382437094bdd..000000000000 --- a/arch/arm/lib/crc-t10dif-glue.c +++ /dev/null @@ -1,72 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions - * - * Copyright (C) 2016 Linaro Ltd - */ - -#include -#include -#include -#include -#include - -#include - -#include -#include - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); - -#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U - -asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len); -asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len, - u8 out[16]); - -u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) -{ - if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { - if (static_branch_likely(&have_pmull)) { - if (crypto_simd_usable()) { - kernel_neon_begin(); - crc = crc_t10dif_pmull64(crc, data, length); - kernel_neon_end(); - return crc; - } - } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && - static_branch_likely(&have_neon) && - crypto_simd_usable()) { - u8 buf[16] __aligned(16); - - kernel_neon_begin(); - crc_t10dif_pmull8(crc, data, length, buf); - kernel_neon_end(); - - return crc_t10dif_generic(0, buf, sizeof(buf)); - } - } - return crc_t10dif_generic(crc, data, length); -} -EXPORT_SYMBOL(crc_t10dif_arch); - -static int __init crc_t10dif_arm_init(void) -{ - if (elf_hwcap & HWCAP_NEON) { - static_branch_enable(&have_neon); - if (elf_hwcap2 & HWCAP2_PMULL) - static_branch_enable(&have_pmull); - } - return 0; -} -arch_initcall(crc_t10dif_arm_init); - -static void __exit crc_t10dif_arm_exit(void) -{ -} -module_exit(crc_t10dif_arm_exit); - -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_DESCRIPTION("Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/arm/lib/crc-t10dif.c b/arch/arm/lib/crc-t10dif.c new file mode 100644 index 000000000000..382437094bdd --- /dev/null +++ b/arch/arm/lib/crc-t10dif.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd + */ + +#include +#include +#include +#include +#include + +#include + +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); + +#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U + +asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len); +asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len, + u8 out[16]); + +u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) +{ + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { + if (static_branch_likely(&have_pmull)) { + if (crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc_t10dif_pmull64(crc, data, length); + kernel_neon_end(); + return crc; + } + } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && + static_branch_likely(&have_neon) && + crypto_simd_usable()) { + u8 buf[16] __aligned(16); + + kernel_neon_begin(); + crc_t10dif_pmull8(crc, data, length, buf); + kernel_neon_end(); + + return crc_t10dif_generic(0, buf, sizeof(buf)); + } + } + return crc_t10dif_generic(crc, data, length); +} +EXPORT_SYMBOL(crc_t10dif_arch); + +static int __init crc_t10dif_arm_init(void) +{ + if (elf_hwcap & HWCAP_NEON) { + static_branch_enable(&have_neon); + if (elf_hwcap2 & HWCAP2_PMULL) + static_branch_enable(&have_pmull); + } + return 0; +} +arch_initcall(crc_t10dif_arm_init); + +static void __exit crc_t10dif_arm_exit(void) +{ +} +module_exit(crc_t10dif_arm_exit); + +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_DESCRIPTION("Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/arm/lib/crc32-glue.c b/arch/arm/lib/crc32-glue.c deleted file mode 100644 index 7ef7db9c0de7..000000000000 --- a/arch/arm/lib/crc32-glue.c +++ /dev/null @@ -1,123 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions - * - * Copyright (C) 2016 Linaro Ltd - */ - -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); - -#define PMULL_MIN_LEN 64 /* min size of buffer for pmull functions */ - -asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc); -asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len); - -asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc); -asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len); - -static u32 crc32_le_scalar(u32 crc, const u8 *p, size_t len) -{ - if (static_branch_likely(&have_crc32)) - return crc32_armv8_le(crc, p, len); - return crc32_le_base(crc, p, len); -} - -u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) -{ - if (len >= PMULL_MIN_LEN + 15 && - static_branch_likely(&have_pmull) && crypto_simd_usable()) { - size_t n = -(uintptr_t)p & 15; - - /* align p to 16-byte boundary */ - if (n) { - crc = crc32_le_scalar(crc, p, n); - p += n; - len -= n; - } - n = round_down(len, 16); - kernel_neon_begin(); - crc = crc32_pmull_le(p, n, crc); - kernel_neon_end(); - p += n; - len -= n; - } - return crc32_le_scalar(crc, p, len); -} -EXPORT_SYMBOL(crc32_le_arch); - -static u32 crc32c_scalar(u32 crc, const u8 *p, size_t len) -{ - if (static_branch_likely(&have_crc32)) - return crc32c_armv8_le(crc, p, len); - return crc32c_base(crc, p, len); -} - -u32 crc32c_arch(u32 crc, const u8 *p, size_t len) -{ - if (len >= PMULL_MIN_LEN + 15 && - static_branch_likely(&have_pmull) && crypto_simd_usable()) { - size_t n = -(uintptr_t)p & 15; - - /* align p to 16-byte boundary */ - if (n) { - crc = crc32c_scalar(crc, p, n); - p += n; - len -= n; - } - n = round_down(len, 16); - kernel_neon_begin(); - crc = crc32c_pmull_le(p, n, crc); - kernel_neon_end(); - p += n; - len -= n; - } - return crc32c_scalar(crc, p, len); -} -EXPORT_SYMBOL(crc32c_arch); - -u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) -{ - return crc32_be_base(crc, p, len); -} -EXPORT_SYMBOL(crc32_be_arch); - -static int __init crc32_arm_init(void) -{ - if (elf_hwcap2 & HWCAP2_CRC32) - static_branch_enable(&have_crc32); - if (elf_hwcap2 & HWCAP2_PMULL) - static_branch_enable(&have_pmull); - return 0; -} -arch_initcall(crc32_arm_init); - -static void __exit crc32_arm_exit(void) -{ -} -module_exit(crc32_arm_exit); - -u32 crc32_optimizations(void) -{ - if (elf_hwcap2 & (HWCAP2_CRC32 | HWCAP2_PMULL)) - return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION; - return 0; -} -EXPORT_SYMBOL(crc32_optimizations); - -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_DESCRIPTION("Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/arm/lib/crc32.c b/arch/arm/lib/crc32.c new file mode 100644 index 000000000000..7ef7db9c0de7 --- /dev/null +++ b/arch/arm/lib/crc32.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); + +#define PMULL_MIN_LEN 64 /* min size of buffer for pmull functions */ + +asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc); +asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len); + +asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc); +asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len); + +static u32 crc32_le_scalar(u32 crc, const u8 *p, size_t len) +{ + if (static_branch_likely(&have_crc32)) + return crc32_armv8_le(crc, p, len); + return crc32_le_base(crc, p, len); +} + +u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (len >= PMULL_MIN_LEN + 15 && + static_branch_likely(&have_pmull) && crypto_simd_usable()) { + size_t n = -(uintptr_t)p & 15; + + /* align p to 16-byte boundary */ + if (n) { + crc = crc32_le_scalar(crc, p, n); + p += n; + len -= n; + } + n = round_down(len, 16); + kernel_neon_begin(); + crc = crc32_pmull_le(p, n, crc); + kernel_neon_end(); + p += n; + len -= n; + } + return crc32_le_scalar(crc, p, len); +} +EXPORT_SYMBOL(crc32_le_arch); + +static u32 crc32c_scalar(u32 crc, const u8 *p, size_t len) +{ + if (static_branch_likely(&have_crc32)) + return crc32c_armv8_le(crc, p, len); + return crc32c_base(crc, p, len); +} + +u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + if (len >= PMULL_MIN_LEN + 15 && + static_branch_likely(&have_pmull) && crypto_simd_usable()) { + size_t n = -(uintptr_t)p & 15; + + /* align p to 16-byte boundary */ + if (n) { + crc = crc32c_scalar(crc, p, n); + p += n; + len -= n; + } + n = round_down(len, 16); + kernel_neon_begin(); + crc = crc32c_pmull_le(p, n, crc); + kernel_neon_end(); + p += n; + len -= n; + } + return crc32c_scalar(crc, p, len); +} +EXPORT_SYMBOL(crc32c_arch); + +u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) +{ + return crc32_be_base(crc, p, len); +} +EXPORT_SYMBOL(crc32_be_arch); + +static int __init crc32_arm_init(void) +{ + if (elf_hwcap2 & HWCAP2_CRC32) + static_branch_enable(&have_crc32); + if (elf_hwcap2 & HWCAP2_PMULL) + static_branch_enable(&have_pmull); + return 0; +} +arch_initcall(crc32_arm_init); + +static void __exit crc32_arm_exit(void) +{ +} +module_exit(crc32_arm_exit); + +u32 crc32_optimizations(void) +{ + if (elf_hwcap2 & (HWCAP2_CRC32 | HWCAP2_PMULL)) + return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION; + return 0; +} +EXPORT_SYMBOL(crc32_optimizations); + +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_DESCRIPTION("Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3-59-g8ed1b From db6108d3ac91b7f09df366e0627d81803c703513 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Apr 2025 17:20:33 -0700 Subject: arm64/crc: drop "glue" from filenames The use of the term "glue" in filenames is a Crypto API-ism that rarely shows up elsewhere in lib/ or arch/*/lib/. I think adopting it there was a mistake. The library just uses standard functions, so the amount of code that could be considered "glue" is quite small. And while often the C functions just wrap the assembly functions, there are also cases like crc32c_arch() in arch/x86/lib/crc32-glue.c that blur the line by in-lining the actual implementation into the C function. That's not "glue code", but rather the actual code. Therefore, let's drop "glue" from the filenames and instead use e.g. crc32.c instead of crc32-glue.c. Reviewed-by: "Martin K. Petersen" Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250424002038.179114-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/lib/Makefile | 4 +- arch/arm64/lib/crc-t10dif-glue.c | 73 -------- arch/arm64/lib/crc-t10dif.c | 73 ++++++++ arch/arm64/lib/crc32-core.S | 362 +++++++++++++++++++++++++++++++++++++++ arch/arm64/lib/crc32-glue.c | 99 ----------- arch/arm64/lib/crc32.S | 362 --------------------------------------- arch/arm64/lib/crc32.c | 99 +++++++++++ 7 files changed, 536 insertions(+), 536 deletions(-) delete mode 100644 arch/arm64/lib/crc-t10dif-glue.c create mode 100644 arch/arm64/lib/crc-t10dif.c create mode 100644 arch/arm64/lib/crc32-core.S delete mode 100644 arch/arm64/lib/crc32-glue.c delete mode 100644 arch/arm64/lib/crc32.S create mode 100644 arch/arm64/lib/crc32.c diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 4d49dff721a8..d97e290619bc 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -14,10 +14,10 @@ endif lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o -crc32-arm64-y := crc32.o crc32-glue.o +crc32-arm64-y := crc32.o crc32-core.o obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm64.o -crc-t10dif-arm64-y := crc-t10dif-glue.o crc-t10dif-core.o +crc-t10dif-arm64-y := crc-t10dif.o crc-t10dif-core.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/arm64/lib/crc-t10dif-glue.c b/arch/arm64/lib/crc-t10dif-glue.c deleted file mode 100644 index 99d0b5668a28..000000000000 --- a/arch/arm64/lib/crc-t10dif-glue.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions - * - * Copyright (C) 2016 - 2017 Linaro Ltd - */ - -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); - -#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U - -asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len, - u8 out[16]); -asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); - -u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) -{ - if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { - if (static_branch_likely(&have_pmull)) { - if (crypto_simd_usable()) { - kernel_neon_begin(); - crc = crc_t10dif_pmull_p64(crc, data, length); - kernel_neon_end(); - return crc; - } - } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && - static_branch_likely(&have_asimd) && - crypto_simd_usable()) { - u8 buf[16]; - - kernel_neon_begin(); - crc_t10dif_pmull_p8(crc, data, length, buf); - kernel_neon_end(); - - return crc_t10dif_generic(0, buf, sizeof(buf)); - } - } - return crc_t10dif_generic(crc, data, length); -} -EXPORT_SYMBOL(crc_t10dif_arch); - -static int __init crc_t10dif_arm64_init(void) -{ - if (cpu_have_named_feature(ASIMD)) { - static_branch_enable(&have_asimd); - if (cpu_have_named_feature(PMULL)) - static_branch_enable(&have_pmull); - } - return 0; -} -arch_initcall(crc_t10dif_arm64_init); - -static void __exit crc_t10dif_arm64_exit(void) -{ -} -module_exit(crc_t10dif_arm64_exit); - -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/lib/crc-t10dif.c b/arch/arm64/lib/crc-t10dif.c new file mode 100644 index 000000000000..99d0b5668a28 --- /dev/null +++ b/arch/arm64/lib/crc-t10dif.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 - 2017 Linaro Ltd + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); + +#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U + +asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len, + u8 out[16]); +asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); + +u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) +{ + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { + if (static_branch_likely(&have_pmull)) { + if (crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc_t10dif_pmull_p64(crc, data, length); + kernel_neon_end(); + return crc; + } + } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && + static_branch_likely(&have_asimd) && + crypto_simd_usable()) { + u8 buf[16]; + + kernel_neon_begin(); + crc_t10dif_pmull_p8(crc, data, length, buf); + kernel_neon_end(); + + return crc_t10dif_generic(0, buf, sizeof(buf)); + } + } + return crc_t10dif_generic(crc, data, length); +} +EXPORT_SYMBOL(crc_t10dif_arch); + +static int __init crc_t10dif_arm64_init(void) +{ + if (cpu_have_named_feature(ASIMD)) { + static_branch_enable(&have_asimd); + if (cpu_have_named_feature(PMULL)) + static_branch_enable(&have_pmull); + } + return 0; +} +arch_initcall(crc_t10dif_arm64_init); + +static void __exit crc_t10dif_arm64_exit(void) +{ +} +module_exit(crc_t10dif_arm64_exit); + +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/lib/crc32-core.S b/arch/arm64/lib/crc32-core.S new file mode 100644 index 000000000000..68825317460f --- /dev/null +++ b/arch/arm64/lib/crc32-core.S @@ -0,0 +1,362 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions + * + * Copyright (C) 2016 - 2018 Linaro Ltd. + * Copyright (C) 2024 Google LLC + * + * Author: Ard Biesheuvel + */ + +#include +#include + + .cpu generic+crc+crypto + + .macro bitle, reg + .endm + + .macro bitbe, reg + rbit \reg, \reg + .endm + + .macro bytele, reg + .endm + + .macro bytebe, reg + rbit \reg, \reg + lsr \reg, \reg, #24 + .endm + + .macro hwordle, reg +CPU_BE( rev16 \reg, \reg ) + .endm + + .macro hwordbe, reg +CPU_LE( rev \reg, \reg ) + rbit \reg, \reg +CPU_BE( lsr \reg, \reg, #16 ) + .endm + + .macro le, regs:vararg + .irp r, \regs +CPU_BE( rev \r, \r ) + .endr + .endm + + .macro be, regs:vararg + .irp r, \regs +CPU_LE( rev \r, \r ) + .endr + .irp r, \regs + rbit \r, \r + .endr + .endm + + .macro __crc32, c, order=le + bit\order w0 + cmp x2, #16 + b.lt 8f // less than 16 bytes + + and x7, x2, #0x1f + and x2, x2, #~0x1f + cbz x7, 32f // multiple of 32 bytes + + and x8, x7, #0xf + ldp x3, x4, [x1] + add x8, x8, x1 + add x1, x1, x7 + ldp x5, x6, [x8] + \order x3, x4, x5, x6 + + tst x7, #8 + crc32\c\()x w8, w0, x3 + csel x3, x3, x4, eq + csel w0, w0, w8, eq + tst x7, #4 + lsr x4, x3, #32 + crc32\c\()w w8, w0, w3 + csel x3, x3, x4, eq + csel w0, w0, w8, eq + tst x7, #2 + lsr w4, w3, #16 + crc32\c\()h w8, w0, w3 + csel w3, w3, w4, eq + csel w0, w0, w8, eq + tst x7, #1 + crc32\c\()b w8, w0, w3 + csel w0, w0, w8, eq + tst x7, #16 + crc32\c\()x w8, w0, x5 + crc32\c\()x w8, w8, x6 + csel w0, w0, w8, eq + cbz x2, 0f + +32: ldp x3, x4, [x1], #32 + sub x2, x2, #32 + ldp x5, x6, [x1, #-16] + \order x3, x4, x5, x6 + crc32\c\()x w0, w0, x3 + crc32\c\()x w0, w0, x4 + crc32\c\()x w0, w0, x5 + crc32\c\()x w0, w0, x6 + cbnz x2, 32b +0: bit\order w0 + ret + +8: tbz x2, #3, 4f + ldr x3, [x1], #8 + \order x3 + crc32\c\()x w0, w0, x3 +4: tbz x2, #2, 2f + ldr w3, [x1], #4 + \order w3 + crc32\c\()w w0, w0, w3 +2: tbz x2, #1, 1f + ldrh w3, [x1], #2 + hword\order w3 + crc32\c\()h w0, w0, w3 +1: tbz x2, #0, 0f + ldrb w3, [x1] + byte\order w3 + crc32\c\()b w0, w0, w3 +0: bit\order w0 + ret + .endm + + .align 5 +SYM_FUNC_START(crc32_le_arm64) + __crc32 +SYM_FUNC_END(crc32_le_arm64) + + .align 5 +SYM_FUNC_START(crc32c_le_arm64) + __crc32 c +SYM_FUNC_END(crc32c_le_arm64) + + .align 5 +SYM_FUNC_START(crc32_be_arm64) + __crc32 order=be +SYM_FUNC_END(crc32_be_arm64) + + in .req x1 + len .req x2 + + /* + * w0: input CRC at entry, output CRC at exit + * x1: pointer to input buffer + * x2: length of input in bytes + */ + .macro crc4way, insn, table, order=le + bit\order w0 + lsr len, len, #6 // len := # of 64-byte blocks + + /* Process up to 64 blocks of 64 bytes at a time */ +.La\@: mov x3, #64 + cmp len, #64 + csel x3, x3, len, hi // x3 := min(len, 64) + sub len, len, x3 + + /* Divide the input into 4 contiguous blocks */ + add x4, x3, x3, lsl #1 // x4 := 3 * x3 + add x7, in, x3, lsl #4 // x7 := in + 16 * x3 + add x8, in, x3, lsl #5 // x8 := in + 32 * x3 + add x9, in, x4, lsl #4 // x9 := in + 16 * x4 + + /* Load the folding coefficients from the lookup table */ + adr_l x5, \table - 12 // entry 0 omitted + add x5, x5, x4, lsl #2 // x5 += 12 * x3 + ldp s0, s1, [x5] + ldr s2, [x5, #8] + + /* Zero init partial CRCs for this iteration */ + mov w4, wzr + mov w5, wzr + mov w6, wzr + mov x17, xzr + +.Lb\@: sub x3, x3, #1 + \insn w6, w6, x17 + ldp x10, x11, [in], #16 + ldp x12, x13, [x7], #16 + ldp x14, x15, [x8], #16 + ldp x16, x17, [x9], #16 + + \order x10, x11, x12, x13, x14, x15, x16, x17 + + /* Apply the CRC transform to 4 16-byte blocks in parallel */ + \insn w0, w0, x10 + \insn w4, w4, x12 + \insn w5, w5, x14 + \insn w6, w6, x16 + \insn w0, w0, x11 + \insn w4, w4, x13 + \insn w5, w5, x15 + cbnz x3, .Lb\@ + + /* Combine the 4 partial results into w0 */ + mov v3.d[0], x0 + mov v4.d[0], x4 + mov v5.d[0], x5 + pmull v0.1q, v0.1d, v3.1d + pmull v1.1q, v1.1d, v4.1d + pmull v2.1q, v2.1d, v5.1d + eor v0.8b, v0.8b, v1.8b + eor v0.8b, v0.8b, v2.8b + mov x5, v0.d[0] + eor x5, x5, x17 + \insn w0, w6, x5 + + mov in, x9 + cbnz len, .La\@ + + bit\order w0 + ret + .endm + + .align 5 +SYM_FUNC_START(crc32c_le_arm64_4way) + crc4way crc32cx, .L0 +SYM_FUNC_END(crc32c_le_arm64_4way) + + .align 5 +SYM_FUNC_START(crc32_le_arm64_4way) + crc4way crc32x, .L1 +SYM_FUNC_END(crc32_le_arm64_4way) + + .align 5 +SYM_FUNC_START(crc32_be_arm64_4way) + crc4way crc32x, .L1, be +SYM_FUNC_END(crc32_be_arm64_4way) + + .section .rodata, "a", %progbits + .align 6 +.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27 + .long 0x0715ce53, 0x9e4addf8, 0xba4fc28e + .long 0xc96cfdc0, 0x0715ce53, 0xddc0152b + .long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8 + .long 0x299847d5, 0x878a92a7, 0x39d3b296 + .long 0xb6dd949b, 0xab7aff2a, 0x0715ce53 + .long 0xa60ce07b, 0x83348832, 0x47db8317 + .long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092 + .long 0x65863b64, 0xb6dd949b, 0xc96cfdc0 + .long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7 + .long 0xf285651c, 0xce7f39f4, 0xdaece73e + .long 0x271d9844, 0xd270f1a2, 0xab7aff2a + .long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385 + .long 0xcec3662e, 0x1b03397f, 0x83348832 + .long 0x8227bb8a, 0xb3e32c28, 0x299847d5 + .long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86 + .long 0xf6076544, 0x10746f3c, 0x18b33a4e + .long 0x98d8d9cb, 0x271d9844, 0xb6dd949b + .long 0x57a3d037, 0x93a5f730, 0x78d9ccb7 + .long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b + .long 0xe0ac139e, 0xcec3662e, 0xa60ce07b + .long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4 + .long 0xa2b73df1, 0xb0cd4768, 0x61d82e56 + .long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2 + .long 0xa90fd27a, 0x0167d312, 0xc619809d + .long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d + .long 0x4597456a, 0x98d8d9cb, 0x65863b64 + .long 0xc9c8b782, 0x68bce87a, 0x1b03397f + .long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd + .long 0x2342001e, 0x3771e98f, 0xb3e32c28 + .long 0xe8b6368b, 0x2178513a, 0x064f7f26 + .long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c + .long 0x0b0bf8ca, 0x6f345e45, 0xf285651c + .long 0x02ee03b2, 0xff0dba97, 0x10746f3c + .long 0x135c83fd, 0xf872e54c, 0xc7a68855 + .long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844 + .long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c + .long 0xded288f8, 0xb3af077a, 0x93a5f730 + .long 0x37170390, 0xca6ef3ac, 0x6cb08e5c + .long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2 + .long 0xb25b29f2, 0xe9e28eb4, 0x1393e203 + .long 0x45cddf4e, 0xc9c8b782, 0xcec3662e + .long 0xdfd94fb2, 0x93e106a4, 0x96c515bb + .long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a + .long 0x8e1450f7, 0x2342001e, 0x8227bb8a + .long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768 + .long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35 + .long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c + .long 0x0cd1526a, 0xf2271e60, 0x0ab3844b + .long 0xd6c3a807, 0x2664fd8b, 0x0167d312 + .long 0x1d31175f, 0x02ee03b2, 0xf6076544 + .long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a + .long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf + .long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb + .long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c + .long 0x889774e1, 0xaa7c7ad5, 0x68bce87a + .long 0x8a074012, 0xded288f8, 0x57a3d037 + .long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b + .long 0x3be3c09b, 0x6353c1cc, 0x42d98888 + .long 0x465a4eee, 0xf48642e9, 0x3771e98f + .long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9 + .long 0xa52f58ec, 0x9a5ede41, 0x2178513a + .long 0x47972100, 0x45cddf4e, 0xe0ac139e + .long 0x359674f7, 0xa51b6135, 0x170076fa + +.L1: .long 0xaf449247, 0x81256527, 0xccaa009e + .long 0x57c54819, 0x1d9513d7, 0x81256527 + .long 0x3f41287a, 0x57c54819, 0xaf449247 + .long 0xf5e48c85, 0x910eeec1, 0x1d9513d7 + .long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394 + .long 0x71d54a59, 0xf5e48c85, 0x57c54819 + .long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed + .long 0xd31343ea, 0xe95c1271, 0x910eeec1 + .long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a + .long 0x9ee62949, 0xcec97417, 0x9026d5b1 + .long 0xa55d1514, 0xf183c71b, 0xd1df2327 + .long 0x21aa2b26, 0xd31343ea, 0xf5e48c85 + .long 0x9d842b80, 0xeea395c4, 0x3c656ced + .long 0xd8110ff1, 0xcd669a40, 0xfe807bbd + .long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd + .long 0x1d6708a0, 0x0c30f51d, 0xe95c1271 + .long 0xef82aa68, 0xdb3935ea, 0xb918a347 + .long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59 + .long 0x99cce860, 0x356d209f, 0xff6f2fc2 + .long 0xd8af8e46, 0xc352f6de, 0xcec97417 + .long 0xf1996890, 0xd8110ff1, 0x1c63267b + .long 0x631bc508, 0xe95c7216, 0xf183c71b + .long 0x8511c306, 0x8e031a19, 0x9b9bdbd0 + .long 0xdb3839f3, 0x1d6708a0, 0xd31343ea + .long 0x7a92fffb, 0xf7003835, 0x4470ac44 + .long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4 + .long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee + .long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40 + .long 0x60290934, 0x81b6f443, 0x6d40f445 + .long 0x8e976a7d, 0xd8af8e46, 0x9ee62949 + .long 0xdcf5088a, 0x9dbdc100, 0x145575d5 + .long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d + .long 0x255b139e, 0x631bc508, 0xa55d1514 + .long 0xd784eaa8, 0xce26786c, 0xdb3935ea + .long 0x6d2c864a, 0x8068c345, 0x2586d334 + .long 0x02072e24, 0xdb3839f3, 0x21aa2b26 + .long 0x06689b0a, 0x5efd72f5, 0xe0575528 + .long 0x1e52f5ea, 0x4117915b, 0x356d209f + .long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80 + .long 0x3796455c, 0xb8e0e4a8, 0xc352f6de + .long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c + .long 0x28ae0976, 0xb46f7cff, 0xd8110ff1 + .long 0x9764bc8d, 0xd7e7a22c, 0x712510f0 + .long 0x13a13e18, 0x3e9a43cd, 0xe95c7216 + .long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356 + .long 0x0c540e7b, 0x753c81ff, 0x8e031a19 + .long 0x9924c781, 0xb9220208, 0x3edcde65 + .long 0x3954de39, 0x1753ab84, 0x1d6708a0 + .long 0xf32238b5, 0xbec81497, 0x9e70b943 + .long 0xbbd2cd2c, 0x0925d861, 0xf7003835 + .long 0xcc401304, 0xd784eaa8, 0xef82aa68 + .long 0x4987e684, 0x6044fbb0, 0x00eba0c8 + .long 0x3aa11427, 0x18fe3b4a, 0x87441142 + .long 0x297aad60, 0x02072e24, 0xd14bcc9b + .long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a + .long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8 + .long 0x25b8822a, 0x1e52f5ea, 0x99cce860 + .long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443 + .long 0x5690aa32, 0xa91fdefb, 0x688a110e + .long 0x1357a093, 0x3796455c, 0xd8af8e46 + .long 0x798fdd33, 0xaaa18a37, 0x357b9517 + .long 0xc2815395, 0x54d42691, 0x9dbdc100 + .long 0x21cfc0f7, 0x28ae0976, 0xf1996890 + .long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6 diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c deleted file mode 100644 index ed3acd71178f..000000000000 --- a/arch/arm64/lib/crc32-glue.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -#include -#include -#include - -#include -#include -#include -#include - -#include - -// The minimum input length to consider the 4-way interleaved code path -static const size_t min_len = 1024; - -asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len); -asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len); -asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len); - -asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); -asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); -asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len); - -u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) -{ - if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) - return crc32_le_base(crc, p, len); - - if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { - kernel_neon_begin(); - crc = crc32_le_arm64_4way(crc, p, len); - kernel_neon_end(); - - p += round_down(len, 64); - len %= 64; - - if (!len) - return crc; - } - - return crc32_le_arm64(crc, p, len); -} -EXPORT_SYMBOL(crc32_le_arch); - -u32 crc32c_arch(u32 crc, const u8 *p, size_t len) -{ - if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) - return crc32c_base(crc, p, len); - - if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { - kernel_neon_begin(); - crc = crc32c_le_arm64_4way(crc, p, len); - kernel_neon_end(); - - p += round_down(len, 64); - len %= 64; - - if (!len) - return crc; - } - - return crc32c_le_arm64(crc, p, len); -} -EXPORT_SYMBOL(crc32c_arch); - -u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) -{ - if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) - return crc32_be_base(crc, p, len); - - if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { - kernel_neon_begin(); - crc = crc32_be_arm64_4way(crc, p, len); - kernel_neon_end(); - - p += round_down(len, 64); - len %= 64; - - if (!len) - return crc; - } - - return crc32_be_arm64(crc, p, len); -} -EXPORT_SYMBOL(crc32_be_arch); - -u32 crc32_optimizations(void) -{ - if (alternative_has_cap_likely(ARM64_HAS_CRC32)) - return CRC32_LE_OPTIMIZATION | - CRC32_BE_OPTIMIZATION | - CRC32C_OPTIMIZATION; - return 0; -} -EXPORT_SYMBOL(crc32_optimizations); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("arm64-optimized CRC32 functions"); diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S deleted file mode 100644 index 68825317460f..000000000000 --- a/arch/arm64/lib/crc32.S +++ /dev/null @@ -1,362 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions - * - * Copyright (C) 2016 - 2018 Linaro Ltd. - * Copyright (C) 2024 Google LLC - * - * Author: Ard Biesheuvel - */ - -#include -#include - - .cpu generic+crc+crypto - - .macro bitle, reg - .endm - - .macro bitbe, reg - rbit \reg, \reg - .endm - - .macro bytele, reg - .endm - - .macro bytebe, reg - rbit \reg, \reg - lsr \reg, \reg, #24 - .endm - - .macro hwordle, reg -CPU_BE( rev16 \reg, \reg ) - .endm - - .macro hwordbe, reg -CPU_LE( rev \reg, \reg ) - rbit \reg, \reg -CPU_BE( lsr \reg, \reg, #16 ) - .endm - - .macro le, regs:vararg - .irp r, \regs -CPU_BE( rev \r, \r ) - .endr - .endm - - .macro be, regs:vararg - .irp r, \regs -CPU_LE( rev \r, \r ) - .endr - .irp r, \regs - rbit \r, \r - .endr - .endm - - .macro __crc32, c, order=le - bit\order w0 - cmp x2, #16 - b.lt 8f // less than 16 bytes - - and x7, x2, #0x1f - and x2, x2, #~0x1f - cbz x7, 32f // multiple of 32 bytes - - and x8, x7, #0xf - ldp x3, x4, [x1] - add x8, x8, x1 - add x1, x1, x7 - ldp x5, x6, [x8] - \order x3, x4, x5, x6 - - tst x7, #8 - crc32\c\()x w8, w0, x3 - csel x3, x3, x4, eq - csel w0, w0, w8, eq - tst x7, #4 - lsr x4, x3, #32 - crc32\c\()w w8, w0, w3 - csel x3, x3, x4, eq - csel w0, w0, w8, eq - tst x7, #2 - lsr w4, w3, #16 - crc32\c\()h w8, w0, w3 - csel w3, w3, w4, eq - csel w0, w0, w8, eq - tst x7, #1 - crc32\c\()b w8, w0, w3 - csel w0, w0, w8, eq - tst x7, #16 - crc32\c\()x w8, w0, x5 - crc32\c\()x w8, w8, x6 - csel w0, w0, w8, eq - cbz x2, 0f - -32: ldp x3, x4, [x1], #32 - sub x2, x2, #32 - ldp x5, x6, [x1, #-16] - \order x3, x4, x5, x6 - crc32\c\()x w0, w0, x3 - crc32\c\()x w0, w0, x4 - crc32\c\()x w0, w0, x5 - crc32\c\()x w0, w0, x6 - cbnz x2, 32b -0: bit\order w0 - ret - -8: tbz x2, #3, 4f - ldr x3, [x1], #8 - \order x3 - crc32\c\()x w0, w0, x3 -4: tbz x2, #2, 2f - ldr w3, [x1], #4 - \order w3 - crc32\c\()w w0, w0, w3 -2: tbz x2, #1, 1f - ldrh w3, [x1], #2 - hword\order w3 - crc32\c\()h w0, w0, w3 -1: tbz x2, #0, 0f - ldrb w3, [x1] - byte\order w3 - crc32\c\()b w0, w0, w3 -0: bit\order w0 - ret - .endm - - .align 5 -SYM_FUNC_START(crc32_le_arm64) - __crc32 -SYM_FUNC_END(crc32_le_arm64) - - .align 5 -SYM_FUNC_START(crc32c_le_arm64) - __crc32 c -SYM_FUNC_END(crc32c_le_arm64) - - .align 5 -SYM_FUNC_START(crc32_be_arm64) - __crc32 order=be -SYM_FUNC_END(crc32_be_arm64) - - in .req x1 - len .req x2 - - /* - * w0: input CRC at entry, output CRC at exit - * x1: pointer to input buffer - * x2: length of input in bytes - */ - .macro crc4way, insn, table, order=le - bit\order w0 - lsr len, len, #6 // len := # of 64-byte blocks - - /* Process up to 64 blocks of 64 bytes at a time */ -.La\@: mov x3, #64 - cmp len, #64 - csel x3, x3, len, hi // x3 := min(len, 64) - sub len, len, x3 - - /* Divide the input into 4 contiguous blocks */ - add x4, x3, x3, lsl #1 // x4 := 3 * x3 - add x7, in, x3, lsl #4 // x7 := in + 16 * x3 - add x8, in, x3, lsl #5 // x8 := in + 32 * x3 - add x9, in, x4, lsl #4 // x9 := in + 16 * x4 - - /* Load the folding coefficients from the lookup table */ - adr_l x5, \table - 12 // entry 0 omitted - add x5, x5, x4, lsl #2 // x5 += 12 * x3 - ldp s0, s1, [x5] - ldr s2, [x5, #8] - - /* Zero init partial CRCs for this iteration */ - mov w4, wzr - mov w5, wzr - mov w6, wzr - mov x17, xzr - -.Lb\@: sub x3, x3, #1 - \insn w6, w6, x17 - ldp x10, x11, [in], #16 - ldp x12, x13, [x7], #16 - ldp x14, x15, [x8], #16 - ldp x16, x17, [x9], #16 - - \order x10, x11, x12, x13, x14, x15, x16, x17 - - /* Apply the CRC transform to 4 16-byte blocks in parallel */ - \insn w0, w0, x10 - \insn w4, w4, x12 - \insn w5, w5, x14 - \insn w6, w6, x16 - \insn w0, w0, x11 - \insn w4, w4, x13 - \insn w5, w5, x15 - cbnz x3, .Lb\@ - - /* Combine the 4 partial results into w0 */ - mov v3.d[0], x0 - mov v4.d[0], x4 - mov v5.d[0], x5 - pmull v0.1q, v0.1d, v3.1d - pmull v1.1q, v1.1d, v4.1d - pmull v2.1q, v2.1d, v5.1d - eor v0.8b, v0.8b, v1.8b - eor v0.8b, v0.8b, v2.8b - mov x5, v0.d[0] - eor x5, x5, x17 - \insn w0, w6, x5 - - mov in, x9 - cbnz len, .La\@ - - bit\order w0 - ret - .endm - - .align 5 -SYM_FUNC_START(crc32c_le_arm64_4way) - crc4way crc32cx, .L0 -SYM_FUNC_END(crc32c_le_arm64_4way) - - .align 5 -SYM_FUNC_START(crc32_le_arm64_4way) - crc4way crc32x, .L1 -SYM_FUNC_END(crc32_le_arm64_4way) - - .align 5 -SYM_FUNC_START(crc32_be_arm64_4way) - crc4way crc32x, .L1, be -SYM_FUNC_END(crc32_be_arm64_4way) - - .section .rodata, "a", %progbits - .align 6 -.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27 - .long 0x0715ce53, 0x9e4addf8, 0xba4fc28e - .long 0xc96cfdc0, 0x0715ce53, 0xddc0152b - .long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8 - .long 0x299847d5, 0x878a92a7, 0x39d3b296 - .long 0xb6dd949b, 0xab7aff2a, 0x0715ce53 - .long 0xa60ce07b, 0x83348832, 0x47db8317 - .long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092 - .long 0x65863b64, 0xb6dd949b, 0xc96cfdc0 - .long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7 - .long 0xf285651c, 0xce7f39f4, 0xdaece73e - .long 0x271d9844, 0xd270f1a2, 0xab7aff2a - .long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385 - .long 0xcec3662e, 0x1b03397f, 0x83348832 - .long 0x8227bb8a, 0xb3e32c28, 0x299847d5 - .long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86 - .long 0xf6076544, 0x10746f3c, 0x18b33a4e - .long 0x98d8d9cb, 0x271d9844, 0xb6dd949b - .long 0x57a3d037, 0x93a5f730, 0x78d9ccb7 - .long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b - .long 0xe0ac139e, 0xcec3662e, 0xa60ce07b - .long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4 - .long 0xa2b73df1, 0xb0cd4768, 0x61d82e56 - .long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2 - .long 0xa90fd27a, 0x0167d312, 0xc619809d - .long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d - .long 0x4597456a, 0x98d8d9cb, 0x65863b64 - .long 0xc9c8b782, 0x68bce87a, 0x1b03397f - .long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd - .long 0x2342001e, 0x3771e98f, 0xb3e32c28 - .long 0xe8b6368b, 0x2178513a, 0x064f7f26 - .long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c - .long 0x0b0bf8ca, 0x6f345e45, 0xf285651c - .long 0x02ee03b2, 0xff0dba97, 0x10746f3c - .long 0x135c83fd, 0xf872e54c, 0xc7a68855 - .long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844 - .long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c - .long 0xded288f8, 0xb3af077a, 0x93a5f730 - .long 0x37170390, 0xca6ef3ac, 0x6cb08e5c - .long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2 - .long 0xb25b29f2, 0xe9e28eb4, 0x1393e203 - .long 0x45cddf4e, 0xc9c8b782, 0xcec3662e - .long 0xdfd94fb2, 0x93e106a4, 0x96c515bb - .long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a - .long 0x8e1450f7, 0x2342001e, 0x8227bb8a - .long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768 - .long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35 - .long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c - .long 0x0cd1526a, 0xf2271e60, 0x0ab3844b - .long 0xd6c3a807, 0x2664fd8b, 0x0167d312 - .long 0x1d31175f, 0x02ee03b2, 0xf6076544 - .long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a - .long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf - .long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb - .long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c - .long 0x889774e1, 0xaa7c7ad5, 0x68bce87a - .long 0x8a074012, 0xded288f8, 0x57a3d037 - .long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b - .long 0x3be3c09b, 0x6353c1cc, 0x42d98888 - .long 0x465a4eee, 0xf48642e9, 0x3771e98f - .long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9 - .long 0xa52f58ec, 0x9a5ede41, 0x2178513a - .long 0x47972100, 0x45cddf4e, 0xe0ac139e - .long 0x359674f7, 0xa51b6135, 0x170076fa - -.L1: .long 0xaf449247, 0x81256527, 0xccaa009e - .long 0x57c54819, 0x1d9513d7, 0x81256527 - .long 0x3f41287a, 0x57c54819, 0xaf449247 - .long 0xf5e48c85, 0x910eeec1, 0x1d9513d7 - .long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394 - .long 0x71d54a59, 0xf5e48c85, 0x57c54819 - .long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed - .long 0xd31343ea, 0xe95c1271, 0x910eeec1 - .long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a - .long 0x9ee62949, 0xcec97417, 0x9026d5b1 - .long 0xa55d1514, 0xf183c71b, 0xd1df2327 - .long 0x21aa2b26, 0xd31343ea, 0xf5e48c85 - .long 0x9d842b80, 0xeea395c4, 0x3c656ced - .long 0xd8110ff1, 0xcd669a40, 0xfe807bbd - .long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd - .long 0x1d6708a0, 0x0c30f51d, 0xe95c1271 - .long 0xef82aa68, 0xdb3935ea, 0xb918a347 - .long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59 - .long 0x99cce860, 0x356d209f, 0xff6f2fc2 - .long 0xd8af8e46, 0xc352f6de, 0xcec97417 - .long 0xf1996890, 0xd8110ff1, 0x1c63267b - .long 0x631bc508, 0xe95c7216, 0xf183c71b - .long 0x8511c306, 0x8e031a19, 0x9b9bdbd0 - .long 0xdb3839f3, 0x1d6708a0, 0xd31343ea - .long 0x7a92fffb, 0xf7003835, 0x4470ac44 - .long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4 - .long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee - .long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40 - .long 0x60290934, 0x81b6f443, 0x6d40f445 - .long 0x8e976a7d, 0xd8af8e46, 0x9ee62949 - .long 0xdcf5088a, 0x9dbdc100, 0x145575d5 - .long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d - .long 0x255b139e, 0x631bc508, 0xa55d1514 - .long 0xd784eaa8, 0xce26786c, 0xdb3935ea - .long 0x6d2c864a, 0x8068c345, 0x2586d334 - .long 0x02072e24, 0xdb3839f3, 0x21aa2b26 - .long 0x06689b0a, 0x5efd72f5, 0xe0575528 - .long 0x1e52f5ea, 0x4117915b, 0x356d209f - .long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80 - .long 0x3796455c, 0xb8e0e4a8, 0xc352f6de - .long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c - .long 0x28ae0976, 0xb46f7cff, 0xd8110ff1 - .long 0x9764bc8d, 0xd7e7a22c, 0x712510f0 - .long 0x13a13e18, 0x3e9a43cd, 0xe95c7216 - .long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356 - .long 0x0c540e7b, 0x753c81ff, 0x8e031a19 - .long 0x9924c781, 0xb9220208, 0x3edcde65 - .long 0x3954de39, 0x1753ab84, 0x1d6708a0 - .long 0xf32238b5, 0xbec81497, 0x9e70b943 - .long 0xbbd2cd2c, 0x0925d861, 0xf7003835 - .long 0xcc401304, 0xd784eaa8, 0xef82aa68 - .long 0x4987e684, 0x6044fbb0, 0x00eba0c8 - .long 0x3aa11427, 0x18fe3b4a, 0x87441142 - .long 0x297aad60, 0x02072e24, 0xd14bcc9b - .long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a - .long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8 - .long 0x25b8822a, 0x1e52f5ea, 0x99cce860 - .long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443 - .long 0x5690aa32, 0xa91fdefb, 0x688a110e - .long 0x1357a093, 0x3796455c, 0xd8af8e46 - .long 0x798fdd33, 0xaaa18a37, 0x357b9517 - .long 0xc2815395, 0x54d42691, 0x9dbdc100 - .long 0x21cfc0f7, 0x28ae0976, 0xf1996890 - .long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6 diff --git a/arch/arm64/lib/crc32.c b/arch/arm64/lib/crc32.c new file mode 100644 index 000000000000..ed3acd71178f --- /dev/null +++ b/arch/arm64/lib/crc32.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +#include +#include +#include +#include + +#include + +// The minimum input length to consider the 4-way interleaved code path +static const size_t min_len = 1024; + +asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len); + +asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len); + +u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32_le_base(crc, p, len); + + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32_le_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + + return crc32_le_arm64(crc, p, len); +} +EXPORT_SYMBOL(crc32_le_arch); + +u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32c_base(crc, p, len); + + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32c_le_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + + return crc32c_le_arm64(crc, p, len); +} +EXPORT_SYMBOL(crc32c_arch); + +u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32_be_base(crc, p, len); + + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32_be_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + + return crc32_be_arm64(crc, p, len); +} +EXPORT_SYMBOL(crc32_be_arch); + +u32 crc32_optimizations(void) +{ + if (alternative_has_cap_likely(ARM64_HAS_CRC32)) + return CRC32_LE_OPTIMIZATION | + CRC32_BE_OPTIMIZATION | + CRC32C_OPTIMIZATION; + return 0; +} +EXPORT_SYMBOL(crc32_optimizations); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("arm64-optimized CRC32 functions"); -- cgit v1.2.3-59-g8ed1b From 436490e86814ae409c1586fc2a8045455f735418 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Apr 2025 17:20:34 -0700 Subject: powerpc/crc: drop "glue" from filenames The use of the term "glue" in filenames is a Crypto API-ism that rarely shows up elsewhere in lib/ or arch/*/lib/. I think adopting it there was a mistake. The library just uses standard functions, so the amount of code that could be considered "glue" is quite small. And while often the C functions just wrap the assembly functions, there are also cases like crc32c_arch() in arch/x86/lib/crc32-glue.c that blur the line by in-lining the actual implementation into the C function. That's not "glue code", but rather the actual code. Therefore, let's drop "glue" from the filenames and instead use e.g. crc32.c instead of crc32-glue.c. Reviewed-by: "Martin K. Petersen" Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250424002038.179114-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/powerpc/lib/Makefile | 4 +- arch/powerpc/lib/crc-t10dif-glue.c | 83 ---------------------------------- arch/powerpc/lib/crc-t10dif.c | 83 ++++++++++++++++++++++++++++++++++ arch/powerpc/lib/crc32-glue.c | 92 -------------------------------------- arch/powerpc/lib/crc32.c | 92 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 177 insertions(+), 177 deletions(-) delete mode 100644 arch/powerpc/lib/crc-t10dif-glue.c create mode 100644 arch/powerpc/lib/crc-t10dif.c delete mode 100644 arch/powerpc/lib/crc32-glue.c create mode 100644 arch/powerpc/lib/crc32.c diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index dd8a4b52a0cc..27f8a0143860 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -79,9 +79,9 @@ CFLAGS_xor_vmx.o += -mhard-float -maltivec $(call cc-option,-mabi=altivec) CFLAGS_xor_vmx.o += -isystem $(shell $(CC) -print-file-name=include) obj-$(CONFIG_CRC32_ARCH) += crc32-powerpc.o -crc32-powerpc-y := crc32-glue.o crc32c-vpmsum_asm.o +crc32-powerpc-y := crc32.o crc32c-vpmsum_asm.o obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-powerpc.o -crc-t10dif-powerpc-y := crc-t10dif-glue.o crct10dif-vpmsum_asm.o +crc-t10dif-powerpc-y := crc-t10dif.o crct10dif-vpmsum_asm.o obj-$(CONFIG_PPC64) += $(obj64-y) diff --git a/arch/powerpc/lib/crc-t10dif-glue.c b/arch/powerpc/lib/crc-t10dif-glue.c deleted file mode 100644 index ddd5c4088f50..000000000000 --- a/arch/powerpc/lib/crc-t10dif-glue.c +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Calculate a CRC T10-DIF with vpmsum acceleration - * - * Copyright 2017, Daniel Axtens, IBM Corporation. - * [based on crc32c-vpmsum_glue.c] - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define VMX_ALIGN 16 -#define VMX_ALIGN_MASK (VMX_ALIGN-1) - -#define VECTOR_BREAKPOINT 64 - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto); - -u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len); - -u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len) -{ - unsigned int prealign; - unsigned int tail; - u32 crc = crci; - - if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || - !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable()) - return crc_t10dif_generic(crc, p, len); - - if ((unsigned long)p & VMX_ALIGN_MASK) { - prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); - crc = crc_t10dif_generic(crc, p, prealign); - len -= prealign; - p += prealign; - } - - if (len & ~VMX_ALIGN_MASK) { - crc <<= 16; - preempt_disable(); - pagefault_disable(); - enable_kernel_altivec(); - crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); - disable_kernel_altivec(); - pagefault_enable(); - preempt_enable(); - crc >>= 16; - } - - tail = len & VMX_ALIGN_MASK; - if (tail) { - p += len & ~VMX_ALIGN_MASK; - crc = crc_t10dif_generic(crc, p, tail); - } - - return crc & 0xffff; -} -EXPORT_SYMBOL(crc_t10dif_arch); - -static int __init crc_t10dif_powerpc_init(void) -{ - if (cpu_has_feature(CPU_FTR_ARCH_207S) && - (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO)) - static_branch_enable(&have_vec_crypto); - return 0; -} -arch_initcall(crc_t10dif_powerpc_init); - -static void __exit crc_t10dif_powerpc_exit(void) -{ -} -module_exit(crc_t10dif_powerpc_exit); - -MODULE_AUTHOR("Daniel Axtens "); -MODULE_DESCRIPTION("CRCT10DIF using vector polynomial multiply-sum instructions"); -MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/lib/crc-t10dif.c b/arch/powerpc/lib/crc-t10dif.c new file mode 100644 index 000000000000..ddd5c4088f50 --- /dev/null +++ b/arch/powerpc/lib/crc-t10dif.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Calculate a CRC T10-DIF with vpmsum acceleration + * + * Copyright 2017, Daniel Axtens, IBM Corporation. + * [based on crc32c-vpmsum_glue.c] + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN-1) + +#define VECTOR_BREAKPOINT 64 + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto); + +u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len); + +u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len) +{ + unsigned int prealign; + unsigned int tail; + u32 crc = crci; + + if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || + !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable()) + return crc_t10dif_generic(crc, p, len); + + if ((unsigned long)p & VMX_ALIGN_MASK) { + prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); + crc = crc_t10dif_generic(crc, p, prealign); + len -= prealign; + p += prealign; + } + + if (len & ~VMX_ALIGN_MASK) { + crc <<= 16; + preempt_disable(); + pagefault_disable(); + enable_kernel_altivec(); + crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); + disable_kernel_altivec(); + pagefault_enable(); + preempt_enable(); + crc >>= 16; + } + + tail = len & VMX_ALIGN_MASK; + if (tail) { + p += len & ~VMX_ALIGN_MASK; + crc = crc_t10dif_generic(crc, p, tail); + } + + return crc & 0xffff; +} +EXPORT_SYMBOL(crc_t10dif_arch); + +static int __init crc_t10dif_powerpc_init(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_207S) && + (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO)) + static_branch_enable(&have_vec_crypto); + return 0; +} +arch_initcall(crc_t10dif_powerpc_init); + +static void __exit crc_t10dif_powerpc_exit(void) +{ +} +module_exit(crc_t10dif_powerpc_exit); + +MODULE_AUTHOR("Daniel Axtens "); +MODULE_DESCRIPTION("CRCT10DIF using vector polynomial multiply-sum instructions"); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/lib/crc32-glue.c b/arch/powerpc/lib/crc32-glue.c deleted file mode 100644 index 42f2dd3c85dd..000000000000 --- a/arch/powerpc/lib/crc32-glue.c +++ /dev/null @@ -1,92 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include -#include -#include -#include - -#define VMX_ALIGN 16 -#define VMX_ALIGN_MASK (VMX_ALIGN-1) - -#define VECTOR_BREAKPOINT 512 - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto); - -u32 __crc32c_vpmsum(u32 crc, const u8 *p, size_t len); - -u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) -{ - return crc32_le_base(crc, p, len); -} -EXPORT_SYMBOL(crc32_le_arch); - -u32 crc32c_arch(u32 crc, const u8 *p, size_t len) -{ - unsigned int prealign; - unsigned int tail; - - if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || - !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable()) - return crc32c_base(crc, p, len); - - if ((unsigned long)p & VMX_ALIGN_MASK) { - prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); - crc = crc32c_base(crc, p, prealign); - len -= prealign; - p += prealign; - } - - if (len & ~VMX_ALIGN_MASK) { - preempt_disable(); - pagefault_disable(); - enable_kernel_altivec(); - crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); - disable_kernel_altivec(); - pagefault_enable(); - preempt_enable(); - } - - tail = len & VMX_ALIGN_MASK; - if (tail) { - p += len & ~VMX_ALIGN_MASK; - crc = crc32c_base(crc, p, tail); - } - - return crc; -} -EXPORT_SYMBOL(crc32c_arch); - -u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) -{ - return crc32_be_base(crc, p, len); -} -EXPORT_SYMBOL(crc32_be_arch); - -static int __init crc32_powerpc_init(void) -{ - if (cpu_has_feature(CPU_FTR_ARCH_207S) && - (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO)) - static_branch_enable(&have_vec_crypto); - return 0; -} -arch_initcall(crc32_powerpc_init); - -static void __exit crc32_powerpc_exit(void) -{ -} -module_exit(crc32_powerpc_exit); - -u32 crc32_optimizations(void) -{ - if (static_key_enabled(&have_vec_crypto)) - return CRC32C_OPTIMIZATION; - return 0; -} -EXPORT_SYMBOL(crc32_optimizations); - -MODULE_AUTHOR("Anton Blanchard "); -MODULE_DESCRIPTION("CRC32C using vector polynomial multiply-sum instructions"); -MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/lib/crc32.c b/arch/powerpc/lib/crc32.c new file mode 100644 index 000000000000..42f2dd3c85dd --- /dev/null +++ b/arch/powerpc/lib/crc32.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include + +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN-1) + +#define VECTOR_BREAKPOINT 512 + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto); + +u32 __crc32c_vpmsum(u32 crc, const u8 *p, size_t len); + +u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + return crc32_le_base(crc, p, len); +} +EXPORT_SYMBOL(crc32_le_arch); + +u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + unsigned int prealign; + unsigned int tail; + + if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || + !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable()) + return crc32c_base(crc, p, len); + + if ((unsigned long)p & VMX_ALIGN_MASK) { + prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); + crc = crc32c_base(crc, p, prealign); + len -= prealign; + p += prealign; + } + + if (len & ~VMX_ALIGN_MASK) { + preempt_disable(); + pagefault_disable(); + enable_kernel_altivec(); + crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); + disable_kernel_altivec(); + pagefault_enable(); + preempt_enable(); + } + + tail = len & VMX_ALIGN_MASK; + if (tail) { + p += len & ~VMX_ALIGN_MASK; + crc = crc32c_base(crc, p, tail); + } + + return crc; +} +EXPORT_SYMBOL(crc32c_arch); + +u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) +{ + return crc32_be_base(crc, p, len); +} +EXPORT_SYMBOL(crc32_be_arch); + +static int __init crc32_powerpc_init(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_207S) && + (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO)) + static_branch_enable(&have_vec_crypto); + return 0; +} +arch_initcall(crc32_powerpc_init); + +static void __exit crc32_powerpc_exit(void) +{ +} +module_exit(crc32_powerpc_exit); + +u32 crc32_optimizations(void) +{ + if (static_key_enabled(&have_vec_crypto)) + return CRC32C_OPTIMIZATION; + return 0; +} +EXPORT_SYMBOL(crc32_optimizations); + +MODULE_AUTHOR("Anton Blanchard "); +MODULE_DESCRIPTION("CRC32C using vector polynomial multiply-sum instructions"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3-59-g8ed1b From b4fa54d654b3531261d1fde3cb73ceae7a98806f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Apr 2025 17:20:35 -0700 Subject: powerpc/crc: rename crc32-vpmsum_core.S to crc-vpmsum-template.S Rename crc32-vpmsum_core.S to crc-vpmsum-template.S to properly convey that (a) it actually generates code for both 32-bit and 16-bit CRCs, not just 32-bit CRCs; and (b) it has "template" semantics, like x86's crc-pclmul-template.S, in the sense that it's included by other files. Reviewed-by: "Martin K. Petersen" Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250424002038.179114-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/powerpc/lib/crc-vpmsum-template.S | 746 ++++++++++++++++++++++++++++++++ arch/powerpc/lib/crc32-vpmsum_core.S | 746 -------------------------------- arch/powerpc/lib/crc32c-vpmsum_asm.S | 2 +- arch/powerpc/lib/crct10dif-vpmsum_asm.S | 2 +- 4 files changed, 748 insertions(+), 748 deletions(-) create mode 100644 arch/powerpc/lib/crc-vpmsum-template.S delete mode 100644 arch/powerpc/lib/crc32-vpmsum_core.S diff --git a/arch/powerpc/lib/crc-vpmsum-template.S b/arch/powerpc/lib/crc-vpmsum-template.S new file mode 100644 index 000000000000..b0f87f595b26 --- /dev/null +++ b/arch/powerpc/lib/crc-vpmsum-template.S @@ -0,0 +1,746 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Core of the accelerated CRC algorithm. + * In your file, define the constants and CRC_FUNCTION_NAME + * Then include this file. + * + * Calculate the checksum of data that is 16 byte aligned and a multiple of + * 16 bytes. + * + * The first step is to reduce it to 1024 bits. We do this in 8 parallel + * chunks in order to mask the latency of the vpmsum instructions. If we + * have more than 32 kB of data to checksum we repeat this step multiple + * times, passing in the previous 1024 bits. + * + * The next step is to reduce the 1024 bits to 64 bits. This step adds + * 32 bits of 0s to the end - this matches what a CRC does. We just + * calculate constants that land the data in this 32 bits. + * + * We then use fixed point Barrett reduction to compute a mod n over GF(2) + * for n = CRC using POWER8 instructions. We use x = 32. + * + * https://en.wikipedia.org/wiki/Barrett_reduction + * + * Copyright (C) 2015 Anton Blanchard , IBM +*/ + +#include +#include + +#define MAX_SIZE 32768 + + .text + +#if defined(__BIG_ENDIAN__) && defined(REFLECT) +#define BYTESWAP_DATA +#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) +#define BYTESWAP_DATA +#else +#undef BYTESWAP_DATA +#endif + +#define off16 r25 +#define off32 r26 +#define off48 r27 +#define off64 r28 +#define off80 r29 +#define off96 r30 +#define off112 r31 + +#define const1 v24 +#define const2 v25 + +#define byteswap v26 +#define mask_32bit v27 +#define mask_64bit v28 +#define zeroes v29 + +#ifdef BYTESWAP_DATA +#define VPERM(A, B, C, D) vperm A, B, C, D +#else +#define VPERM(A, B, C, D) +#endif + +/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */ +FUNC_START(CRC_FUNCTION_NAME) + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + std r26,-48(r1) + std r25,-56(r1) + + li off16,16 + li off32,32 + li off48,48 + li off64,64 + li off80,80 + li off96,96 + li off112,112 + li r0,0 + + /* Enough room for saving 10 non volatile VMX registers */ + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + stvx v20,0,r6 + stvx v21,off16,r6 + stvx v22,off32,r6 + stvx v23,off48,r6 + stvx v24,off64,r6 + stvx v25,off80,r6 + stvx v26,off96,r6 + stvx v27,off112,r6 + stvx v28,0,r7 + stvx v29,off16,r7 + + mr r10,r3 + + vxor zeroes,zeroes,zeroes + vspltisw v0,-1 + + vsldoi mask_32bit,zeroes,v0,4 + vsldoi mask_64bit,zeroes,v0,8 + + /* Get the initial value into v8 */ + vxor v8,v8,v8 + MTVRD(v8, R3) +#ifdef REFLECT + vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ +#else + vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ +#endif + +#ifdef BYTESWAP_DATA + LOAD_REG_ADDR(r3, .byteswap_constant) + lvx byteswap,0,r3 + addi r3,r3,16 +#endif + + cmpdi r5,256 + blt .Lshort + + rldicr r6,r5,0,56 + + /* Checksum in blocks of MAX_SIZE */ +1: lis r7,MAX_SIZE@h + ori r7,r7,MAX_SIZE@l + mr r9,r7 + cmpd r6,r7 + bgt 2f + mr r7,r6 +2: subf r6,r7,r6 + + /* our main loop does 128 bytes at a time */ + srdi r7,r7,7 + + /* + * Work out the offset into the constants table to start at. Each + * constant is 16 bytes, and it is used against 128 bytes of input + * data - 128 / 16 = 8 + */ + sldi r8,r7,4 + srdi r9,r9,3 + subf r8,r8,r9 + + /* We reduce our final 128 bytes in a separate step */ + addi r7,r7,-1 + mtctr r7 + + LOAD_REG_ADDR(r3, .constants) + + /* Find the start of our constants */ + add r3,r3,r8 + + /* zero v0-v7 which will contain our checksums */ + vxor v0,v0,v0 + vxor v1,v1,v1 + vxor v2,v2,v2 + vxor v3,v3,v3 + vxor v4,v4,v4 + vxor v5,v5,v5 + vxor v6,v6,v6 + vxor v7,v7,v7 + + lvx const1,0,r3 + + /* + * If we are looping back to consume more data we use the values + * already in v16-v23. + */ + cmpdi r0,1 + beq 2f + + /* First warm up pass */ + lvx v16,0,r4 + lvx v17,off16,r4 + VPERM(v16,v16,v16,byteswap) + VPERM(v17,v17,v17,byteswap) + lvx v18,off32,r4 + lvx v19,off48,r4 + VPERM(v18,v18,v18,byteswap) + VPERM(v19,v19,v19,byteswap) + lvx v20,off64,r4 + lvx v21,off80,r4 + VPERM(v20,v20,v20,byteswap) + VPERM(v21,v21,v21,byteswap) + lvx v22,off96,r4 + lvx v23,off112,r4 + VPERM(v22,v22,v22,byteswap) + VPERM(v23,v23,v23,byteswap) + addi r4,r4,8*16 + + /* xor in initial value */ + vxor v16,v16,v8 + +2: bdz .Lfirst_warm_up_done + + addi r3,r3,16 + lvx const2,0,r3 + + /* Second warm up pass */ + VPMSUMD(v8,v16,const1) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + VPMSUMD(v9,v17,const1) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + VPMSUMD(v10,v18,const1) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + VPMSUMD(v11,v19,const1) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + ori r2,r2,0 + + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdz .Lfirst_cool_down + + /* + * main loop. We modulo schedule it such that it takes three iterations + * to complete - first iteration load, second iteration vpmsum, third + * iteration xor. + */ + .balign 16 +4: lvx const1,0,r3 + addi r3,r3,16 + ori r2,r2,0 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const2) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const2) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const2) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const2) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + lvx const2,0,r3 + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdnz 4b + +.Lfirst_cool_down: + /* First cool down pass */ + lvx const1,0,r3 + addi r3,r3,16 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const1) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const1) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const1) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const1) + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + ori r2,r2,0 + +.Lsecond_cool_down: + /* Second cool down pass */ + vxor v0,v0,v8 + vxor v1,v1,v9 + vxor v2,v2,v10 + vxor v3,v3,v11 + vxor v4,v4,v12 + vxor v5,v5,v13 + vxor v6,v6,v14 + vxor v7,v7,v15 + +#ifdef REFLECT + /* + * vpmsumd produces a 96 bit result in the least significant bits + * of the register. Since we are bit reflected we have to shift it + * left 32 bits so it occupies the least significant bits in the + * bit reflected domain. + */ + vsldoi v0,v0,zeroes,4 + vsldoi v1,v1,zeroes,4 + vsldoi v2,v2,zeroes,4 + vsldoi v3,v3,zeroes,4 + vsldoi v4,v4,zeroes,4 + vsldoi v5,v5,zeroes,4 + vsldoi v6,v6,zeroes,4 + vsldoi v7,v7,zeroes,4 +#endif + + /* xor with last 1024 bits */ + lvx v8,0,r4 + lvx v9,off16,r4 + VPERM(v8,v8,v8,byteswap) + VPERM(v9,v9,v9,byteswap) + lvx v10,off32,r4 + lvx v11,off48,r4 + VPERM(v10,v10,v10,byteswap) + VPERM(v11,v11,v11,byteswap) + lvx v12,off64,r4 + lvx v13,off80,r4 + VPERM(v12,v12,v12,byteswap) + VPERM(v13,v13,v13,byteswap) + lvx v14,off96,r4 + lvx v15,off112,r4 + VPERM(v14,v14,v14,byteswap) + VPERM(v15,v15,v15,byteswap) + + addi r4,r4,8*16 + + vxor v16,v0,v8 + vxor v17,v1,v9 + vxor v18,v2,v10 + vxor v19,v3,v11 + vxor v20,v4,v12 + vxor v21,v5,v13 + vxor v22,v6,v14 + vxor v23,v7,v15 + + li r0,1 + cmpdi r6,0 + addi r6,r6,128 + bne 1b + + /* Work out how many bytes we have left */ + andi. r5,r5,127 + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,128 + add r3,r3,r6 + + /* How many 16 byte chunks are in the tail */ + srdi r7,r5,4 + mtctr r7 + + /* + * Reduce the previously calculated 1024 bits to 64 bits, shifting + * 32 bits to include the trailing 32 bits of zeros + */ + lvx v0,0,r3 + lvx v1,off16,r3 + lvx v2,off32,r3 + lvx v3,off48,r3 + lvx v4,off64,r3 + lvx v5,off80,r3 + lvx v6,off96,r3 + lvx v7,off112,r3 + addi r3,r3,8*16 + + VPMSUMW(v0,v16,v0) + VPMSUMW(v1,v17,v1) + VPMSUMW(v2,v18,v2) + VPMSUMW(v3,v19,v3) + VPMSUMW(v4,v20,v4) + VPMSUMW(v5,v21,v5) + VPMSUMW(v6,v22,v6) + VPMSUMW(v7,v23,v7) + + /* Now reduce the tail (0 - 112 bytes) */ + cmpdi r7,0 + beq 1f + + lvx v16,0,r4 + lvx v17,0,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off16,r4 + lvx v17,off16,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off32,r4 + lvx v17,off32,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off48,r4 + lvx v17,off48,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off64,r4 + lvx v17,off64,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off80,r4 + lvx v17,off80,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off96,r4 + lvx v17,off96,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + + /* Now xor all the parallel chunks together */ +1: vxor v0,v0,v1 + vxor v2,v2,v3 + vxor v4,v4,v5 + vxor v6,v6,v7 + + vxor v0,v0,v2 + vxor v4,v4,v6 + + vxor v0,v0,v4 + +.Lbarrett_reduction: + /* Barrett constants */ + LOAD_REG_ADDR(r3, .barrett_constants) + + lvx const1,0,r3 + lvx const2,off16,r3 + + vsldoi v1,v0,v0,8 + vxor v0,v0,v1 /* xor two 64 bit results together */ + +#ifdef REFLECT + /* shift left one bit */ + vspltisb v1,1 + vsl v0,v0,v1 +#endif + + vand v0,v0,mask_64bit +#ifndef REFLECT + /* + * Now for the Barrett reduction algorithm. The idea is to calculate q, + * the multiple of our polynomial that we need to subtract. By + * doing the computation 2x bits higher (ie 64 bits) and shifting the + * result back down 2x bits, we round down to the nearest multiple. + */ + VPMSUMD(v1,v0,const1) /* ma */ + vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Get the result into r3. We need to shift it left 8 bytes: + * V0 [ 0 1 2 X ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ +#else + /* + * The reflected version of Barrett reduction. Instead of bit + * reflecting our data (which is expensive to do), we bit reflect our + * constants and our algorithm, which means the intermediate data in + * our vector registers goes from 0-63 instead of 63-0. We can reflect + * the algorithm because we don't carry in mod 2 arithmetic. + */ + vand v1,v0,mask_32bit /* bottom 32 bits of a */ + VPMSUMD(v1,v1,const1) /* ma */ + vand v1,v1,mask_32bit /* bottom 32bits of ma */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in + * the high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ +#endif + + /* Get it into r3 */ + MFVRD(R3, v0) + +.Lout: + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + lvx v20,0,r6 + lvx v21,off16,r6 + lvx v22,off32,r6 + lvx v23,off48,r6 + lvx v24,off64,r6 + lvx v25,off80,r6 + lvx v26,off96,r6 + lvx v27,off112,r6 + lvx v28,0,r7 + lvx v29,off16,r7 + + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + ld r26,-48(r1) + ld r25,-56(r1) + + blr + +.Lfirst_warm_up_done: + lvx const1,0,r3 + addi r3,r3,16 + + VPMSUMD(v8,v16,const1) + VPMSUMD(v9,v17,const1) + VPMSUMD(v10,v18,const1) + VPMSUMD(v11,v19,const1) + VPMSUMD(v12,v20,const1) + VPMSUMD(v13,v21,const1) + VPMSUMD(v14,v22,const1) + VPMSUMD(v15,v23,const1) + + b .Lsecond_cool_down + +.Lshort: + cmpdi r5,0 + beq .Lzero + + LOAD_REG_ADDR(r3, .short_constants) + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,256 + add r3,r3,r6 + + /* How many 16 byte chunks? */ + srdi r7,r5,4 + mtctr r7 + + vxor v19,v19,v19 + vxor v20,v20,v20 + + lvx v0,0,r4 + lvx v16,0,r3 + VPERM(v0,v0,v16,byteswap) + vxor v0,v0,v8 /* xor in initial value */ + VPMSUMW(v0,v0,v16) + bdz .Lv0 + + lvx v1,off16,r4 + lvx v17,off16,r3 + VPERM(v1,v1,v17,byteswap) + VPMSUMW(v1,v1,v17) + bdz .Lv1 + + lvx v2,off32,r4 + lvx v16,off32,r3 + VPERM(v2,v2,v16,byteswap) + VPMSUMW(v2,v2,v16) + bdz .Lv2 + + lvx v3,off48,r4 + lvx v17,off48,r3 + VPERM(v3,v3,v17,byteswap) + VPMSUMW(v3,v3,v17) + bdz .Lv3 + + lvx v4,off64,r4 + lvx v16,off64,r3 + VPERM(v4,v4,v16,byteswap) + VPMSUMW(v4,v4,v16) + bdz .Lv4 + + lvx v5,off80,r4 + lvx v17,off80,r3 + VPERM(v5,v5,v17,byteswap) + VPMSUMW(v5,v5,v17) + bdz .Lv5 + + lvx v6,off96,r4 + lvx v16,off96,r3 + VPERM(v6,v6,v16,byteswap) + VPMSUMW(v6,v6,v16) + bdz .Lv6 + + lvx v7,off112,r4 + lvx v17,off112,r3 + VPERM(v7,v7,v17,byteswap) + VPMSUMW(v7,v7,v17) + bdz .Lv7 + + addi r3,r3,128 + addi r4,r4,128 + + lvx v8,0,r4 + lvx v16,0,r3 + VPERM(v8,v8,v16,byteswap) + VPMSUMW(v8,v8,v16) + bdz .Lv8 + + lvx v9,off16,r4 + lvx v17,off16,r3 + VPERM(v9,v9,v17,byteswap) + VPMSUMW(v9,v9,v17) + bdz .Lv9 + + lvx v10,off32,r4 + lvx v16,off32,r3 + VPERM(v10,v10,v16,byteswap) + VPMSUMW(v10,v10,v16) + bdz .Lv10 + + lvx v11,off48,r4 + lvx v17,off48,r3 + VPERM(v11,v11,v17,byteswap) + VPMSUMW(v11,v11,v17) + bdz .Lv11 + + lvx v12,off64,r4 + lvx v16,off64,r3 + VPERM(v12,v12,v16,byteswap) + VPMSUMW(v12,v12,v16) + bdz .Lv12 + + lvx v13,off80,r4 + lvx v17,off80,r3 + VPERM(v13,v13,v17,byteswap) + VPMSUMW(v13,v13,v17) + bdz .Lv13 + + lvx v14,off96,r4 + lvx v16,off96,r3 + VPERM(v14,v14,v16,byteswap) + VPMSUMW(v14,v14,v16) + bdz .Lv14 + + lvx v15,off112,r4 + lvx v17,off112,r3 + VPERM(v15,v15,v17,byteswap) + VPMSUMW(v15,v15,v17) + +.Lv15: vxor v19,v19,v15 +.Lv14: vxor v20,v20,v14 +.Lv13: vxor v19,v19,v13 +.Lv12: vxor v20,v20,v12 +.Lv11: vxor v19,v19,v11 +.Lv10: vxor v20,v20,v10 +.Lv9: vxor v19,v19,v9 +.Lv8: vxor v20,v20,v8 +.Lv7: vxor v19,v19,v7 +.Lv6: vxor v20,v20,v6 +.Lv5: vxor v19,v19,v5 +.Lv4: vxor v20,v20,v4 +.Lv3: vxor v19,v19,v3 +.Lv2: vxor v20,v20,v2 +.Lv1: vxor v19,v19,v1 +.Lv0: vxor v20,v20,v0 + + vxor v0,v19,v20 + + b .Lbarrett_reduction + +.Lzero: + mr r3,r10 + b .Lout + +FUNC_END(CRC_FUNCTION_NAME) diff --git a/arch/powerpc/lib/crc32-vpmsum_core.S b/arch/powerpc/lib/crc32-vpmsum_core.S deleted file mode 100644 index b0f87f595b26..000000000000 --- a/arch/powerpc/lib/crc32-vpmsum_core.S +++ /dev/null @@ -1,746 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Core of the accelerated CRC algorithm. - * In your file, define the constants and CRC_FUNCTION_NAME - * Then include this file. - * - * Calculate the checksum of data that is 16 byte aligned and a multiple of - * 16 bytes. - * - * The first step is to reduce it to 1024 bits. We do this in 8 parallel - * chunks in order to mask the latency of the vpmsum instructions. If we - * have more than 32 kB of data to checksum we repeat this step multiple - * times, passing in the previous 1024 bits. - * - * The next step is to reduce the 1024 bits to 64 bits. This step adds - * 32 bits of 0s to the end - this matches what a CRC does. We just - * calculate constants that land the data in this 32 bits. - * - * We then use fixed point Barrett reduction to compute a mod n over GF(2) - * for n = CRC using POWER8 instructions. We use x = 32. - * - * https://en.wikipedia.org/wiki/Barrett_reduction - * - * Copyright (C) 2015 Anton Blanchard , IBM -*/ - -#include -#include - -#define MAX_SIZE 32768 - - .text - -#if defined(__BIG_ENDIAN__) && defined(REFLECT) -#define BYTESWAP_DATA -#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) -#define BYTESWAP_DATA -#else -#undef BYTESWAP_DATA -#endif - -#define off16 r25 -#define off32 r26 -#define off48 r27 -#define off64 r28 -#define off80 r29 -#define off96 r30 -#define off112 r31 - -#define const1 v24 -#define const2 v25 - -#define byteswap v26 -#define mask_32bit v27 -#define mask_64bit v28 -#define zeroes v29 - -#ifdef BYTESWAP_DATA -#define VPERM(A, B, C, D) vperm A, B, C, D -#else -#define VPERM(A, B, C, D) -#endif - -/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */ -FUNC_START(CRC_FUNCTION_NAME) - std r31,-8(r1) - std r30,-16(r1) - std r29,-24(r1) - std r28,-32(r1) - std r27,-40(r1) - std r26,-48(r1) - std r25,-56(r1) - - li off16,16 - li off32,32 - li off48,48 - li off64,64 - li off80,80 - li off96,96 - li off112,112 - li r0,0 - - /* Enough room for saving 10 non volatile VMX registers */ - subi r6,r1,56+10*16 - subi r7,r1,56+2*16 - - stvx v20,0,r6 - stvx v21,off16,r6 - stvx v22,off32,r6 - stvx v23,off48,r6 - stvx v24,off64,r6 - stvx v25,off80,r6 - stvx v26,off96,r6 - stvx v27,off112,r6 - stvx v28,0,r7 - stvx v29,off16,r7 - - mr r10,r3 - - vxor zeroes,zeroes,zeroes - vspltisw v0,-1 - - vsldoi mask_32bit,zeroes,v0,4 - vsldoi mask_64bit,zeroes,v0,8 - - /* Get the initial value into v8 */ - vxor v8,v8,v8 - MTVRD(v8, R3) -#ifdef REFLECT - vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ -#else - vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ -#endif - -#ifdef BYTESWAP_DATA - LOAD_REG_ADDR(r3, .byteswap_constant) - lvx byteswap,0,r3 - addi r3,r3,16 -#endif - - cmpdi r5,256 - blt .Lshort - - rldicr r6,r5,0,56 - - /* Checksum in blocks of MAX_SIZE */ -1: lis r7,MAX_SIZE@h - ori r7,r7,MAX_SIZE@l - mr r9,r7 - cmpd r6,r7 - bgt 2f - mr r7,r6 -2: subf r6,r7,r6 - - /* our main loop does 128 bytes at a time */ - srdi r7,r7,7 - - /* - * Work out the offset into the constants table to start at. Each - * constant is 16 bytes, and it is used against 128 bytes of input - * data - 128 / 16 = 8 - */ - sldi r8,r7,4 - srdi r9,r9,3 - subf r8,r8,r9 - - /* We reduce our final 128 bytes in a separate step */ - addi r7,r7,-1 - mtctr r7 - - LOAD_REG_ADDR(r3, .constants) - - /* Find the start of our constants */ - add r3,r3,r8 - - /* zero v0-v7 which will contain our checksums */ - vxor v0,v0,v0 - vxor v1,v1,v1 - vxor v2,v2,v2 - vxor v3,v3,v3 - vxor v4,v4,v4 - vxor v5,v5,v5 - vxor v6,v6,v6 - vxor v7,v7,v7 - - lvx const1,0,r3 - - /* - * If we are looping back to consume more data we use the values - * already in v16-v23. - */ - cmpdi r0,1 - beq 2f - - /* First warm up pass */ - lvx v16,0,r4 - lvx v17,off16,r4 - VPERM(v16,v16,v16,byteswap) - VPERM(v17,v17,v17,byteswap) - lvx v18,off32,r4 - lvx v19,off48,r4 - VPERM(v18,v18,v18,byteswap) - VPERM(v19,v19,v19,byteswap) - lvx v20,off64,r4 - lvx v21,off80,r4 - VPERM(v20,v20,v20,byteswap) - VPERM(v21,v21,v21,byteswap) - lvx v22,off96,r4 - lvx v23,off112,r4 - VPERM(v22,v22,v22,byteswap) - VPERM(v23,v23,v23,byteswap) - addi r4,r4,8*16 - - /* xor in initial value */ - vxor v16,v16,v8 - -2: bdz .Lfirst_warm_up_done - - addi r3,r3,16 - lvx const2,0,r3 - - /* Second warm up pass */ - VPMSUMD(v8,v16,const1) - lvx v16,0,r4 - VPERM(v16,v16,v16,byteswap) - ori r2,r2,0 - - VPMSUMD(v9,v17,const1) - lvx v17,off16,r4 - VPERM(v17,v17,v17,byteswap) - ori r2,r2,0 - - VPMSUMD(v10,v18,const1) - lvx v18,off32,r4 - VPERM(v18,v18,v18,byteswap) - ori r2,r2,0 - - VPMSUMD(v11,v19,const1) - lvx v19,off48,r4 - VPERM(v19,v19,v19,byteswap) - ori r2,r2,0 - - VPMSUMD(v12,v20,const1) - lvx v20,off64,r4 - VPERM(v20,v20,v20,byteswap) - ori r2,r2,0 - - VPMSUMD(v13,v21,const1) - lvx v21,off80,r4 - VPERM(v21,v21,v21,byteswap) - ori r2,r2,0 - - VPMSUMD(v14,v22,const1) - lvx v22,off96,r4 - VPERM(v22,v22,v22,byteswap) - ori r2,r2,0 - - VPMSUMD(v15,v23,const1) - lvx v23,off112,r4 - VPERM(v23,v23,v23,byteswap) - - addi r4,r4,8*16 - - bdz .Lfirst_cool_down - - /* - * main loop. We modulo schedule it such that it takes three iterations - * to complete - first iteration load, second iteration vpmsum, third - * iteration xor. - */ - .balign 16 -4: lvx const1,0,r3 - addi r3,r3,16 - ori r2,r2,0 - - vxor v0,v0,v8 - VPMSUMD(v8,v16,const2) - lvx v16,0,r4 - VPERM(v16,v16,v16,byteswap) - ori r2,r2,0 - - vxor v1,v1,v9 - VPMSUMD(v9,v17,const2) - lvx v17,off16,r4 - VPERM(v17,v17,v17,byteswap) - ori r2,r2,0 - - vxor v2,v2,v10 - VPMSUMD(v10,v18,const2) - lvx v18,off32,r4 - VPERM(v18,v18,v18,byteswap) - ori r2,r2,0 - - vxor v3,v3,v11 - VPMSUMD(v11,v19,const2) - lvx v19,off48,r4 - VPERM(v19,v19,v19,byteswap) - lvx const2,0,r3 - ori r2,r2,0 - - vxor v4,v4,v12 - VPMSUMD(v12,v20,const1) - lvx v20,off64,r4 - VPERM(v20,v20,v20,byteswap) - ori r2,r2,0 - - vxor v5,v5,v13 - VPMSUMD(v13,v21,const1) - lvx v21,off80,r4 - VPERM(v21,v21,v21,byteswap) - ori r2,r2,0 - - vxor v6,v6,v14 - VPMSUMD(v14,v22,const1) - lvx v22,off96,r4 - VPERM(v22,v22,v22,byteswap) - ori r2,r2,0 - - vxor v7,v7,v15 - VPMSUMD(v15,v23,const1) - lvx v23,off112,r4 - VPERM(v23,v23,v23,byteswap) - - addi r4,r4,8*16 - - bdnz 4b - -.Lfirst_cool_down: - /* First cool down pass */ - lvx const1,0,r3 - addi r3,r3,16 - - vxor v0,v0,v8 - VPMSUMD(v8,v16,const1) - ori r2,r2,0 - - vxor v1,v1,v9 - VPMSUMD(v9,v17,const1) - ori r2,r2,0 - - vxor v2,v2,v10 - VPMSUMD(v10,v18,const1) - ori r2,r2,0 - - vxor v3,v3,v11 - VPMSUMD(v11,v19,const1) - ori r2,r2,0 - - vxor v4,v4,v12 - VPMSUMD(v12,v20,const1) - ori r2,r2,0 - - vxor v5,v5,v13 - VPMSUMD(v13,v21,const1) - ori r2,r2,0 - - vxor v6,v6,v14 - VPMSUMD(v14,v22,const1) - ori r2,r2,0 - - vxor v7,v7,v15 - VPMSUMD(v15,v23,const1) - ori r2,r2,0 - -.Lsecond_cool_down: - /* Second cool down pass */ - vxor v0,v0,v8 - vxor v1,v1,v9 - vxor v2,v2,v10 - vxor v3,v3,v11 - vxor v4,v4,v12 - vxor v5,v5,v13 - vxor v6,v6,v14 - vxor v7,v7,v15 - -#ifdef REFLECT - /* - * vpmsumd produces a 96 bit result in the least significant bits - * of the register. Since we are bit reflected we have to shift it - * left 32 bits so it occupies the least significant bits in the - * bit reflected domain. - */ - vsldoi v0,v0,zeroes,4 - vsldoi v1,v1,zeroes,4 - vsldoi v2,v2,zeroes,4 - vsldoi v3,v3,zeroes,4 - vsldoi v4,v4,zeroes,4 - vsldoi v5,v5,zeroes,4 - vsldoi v6,v6,zeroes,4 - vsldoi v7,v7,zeroes,4 -#endif - - /* xor with last 1024 bits */ - lvx v8,0,r4 - lvx v9,off16,r4 - VPERM(v8,v8,v8,byteswap) - VPERM(v9,v9,v9,byteswap) - lvx v10,off32,r4 - lvx v11,off48,r4 - VPERM(v10,v10,v10,byteswap) - VPERM(v11,v11,v11,byteswap) - lvx v12,off64,r4 - lvx v13,off80,r4 - VPERM(v12,v12,v12,byteswap) - VPERM(v13,v13,v13,byteswap) - lvx v14,off96,r4 - lvx v15,off112,r4 - VPERM(v14,v14,v14,byteswap) - VPERM(v15,v15,v15,byteswap) - - addi r4,r4,8*16 - - vxor v16,v0,v8 - vxor v17,v1,v9 - vxor v18,v2,v10 - vxor v19,v3,v11 - vxor v20,v4,v12 - vxor v21,v5,v13 - vxor v22,v6,v14 - vxor v23,v7,v15 - - li r0,1 - cmpdi r6,0 - addi r6,r6,128 - bne 1b - - /* Work out how many bytes we have left */ - andi. r5,r5,127 - - /* Calculate where in the constant table we need to start */ - subfic r6,r5,128 - add r3,r3,r6 - - /* How many 16 byte chunks are in the tail */ - srdi r7,r5,4 - mtctr r7 - - /* - * Reduce the previously calculated 1024 bits to 64 bits, shifting - * 32 bits to include the trailing 32 bits of zeros - */ - lvx v0,0,r3 - lvx v1,off16,r3 - lvx v2,off32,r3 - lvx v3,off48,r3 - lvx v4,off64,r3 - lvx v5,off80,r3 - lvx v6,off96,r3 - lvx v7,off112,r3 - addi r3,r3,8*16 - - VPMSUMW(v0,v16,v0) - VPMSUMW(v1,v17,v1) - VPMSUMW(v2,v18,v2) - VPMSUMW(v3,v19,v3) - VPMSUMW(v4,v20,v4) - VPMSUMW(v5,v21,v5) - VPMSUMW(v6,v22,v6) - VPMSUMW(v7,v23,v7) - - /* Now reduce the tail (0 - 112 bytes) */ - cmpdi r7,0 - beq 1f - - lvx v16,0,r4 - lvx v17,0,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - bdz 1f - - lvx v16,off16,r4 - lvx v17,off16,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - bdz 1f - - lvx v16,off32,r4 - lvx v17,off32,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - bdz 1f - - lvx v16,off48,r4 - lvx v17,off48,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - bdz 1f - - lvx v16,off64,r4 - lvx v17,off64,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - bdz 1f - - lvx v16,off80,r4 - lvx v17,off80,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - bdz 1f - - lvx v16,off96,r4 - lvx v17,off96,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - - /* Now xor all the parallel chunks together */ -1: vxor v0,v0,v1 - vxor v2,v2,v3 - vxor v4,v4,v5 - vxor v6,v6,v7 - - vxor v0,v0,v2 - vxor v4,v4,v6 - - vxor v0,v0,v4 - -.Lbarrett_reduction: - /* Barrett constants */ - LOAD_REG_ADDR(r3, .barrett_constants) - - lvx const1,0,r3 - lvx const2,off16,r3 - - vsldoi v1,v0,v0,8 - vxor v0,v0,v1 /* xor two 64 bit results together */ - -#ifdef REFLECT - /* shift left one bit */ - vspltisb v1,1 - vsl v0,v0,v1 -#endif - - vand v0,v0,mask_64bit -#ifndef REFLECT - /* - * Now for the Barrett reduction algorithm. The idea is to calculate q, - * the multiple of our polynomial that we need to subtract. By - * doing the computation 2x bits higher (ie 64 bits) and shifting the - * result back down 2x bits, we round down to the nearest multiple. - */ - VPMSUMD(v1,v0,const1) /* ma */ - vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ - VPMSUMD(v1,v1,const2) /* qn */ - vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ - - /* - * Get the result into r3. We need to shift it left 8 bytes: - * V0 [ 0 1 2 X ] - * V0 [ 0 X 2 3 ] - */ - vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ -#else - /* - * The reflected version of Barrett reduction. Instead of bit - * reflecting our data (which is expensive to do), we bit reflect our - * constants and our algorithm, which means the intermediate data in - * our vector registers goes from 0-63 instead of 63-0. We can reflect - * the algorithm because we don't carry in mod 2 arithmetic. - */ - vand v1,v0,mask_32bit /* bottom 32 bits of a */ - VPMSUMD(v1,v1,const1) /* ma */ - vand v1,v1,mask_32bit /* bottom 32bits of ma */ - VPMSUMD(v1,v1,const2) /* qn */ - vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ - - /* - * Since we are bit reflected, the result (ie the low 32 bits) is in - * the high 32 bits. We just need to shift it left 4 bytes - * V0 [ 0 1 X 3 ] - * V0 [ 0 X 2 3 ] - */ - vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ -#endif - - /* Get it into r3 */ - MFVRD(R3, v0) - -.Lout: - subi r6,r1,56+10*16 - subi r7,r1,56+2*16 - - lvx v20,0,r6 - lvx v21,off16,r6 - lvx v22,off32,r6 - lvx v23,off48,r6 - lvx v24,off64,r6 - lvx v25,off80,r6 - lvx v26,off96,r6 - lvx v27,off112,r6 - lvx v28,0,r7 - lvx v29,off16,r7 - - ld r31,-8(r1) - ld r30,-16(r1) - ld r29,-24(r1) - ld r28,-32(r1) - ld r27,-40(r1) - ld r26,-48(r1) - ld r25,-56(r1) - - blr - -.Lfirst_warm_up_done: - lvx const1,0,r3 - addi r3,r3,16 - - VPMSUMD(v8,v16,const1) - VPMSUMD(v9,v17,const1) - VPMSUMD(v10,v18,const1) - VPMSUMD(v11,v19,const1) - VPMSUMD(v12,v20,const1) - VPMSUMD(v13,v21,const1) - VPMSUMD(v14,v22,const1) - VPMSUMD(v15,v23,const1) - - b .Lsecond_cool_down - -.Lshort: - cmpdi r5,0 - beq .Lzero - - LOAD_REG_ADDR(r3, .short_constants) - - /* Calculate where in the constant table we need to start */ - subfic r6,r5,256 - add r3,r3,r6 - - /* How many 16 byte chunks? */ - srdi r7,r5,4 - mtctr r7 - - vxor v19,v19,v19 - vxor v20,v20,v20 - - lvx v0,0,r4 - lvx v16,0,r3 - VPERM(v0,v0,v16,byteswap) - vxor v0,v0,v8 /* xor in initial value */ - VPMSUMW(v0,v0,v16) - bdz .Lv0 - - lvx v1,off16,r4 - lvx v17,off16,r3 - VPERM(v1,v1,v17,byteswap) - VPMSUMW(v1,v1,v17) - bdz .Lv1 - - lvx v2,off32,r4 - lvx v16,off32,r3 - VPERM(v2,v2,v16,byteswap) - VPMSUMW(v2,v2,v16) - bdz .Lv2 - - lvx v3,off48,r4 - lvx v17,off48,r3 - VPERM(v3,v3,v17,byteswap) - VPMSUMW(v3,v3,v17) - bdz .Lv3 - - lvx v4,off64,r4 - lvx v16,off64,r3 - VPERM(v4,v4,v16,byteswap) - VPMSUMW(v4,v4,v16) - bdz .Lv4 - - lvx v5,off80,r4 - lvx v17,off80,r3 - VPERM(v5,v5,v17,byteswap) - VPMSUMW(v5,v5,v17) - bdz .Lv5 - - lvx v6,off96,r4 - lvx v16,off96,r3 - VPERM(v6,v6,v16,byteswap) - VPMSUMW(v6,v6,v16) - bdz .Lv6 - - lvx v7,off112,r4 - lvx v17,off112,r3 - VPERM(v7,v7,v17,byteswap) - VPMSUMW(v7,v7,v17) - bdz .Lv7 - - addi r3,r3,128 - addi r4,r4,128 - - lvx v8,0,r4 - lvx v16,0,r3 - VPERM(v8,v8,v16,byteswap) - VPMSUMW(v8,v8,v16) - bdz .Lv8 - - lvx v9,off16,r4 - lvx v17,off16,r3 - VPERM(v9,v9,v17,byteswap) - VPMSUMW(v9,v9,v17) - bdz .Lv9 - - lvx v10,off32,r4 - lvx v16,off32,r3 - VPERM(v10,v10,v16,byteswap) - VPMSUMW(v10,v10,v16) - bdz .Lv10 - - lvx v11,off48,r4 - lvx v17,off48,r3 - VPERM(v11,v11,v17,byteswap) - VPMSUMW(v11,v11,v17) - bdz .Lv11 - - lvx v12,off64,r4 - lvx v16,off64,r3 - VPERM(v12,v12,v16,byteswap) - VPMSUMW(v12,v12,v16) - bdz .Lv12 - - lvx v13,off80,r4 - lvx v17,off80,r3 - VPERM(v13,v13,v17,byteswap) - VPMSUMW(v13,v13,v17) - bdz .Lv13 - - lvx v14,off96,r4 - lvx v16,off96,r3 - VPERM(v14,v14,v16,byteswap) - VPMSUMW(v14,v14,v16) - bdz .Lv14 - - lvx v15,off112,r4 - lvx v17,off112,r3 - VPERM(v15,v15,v17,byteswap) - VPMSUMW(v15,v15,v17) - -.Lv15: vxor v19,v19,v15 -.Lv14: vxor v20,v20,v14 -.Lv13: vxor v19,v19,v13 -.Lv12: vxor v20,v20,v12 -.Lv11: vxor v19,v19,v11 -.Lv10: vxor v20,v20,v10 -.Lv9: vxor v19,v19,v9 -.Lv8: vxor v20,v20,v8 -.Lv7: vxor v19,v19,v7 -.Lv6: vxor v20,v20,v6 -.Lv5: vxor v19,v19,v5 -.Lv4: vxor v20,v20,v4 -.Lv3: vxor v19,v19,v3 -.Lv2: vxor v20,v20,v2 -.Lv1: vxor v19,v19,v1 -.Lv0: vxor v20,v20,v0 - - vxor v0,v19,v20 - - b .Lbarrett_reduction - -.Lzero: - mr r3,r10 - b .Lout - -FUNC_END(CRC_FUNCTION_NAME) diff --git a/arch/powerpc/lib/crc32c-vpmsum_asm.S b/arch/powerpc/lib/crc32c-vpmsum_asm.S index bf442004ea1f..1b35c55cce0a 100644 --- a/arch/powerpc/lib/crc32c-vpmsum_asm.S +++ b/arch/powerpc/lib/crc32c-vpmsum_asm.S @@ -839,4 +839,4 @@ #define CRC_FUNCTION_NAME __crc32c_vpmsum #define REFLECT -#include "crc32-vpmsum_core.S" +#include "crc-vpmsum-template.S" diff --git a/arch/powerpc/lib/crct10dif-vpmsum_asm.S b/arch/powerpc/lib/crct10dif-vpmsum_asm.S index f0b93a0fe168..47a6266d89a8 100644 --- a/arch/powerpc/lib/crct10dif-vpmsum_asm.S +++ b/arch/powerpc/lib/crct10dif-vpmsum_asm.S @@ -842,4 +842,4 @@ .octa 0x0000000000000000000000018bb70000 #define CRC_FUNCTION_NAME __crct10dif_vpmsum -#include "crc32-vpmsum_core.S" +#include "crc-vpmsum-template.S" -- cgit v1.2.3-59-g8ed1b From fa7ed85c9bdcd408fa5e85577a64a4d2a10dd807 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Apr 2025 17:20:36 -0700 Subject: s390/crc: drop "glue" from filenames The use of the term "glue" in filenames is a Crypto API-ism that does not show up elsewhere in lib/. I think adopting it there was a mistake. The library just uses standard functions, so the amount of code that could be considered "glue" is quite small. And while often the C functions just wrap the assembly functions, there are also cases like crc32c_arch() in arch/x86/lib/crc32-glue.c that blur the line by in-lining the actual implementation into the C function. That's not "glue code", but rather the actual code. Therefore, let's drop "glue" from the filenames and instead use e.g. crc32.c instead of crc32-glue.c. Reviewed-by: "Martin K. Petersen" Acked-by: Ard Biesheuvel Acked-by: Heiko Carstens Link: https://lore.kernel.org/r/20250424002038.179114-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/s390/lib/Makefile | 2 +- arch/s390/lib/crc32-glue.c | 77 ---------------------------------------------- arch/s390/lib/crc32.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 78 deletions(-) delete mode 100644 arch/s390/lib/crc32-glue.c create mode 100644 arch/s390/lib/crc32.c diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 14bbfe50033c..271a1c407121 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -26,4 +26,4 @@ lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o obj-$(CONFIG_EXPOLINE_EXTERN) += expoline.o obj-$(CONFIG_CRC32_ARCH) += crc32-s390.o -crc32-s390-y := crc32-glue.o crc32le-vx.o crc32be-vx.o +crc32-s390-y := crc32.o crc32le-vx.o crc32be-vx.o diff --git a/arch/s390/lib/crc32-glue.c b/arch/s390/lib/crc32-glue.c deleted file mode 100644 index 3c4b344417c1..000000000000 --- a/arch/s390/lib/crc32-glue.c +++ /dev/null @@ -1,77 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * CRC-32 implemented with the z/Architecture Vector Extension Facility. - * - * Copyright IBM Corp. 2015 - * Author(s): Hendrik Brueckner - */ -#define KMSG_COMPONENT "crc32-vx" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include -#include -#include -#include -#include "crc32-vx.h" - -#define VX_MIN_LEN 64 -#define VX_ALIGNMENT 16L -#define VX_ALIGN_MASK (VX_ALIGNMENT - 1) - -/* - * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension - * - * Creates a function to perform a particular CRC-32 computation. Depending - * on the message buffer, the hardware-accelerated or software implementation - * is used. Note that the message buffer is aligned to improve fetch - * operations of VECTOR LOAD MULTIPLE instructions. - */ -#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \ - u32 ___fname(u32 crc, const u8 *data, size_t datalen) \ - { \ - unsigned long prealign, aligned, remaining; \ - DECLARE_KERNEL_FPU_ONSTACK16(vxstate); \ - \ - if (datalen < VX_MIN_LEN + VX_ALIGN_MASK || !cpu_has_vx()) \ - return ___crc32_sw(crc, data, datalen); \ - \ - if ((unsigned long)data & VX_ALIGN_MASK) { \ - prealign = VX_ALIGNMENT - \ - ((unsigned long)data & VX_ALIGN_MASK); \ - datalen -= prealign; \ - crc = ___crc32_sw(crc, data, prealign); \ - data = (void *)((unsigned long)data + prealign); \ - } \ - \ - aligned = datalen & ~VX_ALIGN_MASK; \ - remaining = datalen & VX_ALIGN_MASK; \ - \ - kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \ - crc = ___crc32_vx(crc, data, aligned); \ - kernel_fpu_end(&vxstate, KERNEL_VXR_LOW); \ - \ - if (remaining) \ - crc = ___crc32_sw(crc, data + aligned, remaining); \ - \ - return crc; \ - } \ - EXPORT_SYMBOL(___fname); - -DEFINE_CRC32_VX(crc32_le_arch, crc32_le_vgfm_16, crc32_le_base) -DEFINE_CRC32_VX(crc32_be_arch, crc32_be_vgfm_16, crc32_be_base) -DEFINE_CRC32_VX(crc32c_arch, crc32c_le_vgfm_16, crc32c_base) - -u32 crc32_optimizations(void) -{ - if (cpu_has_vx()) { - return CRC32_LE_OPTIMIZATION | - CRC32_BE_OPTIMIZATION | - CRC32C_OPTIMIZATION; - } - return 0; -} -EXPORT_SYMBOL(crc32_optimizations); - -MODULE_AUTHOR("Hendrik Brueckner "); -MODULE_DESCRIPTION("CRC-32 algorithms using z/Architecture Vector Extension Facility"); -MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/crc32.c b/arch/s390/lib/crc32.c new file mode 100644 index 000000000000..3c4b344417c1 --- /dev/null +++ b/arch/s390/lib/crc32.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CRC-32 implemented with the z/Architecture Vector Extension Facility. + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner + */ +#define KMSG_COMPONENT "crc32-vx" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include "crc32-vx.h" + +#define VX_MIN_LEN 64 +#define VX_ALIGNMENT 16L +#define VX_ALIGN_MASK (VX_ALIGNMENT - 1) + +/* + * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension + * + * Creates a function to perform a particular CRC-32 computation. Depending + * on the message buffer, the hardware-accelerated or software implementation + * is used. Note that the message buffer is aligned to improve fetch + * operations of VECTOR LOAD MULTIPLE instructions. + */ +#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \ + u32 ___fname(u32 crc, const u8 *data, size_t datalen) \ + { \ + unsigned long prealign, aligned, remaining; \ + DECLARE_KERNEL_FPU_ONSTACK16(vxstate); \ + \ + if (datalen < VX_MIN_LEN + VX_ALIGN_MASK || !cpu_has_vx()) \ + return ___crc32_sw(crc, data, datalen); \ + \ + if ((unsigned long)data & VX_ALIGN_MASK) { \ + prealign = VX_ALIGNMENT - \ + ((unsigned long)data & VX_ALIGN_MASK); \ + datalen -= prealign; \ + crc = ___crc32_sw(crc, data, prealign); \ + data = (void *)((unsigned long)data + prealign); \ + } \ + \ + aligned = datalen & ~VX_ALIGN_MASK; \ + remaining = datalen & VX_ALIGN_MASK; \ + \ + kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \ + crc = ___crc32_vx(crc, data, aligned); \ + kernel_fpu_end(&vxstate, KERNEL_VXR_LOW); \ + \ + if (remaining) \ + crc = ___crc32_sw(crc, data + aligned, remaining); \ + \ + return crc; \ + } \ + EXPORT_SYMBOL(___fname); + +DEFINE_CRC32_VX(crc32_le_arch, crc32_le_vgfm_16, crc32_le_base) +DEFINE_CRC32_VX(crc32_be_arch, crc32_be_vgfm_16, crc32_be_base) +DEFINE_CRC32_VX(crc32c_arch, crc32c_le_vgfm_16, crc32c_base) + +u32 crc32_optimizations(void) +{ + if (cpu_has_vx()) { + return CRC32_LE_OPTIMIZATION | + CRC32_BE_OPTIMIZATION | + CRC32C_OPTIMIZATION; + } + return 0; +} +EXPORT_SYMBOL(crc32_optimizations); + +MODULE_AUTHOR("Hendrik Brueckner "); +MODULE_DESCRIPTION("CRC-32 algorithms using z/Architecture Vector Extension Facility"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3-59-g8ed1b From ee858d83c59d95e08551a9dc270bedca4b72137d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Apr 2025 17:20:37 -0700 Subject: sparc/crc: drop "glue" from filenames The use of the term "glue" in filenames is a Crypto API-ism that rarely shows up elsewhere in lib/ or arch/*/lib/. I think adopting it there was a mistake. The library just uses standard functions, so the amount of code that could be considered "glue" is quite small. And while often the C functions just wrap the assembly functions, there are also cases like crc32c_arch() in arch/x86/lib/crc32-glue.c that blur the line by in-lining the actual implementation into the C function. That's not "glue code", but rather the actual code. Therefore, let's drop "glue" from the filenames and instead use e.g. crc32.c instead of crc32-glue.c. Reviewed-by: "Martin K. Petersen" Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250424002038.179114-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/sparc/lib/Makefile | 2 +- arch/sparc/lib/crc32.c | 93 +++++++++++++++++++++++++++++++++++++++++++++ arch/sparc/lib/crc32_glue.c | 93 --------------------------------------------- 3 files changed, 94 insertions(+), 94 deletions(-) create mode 100644 arch/sparc/lib/crc32.c delete mode 100644 arch/sparc/lib/crc32_glue.c diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index 5724d0f356eb..ef8860eb3f3d 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -54,4 +54,4 @@ obj-$(CONFIG_SPARC64) += iomap.o obj-$(CONFIG_SPARC32) += atomic32.o obj-$(CONFIG_SPARC64) += PeeCeeI.o obj-$(CONFIG_CRC32_ARCH) += crc32-sparc.o -crc32-sparc-y := crc32_glue.o crc32c_asm.o +crc32-sparc-y := crc32.o crc32c_asm.o diff --git a/arch/sparc/lib/crc32.c b/arch/sparc/lib/crc32.c new file mode 100644 index 000000000000..428fd5588e93 --- /dev/null +++ b/arch/sparc/lib/crc32.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* CRC32c (Castagnoli), sparc64 crc32c opcode accelerated + * + * This is based largely upon arch/x86/crypto/crc32c-intel.c + * + * Copyright (C) 2008 Intel Corporation + * Authors: Austin Zhang + * Kent Liu + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32c_opcode); + +u32 crc32_le_arch(u32 crc, const u8 *data, size_t len) +{ + return crc32_le_base(crc, data, len); +} +EXPORT_SYMBOL(crc32_le_arch); + +void crc32c_sparc64(u32 *crcp, const u64 *data, size_t len); + +u32 crc32c_arch(u32 crc, const u8 *data, size_t len) +{ + size_t n = -(uintptr_t)data & 7; + + if (!static_branch_likely(&have_crc32c_opcode)) + return crc32c_base(crc, data, len); + + if (n) { + /* Data isn't 8-byte aligned. Align it. */ + n = min(n, len); + crc = crc32c_base(crc, data, n); + data += n; + len -= n; + } + n = len & ~7U; + if (n) { + crc32c_sparc64(&crc, (const u64 *)data, n); + data += n; + len -= n; + } + if (len) + crc = crc32c_base(crc, data, len); + return crc; +} +EXPORT_SYMBOL(crc32c_arch); + +u32 crc32_be_arch(u32 crc, const u8 *data, size_t len) +{ + return crc32_be_base(crc, data, len); +} +EXPORT_SYMBOL(crc32_be_arch); + +static int __init crc32_sparc_init(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return 0; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_CRC32C)) + return 0; + + static_branch_enable(&have_crc32c_opcode); + pr_info("Using sparc64 crc32c opcode optimized CRC32C implementation\n"); + return 0; +} +arch_initcall(crc32_sparc_init); + +static void __exit crc32_sparc_exit(void) +{ +} +module_exit(crc32_sparc_exit); + +u32 crc32_optimizations(void) +{ + if (static_key_enabled(&have_crc32c_opcode)) + return CRC32C_OPTIMIZATION; + return 0; +} +EXPORT_SYMBOL(crc32_optimizations); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("CRC32c (Castagnoli), sparc64 crc32c opcode accelerated"); diff --git a/arch/sparc/lib/crc32_glue.c b/arch/sparc/lib/crc32_glue.c deleted file mode 100644 index d34e7cc7e1a1..000000000000 --- a/arch/sparc/lib/crc32_glue.c +++ /dev/null @@ -1,93 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* Glue code for CRC32C optimized for sparc64 crypto opcodes. - * - * This is based largely upon arch/x86/crypto/crc32c-intel.c - * - * Copyright (C) 2008 Intel Corporation - * Authors: Austin Zhang - * Kent Liu - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32c_opcode); - -u32 crc32_le_arch(u32 crc, const u8 *data, size_t len) -{ - return crc32_le_base(crc, data, len); -} -EXPORT_SYMBOL(crc32_le_arch); - -void crc32c_sparc64(u32 *crcp, const u64 *data, size_t len); - -u32 crc32c_arch(u32 crc, const u8 *data, size_t len) -{ - size_t n = -(uintptr_t)data & 7; - - if (!static_branch_likely(&have_crc32c_opcode)) - return crc32c_base(crc, data, len); - - if (n) { - /* Data isn't 8-byte aligned. Align it. */ - n = min(n, len); - crc = crc32c_base(crc, data, n); - data += n; - len -= n; - } - n = len & ~7U; - if (n) { - crc32c_sparc64(&crc, (const u64 *)data, n); - data += n; - len -= n; - } - if (len) - crc = crc32c_base(crc, data, len); - return crc; -} -EXPORT_SYMBOL(crc32c_arch); - -u32 crc32_be_arch(u32 crc, const u8 *data, size_t len) -{ - return crc32_be_base(crc, data, len); -} -EXPORT_SYMBOL(crc32_be_arch); - -static int __init crc32_sparc_init(void) -{ - unsigned long cfr; - - if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) - return 0; - - __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); - if (!(cfr & CFR_CRC32C)) - return 0; - - static_branch_enable(&have_crc32c_opcode); - pr_info("Using sparc64 crc32c opcode optimized CRC32C implementation\n"); - return 0; -} -arch_initcall(crc32_sparc_init); - -static void __exit crc32_sparc_exit(void) -{ -} -module_exit(crc32_sparc_exit); - -u32 crc32_optimizations(void) -{ - if (static_key_enabled(&have_crc32c_opcode)) - return CRC32C_OPTIMIZATION; - return 0; -} -EXPORT_SYMBOL(crc32_optimizations); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("CRC32c (Castagnoli), sparc64 crc32c opcode accelerated"); -- cgit v1.2.3-59-g8ed1b From 35984c730dea1a9fdd5d931f298849605850856a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Apr 2025 17:20:38 -0700 Subject: x86/crc: drop "glue" from filenames The use of the term "glue" in filenames is a Crypto API-ism that rarely shows up elsewhere in lib/ or arch/*/lib/. I think adopting it there was a mistake. The library just uses standard functions, so the amount of code that could be considered "glue" is quite small. And while often the C functions just wrap the assembly functions, there are also cases like crc32c_arch() in arch/x86/lib/crc32-glue.c that blur the line by in-lining the actual implementation into the C function. That's not "glue code", but rather the actual code. Therefore, let's drop "glue" from the filenames and instead use e.g. crc32.c instead of crc32-glue.c. Reviewed-by: "Martin K. Petersen" Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250424002038.179114-8-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/x86/lib/Makefile | 6 +-- arch/x86/lib/crc-t10dif-glue.c | 40 --------------- arch/x86/lib/crc-t10dif.c | 40 +++++++++++++++ arch/x86/lib/crc32-glue.c | 111 ----------------------------------------- arch/x86/lib/crc32.c | 111 +++++++++++++++++++++++++++++++++++++++++ arch/x86/lib/crc64-glue.c | 50 ------------------- arch/x86/lib/crc64.c | 50 +++++++++++++++++++ 7 files changed, 204 insertions(+), 204 deletions(-) delete mode 100644 arch/x86/lib/crc-t10dif-glue.c create mode 100644 arch/x86/lib/crc-t10dif.c delete mode 100644 arch/x86/lib/crc32-glue.c create mode 100644 arch/x86/lib/crc32.c delete mode 100644 arch/x86/lib/crc64-glue.c create mode 100644 arch/x86/lib/crc64.c diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 1c50352eb49f..7cf8681cba0f 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -39,14 +39,14 @@ lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o -crc32-x86-y := crc32-glue.o crc32-pclmul.o +crc32-x86-y := crc32.o crc32-pclmul.o crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o obj-$(CONFIG_CRC64_ARCH) += crc64-x86.o -crc64-x86-y := crc64-glue.o crc64-pclmul.o +crc64-x86-y := crc64.o crc64-pclmul.o obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o -crc-t10dif-x86-y := crc-t10dif-glue.o crc16-msb-pclmul.o +crc-t10dif-x86-y := crc-t10dif.o crc16-msb-pclmul.o obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o obj-y += iomem.o diff --git a/arch/x86/lib/crc-t10dif-glue.c b/arch/x86/lib/crc-t10dif-glue.c deleted file mode 100644 index d073b3678edc..000000000000 --- a/arch/x86/lib/crc-t10dif-glue.c +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * CRC-T10DIF using [V]PCLMULQDQ instructions - * - * Copyright 2024 Google LLC - */ - -#include -#include -#include "crc-pclmul-template.h" - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); - -DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16); - -u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) -{ - CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts, - have_pclmulqdq); - return crc_t10dif_generic(crc, p, len); -} -EXPORT_SYMBOL(crc_t10dif_arch); - -static int __init crc_t10dif_x86_init(void) -{ - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { - static_branch_enable(&have_pclmulqdq); - INIT_CRC_PCLMUL(crc16_msb); - } - return 0; -} -arch_initcall(crc_t10dif_x86_init); - -static void __exit crc_t10dif_x86_exit(void) -{ -} -module_exit(crc_t10dif_x86_exit); - -MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc-t10dif.c b/arch/x86/lib/crc-t10dif.c new file mode 100644 index 000000000000..d073b3678edc --- /dev/null +++ b/arch/x86/lib/crc-t10dif.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * CRC-T10DIF using [V]PCLMULQDQ instructions + * + * Copyright 2024 Google LLC + */ + +#include +#include +#include "crc-pclmul-template.h" + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); + +DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16); + +u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) +{ + CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts, + have_pclmulqdq); + return crc_t10dif_generic(crc, p, len); +} +EXPORT_SYMBOL(crc_t10dif_arch); + +static int __init crc_t10dif_x86_init(void) +{ + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + static_branch_enable(&have_pclmulqdq); + INIT_CRC_PCLMUL(crc16_msb); + } + return 0; +} +arch_initcall(crc_t10dif_x86_init); + +static void __exit crc_t10dif_x86_exit(void) +{ +} +module_exit(crc_t10dif_x86_exit); + +MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c deleted file mode 100644 index e6a6285cfca8..000000000000 --- a/arch/x86/lib/crc32-glue.c +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * x86-optimized CRC32 functions - * - * Copyright (C) 2008 Intel Corporation - * Copyright 2012 Xyratex Technology Limited - * Copyright 2024 Google LLC - */ - -#include -#include -#include "crc-pclmul-template.h" - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); - -DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); - -u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) -{ - CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts, - have_pclmulqdq); - return crc32_le_base(crc, p, len); -} -EXPORT_SYMBOL(crc32_le_arch); - -#ifdef CONFIG_X86_64 -#define CRC32_INST "crc32q %1, %q0" -#else -#define CRC32_INST "crc32l %1, %0" -#endif - -/* - * Use carryless multiply version of crc32c when buffer size is >= 512 to - * account for FPU state save/restore overhead. - */ -#define CRC32C_PCLMUL_BREAKEVEN 512 - -asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); - -u32 crc32c_arch(u32 crc, const u8 *p, size_t len) -{ - size_t num_longs; - - if (!static_branch_likely(&have_crc32)) - return crc32c_base(crc, p, len); - - if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && - static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { - kernel_fpu_begin(); - crc = crc32c_x86_3way(crc, p, len); - kernel_fpu_end(); - return crc; - } - - for (num_longs = len / sizeof(unsigned long); - num_longs != 0; num_longs--, p += sizeof(unsigned long)) - asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p)); - - if (sizeof(unsigned long) > 4 && (len & 4)) { - asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p)); - p += 4; - } - if (len & 2) { - asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p)); - p += 2; - } - if (len & 1) - asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p)); - - return crc; -} -EXPORT_SYMBOL(crc32c_arch); - -u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) -{ - return crc32_be_base(crc, p, len); -} -EXPORT_SYMBOL(crc32_be_arch); - -static int __init crc32_x86_init(void) -{ - if (boot_cpu_has(X86_FEATURE_XMM4_2)) - static_branch_enable(&have_crc32); - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { - static_branch_enable(&have_pclmulqdq); - INIT_CRC_PCLMUL(crc32_lsb); - } - return 0; -} -arch_initcall(crc32_x86_init); - -static void __exit crc32_x86_exit(void) -{ -} -module_exit(crc32_x86_exit); - -u32 crc32_optimizations(void) -{ - u32 optimizations = 0; - - if (static_key_enabled(&have_crc32)) - optimizations |= CRC32C_OPTIMIZATION; - if (static_key_enabled(&have_pclmulqdq)) - optimizations |= CRC32_LE_OPTIMIZATION; - return optimizations; -} -EXPORT_SYMBOL(crc32_optimizations); - -MODULE_DESCRIPTION("x86-optimized CRC32 functions"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc32.c b/arch/x86/lib/crc32.c new file mode 100644 index 000000000000..e6a6285cfca8 --- /dev/null +++ b/arch/x86/lib/crc32.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * x86-optimized CRC32 functions + * + * Copyright (C) 2008 Intel Corporation + * Copyright 2012 Xyratex Technology Limited + * Copyright 2024 Google LLC + */ + +#include +#include +#include "crc-pclmul-template.h" + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); + +DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); + +u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts, + have_pclmulqdq); + return crc32_le_base(crc, p, len); +} +EXPORT_SYMBOL(crc32_le_arch); + +#ifdef CONFIG_X86_64 +#define CRC32_INST "crc32q %1, %q0" +#else +#define CRC32_INST "crc32l %1, %0" +#endif + +/* + * Use carryless multiply version of crc32c when buffer size is >= 512 to + * account for FPU state save/restore overhead. + */ +#define CRC32C_PCLMUL_BREAKEVEN 512 + +asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); + +u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + size_t num_longs; + + if (!static_branch_likely(&have_crc32)) + return crc32c_base(crc, p, len); + + if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && + static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { + kernel_fpu_begin(); + crc = crc32c_x86_3way(crc, p, len); + kernel_fpu_end(); + return crc; + } + + for (num_longs = len / sizeof(unsigned long); + num_longs != 0; num_longs--, p += sizeof(unsigned long)) + asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p)); + + if (sizeof(unsigned long) > 4 && (len & 4)) { + asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p)); + p += 4; + } + if (len & 2) { + asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p)); + p += 2; + } + if (len & 1) + asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p)); + + return crc; +} +EXPORT_SYMBOL(crc32c_arch); + +u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) +{ + return crc32_be_base(crc, p, len); +} +EXPORT_SYMBOL(crc32_be_arch); + +static int __init crc32_x86_init(void) +{ + if (boot_cpu_has(X86_FEATURE_XMM4_2)) + static_branch_enable(&have_crc32); + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + static_branch_enable(&have_pclmulqdq); + INIT_CRC_PCLMUL(crc32_lsb); + } + return 0; +} +arch_initcall(crc32_x86_init); + +static void __exit crc32_x86_exit(void) +{ +} +module_exit(crc32_x86_exit); + +u32 crc32_optimizations(void) +{ + u32 optimizations = 0; + + if (static_key_enabled(&have_crc32)) + optimizations |= CRC32C_OPTIMIZATION; + if (static_key_enabled(&have_pclmulqdq)) + optimizations |= CRC32_LE_OPTIMIZATION; + return optimizations; +} +EXPORT_SYMBOL(crc32_optimizations); + +MODULE_DESCRIPTION("x86-optimized CRC32 functions"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc64-glue.c b/arch/x86/lib/crc64-glue.c deleted file mode 100644 index 1214ee726c16..000000000000 --- a/arch/x86/lib/crc64-glue.c +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * CRC64 using [V]PCLMULQDQ instructions - * - * Copyright 2025 Google LLC - */ - -#include -#include -#include "crc-pclmul-template.h" - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); - -DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64); -DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64); - -u64 crc64_be_arch(u64 crc, const u8 *p, size_t len) -{ - CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts, - have_pclmulqdq); - return crc64_be_generic(crc, p, len); -} -EXPORT_SYMBOL_GPL(crc64_be_arch); - -u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) -{ - CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts, - have_pclmulqdq); - return crc64_nvme_generic(crc, p, len); -} -EXPORT_SYMBOL_GPL(crc64_nvme_arch); - -static int __init crc64_x86_init(void) -{ - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { - static_branch_enable(&have_pclmulqdq); - INIT_CRC_PCLMUL(crc64_msb); - INIT_CRC_PCLMUL(crc64_lsb); - } - return 0; -} -arch_initcall(crc64_x86_init); - -static void __exit crc64_x86_exit(void) -{ -} -module_exit(crc64_x86_exit); - -MODULE_DESCRIPTION("CRC64 using [V]PCLMULQDQ instructions"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc64.c b/arch/x86/lib/crc64.c new file mode 100644 index 000000000000..1214ee726c16 --- /dev/null +++ b/arch/x86/lib/crc64.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * CRC64 using [V]PCLMULQDQ instructions + * + * Copyright 2025 Google LLC + */ + +#include +#include +#include "crc-pclmul-template.h" + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); + +DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64); +DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64); + +u64 crc64_be_arch(u64 crc, const u8 *p, size_t len) +{ + CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts, + have_pclmulqdq); + return crc64_be_generic(crc, p, len); +} +EXPORT_SYMBOL_GPL(crc64_be_arch); + +u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) +{ + CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts, + have_pclmulqdq); + return crc64_nvme_generic(crc, p, len); +} +EXPORT_SYMBOL_GPL(crc64_nvme_arch); + +static int __init crc64_x86_init(void) +{ + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + static_branch_enable(&have_pclmulqdq); + INIT_CRC_PCLMUL(crc64_msb); + INIT_CRC_PCLMUL(crc64_lsb); + } + return 0; +} +arch_initcall(crc64_x86_init); + +static void __exit crc64_x86_exit(void) +{ +} +module_exit(crc64_x86_exit); + +MODULE_DESCRIPTION("CRC64 using [V]PCLMULQDQ instructions"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3-59-g8ed1b From 46e3311607d6c18a760fba4afbd5d24d42abb0f3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Apr 2025 09:24:58 -0700 Subject: crypto: crc32 - remove "generic" from file and module names Since crc32_generic.c and crc32c_generic.c now expose both the generic and architecture-optimized implementations via the crypto_shash API, rather than just the generic implementations as they originally did, remove the "generic" part of the filenames and module names: crypto/crc32-generic.c => crypto/crc32.c crypto/crc32c-generic.c => crypto/crc32c.c crc32-generic.ko => crc32-cryptoapi.ko crc32c-generic.ko => crc32c-cryptoapi.ko The reason for adding the -cryptoapi suffixes to the module names is to avoid a module name collision with crc32.ko which is the library API. We could instead rename the library module to libcrc32.ko. However, while lib/crypto/ uses that convention, the rest of lib/ doesn't. Since the library API is the primary API for CRC-32, I'd like to keep the unsuffixed name for it and make the Crypto API modules use a suffix. Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250428162458.29732-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/Makefile | 10 ++- crypto/crc32.c | 182 +++++++++++++++++++++++++++++++++++++++ crypto/crc32_generic.c | 182 --------------------------------------- crypto/crc32c.c | 222 ++++++++++++++++++++++++++++++++++++++++++++++++ crypto/crc32c_generic.c | 222 ------------------------------------------------ 5 files changed, 410 insertions(+), 408 deletions(-) create mode 100644 crypto/crc32.c delete mode 100644 crypto/crc32_generic.c create mode 100644 crypto/crc32c.c delete mode 100644 crypto/crc32c_generic.c diff --git a/crypto/Makefile b/crypto/Makefile index 0e6ab5ffd3f7..186f968baa39 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -152,10 +152,12 @@ obj-$(CONFIG_CRYPTO_CHACHA20) += chacha_generic.o obj-$(CONFIG_CRYPTO_POLY1305) += poly1305_generic.o obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o -obj-$(CONFIG_CRYPTO_CRC32C) += crc32c_generic.o -obj-$(CONFIG_CRYPTO_CRC32) += crc32_generic.o -CFLAGS_crc32c_generic.o += -DARCH=$(ARCH) -CFLAGS_crc32_generic.o += -DARCH=$(ARCH) +obj-$(CONFIG_CRYPTO_CRC32C) += crc32c-cryptoapi.o +crc32c-cryptoapi-y := crc32c.o +CFLAGS_crc32c.o += -DARCH=$(ARCH) +obj-$(CONFIG_CRYPTO_CRC32) += crc32-cryptoapi.o +crc32-cryptoapi-y := crc32.o +CFLAGS_crc32.o += -DARCH=$(ARCH) obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o obj-$(CONFIG_CRYPTO_KRB5ENC) += krb5enc.o obj-$(CONFIG_CRYPTO_LZO) += lzo.o lzo-rle.o diff --git a/crypto/crc32.c b/crypto/crc32.c new file mode 100644 index 000000000000..783a30b27398 --- /dev/null +++ b/crypto/crc32.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2012 Xyratex Technology Limited + */ + +/* + * This is crypto api shash wrappers to crc32_le. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +/** No default init with ~0 */ +static int crc32_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 0; + + return 0; +} + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set + * the seed. + */ +static int crc32_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) + return -EINVAL; + *mctx = get_unaligned_le32(key); + return 0; +} + +static int crc32_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); + + *crcp = *mctx; + + return 0; +} + +static int crc32_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + *crcp = crc32_le_base(*crcp, data, len); + return 0; +} + +static int crc32_update_arch(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + *crcp = crc32_le(*crcp, data, len); + return 0; +} + +/* No final XOR 0xFFFFFFFF, like crc32_le */ +static int __crc32_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + put_unaligned_le32(crc32_le_base(*crcp, data, len), out); + return 0; +} + +static int __crc32_finup_arch(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + put_unaligned_le32(crc32_le(*crcp, data, len), out); + return 0; +} + +static int crc32_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32_finup_arch(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_finup_arch(shash_desc_ctx(desc), data, len, out); +} + +static int crc32_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); + + put_unaligned_le32(*crcp, out); + return 0; +} + +static int crc32_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len, out); +} + +static int crc32_digest_arch(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_finup_arch(crypto_shash_ctx(desc->tfm), data, len, out); +} + +static struct shash_alg algs[] = {{ + .setkey = crc32_setkey, + .init = crc32_init, + .update = crc32_update, + .final = crc32_final, + .finup = crc32_finup, + .digest = crc32_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + + .base.cra_name = "crc32", + .base.cra_driver_name = "crc32-generic", + .base.cra_priority = 100, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_blocksize = CHKSUM_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(u32), + .base.cra_module = THIS_MODULE, + .base.cra_init = crc32_cra_init, +}, { + .setkey = crc32_setkey, + .init = crc32_init, + .update = crc32_update_arch, + .final = crc32_final, + .finup = crc32_finup_arch, + .digest = crc32_digest_arch, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + + .base.cra_name = "crc32", + .base.cra_driver_name = "crc32-" __stringify(ARCH), + .base.cra_priority = 150, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_blocksize = CHKSUM_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(u32), + .base.cra_module = THIS_MODULE, + .base.cra_init = crc32_cra_init, +}}; + +static int num_algs; + +static int __init crc32_mod_init(void) +{ + /* register the arch flavor only if it differs from the generic one */ + num_algs = 1 + ((crc32_optimizations() & CRC32_LE_OPTIMIZATION) != 0); + + return crypto_register_shashes(algs, num_algs); +} + +static void __exit crc32_mod_fini(void) +{ + crypto_unregister_shashes(algs, num_algs); +} + +subsys_initcall(crc32_mod_init); +module_exit(crc32_mod_fini); + +MODULE_AUTHOR("Alexander Boyko "); +MODULE_DESCRIPTION("CRC32 calculations wrapper for lib/crc32"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CRYPTO("crc32"); +MODULE_ALIAS_CRYPTO("crc32-generic"); diff --git a/crypto/crc32_generic.c b/crypto/crc32_generic.c deleted file mode 100644 index 783a30b27398..000000000000 --- a/crypto/crc32_generic.c +++ /dev/null @@ -1,182 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2012 Xyratex Technology Limited - */ - -/* - * This is crypto api shash wrappers to crc32_le. - */ - -#include -#include -#include -#include -#include -#include -#include - -#define CHKSUM_BLOCK_SIZE 1 -#define CHKSUM_DIGEST_SIZE 4 - -/** No default init with ~0 */ -static int crc32_cra_init(struct crypto_tfm *tfm) -{ - u32 *key = crypto_tfm_ctx(tfm); - - *key = 0; - - return 0; -} - -/* - * Setting the seed allows arbitrary accumulators and flexible XOR policy - * If your algorithm starts with ~0, then XOR with ~0 before you set - * the seed. - */ -static int crc32_setkey(struct crypto_shash *hash, const u8 *key, - unsigned int keylen) -{ - u32 *mctx = crypto_shash_ctx(hash); - - if (keylen != sizeof(u32)) - return -EINVAL; - *mctx = get_unaligned_le32(key); - return 0; -} - -static int crc32_init(struct shash_desc *desc) -{ - u32 *mctx = crypto_shash_ctx(desc->tfm); - u32 *crcp = shash_desc_ctx(desc); - - *crcp = *mctx; - - return 0; -} - -static int crc32_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - u32 *crcp = shash_desc_ctx(desc); - - *crcp = crc32_le_base(*crcp, data, len); - return 0; -} - -static int crc32_update_arch(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - u32 *crcp = shash_desc_ctx(desc); - - *crcp = crc32_le(*crcp, data, len); - return 0; -} - -/* No final XOR 0xFFFFFFFF, like crc32_le */ -static int __crc32_finup(u32 *crcp, const u8 *data, unsigned int len, - u8 *out) -{ - put_unaligned_le32(crc32_le_base(*crcp, data, len), out); - return 0; -} - -static int __crc32_finup_arch(u32 *crcp, const u8 *data, unsigned int len, - u8 *out) -{ - put_unaligned_le32(crc32_le(*crcp, data, len), out); - return 0; -} - -static int crc32_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32_finup(shash_desc_ctx(desc), data, len, out); -} - -static int crc32_finup_arch(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32_finup_arch(shash_desc_ctx(desc), data, len, out); -} - -static int crc32_final(struct shash_desc *desc, u8 *out) -{ - u32 *crcp = shash_desc_ctx(desc); - - put_unaligned_le32(*crcp, out); - return 0; -} - -static int crc32_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len, out); -} - -static int crc32_digest_arch(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32_finup_arch(crypto_shash_ctx(desc->tfm), data, len, out); -} - -static struct shash_alg algs[] = {{ - .setkey = crc32_setkey, - .init = crc32_init, - .update = crc32_update, - .final = crc32_final, - .finup = crc32_finup, - .digest = crc32_digest, - .descsize = sizeof(u32), - .digestsize = CHKSUM_DIGEST_SIZE, - - .base.cra_name = "crc32", - .base.cra_driver_name = "crc32-generic", - .base.cra_priority = 100, - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_blocksize = CHKSUM_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(u32), - .base.cra_module = THIS_MODULE, - .base.cra_init = crc32_cra_init, -}, { - .setkey = crc32_setkey, - .init = crc32_init, - .update = crc32_update_arch, - .final = crc32_final, - .finup = crc32_finup_arch, - .digest = crc32_digest_arch, - .descsize = sizeof(u32), - .digestsize = CHKSUM_DIGEST_SIZE, - - .base.cra_name = "crc32", - .base.cra_driver_name = "crc32-" __stringify(ARCH), - .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_blocksize = CHKSUM_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(u32), - .base.cra_module = THIS_MODULE, - .base.cra_init = crc32_cra_init, -}}; - -static int num_algs; - -static int __init crc32_mod_init(void) -{ - /* register the arch flavor only if it differs from the generic one */ - num_algs = 1 + ((crc32_optimizations() & CRC32_LE_OPTIMIZATION) != 0); - - return crypto_register_shashes(algs, num_algs); -} - -static void __exit crc32_mod_fini(void) -{ - crypto_unregister_shashes(algs, num_algs); -} - -subsys_initcall(crc32_mod_init); -module_exit(crc32_mod_fini); - -MODULE_AUTHOR("Alexander Boyko "); -MODULE_DESCRIPTION("CRC32 calculations wrapper for lib/crc32"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_CRYPTO("crc32"); -MODULE_ALIAS_CRYPTO("crc32-generic"); diff --git a/crypto/crc32c.c b/crypto/crc32c.c new file mode 100644 index 000000000000..b1a36d32dc50 --- /dev/null +++ b/crypto/crc32c.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Cryptographic API. + * + * CRC32C chksum + * + *@Article{castagnoli-crc, + * author = { Guy Castagnoli and Stefan Braeuer and Martin Herrman}, + * title = {{Optimization of Cyclic Redundancy-Check Codes with 24 + * and 32 Parity Bits}}, + * journal = IEEE Transactions on Communication, + * year = {1993}, + * volume = {41}, + * number = {6}, + * pages = {}, + * month = {June}, + *} + * Used by the iSCSI driver, possibly others, and derived from + * the iscsi-crc.c module of the linux-iscsi driver at + * http://linux-iscsi.sourceforge.net. + * + * Following the example of lib/crc32, this function is intended to be + * flexible and useful for all users. Modules that currently have their + * own crc32c, but hopefully may be able to use this one are: + * net/sctp (please add all your doco to here if you change to + * use this one!) + * + * + * Copyright (c) 2004 Cisco Systems, Inc. + * Copyright (c) 2008 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +struct chksum_ctx { + u32 key; +}; + +struct chksum_desc_ctx { + u32 crc; +}; + +/* + * Steps through buffer one byte at a time, calculates reflected + * crc using table. + */ + +static int chksum_init(struct shash_desc *desc) +{ + struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = mctx->key; + + return 0; +} + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set + * the seed. + */ +static int chksum_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int keylen) +{ + struct chksum_ctx *mctx = crypto_shash_ctx(tfm); + + if (keylen != sizeof(mctx->key)) + return -EINVAL; + mctx->key = get_unaligned_le32(key); + return 0; +} + +static int chksum_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = crc32c_base(ctx->crc, data, length); + return 0; +} + +static int chksum_update_arch(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = crc32c(ctx->crc, data, length); + return 0; +} + +static int chksum_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + put_unaligned_le32(~ctx->crc, out); + return 0; +} + +static int __chksum_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) +{ + put_unaligned_le32(~crc32c_base(*crcp, data, len), out); + return 0; +} + +static int __chksum_finup_arch(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + put_unaligned_le32(~crc32c(*crcp, data, len), out); + return 0; +} + +static int chksum_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, len, out); +} + +static int chksum_finup_arch(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup_arch(&ctx->crc, data, len, out); +} + +static int chksum_digest(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); + + return __chksum_finup(&mctx->key, data, length, out); +} + +static int chksum_digest_arch(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); + + return __chksum_finup_arch(&mctx->key, data, length, out); +} + +static int crc32c_cra_init(struct crypto_tfm *tfm) +{ + struct chksum_ctx *mctx = crypto_tfm_ctx(tfm); + + mctx->key = ~0; + return 0; +} + +static struct shash_alg algs[] = {{ + .digestsize = CHKSUM_DIGEST_SIZE, + .setkey = chksum_setkey, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + + .base.cra_name = "crc32c", + .base.cra_driver_name = "crc32c-generic", + .base.cra_priority = 100, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_blocksize = CHKSUM_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct chksum_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_init = crc32c_cra_init, +}, { + .digestsize = CHKSUM_DIGEST_SIZE, + .setkey = chksum_setkey, + .init = chksum_init, + .update = chksum_update_arch, + .final = chksum_final, + .finup = chksum_finup_arch, + .digest = chksum_digest_arch, + .descsize = sizeof(struct chksum_desc_ctx), + + .base.cra_name = "crc32c", + .base.cra_driver_name = "crc32c-" __stringify(ARCH), + .base.cra_priority = 150, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_blocksize = CHKSUM_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct chksum_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_init = crc32c_cra_init, +}}; + +static int num_algs; + +static int __init crc32c_mod_init(void) +{ + /* register the arch flavor only if it differs from the generic one */ + num_algs = 1 + ((crc32_optimizations() & CRC32C_OPTIMIZATION) != 0); + + return crypto_register_shashes(algs, num_algs); +} + +static void __exit crc32c_mod_fini(void) +{ + crypto_unregister_shashes(algs, num_algs); +} + +subsys_initcall(crc32c_mod_init); +module_exit(crc32c_mod_fini); + +MODULE_AUTHOR("Clay Haapala "); +MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations wrapper for lib/crc32c"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CRYPTO("crc32c"); +MODULE_ALIAS_CRYPTO("crc32c-generic"); diff --git a/crypto/crc32c_generic.c b/crypto/crc32c_generic.c deleted file mode 100644 index b1a36d32dc50..000000000000 --- a/crypto/crc32c_generic.c +++ /dev/null @@ -1,222 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Cryptographic API. - * - * CRC32C chksum - * - *@Article{castagnoli-crc, - * author = { Guy Castagnoli and Stefan Braeuer and Martin Herrman}, - * title = {{Optimization of Cyclic Redundancy-Check Codes with 24 - * and 32 Parity Bits}}, - * journal = IEEE Transactions on Communication, - * year = {1993}, - * volume = {41}, - * number = {6}, - * pages = {}, - * month = {June}, - *} - * Used by the iSCSI driver, possibly others, and derived from - * the iscsi-crc.c module of the linux-iscsi driver at - * http://linux-iscsi.sourceforge.net. - * - * Following the example of lib/crc32, this function is intended to be - * flexible and useful for all users. Modules that currently have their - * own crc32c, but hopefully may be able to use this one are: - * net/sctp (please add all your doco to here if you change to - * use this one!) - * - * - * Copyright (c) 2004 Cisco Systems, Inc. - * Copyright (c) 2008 Herbert Xu - */ - -#include -#include -#include -#include -#include -#include -#include - -#define CHKSUM_BLOCK_SIZE 1 -#define CHKSUM_DIGEST_SIZE 4 - -struct chksum_ctx { - u32 key; -}; - -struct chksum_desc_ctx { - u32 crc; -}; - -/* - * Steps through buffer one byte at a time, calculates reflected - * crc using table. - */ - -static int chksum_init(struct shash_desc *desc) -{ - struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - ctx->crc = mctx->key; - - return 0; -} - -/* - * Setting the seed allows arbitrary accumulators and flexible XOR policy - * If your algorithm starts with ~0, then XOR with ~0 before you set - * the seed. - */ -static int chksum_setkey(struct crypto_shash *tfm, const u8 *key, - unsigned int keylen) -{ - struct chksum_ctx *mctx = crypto_shash_ctx(tfm); - - if (keylen != sizeof(mctx->key)) - return -EINVAL; - mctx->key = get_unaligned_le32(key); - return 0; -} - -static int chksum_update(struct shash_desc *desc, const u8 *data, - unsigned int length) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - ctx->crc = crc32c_base(ctx->crc, data, length); - return 0; -} - -static int chksum_update_arch(struct shash_desc *desc, const u8 *data, - unsigned int length) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - ctx->crc = crc32c(ctx->crc, data, length); - return 0; -} - -static int chksum_final(struct shash_desc *desc, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - put_unaligned_le32(~ctx->crc, out); - return 0; -} - -static int __chksum_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) -{ - put_unaligned_le32(~crc32c_base(*crcp, data, len), out); - return 0; -} - -static int __chksum_finup_arch(u32 *crcp, const u8 *data, unsigned int len, - u8 *out) -{ - put_unaligned_le32(~crc32c(*crcp, data, len), out); - return 0; -} - -static int chksum_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - return __chksum_finup(&ctx->crc, data, len, out); -} - -static int chksum_finup_arch(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - return __chksum_finup_arch(&ctx->crc, data, len, out); -} - -static int chksum_digest(struct shash_desc *desc, const u8 *data, - unsigned int length, u8 *out) -{ - struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); - - return __chksum_finup(&mctx->key, data, length, out); -} - -static int chksum_digest_arch(struct shash_desc *desc, const u8 *data, - unsigned int length, u8 *out) -{ - struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); - - return __chksum_finup_arch(&mctx->key, data, length, out); -} - -static int crc32c_cra_init(struct crypto_tfm *tfm) -{ - struct chksum_ctx *mctx = crypto_tfm_ctx(tfm); - - mctx->key = ~0; - return 0; -} - -static struct shash_alg algs[] = {{ - .digestsize = CHKSUM_DIGEST_SIZE, - .setkey = chksum_setkey, - .init = chksum_init, - .update = chksum_update, - .final = chksum_final, - .finup = chksum_finup, - .digest = chksum_digest, - .descsize = sizeof(struct chksum_desc_ctx), - - .base.cra_name = "crc32c", - .base.cra_driver_name = "crc32c-generic", - .base.cra_priority = 100, - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_blocksize = CHKSUM_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct chksum_ctx), - .base.cra_module = THIS_MODULE, - .base.cra_init = crc32c_cra_init, -}, { - .digestsize = CHKSUM_DIGEST_SIZE, - .setkey = chksum_setkey, - .init = chksum_init, - .update = chksum_update_arch, - .final = chksum_final, - .finup = chksum_finup_arch, - .digest = chksum_digest_arch, - .descsize = sizeof(struct chksum_desc_ctx), - - .base.cra_name = "crc32c", - .base.cra_driver_name = "crc32c-" __stringify(ARCH), - .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_blocksize = CHKSUM_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct chksum_ctx), - .base.cra_module = THIS_MODULE, - .base.cra_init = crc32c_cra_init, -}}; - -static int num_algs; - -static int __init crc32c_mod_init(void) -{ - /* register the arch flavor only if it differs from the generic one */ - num_algs = 1 + ((crc32_optimizations() & CRC32C_OPTIMIZATION) != 0); - - return crypto_register_shashes(algs, num_algs); -} - -static void __exit crc32c_mod_fini(void) -{ - crypto_unregister_shashes(algs, num_algs); -} - -subsys_initcall(crc32c_mod_init); -module_exit(crc32c_mod_fini); - -MODULE_AUTHOR("Clay Haapala "); -MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations wrapper for lib/crc32c"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_CRYPTO("crc32c"); -MODULE_ALIAS_CRYPTO("crc32c-generic"); -- cgit v1.2.3-59-g8ed1b From 648c7fb16f609c53d2659fefb5088af619485ab4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 9 May 2025 20:59:59 -0700 Subject: lib/crc: make arch-optimized code use subsys_initcall Make the architecture-optimized CRC code do its CPU feature checks in subsys_initcalls instead of arch_initcalls. This makes it consistent with arch/*/lib/crypto/ and ensures that it runs after initcalls that possibly could be a prerequisite for kernel-mode FPU, such as x86's xfd_update_static_branch() and loongarch's init_euen_mask(). Note: as far as I can tell, x86's xfd_update_static_branch() isn't *actually* needed for kernel-mode FPU. loongarch's init_euen_mask() is needed to enable save/restore of the vector registers, but loongarch doesn't yet have any CRC or crypto code that uses vector registers anyway. Regardless, let's be consistent with arch/*/lib/crypto/ and robust against any potential future dependency on an arch_initcall. Link: https://lore.kernel.org/r/20250510035959.87995-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm/lib/crc-t10dif.c | 2 +- arch/arm/lib/crc32.c | 2 +- arch/arm64/lib/crc-t10dif.c | 2 +- arch/loongarch/lib/crc32-loongarch.c | 2 +- arch/mips/lib/crc32-mips.c | 2 +- arch/powerpc/lib/crc-t10dif.c | 2 +- arch/powerpc/lib/crc32.c | 2 +- arch/sparc/lib/crc32.c | 2 +- arch/x86/lib/crc-t10dif.c | 2 +- arch/x86/lib/crc32.c | 2 +- arch/x86/lib/crc64.c | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/arm/lib/crc-t10dif.c b/arch/arm/lib/crc-t10dif.c index 382437094bdd..1093f8ec13b0 100644 --- a/arch/arm/lib/crc-t10dif.c +++ b/arch/arm/lib/crc-t10dif.c @@ -60,7 +60,7 @@ static int __init crc_t10dif_arm_init(void) } return 0; } -arch_initcall(crc_t10dif_arm_init); +subsys_initcall(crc_t10dif_arm_init); static void __exit crc_t10dif_arm_exit(void) { diff --git a/arch/arm/lib/crc32.c b/arch/arm/lib/crc32.c index 7ef7db9c0de7..f2bef8849c7c 100644 --- a/arch/arm/lib/crc32.c +++ b/arch/arm/lib/crc32.c @@ -103,7 +103,7 @@ static int __init crc32_arm_init(void) static_branch_enable(&have_pmull); return 0; } -arch_initcall(crc32_arm_init); +subsys_initcall(crc32_arm_init); static void __exit crc32_arm_exit(void) { diff --git a/arch/arm64/lib/crc-t10dif.c b/arch/arm64/lib/crc-t10dif.c index 99d0b5668a28..c2ffe4fdb59d 100644 --- a/arch/arm64/lib/crc-t10dif.c +++ b/arch/arm64/lib/crc-t10dif.c @@ -61,7 +61,7 @@ static int __init crc_t10dif_arm64_init(void) } return 0; } -arch_initcall(crc_t10dif_arm64_init); +subsys_initcall(crc_t10dif_arm64_init); static void __exit crc_t10dif_arm64_exit(void) { diff --git a/arch/loongarch/lib/crc32-loongarch.c b/arch/loongarch/lib/crc32-loongarch.c index 8e6d1f517e73..b37cd8537b45 100644 --- a/arch/loongarch/lib/crc32-loongarch.c +++ b/arch/loongarch/lib/crc32-loongarch.c @@ -114,7 +114,7 @@ static int __init crc32_loongarch_init(void) static_branch_enable(&have_crc32); return 0; } -arch_initcall(crc32_loongarch_init); +subsys_initcall(crc32_loongarch_init); static void __exit crc32_loongarch_exit(void) { diff --git a/arch/mips/lib/crc32-mips.c b/arch/mips/lib/crc32-mips.c index 84df361e7181..45e4d2c9fbf5 100644 --- a/arch/mips/lib/crc32-mips.c +++ b/arch/mips/lib/crc32-mips.c @@ -163,7 +163,7 @@ static int __init crc32_mips_init(void) static_branch_enable(&have_crc32); return 0; } -arch_initcall(crc32_mips_init); +subsys_initcall(crc32_mips_init); static void __exit crc32_mips_exit(void) { diff --git a/arch/powerpc/lib/crc-t10dif.c b/arch/powerpc/lib/crc-t10dif.c index ddd5c4088f50..4253842cc50d 100644 --- a/arch/powerpc/lib/crc-t10dif.c +++ b/arch/powerpc/lib/crc-t10dif.c @@ -71,7 +71,7 @@ static int __init crc_t10dif_powerpc_init(void) static_branch_enable(&have_vec_crypto); return 0; } -arch_initcall(crc_t10dif_powerpc_init); +subsys_initcall(crc_t10dif_powerpc_init); static void __exit crc_t10dif_powerpc_exit(void) { diff --git a/arch/powerpc/lib/crc32.c b/arch/powerpc/lib/crc32.c index 42f2dd3c85dd..77e5a37006f0 100644 --- a/arch/powerpc/lib/crc32.c +++ b/arch/powerpc/lib/crc32.c @@ -72,7 +72,7 @@ static int __init crc32_powerpc_init(void) static_branch_enable(&have_vec_crypto); return 0; } -arch_initcall(crc32_powerpc_init); +subsys_initcall(crc32_powerpc_init); static void __exit crc32_powerpc_exit(void) { diff --git a/arch/sparc/lib/crc32.c b/arch/sparc/lib/crc32.c index 428fd5588e93..40d4720a42a1 100644 --- a/arch/sparc/lib/crc32.c +++ b/arch/sparc/lib/crc32.c @@ -74,7 +74,7 @@ static int __init crc32_sparc_init(void) pr_info("Using sparc64 crc32c opcode optimized CRC32C implementation\n"); return 0; } -arch_initcall(crc32_sparc_init); +subsys_initcall(crc32_sparc_init); static void __exit crc32_sparc_exit(void) { diff --git a/arch/x86/lib/crc-t10dif.c b/arch/x86/lib/crc-t10dif.c index d073b3678edc..db7ce59c31ac 100644 --- a/arch/x86/lib/crc-t10dif.c +++ b/arch/x86/lib/crc-t10dif.c @@ -29,7 +29,7 @@ static int __init crc_t10dif_x86_init(void) } return 0; } -arch_initcall(crc_t10dif_x86_init); +subsys_initcall(crc_t10dif_x86_init); static void __exit crc_t10dif_x86_exit(void) { diff --git a/arch/x86/lib/crc32.c b/arch/x86/lib/crc32.c index e6a6285cfca8..d09343e2cea9 100644 --- a/arch/x86/lib/crc32.c +++ b/arch/x86/lib/crc32.c @@ -88,7 +88,7 @@ static int __init crc32_x86_init(void) } return 0; } -arch_initcall(crc32_x86_init); +subsys_initcall(crc32_x86_init); static void __exit crc32_x86_exit(void) { diff --git a/arch/x86/lib/crc64.c b/arch/x86/lib/crc64.c index 1214ee726c16..351a09f5813e 100644 --- a/arch/x86/lib/crc64.c +++ b/arch/x86/lib/crc64.c @@ -39,7 +39,7 @@ static int __init crc64_x86_init(void) } return 0; } -arch_initcall(crc64_x86_init); +subsys_initcall(crc64_x86_init); static void __exit crc64_x86_exit(void) { -- cgit v1.2.3-59-g8ed1b From e8d72b766adcde14188e68968f3cd05f4321691d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 10 May 2025 22:21:51 -0700 Subject: MAINTAINERS: add crc_kunit.c back to CRC LIBRARY Restore lib/tests/crc_kunit.c to CRC LIBRARY following the rename in commit db6fe4d61ece ("lib: Move KUnit tests into tests/ subdirectory") which made it no longer match the file pattern lib/crc*. Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250511052151.420228-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index c59316109e3f..ac70e19c53cf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6253,6 +6253,7 @@ F: Documentation/staging/crc* F: arch/*/lib/crc* F: include/linux/crc* F: lib/crc* +F: lib/tests/crc_kunit.c F: scripts/gen-crc-consts.py CREATIVE SB0540 -- cgit v1.2.3-59-g8ed1b From 0769ebe279c0f07f5f01ef2dca9f425803039755 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 12 May 2025 19:21:14 -0700 Subject: w1: ds2406: use crc16() instead of crc16_byte() loop Instead of looping through each byte and calling crc16_byte(), instead just call crc16() on the whole buffer. No functional change. Acked-by: Krzysztof Kozlowski Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250513022115.39109-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- drivers/w1/slaves/w1_ds2406.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/w1/slaves/w1_ds2406.c b/drivers/w1/slaves/w1_ds2406.c index 1cae9b243ff8..76026d615111 100644 --- a/drivers/w1/slaves/w1_ds2406.c +++ b/drivers/w1/slaves/w1_ds2406.c @@ -29,8 +29,6 @@ static ssize_t w1_f12_read_state( { u8 w1_buf[6] = {W1_F12_FUNC_READ_STATUS, 7, 0, 0, 0, 0}; struct w1_slave *sl = kobj_to_w1_slave(kobj); - u16 crc = 0; - int i; ssize_t rtnval = 1; if (off != 0) @@ -47,9 +45,7 @@ static ssize_t w1_f12_read_state( w1_write_block(sl->master, w1_buf, 3); w1_read_block(sl->master, w1_buf+3, 3); - for (i = 0; i < 6; i++) - crc = crc16_byte(crc, w1_buf[i]); - if (crc == 0xb001) /* good read? */ + if (crc16(0, w1_buf, sizeof(w1_buf)) == 0xb001) /* good read? */ *buf = ((w1_buf[3]>>5)&3)|0x30; else rtnval = -EIO; @@ -66,8 +62,6 @@ static ssize_t w1_f12_write_output( { struct w1_slave *sl = kobj_to_w1_slave(kobj); u8 w1_buf[6] = {W1_F12_FUNC_WRITE_STATUS, 7, 0, 0, 0, 0}; - u16 crc = 0; - int i; ssize_t rtnval = 1; if (count != 1 || off != 0) @@ -83,9 +77,7 @@ static ssize_t w1_f12_write_output( w1_buf[3] = (((*buf)&3)<<5)|0x1F; w1_write_block(sl->master, w1_buf, 4); w1_read_block(sl->master, w1_buf+4, 2); - for (i = 0; i < 6; i++) - crc = crc16_byte(crc, w1_buf[i]); - if (crc == 0xb001) /* good read? */ + if (crc16(0, w1_buf, sizeof(w1_buf)) == 0xb001) /* good read? */ w1_write_8(sl->master, 0xFF); else rtnval = -EIO; -- cgit v1.2.3-59-g8ed1b From 3937f6db6e932c560a0f9ee2cd2a4fdcc314dadf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 12 May 2025 19:21:15 -0700 Subject: lib/crc16: unexport crc16_table and crc16_byte() Now that neither crc16_table nor crc16_byte() is used outside lib/crc16.c, fold them into lib/crc16.c. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250513022115.39109-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc16.h | 9 +-------- lib/crc16.c | 9 ++++----- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/include/linux/crc16.h b/include/linux/crc16.h index 9fa74529b317..b861d969b161 100644 --- a/include/linux/crc16.h +++ b/include/linux/crc16.h @@ -15,14 +15,7 @@ #include -extern u16 const crc16_table[256]; - -extern u16 crc16(u16 crc, const u8 *buffer, size_t len); - -static inline u16 crc16_byte(u16 crc, const u8 data) -{ - return (crc >> 8) ^ crc16_table[(crc ^ data) & 0xff]; -} +u16 crc16(u16 crc, const u8 *p, size_t len); #endif /* __CRC16_H */ diff --git a/lib/crc16.c b/lib/crc16.c index 5c3a803c01e0..9c71eda9bf4b 100644 --- a/lib/crc16.c +++ b/lib/crc16.c @@ -8,7 +8,7 @@ #include /** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */ -u16 const crc16_table[256] = { +static const u16 crc16_table[256] = { 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, @@ -42,20 +42,19 @@ u16 const crc16_table[256] = { 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 }; -EXPORT_SYMBOL(crc16_table); /** * crc16 - compute the CRC-16 for the data buffer * @crc: previous CRC value - * @buffer: data pointer + * @p: data pointer * @len: number of bytes in the buffer * * Returns the updated CRC value. */ -u16 crc16(u16 crc, u8 const *buffer, size_t len) +u16 crc16(u16 crc, const u8 *p, size_t len) { while (len--) - crc = crc16_byte(crc, *buffer++); + crc = (crc >> 8) ^ crc16_table[(crc & 0xff) ^ *p++]; return crc; } EXPORT_SYMBOL(crc16); -- cgit v1.2.3-59-g8ed1b From 289c99bec7eed918ab37c62cbb29a2e3f58fb1fb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 13 May 2025 22:24:09 -0700 Subject: lib/crc32: add SPDX license identifier lib/crc32.c and include/linux/crc32.h got missed by the bulk SPDX conversion because of the nonstandard explanation of the license. However, crc32.c clearly states that it's licensed under the GNU General Public License, Version 2. And the comment in crc32.h clearly indicates that it's meant to have the same license as crc32.c. Therefore, apply SPDX-License-Identifier: GPL-2.0-only to both files. Reviewed-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250514052409.194822-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32.h | 5 +---- lib/crc32.c | 4 +--- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 69c2e8bb3782..569dc13f139f 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -1,7 +1,4 @@ -/* - * crc32.h - * See linux/lib/crc32.c for license and changes - */ +/* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_CRC32_H #define _LINUX_CRC32_H diff --git a/lib/crc32.c b/lib/crc32.c index fddd424ff224..e690026f44f7 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin * cleaned up code to current version of sparse and added the slicing-by-8 @@ -19,9 +20,6 @@ * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. * fs/jffs2 uses seed 0, doesn't xor with ~0. * fs/partitions/efi.c uses seed ~0, xor's with ~0. - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ /* see: Documentation/staging/crc32.rst for a description of algorithms */ -- cgit v1.2.3-59-g8ed1b