linux/lib/crc/x86/crc-pclmul-template.h

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
 * instantiated by crc-pclmul-template.S
 *
 * Copyright 2025 Google LLC
 *
 * Author: Eric Biggers <ebiggers@google.com>
 */
#ifndef _CRC_PCLMUL_TEMPLATE_H
#define _CRC_PCLMUL_TEMPLATE_H

#include <asm/cpufeatures.h>
#include <asm/simd.h>
#include <crypto/internal/simd.h>
#include <linux/static_call.h>
#include "crc-pclmul-consts.h"

#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t)				\
crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len,		\
			  const void *consts_ptr);			\
crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len,		\
			    const void *consts_ptr);			\
crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len,	\
			      const void *consts_ptr);			\
DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)

static inline bool have_vpclmul(void)
{
	return boot_cpu_has(X86_FEATURE_VPCLMULQDQ) &&
	       boot_cpu_has(X86_FEATURE_AVX2) &&
	       cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL);
}

static inline bool have_avx512(void)
{
	return boot_cpu_has(X86_FEATURE_AVX512BW) &&
	       boot_cpu_has(X86_FEATURE_AVX512VL) &&
	       !boot_cpu_has(X86_FEATURE_PREFER_YMM) &&
	       cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL);
}

/*
 * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16
 * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.
 *
 * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.
 * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),
 * varying by CPU and factors such as which parts of the "FPU" state userspace
 * has touched, which could result in a larger cutoff being better.  Indeed, a
 * larger cutoff is usually better for a *single* message.  However, the
 * overhead of the FPU section gets amortized if multiple FPU sections get
 * executed before returning to userspace, since the XSAVE and XRSTOR occur only
 * once.  Considering that and the fact that the [V]PCLMULQDQ code is lighter on
 * the dcache than the table-based code is, a 16-byte cutoff seems to work well.
 */
#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq)		\
do {									\
	if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) &&	\
	    crypto_simd_usable()) {					\
		const void *consts_ptr;					\
									\
		consts_ptr = (consts).fold_across_128_bits_consts;	\
		kernel_fpu_begin();					\
		crc = static_call(prefix##_pclmul)((crc), (p), (len),	\
						   consts_ptr);		\
		kernel_fpu_end();					\
		return crc;						\
	}								\
} while (0)

#endif /* _CRC_PCLMUL_TEMPLATE_H */
x86/crc: add "template" for [V]PCLMULQDQ based CRC functions The Linux kernel implements many variants of CRC, such as crc16, crc_t10dif, crc32_le, crc32c, crc32_be, crc64_nvme, and crc64_be. On x86, except for crc32c which has special scalar instructions, the fastest way to compute any of these CRCs on any message of length roughly >= 16 bytes is to use the SIMD carryless multiplication instructions PCLMULQDQ or VPCLMULQDQ. Depending on the available CPU features this can mean PCLMULQDQ+SSE4.1, VPCLMULQDQ+AVX2, VPCLMULQDQ+AVX10/256, or VPCLMULQDQ+AVX10/512 (or the AVX512 equivalents to AVX10/*). This results in a total of 20+ CRC implementations being potentially needed to properly optimize all CRCs that someone cares about for x86. Besides crc32c, currently only crc32_le and crc_t10dif are actually optimized for x86, and they only use PCLMULQDQ, which means they can be 2-4x slower than what is possible with VPCLMULQDQ. Fortunately, at a high level the code that is needed for any [V]PCLMULQDQ based CRC implementation is mostly the same. Therefore, this patch introduces an assembly macro that expands into the body of a [V]PCLMULQDQ based CRC function for a given number of bits (8, 16, 32, or 64), bit order (lsb or msb-first), vector length, and AVX level. The function expects to be passed a constants table, specific to the polynomial desired, that was generated by the script previously added. When two CRC variants share the same number of bits and bit order, the same functions can be reused, with only the constants table differing. A new C header is also added to make it easy to integrate the new assembly code using a static call. The result is that it becomes straightforward to wire up an optimized implementation of any CRC-8, CRC-16, CRC-32, or CRC-64 for x86. Later patches will wire up specific CRC variants. Although this new template allows easily generating many functions, care was taken to still keep the binary size fairly low. Each generated function is only 550 to 850 bytes depending on the CRC variant and target CPU features. And only one function per CRC variant is actually used at runtime (since all functions support all lengths >= 16 bytes). Note that a similar approach should also work for other architectures that have carryless multiplication instructions, such as arm64. Acked-by: Ard Biesheuvel <ardb@kernel.org> Acked-by: Keith Busch <kbusch@kernel.org> Link: https://lore.kernel.org/r/20250210174540.161705-4-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@google.com> 2025-02-10 09:26:44 -08:00			`/* SPDX-License-Identifier: GPL-2.0-or-later */`
			`/*`
			`* Macros for accessing the [V]PCLMULQDQ-based CRC functions that are`
			`* instantiated by crc-pclmul-template.S`
			`*`
			`* Copyright 2025 Google LLC`
			`*`
			`* Author: Eric Biggers <ebiggers@google.com>`
			`*/`
			`#ifndef _CRC_PCLMUL_TEMPLATE_H`
			`#define _CRC_PCLMUL_TEMPLATE_H`

			`#include <asm/cpufeatures.h>`
			`#include <asm/simd.h>`
			`#include <crypto/internal/simd.h>`
			`#include <linux/static_call.h>`
			`#include "crc-pclmul-consts.h"`

			`#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \`
			`crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \`
			`const void *consts_ptr); \`
			`crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \`
			`const void *consts_ptr); \`
x86/crc: drop the avx10_256 functions and rename avx10_512 to avx512 Intel made a late change to the AVX10 specification that removes support for a 256-bit maximum vector length and enumeration of the maximum vector length. AVX10 will imply a maximum vector length of 512 bits. I.e. there won't be any such thing as AVX10/256 or AVX10/512; there will just be AVX10, and it will essentially just consolidate AVX512 features. As a result of this new development, my strategy of providing both _avx10_256 and _avx10_512 functions didn't turn out to be that useful. The only remaining motivation for the 256-bit AVX512 / AVX10 functions is to avoid downclocking on older Intel CPUs. But I already wrote _avx2 code too (primarily to support CPUs without AVX512), which performs almost as well as _avx10_256. So we should just use that. Therefore, remove the _avx10_256 CRC functions, and rename the _avx10_512 CRC functions to _avx512. Make Ice Lake and Tiger Lake use the _avx2 functions instead of *_avx10_256 which they previously used. Link: https://lore.kernel.org/r/20250319181316.91271-1-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@google.com> 2025-03-19 11:13:16 -07:00			`crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \`
			`const void *consts_ptr); \`
x86/crc: add "template" for [V]PCLMULQDQ based CRC functions The Linux kernel implements many variants of CRC, such as crc16, crc_t10dif, crc32_le, crc32c, crc32_be, crc64_nvme, and crc64_be. On x86, except for crc32c which has special scalar instructions, the fastest way to compute any of these CRCs on any message of length roughly >= 16 bytes is to use the SIMD carryless multiplication instructions PCLMULQDQ or VPCLMULQDQ. Depending on the available CPU features this can mean PCLMULQDQ+SSE4.1, VPCLMULQDQ+AVX2, VPCLMULQDQ+AVX10/256, or VPCLMULQDQ+AVX10/512 (or the AVX512 equivalents to AVX10/*). This results in a total of 20+ CRC implementations being potentially needed to properly optimize all CRCs that someone cares about for x86. Besides crc32c, currently only crc32_le and crc_t10dif are actually optimized for x86, and they only use PCLMULQDQ, which means they can be 2-4x slower than what is possible with VPCLMULQDQ. Fortunately, at a high level the code that is needed for any [V]PCLMULQDQ based CRC implementation is mostly the same. Therefore, this patch introduces an assembly macro that expands into the body of a [V]PCLMULQDQ based CRC function for a given number of bits (8, 16, 32, or 64), bit order (lsb or msb-first), vector length, and AVX level. The function expects to be passed a constants table, specific to the polynomial desired, that was generated by the script previously added. When two CRC variants share the same number of bits and bit order, the same functions can be reused, with only the constants table differing. A new C header is also added to make it easy to integrate the new assembly code using a static call. The result is that it becomes straightforward to wire up an optimized implementation of any CRC-8, CRC-16, CRC-32, or CRC-64 for x86. Later patches will wire up specific CRC variants. Although this new template allows easily generating many functions, care was taken to still keep the binary size fairly low. Each generated function is only 550 to 850 bytes depending on the CRC variant and target CPU features. And only one function per CRC variant is actually used at runtime (since all functions support all lengths >= 16 bytes). Note that a similar approach should also work for other architectures that have carryless multiplication instructions, such as arm64. Acked-by: Ard Biesheuvel <ardb@kernel.org> Acked-by: Keith Busch <kbusch@kernel.org> Link: https://lore.kernel.org/r/20250210174540.161705-4-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@google.com> 2025-02-10 09:26:44 -08:00			`DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)`

lib/crc: x86: Reorganize crc-pclmul static_call initialization Reorganize the crc-pclmul static_call initialization to place more of the logic in the *_mod_init_arch() functions instead of in the INIT_CRC_PCLMUL macro. This provides the flexibility to do more than a single static_call update for each CPU feature check. Right away, optimize crc64_mod_init_arch() to check the CPU features just once instead of twice, doing both the crc64_msb and crc64_lsb static_call updates together. A later commit will also use this to initialize an additional static_key when crc32_lsb_vpclmul_avx512() is enabled. Acked-by: Ard Biesheuvel <ardb@kernel.org> Link: https://lore.kernel.org/r/20250719224938.126512-2-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org> 2025-07-19 15:49:37 -07:00			`static inline bool have_vpclmul(void)`
			`{`
			`return boot_cpu_has(X86_FEATURE_VPCLMULQDQ) &&`
			`boot_cpu_has(X86_FEATURE_AVX2) &&`
			`cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL);`
			`}`

			`static inline bool have_avx512(void)`
			`{`
			`return boot_cpu_has(X86_FEATURE_AVX512BW) &&`
			`boot_cpu_has(X86_FEATURE_AVX512VL) &&`
			`!boot_cpu_has(X86_FEATURE_PREFER_YMM) &&`
			`cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL);`
			`}`
x86/crc: add "template" for [V]PCLMULQDQ based CRC functions The Linux kernel implements many variants of CRC, such as crc16, crc_t10dif, crc32_le, crc32c, crc32_be, crc64_nvme, and crc64_be. On x86, except for crc32c which has special scalar instructions, the fastest way to compute any of these CRCs on any message of length roughly >= 16 bytes is to use the SIMD carryless multiplication instructions PCLMULQDQ or VPCLMULQDQ. Depending on the available CPU features this can mean PCLMULQDQ+SSE4.1, VPCLMULQDQ+AVX2, VPCLMULQDQ+AVX10/256, or VPCLMULQDQ+AVX10/512 (or the AVX512 equivalents to AVX10/*). This results in a total of 20+ CRC implementations being potentially needed to properly optimize all CRCs that someone cares about for x86. Besides crc32c, currently only crc32_le and crc_t10dif are actually optimized for x86, and they only use PCLMULQDQ, which means they can be 2-4x slower than what is possible with VPCLMULQDQ. Fortunately, at a high level the code that is needed for any [V]PCLMULQDQ based CRC implementation is mostly the same. Therefore, this patch introduces an assembly macro that expands into the body of a [V]PCLMULQDQ based CRC function for a given number of bits (8, 16, 32, or 64), bit order (lsb or msb-first), vector length, and AVX level. The function expects to be passed a constants table, specific to the polynomial desired, that was generated by the script previously added. When two CRC variants share the same number of bits and bit order, the same functions can be reused, with only the constants table differing. A new C header is also added to make it easy to integrate the new assembly code using a static call. The result is that it becomes straightforward to wire up an optimized implementation of any CRC-8, CRC-16, CRC-32, or CRC-64 for x86. Later patches will wire up specific CRC variants. Although this new template allows easily generating many functions, care was taken to still keep the binary size fairly low. Each generated function is only 550 to 850 bytes depending on the CRC variant and target CPU features. And only one function per CRC variant is actually used at runtime (since all functions support all lengths >= 16 bytes). Note that a similar approach should also work for other architectures that have carryless multiplication instructions, such as arm64. Acked-by: Ard Biesheuvel <ardb@kernel.org> Acked-by: Keith Busch <kbusch@kernel.org> Link: https://lore.kernel.org/r/20250210174540.161705-4-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@google.com> 2025-02-10 09:26:44 -08:00
			`/*`
			`* Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16`
			`* bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.`
			`*`
			`* 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.`
			`* There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),`
			`* varying by CPU and factors such as which parts of the "FPU" state userspace`
			`* has touched, which could result in a larger cutoff being better. Indeed, a`
			`* larger cutoff is usually better for a single message. However, the`
			`* overhead of the FPU section gets amortized if multiple FPU sections get`
			`* executed before returning to userspace, since the XSAVE and XRSTOR occur only`
			`* once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on`
			`* the dcache than the table-based code is, a 16-byte cutoff seems to work well.`
			`*/`
			`#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \`
			`do { \`
			`if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \`
			`crypto_simd_usable()) { \`
			`const void *consts_ptr; \`
			`\`
			`consts_ptr = (consts).fold_across_128_bits_consts; \`
			`kernel_fpu_begin(); \`
			`crc = static_call(prefix##_pclmul)((crc), (p), (len), \`
			`consts_ptr); \`
			`kernel_fpu_end(); \`
			`return crc; \`
			`} \`
			`} while (0)`

			`#endif /* _CRC_PCLMUL_TEMPLATE_H */`