mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

Improve crc32c() performance on lengths >= 512 bytes by using crc32_lsb_vpclmul_avx512() instead of crc32c_x86_3way(), when the CPU supports VPCLMULQDQ and has a "good" implementation of AVX-512. For now that means AMD Zen 4 and later, and Intel Sapphire Rapids and later. Pass crc32_lsb_vpclmul_avx512() the table of constants needed to make it use the CRC-32C polynomial. Rationale: VPCLMULQDQ performance has improved on newer CPUs, making crc32_lsb_vpclmul_avx512() faster than crc32c_x86_3way(), even though crc32_lsb_vpclmul_avx512() is designed for generic 32-bit CRCs and does not utilize x86_64's dedicated CRC-32C instructions. Performance results for len=4096 using crc_kunit: CPU Before (MB/s) After (MB/s) ====================== ============= ============ AMD Zen 4 (Genoa) 19868 28618 AMD Zen 5 (Ryzen AI 9 365) 24080 46940 AMD Zen 5 (Turin) 29566 58468 Intel Sapphire Rapids 22340 73794 Intel Emerald Rapids 24696 78666 Performance results for len=512 using crc_kunit: CPU Before (MB/s) After (MB/s) ====================== ============= ============ AMD Zen 4 (Genoa) 7251 7758 AMD Zen 5 (Ryzen AI 9 365) 17481 19135 AMD Zen 5 (Turin) 21332 25424 Intel Sapphire Rapids 18886 29312 Intel Emerald Rapids 19675 29045 That being said, in the above benchmarks the ZMM registers are "warm", so they don't quite tell the whole story. While significantly improved from older Intel CPUs, Intel still has ~2000 ns of ZMM warm-up time where 512-bit instructions execute 4 times more slowly than they normally do. In contrast, AMD does better and has virtually zero ZMM warm-up time (at most ~60 ns). Thus, while this change is always beneficial on AMD, strictly speaking there are cases in which it is not beneficial on Intel, e.g. a small number of 512-byte messages with "cold" ZMM registers. But typically, it is beneficial even on Intel. Note that on AMD Zen 3--5, crc32c() performance could be further improved with implementations that interleave crc32q and VPCLMULQDQ instructions. Unfortunately, it appears that a different such implementation would be optimal on *each* of these microarchitectures. Such improvements are left for future work. This commit just improves the way that we choose the implementations we already have. Acked-by: Ard Biesheuvel <ardb@kernel.org> Link: https://lore.kernel.org/r/20250719224938.126512-3-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>
137 lines
4.3 KiB
C
137 lines
4.3 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* x86-optimized CRC32 functions
|
|
*
|
|
* Copyright (C) 2008 Intel Corporation
|
|
* Copyright 2012 Xyratex Technology Limited
|
|
* Copyright 2024 Google LLC
|
|
*/
|
|
|
|
#include "crc-pclmul-template.h"
|
|
|
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
|
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
|
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
|
|
|
|
DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
|
|
|
|
static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
|
|
{
|
|
CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
|
|
have_pclmulqdq);
|
|
return crc32_le_base(crc, p, len);
|
|
}
|
|
|
|
#ifdef CONFIG_X86_64
|
|
#define CRC32_INST "crc32q %1, %q0"
|
|
#else
|
|
#define CRC32_INST "crc32l %1, %0"
|
|
#endif
|
|
|
|
/*
|
|
* Use carryless multiply version of crc32c when buffer size is >= 512 to
|
|
* account for FPU state save/restore overhead.
|
|
*/
|
|
#define CRC32C_PCLMUL_BREAKEVEN 512
|
|
|
|
asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
|
|
|
|
static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
|
|
{
|
|
size_t num_longs;
|
|
|
|
if (!static_branch_likely(&have_crc32))
|
|
return crc32c_base(crc, p, len);
|
|
|
|
if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
|
|
static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
|
|
/*
|
|
* Long length, the vector registers are usable, and the CPU is
|
|
* 64-bit and supports both CRC32 and PCLMULQDQ instructions.
|
|
* It is worthwhile to divide the data into multiple streams,
|
|
* CRC them independently, and combine them using PCLMULQDQ.
|
|
* crc32c_x86_3way() does this using 3 streams, which is the
|
|
* most that x86_64 CPUs have traditionally been capable of.
|
|
*
|
|
* However, due to improved VPCLMULQDQ performance on newer
|
|
* CPUs, use crc32_lsb_vpclmul_avx512() instead of
|
|
* crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
|
|
* "good" implementation of AVX-512.
|
|
*
|
|
* Future work: the optimal strategy on Zen 3--5 is actually to
|
|
* use both crc32q and VPCLMULQDQ in parallel. Unfortunately,
|
|
* different numbers of streams and vector lengths are optimal
|
|
* on each CPU microarchitecture, making it challenging to take
|
|
* advantage of this. (Zen 5 even supports 7 parallel crc32q, a
|
|
* major upgrade.) For now, just choose between
|
|
* crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter
|
|
* is needed anyway for crc32_le(), so we just reuse it here.
|
|
*/
|
|
kernel_fpu_begin();
|
|
if (static_branch_likely(&have_vpclmul_avx512))
|
|
crc = crc32_lsb_vpclmul_avx512(crc, p, len,
|
|
crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
|
|
else
|
|
crc = crc32c_x86_3way(crc, p, len);
|
|
kernel_fpu_end();
|
|
return crc;
|
|
}
|
|
|
|
/*
|
|
* Short length, XMM registers unusable, or the CPU is 32-bit; but the
|
|
* CPU supports CRC32 instructions. Just issue a single stream of CRC32
|
|
* instructions inline. While this doesn't use the CPU's CRC32
|
|
* throughput very well, it avoids the need to combine streams. Stream
|
|
* combination would be inefficient here.
|
|
*/
|
|
|
|
for (num_longs = len / sizeof(unsigned long);
|
|
num_longs != 0; num_longs--, p += sizeof(unsigned long))
|
|
asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
|
|
|
|
if (sizeof(unsigned long) > 4 && (len & 4)) {
|
|
asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
|
|
p += 4;
|
|
}
|
|
if (len & 2) {
|
|
asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
|
|
p += 2;
|
|
}
|
|
if (len & 1)
|
|
asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
|
|
|
|
return crc;
|
|
}
|
|
|
|
#define crc32_be_arch crc32_be_base /* not implemented on this arch */
|
|
|
|
#define crc32_mod_init_arch crc32_mod_init_arch
|
|
static inline void crc32_mod_init_arch(void)
|
|
{
|
|
if (boot_cpu_has(X86_FEATURE_XMM4_2))
|
|
static_branch_enable(&have_crc32);
|
|
if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
|
|
static_branch_enable(&have_pclmulqdq);
|
|
if (have_vpclmul()) {
|
|
if (have_avx512()) {
|
|
static_call_update(crc32_lsb_pclmul,
|
|
crc32_lsb_vpclmul_avx512);
|
|
static_branch_enable(&have_vpclmul_avx512);
|
|
} else {
|
|
static_call_update(crc32_lsb_pclmul,
|
|
crc32_lsb_vpclmul_avx2);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static inline u32 crc32_optimizations_arch(void)
|
|
{
|
|
u32 optimizations = 0;
|
|
|
|
if (static_key_enabled(&have_crc32))
|
|
optimizations |= CRC32C_OPTIMIZATION;
|
|
if (static_key_enabled(&have_pclmulqdq))
|
|
optimizations |= CRC32_LE_OPTIMIZATION;
|
|
return optimizations;
|
|
}
|