2025-02-10 09:26:44 -08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
|
|
/*
|
|
|
|
* CRC constants generated by:
|
|
|
|
*
|
lib/crc: x86/crc32c: Enable VPCLMULQDQ optimization where beneficial
Improve crc32c() performance on lengths >= 512 bytes by using
crc32_lsb_vpclmul_avx512() instead of crc32c_x86_3way(), when the CPU
supports VPCLMULQDQ and has a "good" implementation of AVX-512. For now
that means AMD Zen 4 and later, and Intel Sapphire Rapids and later.
Pass crc32_lsb_vpclmul_avx512() the table of constants needed to make it
use the CRC-32C polynomial.
Rationale: VPCLMULQDQ performance has improved on newer CPUs, making
crc32_lsb_vpclmul_avx512() faster than crc32c_x86_3way(), even though
crc32_lsb_vpclmul_avx512() is designed for generic 32-bit CRCs and does
not utilize x86_64's dedicated CRC-32C instructions.
Performance results for len=4096 using crc_kunit:
CPU Before (MB/s) After (MB/s)
====================== ============= ============
AMD Zen 4 (Genoa) 19868 28618
AMD Zen 5 (Ryzen AI 9 365) 24080 46940
AMD Zen 5 (Turin) 29566 58468
Intel Sapphire Rapids 22340 73794
Intel Emerald Rapids 24696 78666
Performance results for len=512 using crc_kunit:
CPU Before (MB/s) After (MB/s)
====================== ============= ============
AMD Zen 4 (Genoa) 7251 7758
AMD Zen 5 (Ryzen AI 9 365) 17481 19135
AMD Zen 5 (Turin) 21332 25424
Intel Sapphire Rapids 18886 29312
Intel Emerald Rapids 19675 29045
That being said, in the above benchmarks the ZMM registers are "warm",
so they don't quite tell the whole story. While significantly improved
from older Intel CPUs, Intel still has ~2000 ns of ZMM warm-up time
where 512-bit instructions execute 4 times more slowly than they
normally do. In contrast, AMD does better and has virtually zero ZMM
warm-up time (at most ~60 ns). Thus, while this change is always
beneficial on AMD, strictly speaking there are cases in which it is not
beneficial on Intel, e.g. a small number of 512-byte messages with
"cold" ZMM registers. But typically, it is beneficial even on Intel.
Note that on AMD Zen 3--5, crc32c() performance could be further
improved with implementations that interleave crc32q and VPCLMULQDQ
instructions. Unfortunately, it appears that a different such
implementation would be optimal on *each* of these microarchitectures.
Such improvements are left for future work. This commit just improves
the way that we choose the implementations we already have.
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250719224938.126512-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
2025-07-19 15:49:38 -07:00
|
|
|
* ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
|
2025-02-10 09:26:44 -08:00
|
|
|
*
|
|
|
|
* Do not edit manually.
|
|
|
|
*/
|
|
|
|
|
2025-02-10 09:26:45 -08:00
|
|
|
/*
|
|
|
|
* CRC folding constants generated for most-significant-bit-first CRC-16 using
|
|
|
|
* G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
|
|
|
|
*/
|
|
|
|
static const struct {
|
|
|
|
u8 bswap_mask[16];
|
|
|
|
u64 fold_across_2048_bits_consts[2];
|
|
|
|
u64 fold_across_1024_bits_consts[2];
|
|
|
|
u64 fold_across_512_bits_consts[2];
|
|
|
|
u64 fold_across_256_bits_consts[2];
|
|
|
|
u64 fold_across_128_bits_consts[2];
|
|
|
|
u8 shuf_table[48];
|
|
|
|
u64 barrett_reduction_consts[2];
|
|
|
|
} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = {
|
|
|
|
.bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
|
|
|
|
.fold_across_2048_bits_consts = {
|
|
|
|
0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */
|
|
|
|
0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */
|
|
|
|
},
|
|
|
|
.fold_across_1024_bits_consts = {
|
|
|
|
0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */
|
|
|
|
0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */
|
|
|
|
},
|
|
|
|
.fold_across_512_bits_consts = {
|
|
|
|
0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */
|
|
|
|
0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */
|
|
|
|
},
|
|
|
|
.fold_across_256_bits_consts = {
|
|
|
|
0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */
|
|
|
|
0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */
|
|
|
|
},
|
|
|
|
.fold_across_128_bits_consts = {
|
|
|
|
0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */
|
|
|
|
0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */
|
|
|
|
},
|
|
|
|
.shuf_table = {
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
},
|
|
|
|
.barrett_reduction_consts = {
|
|
|
|
0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */
|
|
|
|
0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2025-02-10 09:26:44 -08:00
|
|
|
/*
|
|
|
|
* CRC folding constants generated for least-significant-bit-first CRC-32 using
|
|
|
|
* G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
|
|
|
|
* x^5 + x^4 + x^2 + x^1 + x^0
|
|
|
|
*/
|
|
|
|
static const struct {
|
|
|
|
u64 fold_across_2048_bits_consts[2];
|
|
|
|
u64 fold_across_1024_bits_consts[2];
|
|
|
|
u64 fold_across_512_bits_consts[2];
|
|
|
|
u64 fold_across_256_bits_consts[2];
|
|
|
|
u64 fold_across_128_bits_consts[2];
|
|
|
|
u8 shuf_table[48];
|
|
|
|
u64 barrett_reduction_consts[2];
|
|
|
|
} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = {
|
|
|
|
.fold_across_2048_bits_consts = {
|
|
|
|
0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */
|
|
|
|
0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */
|
|
|
|
},
|
|
|
|
.fold_across_1024_bits_consts = {
|
|
|
|
0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */
|
|
|
|
0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */
|
|
|
|
},
|
|
|
|
.fold_across_512_bits_consts = {
|
|
|
|
0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */
|
|
|
|
0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */
|
|
|
|
},
|
|
|
|
.fold_across_256_bits_consts = {
|
|
|
|
0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */
|
|
|
|
0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */
|
|
|
|
},
|
|
|
|
.fold_across_128_bits_consts = {
|
|
|
|
0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */
|
|
|
|
0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */
|
|
|
|
},
|
|
|
|
.shuf_table = {
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
},
|
|
|
|
.barrett_reduction_consts = {
|
|
|
|
0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */
|
|
|
|
0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */
|
|
|
|
},
|
|
|
|
};
|
2025-02-10 09:26:45 -08:00
|
|
|
|
lib/crc: x86/crc32c: Enable VPCLMULQDQ optimization where beneficial
Improve crc32c() performance on lengths >= 512 bytes by using
crc32_lsb_vpclmul_avx512() instead of crc32c_x86_3way(), when the CPU
supports VPCLMULQDQ and has a "good" implementation of AVX-512. For now
that means AMD Zen 4 and later, and Intel Sapphire Rapids and later.
Pass crc32_lsb_vpclmul_avx512() the table of constants needed to make it
use the CRC-32C polynomial.
Rationale: VPCLMULQDQ performance has improved on newer CPUs, making
crc32_lsb_vpclmul_avx512() faster than crc32c_x86_3way(), even though
crc32_lsb_vpclmul_avx512() is designed for generic 32-bit CRCs and does
not utilize x86_64's dedicated CRC-32C instructions.
Performance results for len=4096 using crc_kunit:
CPU Before (MB/s) After (MB/s)
====================== ============= ============
AMD Zen 4 (Genoa) 19868 28618
AMD Zen 5 (Ryzen AI 9 365) 24080 46940
AMD Zen 5 (Turin) 29566 58468
Intel Sapphire Rapids 22340 73794
Intel Emerald Rapids 24696 78666
Performance results for len=512 using crc_kunit:
CPU Before (MB/s) After (MB/s)
====================== ============= ============
AMD Zen 4 (Genoa) 7251 7758
AMD Zen 5 (Ryzen AI 9 365) 17481 19135
AMD Zen 5 (Turin) 21332 25424
Intel Sapphire Rapids 18886 29312
Intel Emerald Rapids 19675 29045
That being said, in the above benchmarks the ZMM registers are "warm",
so they don't quite tell the whole story. While significantly improved
from older Intel CPUs, Intel still has ~2000 ns of ZMM warm-up time
where 512-bit instructions execute 4 times more slowly than they
normally do. In contrast, AMD does better and has virtually zero ZMM
warm-up time (at most ~60 ns). Thus, while this change is always
beneficial on AMD, strictly speaking there are cases in which it is not
beneficial on Intel, e.g. a small number of 512-byte messages with
"cold" ZMM registers. But typically, it is beneficial even on Intel.
Note that on AMD Zen 3--5, crc32c() performance could be further
improved with implementations that interleave crc32q and VPCLMULQDQ
instructions. Unfortunately, it appears that a different such
implementation would be optimal on *each* of these microarchitectures.
Such improvements are left for future work. This commit just improves
the way that we choose the implementations we already have.
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250719224938.126512-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
2025-07-19 15:49:38 -07:00
|
|
|
/*
|
|
|
|
* CRC folding constants generated for least-significant-bit-first CRC-32 using
|
|
|
|
* G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 +
|
|
|
|
* x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0
|
|
|
|
*/
|
|
|
|
static const struct {
|
|
|
|
u64 fold_across_2048_bits_consts[2];
|
|
|
|
u64 fold_across_1024_bits_consts[2];
|
|
|
|
u64 fold_across_512_bits_consts[2];
|
|
|
|
u64 fold_across_256_bits_consts[2];
|
|
|
|
u64 fold_across_128_bits_consts[2];
|
|
|
|
u8 shuf_table[48];
|
|
|
|
u64 barrett_reduction_consts[2];
|
|
|
|
} crc32_lsb_0x82f63b78_consts ____cacheline_aligned __maybe_unused = {
|
|
|
|
.fold_across_2048_bits_consts = {
|
|
|
|
0x00000000dcb17aa4, /* HI64_TERMS: (x^2079 mod G) * x^32 */
|
|
|
|
0x00000000b9e02b86, /* LO64_TERMS: (x^2015 mod G) * x^32 */
|
|
|
|
},
|
|
|
|
.fold_across_1024_bits_consts = {
|
|
|
|
0x000000006992cea2, /* HI64_TERMS: (x^1055 mod G) * x^32 */
|
|
|
|
0x000000000d3b6092, /* LO64_TERMS: (x^991 mod G) * x^32 */
|
|
|
|
},
|
|
|
|
.fold_across_512_bits_consts = {
|
|
|
|
0x00000000740eef02, /* HI64_TERMS: (x^543 mod G) * x^32 */
|
|
|
|
0x000000009e4addf8, /* LO64_TERMS: (x^479 mod G) * x^32 */
|
|
|
|
},
|
|
|
|
.fold_across_256_bits_consts = {
|
|
|
|
0x000000003da6d0cb, /* HI64_TERMS: (x^287 mod G) * x^32 */
|
|
|
|
0x00000000ba4fc28e, /* LO64_TERMS: (x^223 mod G) * x^32 */
|
|
|
|
},
|
|
|
|
.fold_across_128_bits_consts = {
|
|
|
|
0x00000000f20c0dfe, /* HI64_TERMS: (x^159 mod G) * x^32 */
|
|
|
|
0x00000000493c7d27, /* LO64_TERMS: (x^95 mod G) * x^32 */
|
|
|
|
},
|
|
|
|
.shuf_table = {
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
},
|
|
|
|
.barrett_reduction_consts = {
|
|
|
|
0x4869ec38dea713f1, /* HI64_TERMS: floor(x^95 / G) */
|
|
|
|
0x0000000105ec76f0, /* LO64_TERMS: (G - x^32) * x^31 */
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2025-02-10 09:26:45 -08:00
|
|
|
/*
|
|
|
|
* CRC folding constants generated for most-significant-bit-first CRC-64 using
|
|
|
|
* G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
|
|
|
|
* x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
|
|
|
|
* x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
|
|
|
|
* x^7 + x^4 + x^1 + x^0
|
|
|
|
*/
|
|
|
|
static const struct {
|
|
|
|
u8 bswap_mask[16];
|
|
|
|
u64 fold_across_2048_bits_consts[2];
|
|
|
|
u64 fold_across_1024_bits_consts[2];
|
|
|
|
u64 fold_across_512_bits_consts[2];
|
|
|
|
u64 fold_across_256_bits_consts[2];
|
|
|
|
u64 fold_across_128_bits_consts[2];
|
|
|
|
u8 shuf_table[48];
|
|
|
|
u64 barrett_reduction_consts[2];
|
|
|
|
} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = {
|
|
|
|
.bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
|
|
|
|
.fold_across_2048_bits_consts = {
|
|
|
|
0x7f52691a60ddc70d, /* LO64_TERMS: (x^2048 mod G) * x^0 */
|
|
|
|
0x7036b0389f6a0c82, /* HI64_TERMS: (x^2112 mod G) * x^0 */
|
|
|
|
},
|
|
|
|
.fold_across_1024_bits_consts = {
|
|
|
|
0x05cf79dea9ac37d6, /* LO64_TERMS: (x^1024 mod G) * x^0 */
|
|
|
|
0x001067e571d7d5c2, /* HI64_TERMS: (x^1088 mod G) * x^0 */
|
|
|
|
},
|
|
|
|
.fold_across_512_bits_consts = {
|
|
|
|
0x5f6843ca540df020, /* LO64_TERMS: (x^512 mod G) * x^0 */
|
|
|
|
0xddf4b6981205b83f, /* HI64_TERMS: (x^576 mod G) * x^0 */
|
|
|
|
},
|
|
|
|
.fold_across_256_bits_consts = {
|
|
|
|
0x571bee0a227ef92b, /* LO64_TERMS: (x^256 mod G) * x^0 */
|
|
|
|
0x44bef2a201b5200c, /* HI64_TERMS: (x^320 mod G) * x^0 */
|
|
|
|
},
|
|
|
|
.fold_across_128_bits_consts = {
|
|
|
|
0x05f5c3c7eb52fab6, /* LO64_TERMS: (x^128 mod G) * x^0 */
|
|
|
|
0x4eb938a7d257740e, /* HI64_TERMS: (x^192 mod G) * x^0 */
|
|
|
|
},
|
|
|
|
.shuf_table = {
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
},
|
|
|
|
.barrett_reduction_consts = {
|
|
|
|
0x42f0e1eba9ea3693, /* LO64_TERMS: (G - x^64) * x^0 */
|
|
|
|
0x578d29d06cc4f872, /* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* CRC folding constants generated for least-significant-bit-first CRC-64 using
|
|
|
|
* G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 +
|
|
|
|
* x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 +
|
|
|
|
* x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 +
|
|
|
|
* x^4 + x^3 + x^0
|
|
|
|
*/
|
|
|
|
static const struct {
|
|
|
|
u64 fold_across_2048_bits_consts[2];
|
|
|
|
u64 fold_across_1024_bits_consts[2];
|
|
|
|
u64 fold_across_512_bits_consts[2];
|
|
|
|
u64 fold_across_256_bits_consts[2];
|
|
|
|
u64 fold_across_128_bits_consts[2];
|
|
|
|
u8 shuf_table[48];
|
|
|
|
u64 barrett_reduction_consts[2];
|
|
|
|
} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = {
|
|
|
|
.fold_across_2048_bits_consts = {
|
|
|
|
0x37ccd3e14069cabc, /* HI64_TERMS: (x^2111 mod G) * x^0 */
|
|
|
|
0xa043808c0f782663, /* LO64_TERMS: (x^2047 mod G) * x^0 */
|
|
|
|
},
|
|
|
|
.fold_across_1024_bits_consts = {
|
|
|
|
0xa1ca681e733f9c40, /* HI64_TERMS: (x^1087 mod G) * x^0 */
|
|
|
|
0x5f852fb61e8d92dc, /* LO64_TERMS: (x^1023 mod G) * x^0 */
|
|
|
|
},
|
|
|
|
.fold_across_512_bits_consts = {
|
|
|
|
0x0c32cdb31e18a84a, /* HI64_TERMS: (x^575 mod G) * x^0 */
|
|
|
|
0x62242240ace5045a, /* LO64_TERMS: (x^511 mod G) * x^0 */
|
|
|
|
},
|
|
|
|
.fold_across_256_bits_consts = {
|
|
|
|
0xb0bc2e589204f500, /* HI64_TERMS: (x^319 mod G) * x^0 */
|
|
|
|
0xe1e0bb9d45d7a44c, /* LO64_TERMS: (x^255 mod G) * x^0 */
|
|
|
|
},
|
|
|
|
.fold_across_128_bits_consts = {
|
|
|
|
0xeadc41fd2ba3d420, /* HI64_TERMS: (x^191 mod G) * x^0 */
|
|
|
|
0x21e9761e252621ac, /* LO64_TERMS: (x^127 mod G) * x^0 */
|
|
|
|
},
|
|
|
|
.shuf_table = {
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
},
|
|
|
|
.barrett_reduction_consts = {
|
|
|
|
0x27ecfa329aef9f77, /* HI64_TERMS: floor(x^127 / G) */
|
|
|
|
0x34d926535897936a, /* LO64_TERMS: (G - x^64 - x^0) / x */
|
|
|
|
},
|
|
|
|
};
|