mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-04-13 09:59:31 +00:00

Add x86_64 [V]PCLMULQDQ optimized implementations of crc64_be() and crc64_nvme() by wiring them up to crc-pclmul-template.S. crc64_be() is used by bcache and bcachefs, and crc64_nvme() is used by blk-integrity. Both features can CRC large amounts of data, and the developers of both features have expressed interest in having these CRCs be optimized. So this optimization should be worthwhile. (See https://lore.kernel.org/r/v36sousjd5ukqlkpdxslvpu7l37zbu7d7slgc2trjjqwty2bny@qgzew34feo2r and https://lore.kernel.org/r/20220222163144.1782447-11-kbusch@kernel.org) Benchmark results on AMD Ryzen 9 9950X (Zen 5) using crc_kunit: crc64_be: Length Before After ------ ------ ----- 1 633 MB/s 477 MB/s 16 717 MB/s 2517 MB/s 64 715 MB/s 7525 MB/s 127 714 MB/s 10002 MB/s 128 713 MB/s 13344 MB/s 200 715 MB/s 15752 MB/s 256 714 MB/s 22933 MB/s 511 715 MB/s 28025 MB/s 512 714 MB/s 49772 MB/s 1024 715 MB/s 65261 MB/s 3173 714 MB/s 78773 MB/s 4096 714 MB/s 83315 MB/s 16384 714 MB/s 89487 MB/s crc64_nvme: Length Before After ------ ------ ----- 1 716 MB/s 474 MB/s 16 717 MB/s 3303 MB/s 64 713 MB/s 7940 MB/s 127 715 MB/s 9867 MB/s 128 714 MB/s 13698 MB/s 200 715 MB/s 15995 MB/s 256 714 MB/s 23479 MB/s 511 714 MB/s 28013 MB/s 512 715 MB/s 51533 MB/s 1024 715 MB/s 66788 MB/s 3173 715 MB/s 79182 MB/s 4096 715 MB/s 83966 MB/s 16384 715 MB/s 89739 MB/s Acked-by: Keith Busch <kbusch@kernel.org> Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com> Link: https://lore.kernel.org/r/20250210174540.161705-7-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@google.com>
195 lines
7.4 KiB
C
195 lines
7.4 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* CRC constants generated by:
|
|
*
|
|
* ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
|
|
*
|
|
* Do not edit manually.
|
|
*/
|
|
|
|
/*
|
|
* CRC folding constants generated for most-significant-bit-first CRC-16 using
|
|
* G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
|
|
*/
|
|
static const struct {
|
|
u8 bswap_mask[16];
|
|
u64 fold_across_2048_bits_consts[2];
|
|
u64 fold_across_1024_bits_consts[2];
|
|
u64 fold_across_512_bits_consts[2];
|
|
u64 fold_across_256_bits_consts[2];
|
|
u64 fold_across_128_bits_consts[2];
|
|
u8 shuf_table[48];
|
|
u64 barrett_reduction_consts[2];
|
|
} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = {
|
|
.bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
|
|
.fold_across_2048_bits_consts = {
|
|
0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */
|
|
0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */
|
|
},
|
|
.fold_across_1024_bits_consts = {
|
|
0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */
|
|
0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */
|
|
},
|
|
.fold_across_512_bits_consts = {
|
|
0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */
|
|
0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */
|
|
},
|
|
.fold_across_256_bits_consts = {
|
|
0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */
|
|
0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */
|
|
},
|
|
.fold_across_128_bits_consts = {
|
|
0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */
|
|
0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */
|
|
},
|
|
.shuf_table = {
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
},
|
|
.barrett_reduction_consts = {
|
|
0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */
|
|
0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */
|
|
},
|
|
};
|
|
|
|
/*
|
|
* CRC folding constants generated for least-significant-bit-first CRC-32 using
|
|
* G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
|
|
* x^5 + x^4 + x^2 + x^1 + x^0
|
|
*/
|
|
static const struct {
|
|
u64 fold_across_2048_bits_consts[2];
|
|
u64 fold_across_1024_bits_consts[2];
|
|
u64 fold_across_512_bits_consts[2];
|
|
u64 fold_across_256_bits_consts[2];
|
|
u64 fold_across_128_bits_consts[2];
|
|
u8 shuf_table[48];
|
|
u64 barrett_reduction_consts[2];
|
|
} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = {
|
|
.fold_across_2048_bits_consts = {
|
|
0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */
|
|
0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */
|
|
},
|
|
.fold_across_1024_bits_consts = {
|
|
0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */
|
|
0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */
|
|
},
|
|
.fold_across_512_bits_consts = {
|
|
0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */
|
|
0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */
|
|
},
|
|
.fold_across_256_bits_consts = {
|
|
0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */
|
|
0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */
|
|
},
|
|
.fold_across_128_bits_consts = {
|
|
0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */
|
|
0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */
|
|
},
|
|
.shuf_table = {
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
},
|
|
.barrett_reduction_consts = {
|
|
0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */
|
|
0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */
|
|
},
|
|
};
|
|
|
|
/*
|
|
* CRC folding constants generated for most-significant-bit-first CRC-64 using
|
|
* G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
|
|
* x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
|
|
* x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
|
|
* x^7 + x^4 + x^1 + x^0
|
|
*/
|
|
static const struct {
|
|
u8 bswap_mask[16];
|
|
u64 fold_across_2048_bits_consts[2];
|
|
u64 fold_across_1024_bits_consts[2];
|
|
u64 fold_across_512_bits_consts[2];
|
|
u64 fold_across_256_bits_consts[2];
|
|
u64 fold_across_128_bits_consts[2];
|
|
u8 shuf_table[48];
|
|
u64 barrett_reduction_consts[2];
|
|
} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = {
|
|
.bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
|
|
.fold_across_2048_bits_consts = {
|
|
0x7f52691a60ddc70d, /* LO64_TERMS: (x^2048 mod G) * x^0 */
|
|
0x7036b0389f6a0c82, /* HI64_TERMS: (x^2112 mod G) * x^0 */
|
|
},
|
|
.fold_across_1024_bits_consts = {
|
|
0x05cf79dea9ac37d6, /* LO64_TERMS: (x^1024 mod G) * x^0 */
|
|
0x001067e571d7d5c2, /* HI64_TERMS: (x^1088 mod G) * x^0 */
|
|
},
|
|
.fold_across_512_bits_consts = {
|
|
0x5f6843ca540df020, /* LO64_TERMS: (x^512 mod G) * x^0 */
|
|
0xddf4b6981205b83f, /* HI64_TERMS: (x^576 mod G) * x^0 */
|
|
},
|
|
.fold_across_256_bits_consts = {
|
|
0x571bee0a227ef92b, /* LO64_TERMS: (x^256 mod G) * x^0 */
|
|
0x44bef2a201b5200c, /* HI64_TERMS: (x^320 mod G) * x^0 */
|
|
},
|
|
.fold_across_128_bits_consts = {
|
|
0x05f5c3c7eb52fab6, /* LO64_TERMS: (x^128 mod G) * x^0 */
|
|
0x4eb938a7d257740e, /* HI64_TERMS: (x^192 mod G) * x^0 */
|
|
},
|
|
.shuf_table = {
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
},
|
|
.barrett_reduction_consts = {
|
|
0x42f0e1eba9ea3693, /* LO64_TERMS: (G - x^64) * x^0 */
|
|
0x578d29d06cc4f872, /* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */
|
|
},
|
|
};
|
|
|
|
/*
|
|
* CRC folding constants generated for least-significant-bit-first CRC-64 using
|
|
* G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 +
|
|
* x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 +
|
|
* x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 +
|
|
* x^4 + x^3 + x^0
|
|
*/
|
|
static const struct {
|
|
u64 fold_across_2048_bits_consts[2];
|
|
u64 fold_across_1024_bits_consts[2];
|
|
u64 fold_across_512_bits_consts[2];
|
|
u64 fold_across_256_bits_consts[2];
|
|
u64 fold_across_128_bits_consts[2];
|
|
u8 shuf_table[48];
|
|
u64 barrett_reduction_consts[2];
|
|
} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = {
|
|
.fold_across_2048_bits_consts = {
|
|
0x37ccd3e14069cabc, /* HI64_TERMS: (x^2111 mod G) * x^0 */
|
|
0xa043808c0f782663, /* LO64_TERMS: (x^2047 mod G) * x^0 */
|
|
},
|
|
.fold_across_1024_bits_consts = {
|
|
0xa1ca681e733f9c40, /* HI64_TERMS: (x^1087 mod G) * x^0 */
|
|
0x5f852fb61e8d92dc, /* LO64_TERMS: (x^1023 mod G) * x^0 */
|
|
},
|
|
.fold_across_512_bits_consts = {
|
|
0x0c32cdb31e18a84a, /* HI64_TERMS: (x^575 mod G) * x^0 */
|
|
0x62242240ace5045a, /* LO64_TERMS: (x^511 mod G) * x^0 */
|
|
},
|
|
.fold_across_256_bits_consts = {
|
|
0xb0bc2e589204f500, /* HI64_TERMS: (x^319 mod G) * x^0 */
|
|
0xe1e0bb9d45d7a44c, /* LO64_TERMS: (x^255 mod G) * x^0 */
|
|
},
|
|
.fold_across_128_bits_consts = {
|
|
0xeadc41fd2ba3d420, /* HI64_TERMS: (x^191 mod G) * x^0 */
|
|
0x21e9761e252621ac, /* LO64_TERMS: (x^127 mod G) * x^0 */
|
|
},
|
|
.shuf_table = {
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
},
|
|
.barrett_reduction_consts = {
|
|
0x27ecfa329aef9f77, /* HI64_TERMS: floor(x^127 / G) */
|
|
0x34d926535897936a, /* LO64_TERMS: (G - x^64 - x^0) / x */
|
|
},
|
|
};
|