mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-04 16:25:34 +00:00

In addition to keeping the kernel's copy of zstd up to date, this update was requested by Intel to expose upstream's APIs that allow QAT to accelerate the LZ match finding stage of Zstd. This patch is imported from the upstream tag v1.5.7-kernel [0], which is signed with upstream's signing key EF8FE99528B52FFD [1]. It was imported from upstream using this command: export ZSTD=/path/to/repo/zstd/ export LINUX=/path/to/repo/linux/ cd "$ZSTD/contrib/linux-kernel" git checkout v1.5.7-kernel make import LINUX="$LINUX" This patch has been tested on x86-64, and has been boot tested with a zstd compressed kernel & initramfs on i386 and aarch64. I benchmarked the patch on x86-64 with gcc-14.2.1 on an Intel i9-9900K by measruing the performance of compressed filesystem reads and writes. Component, Level, Size delta, C. time delta, D. time delta Btrfs , 1, +0.00%, -6.1%, +1.4% Btrfs , 3, +0.00%, -9.8%, +3.0% Btrfs , 5, +0.00%, +1.7%, +1.4% Btrfs , 7, +0.00%, -1.9%, +2.7% Btrfs , 9, +0.00%, -3.4%, +3.7% Btrfs , 15, +0.00%, -0.3%, +3.6% SquashFS , 1, +0.00%, N/A, +1.9% The major changes that impact the kernel use cases for each version are: v1.5.7: https://github.com/facebook/zstd/releases/tag/v1.5.7 * Add zstd_compress_sequences_and_literals() for use by Intel's QAT driver to implement Zstd compression acceleration in the kernel. * Fix an underflow bug in 32-bit builds that can cause data corruption when processing more than 4GB of data with a single `ZSTD_CCtx` object, when an input crosses the 4GB boundry. I don't believe this impacts any current kernel use cases, because the `ZSTD_CCtx` is typically reconstructed between compressions. * Levels 1-4 see 5-10% compression speed improvements for inputs smaller than 128KB. v1.5.6: https://github.com/facebook/zstd/releases/tag/v1.5.6 * Improved compression ratio for the highest compression levels. I don't expect these see much use however, due to their slow speeds. v1.5.5: https://github.com/facebook/zstd/releases/tag/v1.5.5 * Fix a rare corruption bug that can trigger on levels 13 and above. * Improve compression speed of levels 5-11 on incompressible data. v1.5.4: https://github.com/facebook/zstd/releases/tag/v1.5.4 * Improve copmression speed of levels 5-11 on ARM. * Improve dictionary compression speed. Signed-off-by: Nick Terrell <terrelln@fb.com>
316 lines
12 KiB
C
316 lines
12 KiB
C
// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/* ******************************************************************
|
|
* FSE : Finite State Entropy decoder
|
|
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* You can contact the author at :
|
|
* - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
|
* - Public forum : https://groups.google.com/forum/#!forum/lz4c
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
|
* in the COPYING file in the root directory of this source tree).
|
|
* You may select, at your option, one of the above-listed licenses.
|
|
****************************************************************** */
|
|
|
|
|
|
/* **************************************************************
|
|
* Includes
|
|
****************************************************************/
|
|
#include "debug.h" /* assert */
|
|
#include "bitstream.h"
|
|
#include "compiler.h"
|
|
#define FSE_STATIC_LINKING_ONLY
|
|
#include "fse.h"
|
|
#include "error_private.h"
|
|
#include "zstd_deps.h" /* ZSTD_memcpy */
|
|
#include "bits.h" /* ZSTD_highbit32 */
|
|
|
|
|
|
/* **************************************************************
|
|
* Error Management
|
|
****************************************************************/
|
|
#define FSE_isError ERR_isError
|
|
#define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) /* use only *after* variable declarations */
|
|
|
|
|
|
/* **************************************************************
|
|
* Templates
|
|
****************************************************************/
|
|
/*
|
|
designed to be included
|
|
for type-specific functions (template emulation in C)
|
|
Objective is to write these functions only once, for improved maintenance
|
|
*/
|
|
|
|
/* safety checks */
|
|
#ifndef FSE_FUNCTION_EXTENSION
|
|
# error "FSE_FUNCTION_EXTENSION must be defined"
|
|
#endif
|
|
#ifndef FSE_FUNCTION_TYPE
|
|
# error "FSE_FUNCTION_TYPE must be defined"
|
|
#endif
|
|
|
|
/* Function names */
|
|
#define FSE_CAT(X,Y) X##Y
|
|
#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
|
|
#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
|
|
|
|
static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
|
|
{
|
|
void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
|
|
FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
|
|
U16* symbolNext = (U16*)workSpace;
|
|
BYTE* spread = (BYTE*)(symbolNext + maxSymbolValue + 1);
|
|
|
|
U32 const maxSV1 = maxSymbolValue + 1;
|
|
U32 const tableSize = 1 << tableLog;
|
|
U32 highThreshold = tableSize-1;
|
|
|
|
/* Sanity Checks */
|
|
if (FSE_BUILD_DTABLE_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(maxSymbolValue_tooLarge);
|
|
if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
|
|
if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
|
|
|
|
/* Init, lay down lowprob symbols */
|
|
{ FSE_DTableHeader DTableH;
|
|
DTableH.tableLog = (U16)tableLog;
|
|
DTableH.fastMode = 1;
|
|
{ S16 const largeLimit= (S16)(1 << (tableLog-1));
|
|
U32 s;
|
|
for (s=0; s<maxSV1; s++) {
|
|
if (normalizedCounter[s]==-1) {
|
|
tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
|
|
symbolNext[s] = 1;
|
|
} else {
|
|
if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
|
|
symbolNext[s] = (U16)normalizedCounter[s];
|
|
} } }
|
|
ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
|
|
}
|
|
|
|
/* Spread symbols */
|
|
if (highThreshold == tableSize - 1) {
|
|
size_t const tableMask = tableSize-1;
|
|
size_t const step = FSE_TABLESTEP(tableSize);
|
|
/* First lay down the symbols in order.
|
|
* We use a uint64_t to lay down 8 bytes at a time. This reduces branch
|
|
* misses since small blocks generally have small table logs, so nearly
|
|
* all symbols have counts <= 8. We ensure we have 8 bytes at the end of
|
|
* our buffer to handle the over-write.
|
|
*/
|
|
{ U64 const add = 0x0101010101010101ull;
|
|
size_t pos = 0;
|
|
U64 sv = 0;
|
|
U32 s;
|
|
for (s=0; s<maxSV1; ++s, sv += add) {
|
|
int i;
|
|
int const n = normalizedCounter[s];
|
|
MEM_write64(spread + pos, sv);
|
|
for (i = 8; i < n; i += 8) {
|
|
MEM_write64(spread + pos + i, sv);
|
|
}
|
|
pos += (size_t)n;
|
|
} }
|
|
/* Now we spread those positions across the table.
|
|
* The benefit of doing it in two stages is that we avoid the
|
|
* variable size inner loop, which caused lots of branch misses.
|
|
* Now we can run through all the positions without any branch misses.
|
|
* We unroll the loop twice, since that is what empirically worked best.
|
|
*/
|
|
{
|
|
size_t position = 0;
|
|
size_t s;
|
|
size_t const unroll = 2;
|
|
assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
|
|
for (s = 0; s < (size_t)tableSize; s += unroll) {
|
|
size_t u;
|
|
for (u = 0; u < unroll; ++u) {
|
|
size_t const uPosition = (position + (u * step)) & tableMask;
|
|
tableDecode[uPosition].symbol = spread[s + u];
|
|
}
|
|
position = (position + (unroll * step)) & tableMask;
|
|
}
|
|
assert(position == 0);
|
|
}
|
|
} else {
|
|
U32 const tableMask = tableSize-1;
|
|
U32 const step = FSE_TABLESTEP(tableSize);
|
|
U32 s, position = 0;
|
|
for (s=0; s<maxSV1; s++) {
|
|
int i;
|
|
for (i=0; i<normalizedCounter[s]; i++) {
|
|
tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
|
|
position = (position + step) & tableMask;
|
|
while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
|
|
} }
|
|
if (position!=0) return ERROR(GENERIC); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
|
|
}
|
|
|
|
/* Build Decoding table */
|
|
{ U32 u;
|
|
for (u=0; u<tableSize; u++) {
|
|
FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
|
|
U32 const nextState = symbolNext[symbol]++;
|
|
tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
|
|
tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
|
|
} }
|
|
|
|
return 0;
|
|
}
|
|
|
|
size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
|
|
{
|
|
return FSE_buildDTable_internal(dt, normalizedCounter, maxSymbolValue, tableLog, workSpace, wkspSize);
|
|
}
|
|
|
|
|
|
#ifndef FSE_COMMONDEFS_ONLY
|
|
|
|
/*-*******************************************************
|
|
* Decompression (Byte symbols)
|
|
*********************************************************/
|
|
|
|
FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
|
|
void* dst, size_t maxDstSize,
|
|
const void* cSrc, size_t cSrcSize,
|
|
const FSE_DTable* dt, const unsigned fast)
|
|
{
|
|
BYTE* const ostart = (BYTE*) dst;
|
|
BYTE* op = ostart;
|
|
BYTE* const omax = op + maxDstSize;
|
|
BYTE* const olimit = omax-3;
|
|
|
|
BIT_DStream_t bitD;
|
|
FSE_DState_t state1;
|
|
FSE_DState_t state2;
|
|
|
|
/* Init */
|
|
CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
|
|
|
|
FSE_initDState(&state1, &bitD, dt);
|
|
FSE_initDState(&state2, &bitD, dt);
|
|
|
|
RETURN_ERROR_IF(BIT_reloadDStream(&bitD)==BIT_DStream_overflow, corruption_detected, "");
|
|
|
|
#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
|
|
|
|
/* 4 symbols per loop */
|
|
for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) & (op<olimit) ; op+=4) {
|
|
op[0] = FSE_GETSYMBOL(&state1);
|
|
|
|
if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */
|
|
BIT_reloadDStream(&bitD);
|
|
|
|
op[1] = FSE_GETSYMBOL(&state2);
|
|
|
|
if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */
|
|
{ if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
|
|
|
|
op[2] = FSE_GETSYMBOL(&state1);
|
|
|
|
if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */
|
|
BIT_reloadDStream(&bitD);
|
|
|
|
op[3] = FSE_GETSYMBOL(&state2);
|
|
}
|
|
|
|
/* tail */
|
|
/* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
|
|
while (1) {
|
|
if (op>(omax-2)) return ERROR(dstSize_tooSmall);
|
|
*op++ = FSE_GETSYMBOL(&state1);
|
|
if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
|
|
*op++ = FSE_GETSYMBOL(&state2);
|
|
break;
|
|
}
|
|
|
|
if (op>(omax-2)) return ERROR(dstSize_tooSmall);
|
|
*op++ = FSE_GETSYMBOL(&state2);
|
|
if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
|
|
*op++ = FSE_GETSYMBOL(&state1);
|
|
break;
|
|
} }
|
|
|
|
assert(op >= ostart);
|
|
return (size_t)(op-ostart);
|
|
}
|
|
|
|
typedef struct {
|
|
short ncount[FSE_MAX_SYMBOL_VALUE + 1];
|
|
} FSE_DecompressWksp;
|
|
|
|
|
|
FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
|
|
void* dst, size_t dstCapacity,
|
|
const void* cSrc, size_t cSrcSize,
|
|
unsigned maxLog, void* workSpace, size_t wkspSize,
|
|
int bmi2)
|
|
{
|
|
const BYTE* const istart = (const BYTE*)cSrc;
|
|
const BYTE* ip = istart;
|
|
unsigned tableLog;
|
|
unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
|
|
FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace;
|
|
size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable);
|
|
FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos;
|
|
|
|
FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
|
|
if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC);
|
|
|
|
/* correct offset to dtable depends on this property */
|
|
FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0);
|
|
|
|
/* normal FSE decoding mode */
|
|
{ size_t const NCountLength =
|
|
FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
|
|
if (FSE_isError(NCountLength)) return NCountLength;
|
|
if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
|
|
assert(NCountLength <= cSrcSize);
|
|
ip += NCountLength;
|
|
cSrcSize -= NCountLength;
|
|
}
|
|
|
|
if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
|
|
assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
|
|
workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
|
|
wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
|
|
|
|
CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
|
|
|
|
{
|
|
const void* ptr = dtable;
|
|
const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
|
|
const U32 fastMode = DTableH->fastMode;
|
|
|
|
/* select fast mode (static) */
|
|
if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1);
|
|
return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0);
|
|
}
|
|
}
|
|
|
|
/* Avoids the FORCE_INLINE of the _body() function. */
|
|
static size_t FSE_decompress_wksp_body_default(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
|
|
{
|
|
return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 0);
|
|
}
|
|
|
|
#if DYNAMIC_BMI2
|
|
BMI2_TARGET_ATTRIBUTE static size_t FSE_decompress_wksp_body_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
|
|
{
|
|
return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 1);
|
|
}
|
|
#endif
|
|
|
|
size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2)
|
|
{
|
|
#if DYNAMIC_BMI2
|
|
if (bmi2) {
|
|
return FSE_decompress_wksp_body_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
|
|
}
|
|
#endif
|
|
(void)bmi2;
|
|
return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
|
|
}
|
|
|
|
#endif /* FSE_COMMONDEFS_ONLY */
|