/* ****************************************************************** * huff0 huffman decoder, * part of Finite State Entropy library * Copyright (c) Meta Platforms, Inc. and affiliates. * * You can contact the author at : * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses.
****************************************************************** */
/* These two optional macros force the use one way or another of the two * Huffman decompression implementations. You can't force in both directions * at the same time.
*/ #ifdefined(HUF_FORCE_DECOMPRESS_X1) && \ defined(HUF_FORCE_DECOMPRESS_X2) #error"Cannot force the use of the X1 and X2 decoders at the same time!" #endif
/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is * supported at runtime, so we can add the BMI2 target attribute. * When it is disabled, we will still get BMI2 if it is enabled statically.
*/ #if DYNAMIC_BMI2 # define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE #else # define HUF_FAST_BMI2_ATTRS #endif
/** * The input/output arguments to the Huffman fast decoding loop: * * ip [in/out] - The input pointers, must be updated to reflect what is consumed. * op [in/out] - The output pointers, must be updated to reflect what is written. * bits [in/out] - The bitstream containers, must be updated to reflect the current state. * dt [in] - The decoding table. * ilowest [in] - The beginning of the valid range of the input. Decoders may read * down to this pointer. It may be below iend[0]. * oend [in] - The end of the output stream. op[3] must not cross oend. * iend [in] - The end of each input stream. ip[i] may cross iend[i], * as long as it is above ilowest, but that indicates corruption.
*/ typedefstruct {
BYTE const* ip[4];
BYTE* op[4];
U64 bits[4]; voidconst* dt;
BYTE const* ilowest;
BYTE* oend;
BYTE const* iend[4];
} HUF_DecompressFastArgs;
/* strict minimum : jump table + 1 byte per stream */ if (srcSize < 10) return ERROR(corruption_detected);
/* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers. * If table log is not correct at this point, fallback to the old decoder. * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
*/ if (dtLog != HUF_DECODER_FAST_TABLELOG) return 0;
/* No point to call the ASM loop for tiny outputs. */ if (args->op[3] >= oend) return 0;
/* bits[] is the bit container. * It is read from the MSB down to the LSB. * It is shifted left as it is read, and zeros are * shifted in. After the lowest valid bit a 1 is * set, so that CountTrailingZeros(bits[]) can be used * to count how many bits we've consumed.
*/
args->bits[0] = HUF_initFastDStream(args->ip[0]);
args->bits[1] = HUF_initFastDStream(args->ip[1]);
args->bits[2] = HUF_initFastDStream(args->ip[2]);
args->bits[3] = HUF_initFastDStream(args->ip[3]);
/* The decoders must be sure to never read beyond ilowest. * This is lower than iend[0], but allowing decoders to read * down to ilowest can allow an extra iteration or two in the * fast loop.
*/
args->ilowest = istart;
args->oend = oend;
args->dt = dt;
return 1;
}
static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
{ /* Validate that we haven't overwritten. */ if (args->op[stream] > segmentEnd) return ERROR(corruption_detected); /* Validate that we haven't read beyond iend[]. * Note that ip[] may be < iend[] because the MSB is * the next bit to read, and we may have consumed 100% * of the stream, so down to iend[i] - 8 is valid.
*/ if (args->ip[stream] < args->iend[stream] - 8) return ERROR(corruption_detected);
/** * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at * a time.
*/ static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
U64 D4; if (MEM_isLittleEndian()) {
D4 = (U64)((symbol << 8) + nbBits);
} else {
D4 = (U64)(symbol + (nbBits << 8));
}
assert(D4 < (1U << 16));
D4 *= 0x0001000100010001ULL; return D4;
}
/** * Increase the tableLog to targetTableLog and rescales the stats. * If tableLog > targetTableLog this is a no-op. * @returns New tableLog
*/ static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog)
{ if (tableLog > targetTableLog) return tableLog; if (tableLog < targetTableLog) {
U32 const scale = targetTableLog - tableLog;
U32 s; /* Increase the weight for all non-zero probability symbols by scale. */ for (s = 0; s < nbSymbols; ++s) {
huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale);
} /* Update rankVal to reflect the new weights. * All weights except 0 get moved to weight + scale. * Weights [1, scale] are empty.
*/ for (s = targetTableLog; s > scale; --s) {
rankVal[s] = rankVal[s - scale];
} for (s = scale; s > 0; --s) {
rankVal[s] = 0;
}
} return targetTableLog;
}
DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp)); if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge);
DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
/* Compute symbols and rankStart given rankVal: * * rankVal already contains the number of values of each weight. * * symbols contains the symbols ordered by weight. First are the rankVal[0] * weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on. * symbols[0] is filled (but unused) to avoid a branch. * * rankStart contains the offset where each rank belongs in the DTable. * rankStart[0] is not filled because there are no entries in the table for * weight 0.
*/
{ int n;
U32 nextRankStart = 0; intconst unroll = 4; intconst nLimit = (int)nbSymbols - unroll + 1; for (n=0; n<(int)tableLog+1; n++) {
U32 const curr = nextRankStart;
nextRankStart += wksp->rankVal[n];
wksp->rankStart[n] = curr;
} for (n=0; n < nLimit; n += unroll) { int u; for (u=0; u < unroll; ++u) {
size_t const w = wksp->huffWeight[n+u];
wksp->symbols[wksp->rankStart[w]++] = (BYTE)(n+u);
}
} for (; n < (int)nbSymbols; ++n) {
size_t const w = wksp->huffWeight[n];
wksp->symbols[wksp->rankStart[w]++] = (BYTE)n;
}
}
/* fill DTable * We fill all entries of each weight in order. * That way length is a constant for each iteration of the outer loop. * We can switch based on the length to a different inner loop which is * optimized for that particular case.
*/
{ U32 w; int symbol = wksp->rankVal[0]; int rankStart = 0; for (w=1; w<tableLog+1; ++w) { intconst symbolCount = wksp->rankVal[w]; intconst length = (1 << w) >> 1; int uStart = rankStart;
BYTE const nbBits = (BYTE)(tableLog + 1 - w); int s; int u; switch (length) { case 1: for (s=0; s<symbolCount; ++s) {
HUF_DEltX1 D;
D.byte = wksp->symbols[symbol + s];
D.nbBits = nbBits;
dt[uStart] = D;
uStart += 1;
} break; case 2: for (s=0; s<symbolCount; ++s) {
HUF_DEltX1 D;
D.byte = wksp->symbols[symbol + s];
D.nbBits = nbBits;
dt[uStart+0] = D;
dt[uStart+1] = D;
uStart += 2;
} break; case 4: for (s=0; s<symbolCount; ++s) {
U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
MEM_write64(dt + uStart, D4);
uStart += 4;
} break; case 8: for (s=0; s<symbolCount; ++s) {
U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
MEM_write64(dt + uStart, D4);
MEM_write64(dt + uStart + 4, D4);
uStart += 8;
} break; default: for (s=0; s<symbolCount; ++s) {
U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits); for (u=0; u < length; u += 16) {
MEM_write64(dt + uStart + u + 0, D4);
MEM_write64(dt + uStart + u + 4, D4);
MEM_write64(dt + uStart + u + 8, D4);
MEM_write64(dt + uStart + u + 12, D4);
}
assert(u == length);
uStart += length;
} break;
}
symbol += symbolCount;
rankStart += symbolCount * length;
}
} return iSize;
}
/* check corruption */ /* note : should not be necessary : op# advance in lock step, and we control op4.
* but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */ if (op1 > opStart2) return ERROR(corruption_detected); if (op2 > opStart3) return ERROR(corruption_detected); if (op3 > opStart4) return ERROR(corruption_detected); /* note : op4 supposed already verified within main loop */
/* finish bitStreams one by one */
HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog);
HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog);
HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog);
HUF_decodeStreamX1(op4, &bitD4, oend, dt, dtLog);
/* Assert loop preconditions */ #ifndef NDEBUG for (stream = 0; stream < 4; ++stream) {
assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
assert(ip[stream] >= ilowest);
} #endif /* Compute olimit */
{ /* Each iteration produces 5 output symbols per stream */
size_t const oiters = (size_t)(oend - op[3]) / 5; /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes * per stream.
*/
size_t const iiters = (size_t)(ip[0] - ilowest) / 7; /* We can safely run iters iterations before running bounds checks */
size_t const iters = MIN(oiters, iiters);
size_t const symbols = iters * 5;
/* We can simply check that op[3] < olimit, instead of checking all * of our bounds, since we can't hit the other bounds until we've run * iters iterations, which only happens when op[3] == olimit.
*/
olimit = op[3] + symbols;
/* Exit fast decoding loop once we reach the end. */ if (op[3] == olimit) break;
/* Exit the decoding loop if any input pointer has crossed the * previous one. This indicates corruption, and a precondition * to our loop is that ip[i] >= ip[0].
*/ for (stream = 1; stream < 4; ++stream) { if (ip[stream] < ip[stream - 1]) goto _out;
}
}
/* Manually unroll the loop because compilers don't consistently * unroll the inner loops, which destroys performance.
*/ do { /* Decode 5 symbols in each of the 4 streams */
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
/* Reload each of the 4 the bitstreams */
HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
} while (op[3] < olimit);
/* Save the final values of each of the state variables back to args. */
ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
ZSTD_memcpy(&args->op, &op, sizeof(op));
}
/** * @returns @p dstSize on success (>= 6) * 0 if the fallback implementation should be used * An error if an error occurred
*/ static HUF_FAST_BMI2_ATTRS
size_t
HUF_decompress4X1_usingDTable_internal_fast( void* dst, size_t dstSize, constvoid* cSrc, size_t cSrcSize, const HUF_DTable* DTable,
HUF_DecompressFastLoopFn loopFn)
{ voidconst* dt = DTable + 1;
BYTE const* const ilowest = (BYTE const*)cSrc;
BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
HUF_DecompressFastArgs args;
{ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); if (ret == 0) return 0;
}
/* Fill DTable in order of weight. */ for (w = 1; w < wEnd; ++w) { intconst begin = (int)rankStart[w]; intconst end = (int)rankStart[w+1];
U32 const nbBits = nbBitsBaseline - w;
if (targetLog-nbBits >= minBits) { /* Enough room for a second symbol. */ int start = rankVal[w];
U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */); int minWeight = nbBits + scaleLog; int s; if (minWeight < 1) minWeight = 1; /* Fill the DTable for every symbol of weight w. * These symbols get at least 1 second symbol.
*/ for (s = begin; s != end; ++s) {
HUF_fillDTableX2Level2(
DTable + start, targetLog, nbBits,
rankValOrigin[nbBits], minWeight, wEnd,
sortedList, rankStart,
nbBitsBaseline, sortedList[s].symbol);
start += length;
}
} else { /* Only a single symbol. */
HUF_fillDTableX2ForWeight(
DTable + rankVal[w],
sortedList + begin, sortedList + end,
nbBits, targetLog, /* baseSeq */ 0, /* level */ 1);
}
}
}
DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */ if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
/* check result */ if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG;
/* find maxWeight */ for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
/* Get start index of each weight */
{ U32 w, nextRankStart = 0; for (w=1; w<maxW+1; w++) {
U32 curr = nextRankStart;
nextRankStart += wksp->rankStats[w];
rankStart[w] = curr;
}
rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
rankStart[maxW+1] = nextRankStart;
}
/* sort symbols by weight */
{ U32 s; for (s=0; s<nbSymbols; s++) {
U32 const w = wksp->weightList[s];
U32 const r = rankStart[w]++;
wksp->sortedSymbol[r].symbol = (BYTE)s;
}
rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
}
/* up to 8 symbols at a time */ if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) { if (dtLog <= 11 && MEM_64bits()) { /* up to 10 symbols at a time */ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) {
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
}
} else { /* up to 8 symbols at a time */ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
}
}
} else {
BIT_reloadDStream(bitDPtr);
}
/* closer to end : up to 2 symbols at a time */ if ((size_t)(pEnd - p) >= 2) { while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
while (p <= pEnd-2)
HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
}
if (p < pEnd)
p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
/* Assert loop preconditions */ #ifndef NDEBUG for (stream = 0; stream < 4; ++stream) {
assert(op[stream] <= oend[stream]);
assert(ip[stream] >= ilowest);
} #endif /* Compute olimit */
{ /* Each loop does 5 table lookups for each of the 4 streams. * Each table lookup consumes up to 11 bits of input, and produces * up to 2 bytes of output.
*/ /* We can consume up to 7 bytes of input per iteration per stream. * We also know that each input pointer is >= ip[0]. So we can run * iters loops before running out of input.
*/
size_t iters = (size_t)(ip[0] - ilowest) / 7; /* Each iteration can produce up to 10 bytes of output per stream. * Each output stream my advance at different rates. So take the * minimum number of safe iterations among all the output streams.
*/ for (stream = 0; stream < 4; ++stream) {
size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
iters = MIN(iters, oiters);
}
/* Each iteration produces at least 5 output symbols. So until * op[3] crosses olimit, we know we haven't executed iters * iterations yet. This saves us maintaining an iters counter, * at the expense of computing the remaining # of iterations * more frequently.
*/
olimit = op[3] + (iters * 5);
/* Exit the fast decoding loop once we reach the end. */ if (op[3] == olimit) break;
/* Exit the decoding loop if any input pointer has crossed the * previous one. This indicates corruption, and a precondition * to our loop is that ip[i] >= ip[0].
*/ for (stream = 1; stream < 4; ++stream) { if (ip[stream] < ip[stream - 1]) goto _out;
}
}
/* Manually unroll the loop because compilers don't consistently * unroll the inner loops, which destroys performance.
*/ do { /* Decode 5 symbols from each of the first 3 streams. * The final stream will be decoded during the reload phase * to reduce register pressure.
*/
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
/* Decode one symbol from the final stream */
HUF_4X2_DECODE_SYMBOL(3, 1);
/* Decode 4 symbols from the final stream & reload bitstreams. * The final stream is reloaded last, meaning that all 5 symbols * are decoded from the final stream before it is reloaded.
*/
HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
} while (op[3] < olimit);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.