/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses.
*/
/* zstd_decompress_block :
* this module takes care of decompressing _compressed_ block */
/* These two optional macros force the use one way or another of the two * ZSTD_decompressSequences implementations. You can't force in both directions * at the same time.
*/ #ifdefined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) #error"Cannot force the use of the short and the long ZSTD_decompressSequences variants!" #endif
/* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ staticvoid ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, const streaming_operation streaming, const size_t expectedWriteSize, constunsigned splitImmediately)
{
size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
assert(litSize <= blockSizeMax);
assert(dctx->isFrameDecompression || streaming == not_streaming);
assert(expectedWriteSize <= blockSizeMax); if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { /* If we aren't streaming, we can just put the literals after the output * of the current block. We don't need to worry about overwriting the * extDict of our window, because it doesn't exist. * So if we have space after the end of the block, just put it there.
*/
dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH;
dctx->litBufferEnd = dctx->litBuffer + litSize;
dctx->litBufferLocation = ZSTD_in_dst;
} elseif (litSize <= ZSTD_LITBUFFEREXTRASIZE) { /* Literals fit entirely within the extra buffer, put them there to avoid * having to split the literals.
*/
dctx->litBuffer = dctx->litExtraBuffer;
dctx->litBufferEnd = dctx->litBuffer + litSize;
dctx->litBufferLocation = ZSTD_not_in_dst;
} else {
assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); /* Literals must be split between the output block and the extra lit * buffer. We fill the extra lit buffer with the tail of the literals, * and put the rest of the literals at the end of the block, with * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. * This MUST not write more than our maxBlockSize beyond dst, because in * streaming mode, that could overwrite part of our extDict window.
*/ if (splitImmediately) { /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
} else { /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
}
dctx->litBufferLocation = ZSTD_split;
assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize);
}
}
/*! ZSTD_decodeLiteralsBlock() : * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write. * * @return : nb of bytes read from src (< srcSize )
* note : symbol not declared but exposed for fullbench */ static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, constvoid* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ void* dst, size_t dstCapacity, const streaming_operation streaming)
{
DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
/* Default FSE distribution tables. * These are pre-calculated FSE decoding tables using default distributions as defined in specification : * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions * They were generated programmatically with following method : * - start from default distributions, present in /lib/common/zstd_internal.h * - generate tables normally, using ZSTD_buildFSETable() * - printout the content of tables
* - pretify output, report below, test with fuzzer to ensure it's correct */
/* Spread symbols */
assert(tableSize <= 512); /* Specialized symbol spreading for the case when there are * no low probability (-1 count) symbols. When compressing * small blocks we avoid low probability symbols to hit this * case, since header decoding speed matters more.
*/ if (highThreshold == tableSize - 1) {
size_t const tableMask = tableSize-1;
size_t const step = FSE_TABLESTEP(tableSize); /* First lay down the symbols in order. * We use a uint64_t to lay down 8 bytes at a time. This reduces branch * misses since small blocks generally have small table logs, so nearly * all symbols have counts <= 8. We ensure we have 8 bytes at the end of * our buffer to handle the over-write.
*/
{
U64 const add = 0x0101010101010101ull;
size_t pos = 0;
U64 sv = 0;
U32 s; for (s=0; s<maxSV1; ++s, sv += add) { int i; intconst n = normalizedCounter[s];
MEM_write64(spread + pos, sv); for (i = 8; i < n; i += 8) {
MEM_write64(spread + pos + i, sv);
}
assert(n>=0);
pos += (size_t)n;
}
} /* Now we spread those positions across the table. * The benefit of doing it in two stages is that we avoid the * variable size inner loop, which caused lots of branch misses. * Now we can run through all the positions without any branch misses. * We unroll the loop twice, since that is what empirically worked best.
*/
{
size_t position = 0;
size_t s;
size_t const unroll = 2;
assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ for (s = 0; s < (size_t)tableSize; s += unroll) {
size_t u; for (u = 0; u < unroll; ++u) {
size_t const uPosition = (position + (u * step)) & tableMask;
tableDecode[uPosition].baseValue = spread[s + u];
}
position = (position + (unroll * step)) & tableMask;
}
assert(position == 0);
}
} else {
U32 const tableMask = tableSize-1;
U32 const step = FSE_TABLESTEP(tableSize);
U32 s, position = 0; for (s=0; s<maxSV1; s++) { int i; intconst n = normalizedCounter[s]; for (i=0; i<n; i++) {
tableDecode[position].baseValue = s;
position = (position + step) & tableMask; while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */
} }
assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
}
/*! ZSTD_overlapCopy8() : * Copies 8 bytes from ip to op and updates op and ip where ip <= op. * If the offset is < 8 then the offset is spread to at least 8 bytes. * * Precondition: *ip <= *op * Postcondition: *op - *op >= 8
*/
HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
assert(*ip <= *op); if (offset < 8) { /* close range match, overlap */ staticconst U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ staticconstint dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */ intconst sub2 = dec64table[offset];
(*op)[0] = (*ip)[0];
(*op)[1] = (*ip)[1];
(*op)[2] = (*ip)[2];
(*op)[3] = (*ip)[3];
*ip += dec32table[offset];
ZSTD_copy4(*op+4, *ip);
*ip -= sub2;
} else {
ZSTD_copy8(*op, *ip);
}
*ip += 8;
*op += 8;
assert(*op - *ip >= 8);
}
/*! ZSTD_safecopy() : * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer * and write up to 16 bytes past oend_w (op >= oend_w is allowed). * This function is only called in the uncommon case where the sequence is near the end of the block. It * should be fast for a single long sequence, but can be slow for several short sequences. * * @param ovtype controls the overlap detection * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart. * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. * The src buffer must be before the dst buffer.
*/ staticvoid ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
ptrdiff_t const diff = op - ip;
BYTE* const oend = op + length;
if (length < 8) { /* Handle short lengths. */ while (op < oend) *op++ = *ip++; return;
} if (ovtype == ZSTD_overlap_src_before_dst) { /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
assert(length >= 8);
ZSTD_overlapCopy8(&op, &ip, diff);
length -= 8;
assert(op - ip >= 8);
assert(op <= oend);
}
if (oend <= oend_w) { /* No risk of overwrite. */
ZSTD_wildcopy(op, ip, length, ovtype); return;
} if (op <= oend_w) { /* Wildcopy until we get close to the end. */
assert(oend > oend_w);
ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
ip += oend_w - op;
op += oend_w - op;
} /* Handle the leftovers. */ while (op < oend) *op++ = *ip++;
}
/* ZSTD_safecopyDstBeforeSrc(): * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
* Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ staticvoid ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) {
ptrdiff_t const diff = op - ip;
BYTE* const oend = op + length;
if (length < 8 || diff > -8) { /* Handle short lengths, close overlaps, and dst not before src. */ while (op < oend) *op++ = *ip++; return;
}
/* Handle the leftovers. */ while (op < oend) *op++ = *ip++;
}
/* ZSTD_execSequenceEnd(): * This version handles cases that are near the end of the output buffer. It requires * more careful checks to make sure there is no overflow. By separating out these hard * and unlikely cases, we can speed up the common cases. * * NOTE: This function needs to be fast for a single long sequence, but doesn't need * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
*/
FORCE_NOINLINE
ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
size_t ZSTD_execSequenceEnd(BYTE* op,
BYTE* const oend, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit, const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
{
BYTE* const oLitEnd = op + sequence.litLength;
size_t const sequenceLength = sequence.litLength + sequence.matchLength; const BYTE* const iLitEnd = *litPtr + sequence.litLength; const BYTE* match = oLitEnd - sequence.offset;
BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
/* bounds checks : careful of address space overflow in 32-bit mode */
RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
assert(op < op + sequenceLength);
assert(oLitEnd < op + sequenceLength);
/* ZSTD_execSequenceEndSplitLitBuffer(): * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case.
*/
FORCE_NOINLINE
ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
BYTE* const oend, const BYTE* const oend_w, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit, const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
{
BYTE* const oLitEnd = op + sequence.litLength;
size_t const sequenceLength = sequence.litLength + sequence.matchLength; const BYTE* const iLitEnd = *litPtr + sequence.litLength; const BYTE* match = oLitEnd - sequence.offset;
/* bounds checks : careful of address space overflow in 32-bit mode */
RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
assert(op < op + sequenceLength);
assert(oLitEnd < op + sequenceLength);
/* copy literals */
RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
op = oLitEnd;
*litPtr = iLitEnd;
#ifdefined(__aarch64__) /* prefetch sequence starting from match that will be used for copy later */
PREFETCH_L1(match); #endif /* Handle edge cases in a slow path: * - Read beyond end of literals * - Match end is within WILDCOPY_OVERLIMIT of oend * - 32-bit mode and the match length overflows
*/ if (UNLIKELY(
iLitEnd > litLimit ||
oMatchEnd > oend_w ||
(MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
/* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
assert(op <= oLitEnd /* No overflow */);
assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
assert(oMatchEnd <= oend /* No underflow */);
assert(iLitEnd <= litLimit /* Literal length is in bounds */);
assert(oLitEnd <= oend_w /* Can wildcopy literals */);
assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
/* Copy Literals: * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. * We likely don't need the full 32-byte wildcopy.
*/
assert(WILDCOPY_OVERLENGTH >= 16);
ZSTD_copy16(op, (*litPtr)); if (UNLIKELY(sequence.litLength > 16)) {
ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
}
op = oLitEnd;
*litPtr = iLitEnd; /* update for next sequence */
/* Copy Match */ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix -> go into extDict */
RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
match = dictEnd + (match - prefixStart); if (match + sequence.matchLength <= dictEnd) {
ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength;
} /* span extDict & currentPrefixSegment */
{ size_t const length1 = dictEnd - match;
ZSTD_memmove(oLitEnd, match, length1);
op = oLitEnd + length1;
sequence.matchLength -= length1;
match = prefixStart;
}
} /* Match within prefix of 1 or more bytes */
assert(op <= oMatchEnd);
assert(oMatchEnd <= oend_w);
assert(match >= prefixStart);
assert(sequence.matchLength >= 1);
/* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy * without overlap checking.
*/ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { /* We bet on a full wildcopy for matches, since we expect matches to be * longer than literals (in general). In silesia, ~10% of matches are longer * than 16 bytes.
*/
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); return sequenceLength;
}
assert(sequence.offset < WILDCOPY_VECLEN);
/* Copy 8 bytes and spread the offset to be >= 8. */
ZSTD_overlapCopy8(&op, &match, sequence.offset);
/* If the match length is > 8 bytes, then continue with the wildcopy. */ if (sequence.matchLength > 8) {
assert(op < oMatchEnd);
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
} return sequenceLength;
}
assert(op != NULL /* Precondition */);
assert(oend_w < oend /* No underflow */); /* Handle edge cases in a slow path: * - Read beyond end of literals * - Match end is within WILDCOPY_OVERLIMIT of oend * - 32-bit mode and the match length overflows
*/ if (UNLIKELY(
iLitEnd > litLimit ||
oMatchEnd > oend_w ||
(MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
/* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
assert(op <= oLitEnd /* No overflow */);
assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
assert(oMatchEnd <= oend /* No underflow */);
assert(iLitEnd <= litLimit /* Literal length is in bounds */);
assert(oLitEnd <= oend_w /* Can wildcopy literals */);
assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
/* Copy Literals: * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. * We likely don't need the full 32-byte wildcopy.
*/
assert(WILDCOPY_OVERLENGTH >= 16);
ZSTD_copy16(op, (*litPtr)); if (UNLIKELY(sequence.litLength > 16)) {
ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
}
op = oLitEnd;
*litPtr = iLitEnd; /* update for next sequence */
/* Copy Match */ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix -> go into extDict */
RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
match = dictEnd + (match - prefixStart); if (match + sequence.matchLength <= dictEnd) {
ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength;
} /* span extDict & currentPrefixSegment */
{ size_t const length1 = dictEnd - match;
ZSTD_memmove(oLitEnd, match, length1);
op = oLitEnd + length1;
sequence.matchLength -= length1;
match = prefixStart;
} } /* Match within prefix of 1 or more bytes */
assert(op <= oMatchEnd);
assert(oMatchEnd <= oend_w);
assert(match >= prefixStart);
assert(sequence.matchLength >= 1);
/* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy * without overlap checking.
*/ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { /* We bet on a full wildcopy for matches, since we expect matches to be * longer than literals (in general). In silesia, ~10% of matches are longer * than 16 bytes.
*/
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); return sequenceLength;
}
assert(sequence.offset < WILDCOPY_VECLEN);
/* Copy 8 bytes and spread the offset to be >= 8. */
ZSTD_overlapCopy8(&op, &match, sequence.offset);
/* If the match length is > 8 bytes, then continue with the wildcopy. */ if (sequence.matchLength > 8) {
assert(op < oMatchEnd);
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);
} return sequenceLength;
}
/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 * bits before reloading. This value is the maximum number of bytes we read * after reloading when we are decoding long offsets.
*/ #define LONG_OFFSETS_MAX_EXTRA_BITS_32 \
(ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \
? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \
: 0)
assert(llBits <= MaxLLBits);
assert(mlBits <= MaxMLBits);
assert(ofBits <= MaxOff); /* * As gcc has better branch and block analyzers, sometimes it is only * valuable to mark likeliness for clang, it gives around 3-4% of * performance.
*/
if (mlBits > 0)
seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
BIT_reloadDStream(&seqState->DStream); if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
BIT_reloadDStream(&seqState->DStream); /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
if (llBits > 0)
seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
if (MEM_32bits())
BIT_reloadDStream(&seqState->DStream);
/* decompress without overrunning litPtr begins */
{ seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ /* Align the decompression loop to 32 + 16 bytes. * * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression * speed swings based on the alignment of the decompression loop. This * performance swing is caused by parts of the decompression loop falling * out of the DSB. The entire decompression loop should fit in the DSB, * when it can't we get much worse performance. You can measure if you've * hit the good case or the bad case with this perf command for some * compressed file test.zst: * * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.22 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.