// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses.
*/
/* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
* Feel free to remove this assert if there's a good reason! */
assert(dtlm == ZSTD_dtlm_full);
/* Always insert every fastHashFillStep position into the hash table. * Insert the other positions if their hash entry is empty.
*/ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
U32 const curr = (U32)(ip - base);
{ size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); }
if (dtlm == ZSTD_dtlm_fast) continue; /* Only load extra positions for ZSTD_dtlm_full */
{ U32 p; for (p = 1; p < fastHashFillStep; ++p) {
size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls); if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */
ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
} } } }
}
/* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
* Feel free to remove this assert if there's a good reason! */
assert(dtlm == ZSTD_dtlm_fast);
/* Always insert every fastHashFillStep position into the hash table. * Insert the other positions if their hash entry is empty.
*/ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
U32 const curr = (U32)(ip - base);
size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls);
hashTable[hash0] = curr; if (dtlm == ZSTD_dtlm_fast) continue; /* Only load extra positions for ZSTD_dtlm_full */
{ U32 p; for (p = 1; p < fastHashFillStep; ++p) {
size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls); if (hashTable[hash] == 0) { /* not yet filled */
hashTable[hash] = curr + p;
} } } }
}
staticint
ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
{ /* Array of ~random data, should have low probability of matching data. * Load from here if the index is invalid.
* Used to avoid unpredictable branches. */ staticconst BYTE dummy[] = {0x12,0x34,0x56,0x78};
/* currentIdx >= lowLimit is a (somewhat) unpredictable branch. * However expression below compiles into conditional move.
*/ const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy); /* Note: this used to be written as : return test1 && test2; * Unfortunately, once inlined, these tests become branches, * in which case it becomes critical that they are executed in the right order (test1 then test2). * So we have to write these tests in a specific manner to ensure their ordering.
*/ if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0; /* force ordering of these tests, which matters once the function is inlined, as they become branches */
__asm__(""); return matchIdx >= idxLowLimit;
}
staticint
ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
{ /* using a branch instead of a cmov, * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true,
* aka almost all candidates are within range */
U32 mval; if (matchIdx >= idxLowLimit) {
mval = MEM_read32(matchAddress);
} else {
mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */
}
return (MEM_read32(currentPtr) == mval);
}
/* * If you squint hard enough (and ignore repcodes), the search operation at any * given position is broken into 4 stages: * * 1. Hash (map position to hash value via input read) * 2. Lookup (map hash val to index via hashtable read) * 3. Load (map index to value at that position via input read) * 4. Compare * * Each of these steps involves a memory read at an address which is computed * from the previous step. This means these steps must be sequenced and their * latencies are cumulative. * * Rather than do 1->2->3->4 sequentially for a single position before moving * onto the next, this implementation interleaves these operations across the * next few positions: * * R = Repcode Read & Compare * H = Hash * T = Table Lookup * M = Match Read & Compare * * Pos | Time --> * ----+------------------- * N | ... M * N+1 | ... TM * N+2 | R H T M * N+3 | H TM * N+4 | R H T M * N+5 | H ... * N+6 | R ... * * This is very much analogous to the pipelining of execution in a CPU. And just * like a CPU, we have to dump the pipeline when we find a match (i.e., take a * branch). * * When this happens, we throw away our current state, and do the following prep * to re-enter the loop: * * Pos | Time --> * ----+------------------- * N | H T * N+1 | H * * This is also the work we do at the beginning to enter the loop initially.
*/
FORCE_INLINE_TEMPLATE
ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
size_t ZSTD_compressBlock_fast_noDict_generic(
ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], voidconst* src, size_t srcSize,
U32 const mls, int useCmov)
{ const ZSTD_compressionParameters* const cParams = &ms->cParams;
U32* const hashTable = ms->hashTable;
U32 const hlog = cParams->hashLog;
size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */ const BYTE* const base = ms->window.base; const BYTE* const istart = (const BYTE*)src; const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); const BYTE* const prefixStart = base + prefixStartIndex; const BYTE* const iend = istart + srcSize; const BYTE* const ilimit = iend - HASH_READ_SIZE;
size_t hash0; /* hash for ip0 */
size_t hash1; /* hash for ip1 */
U32 matchIdx; /* match idx for ip0 */
U32 offcode; const BYTE* match0;
size_t mLength;
/* ip0 and ip1 are always adjacent. The targetLength skipping and * uncompressibility acceleration is applied to every other position, * matching the behavior of #1562. step therefore represents the gap
* between pairs of positions, from ip0 to ip2 or ip1 to ip3. */
size_t step; const BYTE* nextStep; const size_t kStepIncr = (1 << (kSearchStrength - 1)); const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch;
/* Write next hash table entry: it's already calculated. * This write is known to be safe because ip1 is before the
* repcode (ip2). */
hashTable[hash1] = (U32)(ip1 - base);
goto _match;
}
if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { /* Write next hash table entry (it's already calculated). * This write is known to be safe because the ip1 == ip0 + 1,
* so searching will resume after ip1 */
hashTable[hash1] = (U32)(ip1 - base);
if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { /* Write next hash table entry, since it's already calculated */ if (step <= 4) { /* Avoid writing an index if it's >= position where search will resume. * The minimum possible match has length 4, so search can resume at ip0 + 4.
*/
hashTable[hash1] = (U32)(ip1 - base);
} goto _offset;
}
_cleanup: /* Note that there are probably still a couple positions one could search. * However, it seems to be a meaningful performance hit to try to search
* them. So let's not. */
/* When the repcodes are outside of the prefix, we set them to zero before the loop. * When the offsets are still zero, we need to restore them after the block to have a correct * repcode history. If only one offset was invalid, it is easy. The tricky case is when both * offsets were invalid. We need to figure out which offset to refill with. * - If both offsets are zero they are in the same order. * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`. * - If only one is zero, we need to decide which offset to restore. * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1. * - It is impossible for rep_offset2 to be non-zero. * * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
*/
offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
/* save reps for next block */
rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
/* Return the last literals size */ return (size_t)(iend - anchor);
/* Fill table and check for immediate repcode. */ if (ip0 <= ilimit) { /* Fill Table */
assert(base+current0+2 > istart); /* check base overflow */
hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
/* if a dictionary is still attached, it necessarily means that
* it is within window size. So we just check it. */ const U32 maxDistance = 1U << cParams->windowLog; const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
assert(endIndex - prefixStartIndex <= maxDistance);
(void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */
(void)hasStep; /* not currently specialized on whether it's accelerated */
/* ensure there will be no underflow
* when translating a dict index into a local index */
assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
/* switch to "regular" variant if extDict is invalidated due to maxDistance */ if (prefixStartIndex == dictStartIndex) return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
_cleanup: /* Note that there are probably still a couple positions we could search. * However, it seems to be a meaningful performance hit to try to search
* them. So let's not. */
/* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
* rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
/* save reps for next block */
rep[0] = offset_1 ? offset_1 : offsetSaved1;
rep[1] = offset_2 ? offset_2 : offsetSaved2;
/* Return the last literals size */ return (size_t)(iend - anchor);
/* write next hash table entry */ if (ip1 < ip0) {
hashTable[hash1] = (U32)(ip1 - base);
}
/* Fill table and check for immediate repcode. */ if (ip0 <= ilimit) { /* Fill Table */
assert(base+current0+2 > istart); /* check base overflow */
hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.