// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses.
*/
/* * Returns true if we should use ncount=-1 else we should * use ncount=1 for low probability symbols instead.
*/ staticunsigned ZSTD_useLowProbCount(size_t const nbSeq)
{ /* Heuristic: This should cover most blocks <= 16K and * start to fade out after 16K to about 32K depending on * compressibility.
*/ return nbSeq >= 2048;
}
/* * Returns the cost in bytes of encoding the normalized count header. * Returns an error if any of the helper functions return an error.
*/ static size_t ZSTD_NCountCost(unsignedconst* count, unsignedconst max,
size_t const nbSeq, unsignedconst FSELog)
{
BYTE wksp[FSE_NCOUNTBOUND];
S16 norm[MaxSeq + 1]; const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max, ZSTD_useLowProbCount(nbSeq)), ""); return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog);
}
/* * Returns the cost in bits of encoding the distribution described by count * using the entropy bound.
*/ static size_t ZSTD_entropyCost(unsignedconst* count, unsignedconst max, size_t const total)
{ unsigned cost = 0; unsigned s;
/* * Returns the cost in bits of encoding the distribution in count using ctable. * Returns an error if ctable cannot represent all the symbols in count.
*/
size_t ZSTD_fseBitCost(
FSE_CTable const* ctable, unsignedconst* count, unsignedconst max)
{ unsignedconst kAccuracyLog = 8;
size_t cost = 0; unsigned s;
FSE_CState_t cstate;
FSE_initCState(&cstate, ctable); if (ZSTD_getFSEMaxSymbolValue(ctable) < max) {
DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u",
ZSTD_getFSEMaxSymbolValue(ctable), max); return ERROR(GENERIC);
} for (s = 0; s <= max; ++s) { unsignedconst tableLog = cstate.stateLog; unsignedconst badCost = (tableLog + 1) << kAccuracyLog; unsignedconst bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog); if (count[s] == 0) continue; if (bitCost >= badCost) {
DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s); return ERROR(GENERIC);
}
cost += (size_t)count[s] * bitCost;
} return cost >> kAccuracyLog;
}
/* * Returns the cost in bits of encoding the distribution in count using the * table described by norm. The max symbol support by norm is assumed >= max. * norm must be valid for every symbol with non-zero probability in count.
*/
size_t ZSTD_crossEntropyCost(shortconst* norm, unsigned accuracyLog, unsignedconst* count, unsignedconst max)
{ unsignedconst shift = 8 - accuracyLog;
size_t cost = 0; unsigned s;
assert(accuracyLog <= 8); for (s = 0; s <= max; ++s) { unsignedconst normAcc = (norm[s] != -1) ? (unsigned)norm[s] : 1; unsignedconst norm256 = normAcc << shift;
assert(norm256 > 0);
assert(norm256 < 256);
cost += count[s] * kInverseProbabilityLog256[norm256];
} return cost >> 8;
}
SymbolEncodingType_e
ZSTD_selectEncodingType(
FSE_repeat* repeatMode, unsignedconst* count, unsignedconst max,
size_t const mostFrequent, size_t nbSeq, unsignedconst FSELog,
FSE_CTable const* prevCTable, shortconst* defaultNorm, U32 defaultNormLog,
ZSTD_DefaultPolicy_e const isDefaultAllowed,
ZSTD_strategy const strategy)
{
ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0); if (mostFrequent == nbSeq) {
*repeatMode = FSE_repeat_none; if (isDefaultAllowed && nbSeq <= 2) { /* Prefer set_basic over set_rle when there are 2 or fewer symbols, * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. * If basic encoding isn't possible, always choose RLE.
*/
DEBUGLOG(5, "Selected set_basic"); return set_basic;
}
DEBUGLOG(5, "Selected set_rle"); return set_rle;
} if (strategy < ZSTD_lazy) { if (isDefaultAllowed) {
size_t const staticFse_nbSeq_max = 1000;
size_t const mult = 10 - strategy;
size_t const baseLog = 3;
size_t const dynamicFse_nbSeq_min = (((size_t)1 << defaultNormLog) * mult) >> baseLog; /* 28-36 for offset, 56-72 for lengths */
assert(defaultNormLog >= 5 && defaultNormLog <= 6); /* xx_DEFAULTNORMLOG */
assert(mult <= 9 && mult >= 7); if ( (*repeatMode == FSE_repeat_valid)
&& (nbSeq < staticFse_nbSeq_max) ) {
DEBUGLOG(5, "Selected set_repeat"); return set_repeat;
} if ( (nbSeq < dynamicFse_nbSeq_min)
|| (mostFrequent < (nbSeq >> (defaultNormLog-1))) ) {
DEBUGLOG(5, "Selected set_basic"); /* The format allows default tables to be repeated, but it isn't useful. * When using simple heuristics to select encoding type, we don't want * to confuse these tables with dictionaries. When running more careful * analysis, we don't need to waste time checking both repeating tables * and default tables.
*/
*repeatMode = FSE_repeat_none; return set_basic;
}
}
} else {
size_t const basicCost = isDefaultAllowed ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, count, max) : ERROR(GENERIC);
size_t const repeatCost = *repeatMode != FSE_repeat_none ? ZSTD_fseBitCost(prevCTable, count, max) : ERROR(GENERIC);
size_t const NCountCost = ZSTD_NCountCost(count, max, nbSeq, FSELog);
size_t const compressedCost = (NCountCost << 3) + ZSTD_entropyCost(count, max, nbSeq);
DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog);
FSE_flushCState(&blockStream, &stateMatchLength);
DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog);
FSE_flushCState(&blockStream, &stateOffsetBits);
DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog);
FSE_flushCState(&blockStream, &stateLitLength);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.