/** * Collation binary data reader.
*/ struct U_I18N_API CollationDataReader /* all static */ { // The following constants are also copied into source/common/ucol_swp.cpp. // Keep them in sync! enum { /** * Number of int32_t indexes. * * Can be 2 if there are only options. * Can be 7 or 8 if there are only options and a script reordering. * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
*/
IX_INDEXES_LENGTH, // 0 /** * Bits 31..24: numericPrimary, for numeric collation * 23..16: fast Latin format version (0 = no fast Latin table) * 15.. 0: options bit set
*/
IX_OPTIONS,
IX_RESERVED2,
IX_RESERVED3,
/** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
IX_JAMO_CE32S_START, // 4
// Byte offsets from the start of the data, after the generic header. // The indexes[] are at byte offset 0, other data follows. // Each data item is aligned properly. // The data items should be in descending order of unit size, // to minimize the need for padding. // Each item's byte length is given by the difference between its offset and // the next index/offset value. /** Byte offset to int32_t reorderCodes[]. */
IX_REORDER_CODES_OFFSET, /** * Byte offset to uint8_t reorderTable[]. * Empty table if <256 bytes (padding only). * Otherwise 256 bytes or more (with padding).
*/
IX_REORDER_TABLE_OFFSET, /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
IX_TRIE_OFFSET,
IX_RESERVED8_OFFSET, // 8 /** Byte offset to int64_t ces[]. */
IX_CES_OFFSET,
IX_RESERVED10_OFFSET, /** Byte offset to uint32_t ce32s[]. */
IX_CE32S_OFFSET,
/** Byte offset to uint32_t rootElements[]. */
IX_ROOT_ELEMENTS_OFFSET, // 12 /** Byte offset to char16_t *contexts[]. */
IX_CONTEXTS_OFFSET, /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */
IX_UNSAFE_BWD_OFFSET, /** Byte offset to uint16_t fastLatinTable[]. */
IX_FAST_LATIN_TABLE_OFFSET,
/** Byte offset to uint16_t scripts[]. */
IX_SCRIPTS_OFFSET, // 16 /** * Byte offset to UBool compressibleBytes[]. * Empty table if <256 bytes (padding only). * Otherwise 256 bytes or more (with padding).
*/
IX_COMPRESSIBLE_BYTES_OFFSET,
IX_RESERVED18_OFFSET,
IX_TOTAL_SIZE
};
private:
CollationDataReader() = delete; // no constructor
};
/* * Format of collation data (ucadata.icu, binary data in coll/ *.res files). * Format version 5. * * The root collation data is stored in the ucadata.icu file. * Tailorings are stored inside .res resource bundle files, with a complete file header. * * Collation data begins with a standard ICU data file header * (DataHeader, see ucmndata.h and unicode/udata.h). * The UDataInfo.dataVersion field contains the UCA and other version numbers, * see the comments for CollationTailoring.version. * * After the header, the file contains the following parts. * Constants are defined as enum values of the CollationDataReader class. * See also the Collation class. * * int32_t indexes[indexesLength]; * The indexes array has variable length. * Some tailorings only need the length and the options, * others only add reorderCodes and the reorderTable, * some need to store mappings. * Only as many indexes are stored as needed to read all of the data. * * Index 0: indexesLength * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS * Index 2..3: Unused/reserved/0. * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo * are stored in a short, contiguous part of the ce32s array. * * Indexes 5..19 are byte offsets in ascending order. * Each byte offset marks the start of the next part in the data file, * and the end of the previous one. * When two consecutive byte offsets are the same (or too short), * then the corresponding part is empty. * Byte offsets are offsets from after the header, * that is, from the beginning of the indexes[]. * Each part starts at an offset with proper alignment for its data. * If necessary, the previous part may include padding bytes to achieve this alignment. * The last byte offset that is stored in the indexes indicates the total size of the data * (starting with the indexes). * * int32_t reorderCodes[]; -- empty in root * The list of script and reordering codes. * * Beginning with format version 5, this array may optionally * have trailing entries with a full list of reorder ranges * as described for CollationSettings::reorderRanges. * * Script or reorder codes are first and do not exceed 16-bit values. * Range limits are stored in the upper 16 bits, and are never 0. * Split this array into reorder codes and ranges at the first entry * with non-zero upper 16 bits. * * If the ranges are missing but needed for split-reordered primary lead bytes, * then they are regenerated at load time. * * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes * Primary-weight lead byte permutation table. * Normally present when the reorderCodes are, but can be built at load time. * * Beginning with format version 5, a 0 entry at a non-zero index * (which is otherwise an illegal value) * means that the primary lead byte is "split" * (there are different offsets for primaries that share that lead byte) * and the reordering offset must be determined via the reorder ranges * that are either stored as part of the reorderCodes array * or regenerated at load time. * * UTrie2 trie; -- see utrie2_impl.h and utrie2.h * The trie holds the main collation data. Each code point is mapped to a 32-bit value. * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set, * in which case it is a special CE32 and contains a 4-bit tag and further data. * See the Collation class for details. * * The trie has a value for each lead surrogate code unit with some bits encoding * collective properties of the 1024 supplementary characters whose UTF-16 form starts with * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. * * int64_t ces[]; * 64-bit CEs and expansions that cannot be stored in a more compact form. * * uint32_t ce32s[]; * CE32s for expansions in compact form, and for characters whose trie values * contain special data. * * uint32_t rootElements[]; -- empty in all tailorings * Compact storage for all of the CEs that occur in the root collation. * See the CollationRootElements class. * * char16_t *contexts[]; * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings. * * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize() * Serialized form of characters that are unsafe when iterating backwards, * and at the end of an identical string prefix. * Back up to a safe character. * Lead surrogates are "unsafe" when any of their corresponding supplementary * code points are unsafe. * Does not include [:^lccc=0:][:^tccc=0:]. * For each tailoring, the root unsafeBackwardSet is subtracted. * (As a result, in many tailorings no set needs to be stored.) * * uint16_t fastLatinTable[]; * Optional optimization for Latin text. * See the CollationFastLatin class. * * uint16_t scripts[]; -- empty in all tailorings * Format version 5: * uint16_t numScripts; * uint16_t scriptsIndex[numScripts+16]; * uint16_t scriptStarts[]; * See CollationData::numScripts etc. * * Format version 4: * Table of the reordering groups with their first and last lead bytes, * and their script and reordering codes. * See CollationData::scripts. * * UBool compressibleBytes[]; -- empty in all tailorings * Flag for getSortKey(), indicating primary weight lead bytes that are compressible. * * ----------------- * Changes for formatVersion 5 (ICU 55) * * Reordering moves single scripts, not groups of scripts. * Reorder ranges are optionally appended to the reorderCodes, * and a 0 entry in the reorderTable indicates a split lead byte. * The scripts data has a new format. * * The rootElements may contain secondary and tertiary weights below common=05. * (Used for small Hiragana letters.) * Where is occurs, there is also an explicit unit with common secondary & tertiary weights. * There are no other data structure changes, but builder code needs to be able to handle such data. * * The collation element for the merge separator code point U+FFFE * does not necessarily have special, unique secondary/tertiary weights any more.
*/
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.