// Dump an aliases = [...] key for properties with aliases void dumpPropertyAliases(UProperty uproperty, FILE* f) { int i = U_LONG_PROPERTY_NAME + 1;
while(true) { // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially, // and returning null after that constchar* alias = u_getPropertyName(uproperty, static_cast<UPropertyNameChoice>(i)); if (!alias) { break;
} if (i == U_LONG_PROPERTY_NAME + 1) {
fprintf(f, "aliases = [\"%s\"", alias);
} else {
fprintf(f, ", \"%s\"", alias);
}
i++;
} if (i != U_LONG_PROPERTY_NAME + 1) {
fprintf(f, "]\n");
}
}
// If the value exists, dump an indented entry of the format // `" {discr = <discriminant>, long = <longname>, short = <shortname>, aliases = [<aliases>]},"` void dumpValueEntry(UProperty uproperty, int v, bool is_mask, FILE* f) { constchar* fullValueName = u_getPropertyValueName(uproperty, v, U_LONG_PROPERTY_NAME); constchar* shortValueName = u_getPropertyValueName(uproperty, v, U_SHORT_PROPERTY_NAME); if (!fullValueName) { return;
} if (is_mask) {
fprintf(f, " {discr = 0x%X", v);
} else {
fprintf(f, " {discr = %i", v);
}
fprintf(f, ", long = \"%s\"", fullValueName); if (shortValueName) {
fprintf(f, ", short = \"%s\"", shortValueName);
} int i = U_LONG_PROPERTY_NAME + 1; while(true) { // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially, // and returning null after that constchar* alias = u_getPropertyValueName(uproperty, v, static_cast<UPropertyNameChoice>(i)); if (!alias) { break;
} if (i == U_LONG_PROPERTY_NAME + 1) {
fprintf(f, ", aliases = [\"%s\"", alias);
} else {
fprintf(f, ", \"%s\"", alias);
}
i++;
} if (i != U_LONG_PROPERTY_NAME + 1) {
fprintf(f, "]");
}
fprintf(f, "},\n");
}
/* * Export Bidi_Mirroring_Glyph values (code points) in a similar way to how enumerated * properties are dumped to file. * Note: the data will store 0 for code points without a value defined for * Bidi_Mirroring_Glyph.
*/ void dumpBidiMirroringGlyph(FILE* f) {
UProperty uproperty = UCHAR_BIDI_MIRRORING_GLYPH;
IcuToolErrorCode status("icuexportdata: dumpBidiMirroringGlyph"); constchar* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME); constchar* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
handleError(status, __LINE__, fullPropName);
// Store 21-bit code point as is
UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32;
// note: unlike dumpEnumeratedProperty, which can get inversion map data using // u_getIntPropertyMap(uproperty), the only reliable way to get Bidi_Mirroring_Glyph // is to use u_charMirror(cp) over the code point space.
LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status)); for(UChar32 c = UCHAR_MIN_VALUE; c <= UCHAR_MAX_VALUE; c++) {
UChar32 mirroringGlyph = u_charMirror(c); // The trie builder code throws an error when it cannot compress the data sufficiently. // Therefore, when the value is undefined for a code point, keep a 0 in the trie // instead of the ICU API behavior of returning the code point value. Using 0 // results in a relatively significant space savings by not including redundant data. if (c != mirroringGlyph) {
umutablecptrie_set(builder.getAlias(), c, mirroringGlyph, status);
}
}
// currently a trie and inversion map are the same (as relied upon in characterproperties.cpp) const UCPMap* umap = reinterpret_cast<UCPMap *>(utrie.getAlias());
// After printing property value `v`, print `mask` if and only if `mask` comes immediately // after the property in the listing void maybeDumpMaskValue(UProperty uproperty, uint32_t v, uint32_t mask, FILE* f) { if (U_MASK(v) < mask && U_MASK(v + 1) > mask)
dumpValueEntry(uproperty, mask, true, f);
}
fprintf(f, "values = [\n"); for (uint32_t v = minValue; v <= maxValue; v++) {
dumpValueEntry(uproperty, U_MASK(v), true, f);
// We want to dump these masks "in order", which means they // should come immediately after every property they contain
maybeDumpMaskValue(uproperty, v, U_GC_L_MASK, f);
maybeDumpMaskValue(uproperty, v, U_GC_LC_MASK, f);
maybeDumpMaskValue(uproperty, v, U_GC_M_MASK, f);
maybeDumpMaskValue(uproperty, v, U_GC_N_MASK, f);
maybeDumpMaskValue(uproperty, v, U_GC_Z_MASK, f);
maybeDumpMaskValue(uproperty, v, U_GC_C_MASK, f);
maybeDumpMaskValue(uproperty, v, U_GC_P_MASK, f);
maybeDumpMaskValue(uproperty, v, U_GC_S_MASK, f);
}
fprintf(f, "]\n");
}
// We want to use 16 bits for our exported trie of sc/scx data because we // need 12 bits to match the 12 bits of data stored for sc/scx in the trie // in the uprops.icu data file.
UCPTrieValueWidth scWidth = UCPTRIE_VALUE_BITS_16;
// The values for the output scx companion array. // Invariant is that all subvectors are distinct.
std::vector< std::vector<uint16_t> > outputDedupVec;
// The sc/scx companion array is an array of arrays (of script codes)
fputs("script_code_array = [\n", f); for(const UChar32 cp : scxCodePoints) { // Get the Script value
uint32_t scVal = umutablecptrie_get(builder.getAlias(), cp); // Get the Script_Extensions value (array of Script codes) const int32_t SCX_ARRAY_CAPACITY = 32;
UScriptCode scxValArray[SCX_ARRAY_CAPACITY];
int32_t numScripts = uscript_getScriptExtensions(cp, scxValArray, SCX_ARRAY_CAPACITY, status);
handleError(status, __LINE__, scxFullPropName);
// Convert the scx array into a vector
std::vector<uint16_t> scxValVec; for(int i = 0; i < numScripts; i++) {
scxValVec.push_back(scxValArray[i]);
} // Ensure that it is sorted
std::sort(scxValVec.begin(), scxValVec.end()); // Copy the Script value into the first position of the scx array only // if we have the "other" case (Script value is not Common nor Inherited). // This offers faster access when users want only the Script value. if (scVal != USCRIPT_COMMON && scVal != USCRIPT_INHERITED) {
scxValVec.insert(scxValVec.begin(), scVal);
}
// See if there is already an scx value array matching the newly built one. // If there is, then use its index. // If not, then append the new value array. bool isScxValUnique = true;
size_t outputIndex = 0; for (outputIndex = 0; outputIndex < outputDedupVec.size(); outputIndex++) { if (outputDedupVec[outputIndex] == scxValVec) {
isScxValUnique = false; break;
}
}
// We must update the value in the UCPTrie for the code point to contain: // 9..0 the Script code in the lower 10 bits when 11..10 is 0, else it is // the index into the companion array // 11..10 the same higher-order 2 bits in the trie in uprops.icu indicating whether // 3: other // 2: Script=Inherited // 1: Script=Common // 0: Script=value in 9..0 (N/A because we are in this loop to create the companion array for non-0 cases)
uint16_t mask = 0; if (scVal == USCRIPT_COMMON) {
mask = DATAEXPORT_SCRIPT_X_WITH_COMMON;
} elseif (scVal == USCRIPT_INHERITED) {
mask = DATAEXPORT_SCRIPT_X_WITH_INHERITED;
} else {
mask = DATAEXPORT_SCRIPT_X_WITH_OTHER;
}
// The new trie value is the index into the new array with the high order bits set
uint32_t newScVal = outputIndex | mask;
// Update the code point in the mutable trie builder with the trie value
umutablecptrie_set(builder.getAlias(), cp, newScVal, status);
handleError(status, __LINE__, scxFullPropName);
}
fputs("]\n\n", f); // Print the TOML close delimiter for the outer array.
// Convert from mutable trie builder to immutable trie.
LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
builder.getAlias(),
trieType,
scWidth,
status));
handleError(status, __LINE__, scxFullPropName);
// Zero is a magic number that means the character decomposes to itself.
LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
// Iterate backwards to insert lower code points in the trie first in case it matters // for trie block allocation. for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) { const PendingDescriptor& pending = pendingTrieInsertions[i];
uint32_t additional = 0; if (!(pending.descriptor & 0xFFFC0000)) {
uint32_t offset = pending.descriptor & 0xFFF; if (!pending.supplementary) { if (offset >= baseSize16) { // This is a offset to supplementary 16-bit data. We have // 16-bit base data and 32-bit base data before. However, // the 16-bit base data length is already part of offset.
additional = baseSize32;
}
} else { if (offset >= baseSize32) { // This is an offset to supplementary 32-bit data. We have 16-bit // base data, 32-bit base data, and 16-bit supplementary data before. // However, the 32-bit base data length is already part // of offset.
additional = baseSize16 + supplementSize16;
} else { // This is an offset to 32-bit base data. We have 16-bit // base data before.
additional = baseSize16;
}
} if (offset + additional > 0xFFF) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
} // It turns out it's better to swap the halves compared to the initial // idea in order to put special marker values close to zero so that // an important marker value becomes 1, so it's efficient to compare // "1 or 0". Unfortunately, going through all the code to swap // things is too error prone, so let's do the swapping here in one // place.
uint32_t oldTrieValue = pending.descriptor + additional;
uint32_t swappedTrieValue = (oldTrieValue >> 16) | (oldTrieValue << 16);
umutablecptrie_set(builder.getAlias(), pending.scalar, swappedTrieValue, status);
}
LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
builder.getAlias(),
trieType,
UCPTRIE_VALUE_BITS_32,
status));
handleError(status, __LINE__, basename);
if (reference) { if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) { // NFD expectations don't hold. The set must not contain the half-width // kana voicing marks and must contain iota subscript.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
USet* halfWidthCheck = uset_cloneAsThawed(uset);
uset_removeAll(halfWidthCheck, reference); if (uset_equals(halfWidthCheck, halfWidthVoicing)) {
flags |= 1;
} elseif (!uset_isEmpty(halfWidthCheck)) { // The result was neither empty nor contained exactly // the two half-width voicing marks. The ICU4X // normalizer doesn't know how to deal with this case.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
uset_close(halfWidthCheck);
USet* iotaCheck = uset_cloneAsThawed(reference);
uset_removeAll(iotaCheck, uset); if (!(uset_equals(iotaCheck, iotaSubscript)) && !uset_isEmpty(iotaCheck)) { // The result was neither empty nor contained exactly // the iota subscript. The ICU4X normalizer doesn't // know how to deal with this case.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
// Special marker for the NFKD form of U+FDFA const int32_t FDFA_MARKER = 3;
// Special marker for characters whose decomposition starts with a non-starter // and the decomposition isn't the character itself. const int32_t SPECIAL_NON_STARTER_DECOMPOSITION_MARKER = 2;
// Special marker for starters that decompose to themselves but that may // combine backwards under canonical composition const int32_t BACKWARD_COMBINING_STARTER_MARKER = 1;
/// Marker that a complex decomposition isn't round-trippable /// under re-composition. /// /// TODO: When taking a data format break, swap this around with /// `BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER`. const uint32_t NON_ROUND_TRIP_MARKER = 1;
/// Marker that a complex decomposition starts with a starter /// that can combine backwards. /// /// TODO: When taking a data format break, swap this around with /// `NON_ROUND_TRIP_MARKER` to use the same bit as with characters /// that decompose to self but can combine backwards. const uint32_t BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER = 2;
UBool permissibleBmpPair(UBool knownToRoundTrip, UChar32 c, UChar32 second) { if (knownToRoundTrip) { returntrue;
} // Nuktas, Hebrew presentation forms and polytonic Greek with oxia // are special-cased in ICU4X. if (c >= 0xFB1D && c <= 0xFB4E) { // Hebrew presentation forms returntrue;
} if (c >= 0x1F71 && c <= 0x1FFB) { // Polytonic Greek with oxia returntrue;
} if ((second & 0x7F) == 0x3C && second >= 0x0900 && second <= 0x0BFF) { // Nukta returntrue;
} // To avoid more branchiness, 4 characters that decompose to // a BMP starter followed by a BMP non-starter are excluded // from being encoded directly into the trie value and are // handled as complex decompositions instead. These are: // U+0F76 TIBETAN VOWEL SIGN VOCALIC R // U+0F78 TIBETAN VOWEL SIGN VOCALIC L // U+212B ANGSTROM SIGN // U+2ADC FORKING returnfalse;
}
// Find the slice `needle` within `storage` and return its index, failing which, // append all elements of `needle` to `storage` and return the index of it at the end. template<typename T>
size_t findOrAppend(std::vector<T>& storage, const UChar32* needle, size_t needleLen) { // Last index where we might find the start of the complete needle. // bounds check is `i + needleLen <= storage.size()` since the inner // loop will range from `i` to `i + needleLen - 1` (the `-1` is why we use `<=`) for (size_t i = 0; i + needleLen <= storage.size(); i++) { for (size_t j = 0;; j++) { if (j == needleLen) { return i; // found a match
} if (storage[i + j] != static_cast<uint32_t>(needle[j])) { break;
}
}
} // We didn't find anything. Append, keeping the append index in mind.
size_t index = storage.size(); for(size_t i = 0; i < needleLen; i++) {
storage.push_back(static_cast<T>(needle[i]));
}
// Max length as of Unicode 14 is 4 for NFD. For NFKD the max // is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB). const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9; const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8; const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
UChar32 utf32[DECOMPOSITION_BUFFER_SIZE]; const int32_t RAW_DECOMPOSITION_BUFFER_SIZE = 2;
UChar32 rawUtf32[RAW_DECOMPOSITION_BUFFER_SIZE];
// Iterate over all scalar values excluding Hangul syllables. // // We go backwards in order to better find overlapping decompositions. // // As of Unicode 14: // Iterate forward without overlap search: // nfd: 16 size: 896, 32 size: 173 // nfkd: 16 size: 3854, 32 size: 179 // // Iterate forward with overlap search: // nfd: 16 size: 888, 32 size: 173 // nfkd: 16 size: 3266, 32 size: 179 // // Iterate backward with overlap search: // nfd: 16 size: 776, 32 size: 173 // nfkd: 16 size: 2941, 32 size: 179 // // UChar32 is signed! for (UChar32 c = 0x10FFFF; c >= 0; --c) { if (c >= 0xAC00 && c <= 0xD7A3) { // Hangul syllable continue;
} if (c >= 0xD800 && c < 0xE000) { // Surrogate continue;
}
UnicodeString src;
UnicodeString dst; // True if we're building non-NFD or we're building NFD but // the `c` round trips to NFC. // False if we're building NFD and `c` does not round trip to NFC.
UBool nonNfdOrRoundTrips = true;
src.append(c); if (mainNormalizer != nfdNormalizer) {
UnicodeString inter;
mainNormalizer->normalize(src, inter, status);
nfdNormalizer->normalize(inter, dst, status);
} else {
nfdNormalizer->normalize(src, dst, status);
UnicodeString nfc;
nfcNormalizer->normalize(dst, nfc, status);
nonNfdOrRoundTrips = (src == nfc);
} if (uts46) { // Work around https://unicode-org.atlassian.net/browse/ICU-22658 // TODO: Remove the workaround after data corresponding to // https://www.unicode.org/L2/L2024/24061.htm#179-C36 lands // for Unicode 16. switch (c) { case 0x2F868:
dst.truncate(0);
dst.append(static_cast<UChar32>(0x36FC)); break; case 0x2F874:
dst.truncate(0);
dst.append(static_cast<UChar32>(0x5F53)); break; case 0x2F91F:
dst.truncate(0);
dst.append(static_cast<UChar32>(0x243AB)); break; case 0x2F95F:
dst.truncate(0);
dst.append(static_cast<UChar32>(0x7AEE)); break; case 0x2F9BF:
dst.truncate(0);
dst.append(static_cast<UChar32>(0x45D7)); break;
}
}
int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) { if (!uts46) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
} if (len > DECOMPOSITION_BUFFER_SIZE) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
uint8_t firstCombiningClass = u_getCombiningClass(utf32[0]); bool specialNonStarterDecomposition = false; bool startsWithBackwardCombiningStarter = false; if (firstCombiningClass) {
decompositionPassthroughBound = c;
compositionPassthroughBound = c;
uset_add(decompositionStartsWithNonStarter, c); if (src != dst) { if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F) {
specialNonStarterDecomposition = true;
} else { // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
}
} elseif (uset_contains(backwardCombiningStarters, utf32[0])) {
compositionPassthroughBound = c;
startsWithBackwardCombiningStarter = true;
uset_add(decompositionStartsWithBackwardCombiningStarter, c);
} if (c != BACKWARD_COMBINING_STARTER_MARKER && len == 1 && utf32[0] == BACKWARD_COMBINING_STARTER_MARKER) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
} if (c != SPECIAL_NON_STARTER_DECOMPOSITION_MARKER && len == 1 && utf32[0] == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
} if (c != FDFA_MARKER && len == 1 && utf32[0] == FDFA_MARKER) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
} if (mainNormalizer != nfdNormalizer) {
UnicodeString nfd;
nfdNormalizer->normalize(src, nfd, status); if (dst == nfd) { continue;
}
decompositionPassthroughBound = c;
compositionPassthroughBound = c;
} elseif (firstCombiningClass) {
len = 1; if (specialNonStarterDecomposition) {
utf32[0] = SPECIAL_NON_STARTER_DECOMPOSITION_MARKER; // magic value
} else { // Use the surrogate range to store the canonical combining class
utf32[0] = 0xD800 | static_cast<UChar32>(firstCombiningClass);
}
} else { if (src == dst) { if (startsWithBackwardCombiningStarter) {
pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_STARTER_MARKER << 16, false});
} continue;
}
decompositionPassthroughBound = c; // ICU4X hard-codes ANGSTROM SIGN if (c != 0x212B) {
UnicodeString raw; if (!nfdNormalizer->getRawDecomposition(c, raw)) { // We're always supposed to have a non-recursive decomposition // if we had a recursive one.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
} // In addition to actual difference, put the whole range that contains characters // with oxia into the non-recursive trie in order to catch cases where characters // with oxia have singleton decompositions to corresponding characters with tonos. // This way, the run-time decision to fall through can be done on the range // without checking for individual characters inside the range. if (raw != dst || (c >= 0x1F71 && c <= 0x1FFB)) {
int32_t rawLen = raw.toUTF32(rawUtf32, RAW_DECOMPOSITION_BUFFER_SIZE, status); if (!rawLen) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
} if (rawLen == 1) { if (c >= 0xFFFF) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, static_cast<uint32_t>(rawUtf32[0]), status);
} elseif (rawUtf32[0] <= 0xFFFF && rawUtf32[1] <= 0xFFFF) { if (!rawUtf32[0] || !rawUtf32[1]) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
} // Swapped for consistency with the primary trie
uint32_t bmpPair = static_cast<uint32_t>(rawUtf32[1]) << 16 | static_cast<uint32_t>(rawUtf32[0]);
umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, bmpPair, status);
} else { // Let's add 1 to index to make it always non-zero to distinguish // it from the default zero.
uint32_t index = nonRecursive32.size() + 1;
nonRecursive32.push_back(static_cast<uint32_t>(rawUtf32[0]));
nonRecursive32.push_back(static_cast<uint32_t>(rawUtf32[1])); if (index > 0xFFFF) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, index << 16, status);
}
}
}
} if (!nonNfdOrRoundTrips) {
compositionPassthroughBound = c;
} if (!len) { if (!uts46) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
pendingTrieInsertions.push_back({c, 0xFFFFFFFF, false});
} elseif (len == 1 && ((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) { // Singleton decompositions to conjoining jamo. if (mainNormalizer == nfdNormalizer) { // Not supposed to happen in NFD
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
pendingTrieInsertions.push_back({c, static_cast<uint32_t>(utf32[0]) << 16, false});
} elseif (!startsWithBackwardCombiningStarter && len == 1 && utf32[0] <= 0xFFFF) {
pendingTrieInsertions.push_back({c, static_cast<uint32_t>(utf32[0]) << 16, false});
} elseif (!startsWithBackwardCombiningStarter &&
len == 2 &&
utf32[0] <= 0xFFFF &&
utf32[1] <= 0xFFFF &&
!u_getCombiningClass(utf32[0]) &&
u_getCombiningClass(utf32[1]) &&
permissibleBmpPair(nonNfdOrRoundTrips, c, utf32[1])) { for (int32_t i = 0; i < len; ++i) { if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) { // Assert that iota subscript and half-width voicing marks never occur in these // expansions in the normalization forms where they are special.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
}
pendingTrieInsertions.push_back({c, (static_cast<uint32_t>(utf32[0]) << 16) | static_cast<uint32_t>(utf32[1]), false});
} else {
UBool supplementary = false;
UBool nonInitialStarter = false; for (int32_t i = 0; i < len; ++i) { if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) { // Assert that iota subscript and half-width voicing marks never occur in these // expansions in the normalization forms where they are special.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
if (utf32[i] > 0xFFFF) {
supplementary = true;
} if (utf32[i] == 0) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
} if (i != 0 && !u_getCombiningClass(utf32[i])) {
nonInitialStarter = true;
}
} if (len == 1) { // The format doesn't allow for length 1 for BMP, // so if these ever occur, they need to be promoted // to wider storage. As of Unicode 16 alpha, this // case does not arise.
supplementary = true;
} if (!supplementary) { if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) { if (len == 18 && c == 0xFDFA) { // Special marker for the one character whose decomposition // is too long.
pendingTrieInsertions.push_back({c, FDFA_MARKER << 16, supplementary}); continue;
} else {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
}
} elseif (len > LONGEST_ENCODABLE_LENGTH_32 || !len) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
} // Complex decomposition // Format for 16-bit value: // 15..13: length minus two for 16-bit case and length minus one for // the 32-bit case. Length 8 needs to fit in three bits in // the 16-bit case, and this way the value is future-proofed // up to 9 in the 16-bit case. Zero is unused and length one // in the 16-bit case goes directly into the trie. // 12: 1 if all trailing characters are guaranteed non-starters, // 0 if no guarantees about non-starterness. // Note: The bit choice is this way around to allow for // dynamically falling back to not having this but instead // having one more bit for length by merely choosing // different masks. // 11..0: Start offset in storage. The offset is to the logical // sequence of scalars16, scalars32, supplementary_scalars16, // supplementary_scalars32.
uint32_t descriptor = static_cast<uint32_t>(!nonInitialStarter) << 12; if (!supplementary) {
descriptor |= (static_cast<uint32_t>(len) - 2) << 13;
} else {
descriptor |= (static_cast<uint32_t>(len) - 1) << 13;
} if (descriptor & 0xFFF) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
size_t index = 0; if (!supplementary) {
index = findOrAppend(storage16, utf32, len);
} else {
index = findOrAppend(storage32, utf32, len);
} if (index > 0xFFF) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
descriptor |= static_cast<uint32_t>(index); if (!descriptor || descriptor > 0xFFFF) { // > 0xFFFF should never happen if the code above is correct. // == 0 should not happen due to the nature of the data.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, basename);
}
uint32_t nonRoundTripMarker = 0; if (!nonNfdOrRoundTrips) {
nonRoundTripMarker = (NON_ROUND_TRIP_MARKER << 16);
}
uint32_t canCombineBackwardsMarker = 0; if (startsWithBackwardCombiningStarter) {
canCombineBackwardsMarker = (BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER << 16);
}
pendingTrieInsertions.push_back({c, descriptor | nonRoundTripMarker | canCombineBackwardsMarker, supplementary});
}
} if (storage16.size() + storage32.size() > 0xFFF) {
status.set(U_INTERNAL_PROGRAM_ERROR);
} if (f) {
usrc_writeArray(f, "scalars32 = [\n ", nonRecursive32.data(), 32, nonRecursive32.size(), " ", "\n]\n");
void printHelp(FILE* stdfile, constchar* program) {
fprintf(stdfile, "usage: %s -m mode [-options] [--all | properties...]\n" "\tdump Unicode property data to .toml files\n" "options:\n" "\t-h or -? or --help this usage text\n" "\t-V or --version show a version message\n" "\t-m or --mode mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n" "\t --trie-type set the trie type (small or fast, default small)\n" "\t-d or --destdir destination directory, followed by the path\n" "\t --all write out all properties known to icuexportdata\n" "\t --index write an _index.toml summarizing all data exported\n" "\t-c or --copyright include a copyright notice\n" "\t-v or --verbose Turn on verbose output\n" "\t-q or --quiet do not display warnings and progress\n",
program);
}
int exportUprops(int argc, char* argv[]) { // Load list of Unicode properties
std::vector<constchar*> propNames; for (int i=1; i<argc; i++) {
propNames.push_back(argv[i]);
} if (options[OPT_ALL].doesOccur) { int i = UCHAR_BINARY_START; while (true) { if (i == UCHAR_BINARY_LIMIT) {
i = UCHAR_INT_START;
} if (i == UCHAR_INT_LIMIT) {
i = UCHAR_GENERAL_CATEGORY_MASK;
} if (i == UCHAR_GENERAL_CATEGORY_MASK + 1) {
i = UCHAR_BIDI_MIRRORING_GLYPH;
} if (i == UCHAR_BIDI_MIRRORING_GLYPH + 1) {
i = UCHAR_SCRIPT_EXTENSIONS;
} if (i == UCHAR_SCRIPT_EXTENSIONS + 1) { break;
}
UProperty uprop = static_cast<UProperty>(i); constchar* propName = u_getPropertyName(uprop, U_SHORT_PROPERTY_NAME); if (propName == nullptr) {
propName = u_getPropertyName(uprop, U_LONG_PROPERTY_NAME); if (propName != nullptr && VERBOSE) {
std::cerr << "Note: falling back to long name for: " << propName << std::endl;
}
} if (propName != nullptr) {
propNames.push_back(propName);
} else {
std::cerr << "Warning: Could not find name for: " << uprop << std::endl;
}
i++;
}
}
if (propNames.empty()
|| options[OPT_HELP_H].doesOccur
|| options[OPT_HELP_QUESTION_MARK].doesOccur
|| !options[OPT_MODE].doesOccur) {
FILE *stdfile=argc<0 ? stderr : stdout;
fprintf(stdfile, "usage: %s -m uprops [-options] [--all | properties...]\n" "\tdump Unicode property data to .toml files\n" "options:\n" "\t-h or -? or --help this usage text\n" "\t-V or --version show a version message\n" "\t-m or --mode mode: currently only 'uprops', but more may be added\n" "\t --trie-type set the trie type (small or fast, default small)\n" "\t-d or --destdir destination directory, followed by the path\n" "\t --all write out all properties known to icuexportdata\n" "\t --index write an _index.toml summarizing all data exported\n" "\t-c or --copyright include a copyright notice\n" "\t-v or --verbose Turn on verbose output\n" "\t-q or --quiet do not display warnings and progress\n",
argv[0]); return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
constchar* mode = options[OPT_MODE].value; if (uprv_strcmp(mode, "uprops") != 0) {
fprintf(stderr, "Invalid option for --mode (must be uprops)\n"); return U_ILLEGAL_ARGUMENT_ERROR;
}
if (options[OPT_TRIE_TYPE].doesOccur) { if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
trieType = UCPTRIE_TYPE_FAST;
} elseif (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
trieType = UCPTRIE_TYPE_SMALL;
} else {
fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n"); return U_ILLEGAL_ARGUMENT_ERROR;
}
}
if (options[OPT_INDEX].doesOccur) {
FILE* f = prepareOutputFile("_index");
fprintf(f, "index = [\n"); for (constchar* propName : propNames) { // At this point, propName is a valid property name, so it should be alphanum ASCII
fprintf(f, " { filename=\"%s.toml\" },\n", propName);
}
fprintf(f, "]\n");
fclose(f);
}
// Note: the USets are not exported. They are only used to check that a new // Unicode version doesn't violate expectations that are hard-coded in ICU4X.
USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty();
USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
std::vector<PendingDescriptor> nfdPendingTrieInsertions;
UChar32 nfdBound = 0x10FFFF;
UChar32 nfcBound = 0x10FFFF;
computeDecompositions("nfd",
backwardCombiningStarters,
storage16,
storage32,
nfdDecompositionStartsWithNonStarter,
nfdDecompositionStartsWithBackwardCombiningStarter,
nfdPendingTrieInsertions,
nfdBound,
nfcBound); if (!(nfdBound == 0xC0 && nfcBound == 0x300)) { // Unexpected bounds for NFD/NFC.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, __LINE__, "exportNorm");
}