/** * Build-time context and CE32 for a code point. * If a code point has contextual mappings, then the default (no-context) mapping * and all conditional mappings are stored in a singly-linked list * of ConditionalCE32, sorted by context strings. * * Context strings sort by prefix length, then by prefix, then by contraction suffix. * Context strings must be unique and in ascending order.
*/ struct ConditionalCE32 : public UMemory {
ConditionalCE32()
: context(),
ce32(0), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32),
next(-1) {}
ConditionalCE32(const UnicodeString &ct, uint32_t ce)
: context(ct),
ce32(ce), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32),
next(-1) {}
/** * "\0" for the first entry for any code point, with its default CE32. * * Otherwise one unit with the length of the prefix string, * then the prefix string, then the contraction suffix.
*/
UnicodeString context; /** * CE32 for the code point and its context. * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
*/
uint32_t ce32; /** * Default CE32 for all contexts with this same prefix. * Initially NO_CE32. Set only while building runtime data structures, * and only on one of the nodes of a sub-list with the same prefix.
*/
uint32_t defaultCE32; /** * CE32 for the built contexts. * When fetching CEs from the builder, the contexts are built into their runtime form * so that the normal collation implementation can process them. * The result is cached in the list head. It is reset when the contexts are modified. * All of these builtCE32 are invalidated by clearContexts(), * via incrementing the contextsEra.
*/
uint32_t builtCE32; /** * The "era" of building intermediate contexts when the above builtCE32 was set. * When the array of cached, temporary contexts overflows, then clearContexts() * removes them all and invalidates the builtCE32 that used to point to built tries.
*/
int32_t era = -1; /** * Index of the next ConditionalCE32. * Negative for the end of the list.
*/
int32_t next; // Note: We could create a separate class for all of the contextual mappings for // a code point, with the builtCE32, the era, and a list of the actual mappings. // The class that represents one mapping would then not need to // store those fields in each element.
};
/** * Build-time collation element and character iterator. * Uses the runtime CollationIterator for fetching CEs for a string * but reads from the builder's unfinished data structures. * In particular, this class reads from the unfinished trie * and has to avoid CollationIterator::nextCE() and redirect other * calls to data->getCE32() and data->getCE32FromSupplementary(). * * We do this so that we need not implement the collation algorithm * again for the builder and make it behave exactly like the runtime code. * That would be more difficult to test and maintain than this indirection. * * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data, * so the data accesses from those code paths need not be modified. * * This class iterates directly over whole code points * so that the CollationIterator does not need the finished trie * for handling the LEAD_SURROGATE_TAG.
*/ class DataBuilderCollationIterator : public CollationIterator { public:
DataBuilderCollationIterator(CollationDataBuilder &b);
// For a tailoring, the default is to fall back to the base. // For ICU4X, use the same value for fallback as for the default // to avoid having to have different blocks for the two.
trie = utrie2_open(Collation::FALLBACK_CE32, icu4xMode ? Collation::FALLBACK_CE32 : Collation::FFFD_CE32, &errorCode);
if (!icu4xMode) { // Set the Latin-1 letters block so that it is allocated first in the data array, // to try to improve locality of reference when sorting Latin-1 text. // Do not use utrie2_setRange32() since that will not actually allocate blocks // that are filled with the default value. // ASCII (0..7F) is already preallocated anyway. for(UChar32 c = 0xc0; c <= 0xff; ++c) {
utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode);
}
// Hangul syllables are not tailorable (except via tailoring Jamos). // Always set the Hangul tag to help performance. // Do this here, rather than in buildMappings(), // so that we see the HANGUL_TAG in various assertions.
uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
// Copy the set contents but don't copy/clone the set as a whole because // that would copy the isFrozen state too.
unsafeBackwardSet.addAll(*b->unsafeBackwardSet);
}
if(U_FAILURE(errorCode)) { return; }
}
UBool
CollationDataBuilder::maybeSetPrimaryRange(UChar32 start, UChar32 end,
uint32_t primary, int32_t step,
UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { returnfalse; }
U_ASSERT(start <= end); // TODO: Do we need to check what values are currently set for start..end? // An offset range is worth it only if we can achieve an overlap between // adjacent UTrie2 blocks of 32 code points each. // An offset CE is also a little more expensive to look up and compute // than a simple CE. // If the range spans at least three UTrie2 block boundaries (> 64 code points), // then we take it. // If the range spans one or two block boundaries and there are // at least 4 code points on either side, then we take it. // (We could additionally require a minimum range length of, say, 16.)
int32_t blockDelta = (end >> 5) - (start >> 5); if(2 <= step && step <= 0x7f &&
(blockDelta >= 3 ||
(blockDelta > 0 && (start & 0x1f) <= 0x1c && (end & 0x1f) >= 3))) {
int64_t dataCE = (static_cast<int64_t>(primary) << 32) | (start << 8) | step; if(isCompressiblePrimary(primary)) { dataCE |= 0x80; }
int32_t index = addCE(dataCE, errorCode); if(U_FAILURE(errorCode)) { return 0; } if(index > Collation::MAX_INDEX) {
errorCode = U_BUFFER_OVERFLOW_ERROR; return 0;
}
uint32_t offsetCE32 = Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG, index);
utrie2_setRange32(trie, start, end, offsetCE32, true, &errorCode);
modified = true; returntrue;
} else { returnfalse;
}
}
if (icu4xMode) { if (base && c >= 0x1100 && c < 0x1200) { // Omit jamo tailorings. // TODO(https://github.com/unicode-org/icu4x/issues/1941).
} const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(errorCode);
UnicodeString sInNfd;
nfdNormalizer->normalize(s, sInNfd, errorCode); if (s != sInNfd) { // s is not in NFD, so it cannot match in ICU4X, since ICU4X only // does NFD lookups.
// As of Unicode 16 alpha, the cases that come here are: // // 1. The second character is a special decomposing Tibetan vowel // sign. These are OK to ignore in the precomposed form, since // the decomposed form is added also. // 2. Likewise for KIRAT RAI VOWEL SIGN AA followed by KIRAT RAI VOWEL SIGN AI // and other such cases. // For details see the normalization section of // https://www.unicode.org/review/pri497/pri497-background.html // 3. U+FDD1 followed by U+AC00 is a marker for the alphabetical // index feature of ICU4C, which at this time does not have // a counterpart in ICU4X. return;
}
if (!prefix.isEmpty()) {
UnicodeString prefixInNfd;
nfdNormalizer->normalize(prefix, prefixInNfd, errorCode); if (prefix != prefixInNfd) {
errorCode = U_UNSUPPORTED_ERROR; return;
}
int32_t count = prefix.countChar32(); if (count > 2) { // Prefix too long for ICU4X.
errorCode = U_UNSUPPORTED_ERROR; return;
}
UChar32 utf32[4];
int32_t len = prefix.toUTF32(utf32, 4, errorCode); if (len != count) {
errorCode = U_INVALID_STATE_ERROR; return;
}
UChar32 c = utf32[0]; if (u_getCombiningClass(c)) { // Prefix must start with as starter for ICU4X.
errorCode = U_UNSUPPORTED_ERROR; return;
} // XXX: Korean searchjl has jamo in prefix, so commenting out this // check for now. ICU4X currently ignores non-root jamo tables anyway. // searchjl was added in // https://unicode-org.atlassian.net/browse/CLDR-3560 // Contractions were changed to prefixes in // https://unicode-org.atlassian.net/browse/CLDR-6546 // // if ((c >= 0x1100 && c < 0x1200) || (c >= 0xAC00 && c < 0xD7A4)) { // errorCode = U_UNSUPPORTED_ERROR; // return; // } if ((len > 1) && !(utf32[1] == 0x3099 || utf32[1] == 0x309A)) { // Second character in prefix, if present, must be a kana voicing mark for ICU4X.
errorCode = U_UNSUPPORTED_ERROR; return;
}
}
if (s.length() > cLength) { // Check that there's no modern Hangul in contractions. for (int32_t i = 0; i < s.length(); ++i) {
char16_t c = s.charAt(i); if ((c >= 0x1100 && c < 0x1100 + 19) || (c >= 0x1161 && c < 0x1161 + 21) || (c >= 0x11A7 && c < 0x11A7 + 28) || (c >= 0xAC00 && c < 0xD7A4)) {
errorCode = U_UNSUPPORTED_ERROR; return;
}
}
}
}
if(oldCE32 == Collation::FALLBACK_CE32) { // First tailoring for c. // If c has contextual base mappings or if we add a contextual mapping, // then copy the base mappings. // Otherwise we just override the base mapping.
uint32_t baseCE32 = base->getFinalCE32(base->getCE32(c)); if(hasContext || Collation::ce32HasContext(baseCE32)) {
oldCE32 = copyFromBaseCE32(c, baseCE32, true, errorCode);
utrie2_set32(trie, c, oldCE32, &errorCode); if(U_FAILURE(errorCode)) { return; }
}
} if(!hasContext) { // No prefix, no contraction. if(!isBuilderContextCE32(oldCE32)) {
utrie2_set32(trie, c, ce32, &errorCode);
} else {
ConditionalCE32 *cond = getConditionalCE32ForCE32(oldCE32);
cond->builtCE32 = Collation::NO_CE32;
cond->ce32 = ce32;
}
} else {
ConditionalCE32 *cond; if(!isBuilderContextCE32(oldCE32)) { // Replace the simple oldCE32 with a builder context CE32 // pointing to a new ConditionalCE32 list head.
int32_t index = addConditionalCE32(UnicodeString(static_cast<char16_t>(0)), oldCE32, errorCode); if(U_FAILURE(errorCode)) { return; }
uint32_t contextCE32 = makeBuilderContextCE32(index);
utrie2_set32(trie, c, contextCE32, &errorCode);
contextChars.add(c);
cond = getConditionalCE32(index);
} else {
cond = getConditionalCE32ForCE32(oldCE32);
cond->builtCE32 = Collation::NO_CE32;
}
UnicodeString suffix(s, cLength);
UnicodeString context(static_cast<char16_t>(prefix.length()));
context.append(prefix).append(suffix);
unsafeBackwardSet.addAll(suffix); for(;;) { // invariant: context > cond->context
int32_t next = cond->next; if(next < 0) { // Append a new ConditionalCE32 after cond.
int32_t index = addConditionalCE32(context, ce32, errorCode); if(U_FAILURE(errorCode)) { return; }
cond->next = index; break;
}
ConditionalCE32 *nextCond = getConditionalCE32(next);
int8_t cmp = context.compare(nextCond->context); if(cmp < 0) { // Insert a new ConditionalCE32 between cond and nextCond.
int32_t index = addConditionalCE32(context, ce32, errorCode); if(U_FAILURE(errorCode)) { return; }
cond->next = index;
getConditionalCE32(index)->next = next; break;
} elseif(cmp == 0) { // Same context as before, overwrite its ce32.
nextCond->ce32 = ce32; break;
}
cond = nextCond;
}
}
modified = true;
}
uint32_t
CollationDataBuilder::encodeOneCEAsCE32(int64_t ce) {
uint32_t p = static_cast<uint32_t>(ce >> 32);
uint32_t lower32 = static_cast<uint32_t>(ce);
uint32_t t = static_cast<uint32_t>(ce & 0xffff);
U_ASSERT((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s. if((ce & INT64_C(0xffff00ff00ff)) == 0) { // normal form ppppsstt return p | (lower32 >> 16) | (t >> 8);
} elseif((ce & INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE) { // long-primary form ppppppC1 return Collation::makeLongPrimaryCE32(p);
} elseif(p == 0 && (t & 0xff) == 0) { // long-secondary form ssssttC2 return Collation::makeLongSecondaryCE32(lower32);
} return Collation::NO_CE32;
}
uint32_t
CollationDataBuilder::encodeOneCE(int64_t ce, UErrorCode &errorCode) { // Try to encode one CE as one CE32.
uint32_t ce32 = encodeOneCEAsCE32(ce); if(ce32 != Collation::NO_CE32) { return ce32; }
int32_t index = addCE(ce, errorCode); if(U_FAILURE(errorCode)) { return 0; } if(index > Collation::MAX_INDEX) {
errorCode = U_BUFFER_OVERFLOW_ERROR; return 0;
} return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, index, 1);
}
uint32_t
CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength,
UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; } if(cesLength < 0 || cesLength > Collation::MAX_EXPANSION_LENGTH) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR; return 0;
} if(trie == nullptr || utrie2_isFrozen(trie)) {
errorCode = U_INVALID_STATE_ERROR; return 0;
} if(cesLength == 0) { // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE. // Do this here so that callers need not do it. return encodeOneCEAsCE32(0);
} elseif(cesLength == 1) { return encodeOneCE(ces[0], errorCode);
} elseif(cesLength == 2 && !icu4xMode) { // Try to encode two CEs as one CE32. // Turn this off for ICU4X, because without the canonical closure // these are so rare that it doesn't make sense to spend a branch // on checking this tag when using the data.
int64_t ce0 = ces[0];
int64_t ce1 = ces[1];
uint32_t p0 = static_cast<uint32_t>(ce0 >> 32); if((ce0 & INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE &&
(ce1 & INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE &&
p0 != 0) { // Latin mini expansion return
p0 |
((static_cast<uint32_t>(ce0) & 0xff00u) << 8) | static_cast<uint32_t>(ce1 >> 16) |
Collation::SPECIAL_CE32_LOW_BYTE |
Collation::LATIN_EXPANSION_TAG;
}
} // Try to encode two or more CEs as CE32s.
int32_t newCE32s[Collation::MAX_EXPANSION_LENGTH]; for(int32_t i = 0;; ++i) { if(i == cesLength) { return encodeExpansion32(newCE32s, cesLength, errorCode);
}
uint32_t ce32 = encodeOneCEAsCE32(ces[i]); if(ce32 == Collation::NO_CE32) { break; }
newCE32s[i] = static_cast<int32_t>(ce32);
} return encodeExpansion(ces, cesLength, errorCode);
}
uint32_t
CollationDataBuilder::encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; } // See if this sequence of CEs has already been stored.
int64_t first = ces[0];
int32_t ce64sMax = ce64s.size() - length; for(int32_t i = 0; i <= ce64sMax; ++i) { if(first == ce64s.elementAti(i)) { if(i > Collation::MAX_INDEX) {
errorCode = U_BUFFER_OVERFLOW_ERROR; return 0;
} for(int32_t j = 1;; ++j) { if(j == length) { return Collation::makeCE32FromTagIndexAndLength(
Collation::EXPANSION_TAG, i, length);
} if(ce64s.elementAti(i + j) != ces[j]) { break; }
}
}
} // Store the new sequence.
int32_t i = ce64s.size(); if(i > Collation::MAX_INDEX) {
errorCode = U_BUFFER_OVERFLOW_ERROR; return 0;
} for(int32_t j = 0; j < length; ++j) {
ce64s.addElement(ces[j], errorCode);
} return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, i, length);
}
uint32_t
CollationDataBuilder::encodeExpansion32(const int32_t newCE32s[], int32_t length,
UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; } // See if this sequence of CE32s has already been stored.
int32_t first = newCE32s[0];
int32_t ce32sMax = ce32s.size() - length; for(int32_t i = 0; i <= ce32sMax; ++i) { if(first == ce32s.elementAti(i)) { if(i > Collation::MAX_INDEX) {
errorCode = U_BUFFER_OVERFLOW_ERROR; return 0;
} for(int32_t j = 1;; ++j) { if(j == length) { return Collation::makeCE32FromTagIndexAndLength(
Collation::EXPANSION32_TAG, i, length);
} if(ce32s.elementAti(i + j) != newCE32s[j]) { break; }
}
}
} // Store the new sequence.
int32_t i = ce32s.size(); if(i > Collation::MAX_INDEX) {
errorCode = U_BUFFER_OVERFLOW_ERROR; return 0;
} for(int32_t j = 0; j < length; ++j) {
ce32s.addElement(newCE32s[j], errorCode);
} return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG, i, length);
}
uint32_t
CollationDataBuilder::copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext,
UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; } if(!Collation::isSpecialCE32(ce32)) { return ce32; } switch(Collation::tagFromCE32(ce32)) { case Collation::LONG_PRIMARY_TAG: case Collation::LONG_SECONDARY_TAG: case Collation::LATIN_EXPANSION_TAG: // copy as is break; case Collation::EXPANSION32_TAG: { const uint32_t *baseCE32s = base->ce32s + Collation::indexFromCE32(ce32);
int32_t length = Collation::lengthFromCE32(ce32);
ce32 = encodeExpansion32( reinterpret_cast<const int32_t *>(baseCE32s), length, errorCode); break;
} case Collation::EXPANSION_TAG: { const int64_t *baseCEs = base->ces + Collation::indexFromCE32(ce32);
int32_t length = Collation::lengthFromCE32(ce32);
ce32 = encodeExpansion(baseCEs, length, errorCode); break;
} case Collation::PREFIX_TAG: { // Flatten prefixes and nested suffixes (contractions) // into a linear list of ConditionalCE32. const char16_t *p = base->contexts + Collation::indexFromCE32(ce32);
ce32 = CollationData::readCE32(p); // Default if no prefix match. if(!withContext) { return copyFromBaseCE32(c, ce32, false, errorCode);
}
ConditionalCE32 head;
UnicodeString context(static_cast<char16_t>(0));
int32_t index; if(Collation::isContractionCE32(ce32)) {
index = copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
} else {
ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
head.next = index = addConditionalCE32(context, ce32, errorCode);
} if(U_FAILURE(errorCode)) { return 0; }
ConditionalCE32 *cond = getConditionalCE32(index); // the last ConditionalCE32 so far
UCharsTrie::Iterator prefixes(p + 2, 0, errorCode); while(prefixes.next(errorCode)) {
context = prefixes.getString();
context.reverse();
context.insert(0, static_cast<char16_t>(context.length()));
ce32 = static_cast<uint32_t>(prefixes.getValue()); if(Collation::isContractionCE32(ce32)) {
index = copyContractionsFromBaseCE32(context, c, ce32, cond, errorCode);
} else {
ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
cond->next = index = addConditionalCE32(context, ce32, errorCode);
} if(U_FAILURE(errorCode)) { return 0; }
cond = getConditionalCE32(index);
}
ce32 = makeBuilderContextCE32(head.next);
contextChars.add(c); break;
} case Collation::CONTRACTION_TAG: { if(!withContext) { const char16_t *p = base->contexts + Collation::indexFromCE32(ce32);
ce32 = CollationData::readCE32(p); // Default if no suffix match. return copyFromBaseCE32(c, ce32, false, errorCode);
}
ConditionalCE32 head;
UnicodeString context(static_cast<char16_t>(0));
copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
ce32 = makeBuilderContextCE32(head.next);
contextChars.add(c); break;
} case Collation::HANGUL_TAG:
errorCode = U_UNSUPPORTED_ERROR; // We forbid tailoring of Hangul syllables. break; case Collation::OFFSET_TAG:
ce32 = getCE32FromOffsetCE32(true, c, ce32); break; case Collation::IMPLICIT_TAG:
ce32 = encodeOneCE(Collation::unassignedCEFromCodePoint(c), errorCode); break; default:
UPRV_UNREACHABLE_EXIT; // require ce32 == base->getFinalCE32(ce32)
} return ce32;
}
int32_t
CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
ConditionalCE32 *cond, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; } const char16_t *p = base->contexts + Collation::indexFromCE32(ce32);
int32_t index; if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { // No match on the single code point. // We are underneath a prefix, and the default mapping is just // a fallback to the mappings for a shorter prefix.
U_ASSERT(context.length() > 1);
index = -1;
} else {
ce32 = CollationData::readCE32(p); // Default if no suffix match.
U_ASSERT(!Collation::isContractionCE32(ce32));
ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
cond->next = index = addConditionalCE32(context, ce32, errorCode); if(U_FAILURE(errorCode)) { return 0; }
cond = getConditionalCE32(index);
}
int32_t suffixStart = context.length();
UCharsTrie::Iterator suffixes(p + 2, 0, errorCode); while(suffixes.next(errorCode)) {
context.append(suffixes.getString());
ce32 = copyFromBaseCE32(c, static_cast<uint32_t>(suffixes.getValue()), true, errorCode);
cond->next = index = addConditionalCE32(context, ce32, errorCode); if(U_FAILURE(errorCode)) { return 0; } // No need to update the unsafeBackwardSet because the tailoring set // is already a copy of the base set.
cond = getConditionalCE32(index);
context.truncate(suffixStart);
}
U_ASSERT(index >= 0); return index;
}
void
CollationDataBuilder::copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } if(trie == nullptr || utrie2_isFrozen(trie)) {
errorCode = U_INVALID_STATE_ERROR; return;
}
CopyHelper helper(src, *this, modifier, errorCode);
utrie2_enum(src.trie, nullptr, enumRangeForCopy, &helper);
errorCode = helper.errorCode; // Update the contextChars and the unsafeBackwardSet while copying, // in case a character had conditional mappings in the source builder // and they were removed later.
modified |= src.modified;
}
uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH];
int32_t jamoIndex = -1; if(getJamoCE32s(jamoCE32s, errorCode)) {
jamoIndex = ce32s.size(); for(int32_t i = 0; i < CollationData::JAMO_CE32S_LENGTH; ++i) {
ce32s.addElement(static_cast<int32_t>(jamoCE32s[i]), errorCode);
} // Small optimization: Use a bit in the Hangul ce32 // to indicate that none of the Jamo CE32s are isSpecialCE32() // (as it should be in the root collator). // It allows CollationIterator to avoid recursive function calls and per-Jamo tests. // In order to still have good trie compression and keep this code simple, // we only set this flag if a whole block of 588 Hangul syllables starting with // a common leading consonant (Jamo L) has this property.
UBool isAnyJamoVTSpecial = false; for(int32_t i = Hangul::JAMO_L_COUNT; i < CollationData::JAMO_CE32S_LENGTH; ++i) { if(Collation::isSpecialCE32(jamoCE32s[i])) {
isAnyJamoVTSpecial = true; break;
}
}
uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
UChar32 c = Hangul::HANGUL_BASE; for(int32_t i = 0; i < Hangul::JAMO_L_COUNT; ++i) { // iterate over the Jamo L
uint32_t ce32 = hangulCE32; if(!isAnyJamoVTSpecial && !Collation::isSpecialCE32(jamoCE32s[i])) {
ce32 |= Collation::HANGUL_NO_SPECIAL_JAMO;
}
UChar32 limit = c + Hangul::JAMO_VT_COUNT;
utrie2_setRange32(trie, c, limit - 1, ce32, true, &errorCode);
c = limit;
}
} else { // Copy the Hangul CE32s from the base in blocks per Jamo L, // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks. for(UChar32 c = Hangul::HANGUL_BASE; c < Hangul::HANGUL_LIMIT;) {
uint32_t ce32 = base->getCE32(c);
U_ASSERT(Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG));
UChar32 limit = c + Hangul::JAMO_VT_COUNT;
utrie2_setRange32(trie, c, limit - 1, ce32, true, &errorCode);
c = limit;
}
}
if (!icu4xMode) { // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
ce32s.setElementAt(static_cast<int32_t>(utrie2_get32(trie, 0)), 0);
utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);
}
// Mark each lead surrogate as "unsafe" // if any of its 1024 associated supplementary code points is "unsafe".
UChar32 c = 0x10000; for(char16_t lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) {
unsafeBackwardSet.add(lead);
}
}
unsafeBackwardSet.freeze();
void
CollationDataBuilder::clearContexts() {
contexts.remove(); // Incrementing the contexts build "era" invalidates all of the builtCE32 // from before this clearContexts() call. // Simpler than finding and resetting all of those fields.
++contextsEra;
}
void
CollationDataBuilder::buildContexts(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } // Ignore abandoned lists and the cached builtCE32, // and build all contexts from scratch.
clearContexts();
UnicodeSetIterator iter(contextChars); while(U_SUCCESS(errorCode) && iter.next()) {
U_ASSERT(!iter.isString());
UChar32 c = iter.getCodepoint();
uint32_t ce32 = utrie2_get32(trie, c); if(!isBuilderContextCE32(ce32)) { // Impossible: No context data for c in contextChars.
errorCode = U_INTERNAL_PROGRAM_ERROR; return;
}
ConditionalCE32 *cond = getConditionalCE32ForCE32(ce32);
ce32 = buildContext(cond, errorCode);
utrie2_set32(trie, c, ce32, &errorCode);
}
}
uint32_t
CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; } // The list head must have no context.
U_ASSERT(!head->hasContext()); // The list head must be followed by one or more nodes that all do have context.
U_ASSERT(head->next >= 0);
UCharsTrieBuilder prefixBuilder(errorCode);
UCharsTrieBuilder contractionBuilder(errorCode); // This outer loop goes from each prefix to the next. // For each prefix it finds the one or more same-prefix entries (firstCond..lastCond). // If there are multiple suffixes for the same prefix, // then an inner loop builds a contraction trie for them. for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) { if(U_FAILURE(errorCode)) { return 0; } // early out for memory allocation errors // After the list head, the prefix or suffix can be empty, but not both.
U_ASSERT(cond == head || cond->hasContext());
int32_t prefixLength = cond->prefixLength();
UnicodeString prefix(cond->context, 0, prefixLength + 1); // Collect all contraction suffixes for one prefix.
ConditionalCE32 *firstCond = cond;
ConditionalCE32 *lastCond; do {
lastCond = cond; // Clear the defaultCE32 fields as we go. // They are left over from building a previous version of this list of contexts. // // One of the code paths below may copy a preceding defaultCE32 // into its emptySuffixCE32. // If a new suffix has been inserted before what used to be // the firstCond for its prefix, then that previous firstCond could still // contain an outdated defaultCE32 from an earlier buildContext() and // result in an incorrect emptySuffixCE32. // So we reset all defaultCE32 before reading and setting new values.
cond->defaultCE32 = Collation::NO_CE32;
} while(cond->next >= 0 &&
(cond = getConditionalCE32(cond->next))->context.startsWith(prefix));
uint32_t ce32;
int32_t suffixStart = prefixLength + 1; // == prefix.length() if(lastCond->context.length() == suffixStart) { // One prefix without contraction suffix.
U_ASSERT(firstCond == lastCond);
ce32 = lastCond->ce32;
cond = lastCond;
} else { // Build the contractions trie.
contractionBuilder.clear(); // Entry for an empty suffix, to be stored before the trie.
uint32_t emptySuffixCE32 = 0;
uint32_t flags = 0; if(firstCond->context.length() == suffixStart) { // There is a mapping for the prefix and the single character c. (p|c) // If no other suffix matches, then we return this value.
emptySuffixCE32 = firstCond->ce32;
cond = getConditionalCE32(firstCond->next);
} else { // There is no mapping for the prefix and just the single character. // (There is no p|c, only p|cd, p|ce etc.)
flags |= Collation::CONTRACT_SINGLE_CP_NO_MATCH; // When the prefix matches but none of the prefix-specific suffixes, // then we fall back to the mappings with the next-longest prefix, // and ultimately to mappings with no prefix. // Each fallback might be another set of contractions. // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c, // then in text "pch" we find the ch contraction. for(cond = head;; cond = getConditionalCE32(cond->next)) {
int32_t length = cond->prefixLength(); if(length == prefixLength) { break; } if(cond->defaultCE32 != Collation::NO_CE32 &&
(length==0 || prefix.endsWith(cond->context, 1, length))) {
emptySuffixCE32 = cond->defaultCE32;
}
}
cond = firstCond;
} // Optimization: Set a flag when // the first character of every contraction suffix has lccc!=0. // Short-circuits contraction matching when a normal letter follows.
flags |= Collation::CONTRACT_NEXT_CCC; // Add all of the non-empty suffixes into the contraction trie. for(;;) {
UnicodeString suffix(cond->context, suffixStart);
uint16_t fcd16 = nfcImpl.getFCD16(suffix.char32At(0)); if(fcd16 <= 0xff) {
flags &= ~Collation::CONTRACT_NEXT_CCC;
}
fcd16 = nfcImpl.getFCD16(suffix.char32At(suffix.length() - 1)); if(fcd16 > 0xff) { // The last suffix character has lccc!=0, allowing for discontiguous contractions.
flags |= Collation::CONTRACT_TRAILING_CCC;
} if (icu4xMode && (flags & Collation::CONTRACT_HAS_STARTER) == 0) { for (int32_t i = 0; i < suffix.length();) {
UChar32 c = suffix.char32At(i); if (!u_getCombiningClass(c)) {
flags |= Collation::CONTRACT_HAS_STARTER; break;
} if (c > 0xFFFF) {
i += 2;
} else {
++i;
}
}
}
contractionBuilder.add(suffix, static_cast<int32_t>(cond->ce32), errorCode); if(cond == lastCond) { break; }
cond = getConditionalCE32(cond->next);
}
int32_t index = addContextTrie(emptySuffixCE32, contractionBuilder, errorCode); if(U_FAILURE(errorCode)) { return 0; } if(index > Collation::MAX_INDEX) {
errorCode = U_BUFFER_OVERFLOW_ERROR; return 0;
}
ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | flags;
}
U_ASSERT(cond == lastCond);
firstCond->defaultCE32 = ce32; if(prefixLength == 0) { if(cond->next < 0) { // No non-empty prefixes, only contractions. return ce32;
}
} else {
prefix.remove(0, 1); // Remove the length unit.
prefix.reverse();
prefixBuilder.add(prefix, static_cast<int32_t>(ce32), errorCode); if(cond->next < 0) { break; }
}
}
U_ASSERT(head->defaultCE32 != Collation::NO_CE32);
int32_t index = addContextTrie(head->defaultCE32, prefixBuilder, errorCode); if(U_FAILURE(errorCode)) { return 0; } if(index > Collation::MAX_INDEX) {
errorCode = U_BUFFER_OVERFLOW_ERROR; return 0;
} return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG, index);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.