// These methods are here, rather than in rulebasedcollator.cpp, // for modularization: // Most code using Collator does not need to build a Collator from rules. // By moving these constructors and helper methods to a separate file, // most code will not have a static dependency on the builder code.
int64_t node = nodes.elementAti(index); // If the index is for a "weaker" node, // then skip backwards over this and further "weaker" nodes. while(strengthFromNode(node) > strength) {
index = previousIndexFromNode(node);
node = nodes.elementAti(index);
}
// Find or insert a node whose index we will put into a temporary CE. if(strengthFromNode(node) == strength && isTailoredNode(node)) { // Reset to just before this same-strength tailored node.
index = previousIndexFromNode(node);
} elseif(strength == UCOL_PRIMARY) { // root primary node (has no previous index)
uint32_t p = weight32FromNode(node); if(p == 0) {
errorCode = U_UNSUPPORTED_ERROR;
parserErrorReason = "reset primary-before ignorable not possible"; return;
} if(p <= rootElements.getFirstPrimary()) { // There is no primary gap between ignorables and the space-first-primary.
errorCode = U_UNSUPPORTED_ERROR;
parserErrorReason = "reset primary-before first non-ignorable not supported"; return;
} if(p == Collation::FIRST_TRAILING_PRIMARY) { // We do not support tailoring to an unassigned-implicit CE.
errorCode = U_UNSUPPORTED_ERROR;
parserErrorReason = "reset primary-before [first trailing] not supported"; return;
}
p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p));
index = findOrInsertNodeForPrimary(p, errorCode); // Go to the last node in this list: // Tailor after the last node between adjacent root nodes. for(;;) {
node = nodes.elementAti(index);
int32_t nextIndex = nextIndexFromNode(node); if(nextIndex == 0) { break; }
index = nextIndex;
}
} else { // &[before 2] or &[before 3]
index = findCommonNode(index, UCOL_SECONDARY); if(strength >= UCOL_TERTIARY) {
index = findCommonNode(index, UCOL_TERTIARY);
} // findCommonNode() stayed on the stronger node or moved to // an explicit common-weight node of the reset-before strength.
node = nodes.elementAti(index); if(strengthFromNode(node) == strength) { // Found a same-strength node with an explicit weight.
uint32_t weight16 = weight16FromNode(node); if(weight16 == 0) {
errorCode = U_UNSUPPORTED_ERROR; if(strength == UCOL_SECONDARY) {
parserErrorReason = "reset secondary-before secondary ignorable not possible";
} else {
parserErrorReason = "reset tertiary-before completely ignorable not possible";
} return;
}
U_ASSERT(weight16 > Collation::BEFORE_WEIGHT16); // Reset to just before this node. // Insert the preceding same-level explicit weight if it is not there already. // Which explicit weight immediately precedes this one?
weight16 = getWeight16Before(index, node, strength); // Does this preceding weight have a node?
uint32_t previousWeight16;
int32_t previousIndex = previousIndexFromNode(node); for(int32_t i = previousIndex;; i = previousIndexFromNode(node)) {
node = nodes.elementAti(i);
int32_t previousStrength = strengthFromNode(node); if(previousStrength < strength) {
U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16 || i == previousIndex); // Either the reset element has an above-common weight and // the parent node provides the implied common weight, // or the reset element has a weight<=common in the node // right after the parent, and we need to insert the preceding weight.
previousWeight16 = Collation::COMMON_WEIGHT16; break;
} elseif(previousStrength == strength && !isTailoredNode(node)) {
previousWeight16 = weight16FromNode(node); break;
} // Skip weaker nodes and same-level tailored nodes.
} if(previousWeight16 == weight16) { // The preceding weight has a node, // maybe with following weaker or tailored nodes. // Reset to the last of them.
index = previousIndex;
} else { // Insert a node with the preceding weight, reset to that.
node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
index = insertNodeBetween(previousIndex, index, node, errorCode);
}
} else { // Found a stronger node with implied strength-common weight.
uint32_t weight16 = getWeight16Before(index, node, strength);
index = findOrInsertWeakNode(index, weight16, strength, errorCode);
} // Strength of the temporary CE = strength of its reset position. // Code above raises an error if the before-strength is stronger.
strength = ceStrength(ces[cesLength - 1]);
} if(U_FAILURE(errorCode)) {
parserErrorReason = "inserting reset position for &[before n]"; return;
}
ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
}
uint32_t
CollationBuilder::getWeight16Before(int32_t index, int64_t node, int32_t level) {
U_ASSERT(strengthFromNode(node) < level || !isTailoredNode(node)); // Collect the root CE weights if this node is for a root CE. // If it is not, then return the low non-primary boundary for a tailored CE.
uint32_t t; if(strengthFromNode(node) == UCOL_TERTIARY) {
t = weight16FromNode(node);
} else {
t = Collation::COMMON_WEIGHT16; // Stronger node with implied common weight.
} while(strengthFromNode(node) > UCOL_SECONDARY) {
index = previousIndexFromNode(node);
node = nodes.elementAti(index);
} if(isTailoredNode(node)) { return Collation::BEFORE_WEIGHT16;
}
uint32_t s; if(strengthFromNode(node) == UCOL_SECONDARY) {
s = weight16FromNode(node);
} else {
s = Collation::COMMON_WEIGHT16; // Stronger node with implied common weight.
} while(strengthFromNode(node) > UCOL_PRIMARY) {
index = previousIndexFromNode(node);
node = nodes.elementAti(index);
} if(isTailoredNode(node)) { return Collation::BEFORE_WEIGHT16;
} // [p, s, t] is a root CE. Return the preceding weight for the requested level.
uint32_t p = weight32FromNode(node);
uint32_t weight16; if(level == UCOL_SECONDARY) {
weight16 = rootElements.getSecondaryBefore(p, s);
} else {
weight16 = rootElements.getTertiaryBefore(p, s, t);
U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);
} return weight16;
}
int64_t
CollationBuilder::getSpecialResetPosition(const UnicodeString &str, constchar *&parserErrorReason, UErrorCode &errorCode) {
U_ASSERT(str.length() == 2);
int64_t ce;
int32_t strength = UCOL_PRIMARY;
UBool isBoundary = false;
UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE;
U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING); switch(pos) { case CollationRuleParser::FIRST_TERTIARY_IGNORABLE: // Quaternary CEs are not supported. // Non-zero quaternary weights are possible only on tertiary or stronger CEs. return 0; case CollationRuleParser::LAST_TERTIARY_IGNORABLE: return 0; case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: { // Look for a tailored tertiary node after [0, 0, 0].
int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode); if(U_FAILURE(errorCode)) { return 0; }
int64_t node = nodes.elementAti(index); if((index = nextIndexFromNode(node)) != 0) {
node = nodes.elementAti(index);
U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY); if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) { return tempCEFromIndexAndStrength(index, UCOL_TERTIARY);
}
} return rootElements.getFirstTertiaryCE(); // No need to look for nodeHasAnyBefore() on a tertiary node.
} case CollationRuleParser::LAST_SECONDARY_IGNORABLE:
ce = rootElements.getLastTertiaryCE();
strength = UCOL_TERTIARY; break; case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: { // Look for a tailored secondary node after [0, 0, *].
int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode); if(U_FAILURE(errorCode)) { return 0; }
int64_t node = nodes.elementAti(index); while((index = nextIndexFromNode(node)) != 0) {
node = nodes.elementAti(index);
strength = strengthFromNode(node); if(strength < UCOL_SECONDARY) { break; } if(strength == UCOL_SECONDARY) { if(isTailoredNode(node)) { if(nodeHasBefore3(node)) {
index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
U_ASSERT(isTailoredNode(nodes.elementAti(index)));
} return tempCEFromIndexAndStrength(index, UCOL_SECONDARY);
} else { break;
}
}
}
ce = rootElements.getFirstSecondaryCE();
strength = UCOL_SECONDARY; break;
} case CollationRuleParser::LAST_PRIMARY_IGNORABLE:
ce = rootElements.getLastSecondaryCE();
strength = UCOL_SECONDARY; break; case CollationRuleParser::FIRST_VARIABLE:
ce = rootElements.getFirstPrimaryCE();
isBoundary = true; // FractionalUCA.txt: FDD1 00A0, SPACE first primary break; case CollationRuleParser::LAST_VARIABLE:
ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1); break; case CollationRuleParser::FIRST_REGULAR:
ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);
isBoundary = true; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary break; case CollationRuleParser::LAST_REGULAR: // Use the Hani-first-primary rather than the actual last "regular" CE before it, // for backward compatibility with behavior before the introduction of // script-first-primary CEs in the root collator.
ce = rootElements.firstCEWithPrimaryAtLeast(
baseData->getFirstPrimaryForGroup(USCRIPT_HAN)); break; case CollationRuleParser::FIRST_IMPLICIT:
ce = baseData->getSingleCE(0x4e00, errorCode); break; case CollationRuleParser::LAST_IMPLICIT: // We do not support tailoring to an unassigned-implicit CE.
errorCode = U_UNSUPPORTED_ERROR;
parserErrorReason = "reset to [last implicit] not supported"; return 0; case CollationRuleParser::FIRST_TRAILING:
ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
isBoundary = true; // trailing first primary (there is no mapping for it) break; case CollationRuleParser::LAST_TRAILING:
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
parserErrorReason = "LDML forbids tailoring to U+FFFF"; return 0; default:
UPRV_UNREACHABLE_EXIT;
}
int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode); if(U_FAILURE(errorCode)) { return 0; }
int64_t node = nodes.elementAti(index); if((pos & 1) == 0) { // even pos = [first xyz] if(!nodeHasAnyBefore(node) && isBoundary) { // A <group> first primary boundary is artificially added to FractionalUCA.txt. // It is reachable via its special contraction, but is not normally used. // Find the first character tailored after the boundary CE, // or the first real root CE after it. if((index = nextIndexFromNode(node)) != 0) { // If there is a following node, then it must be tailored // because there are no root CEs with a boundary primary // and non-common secondary/tertiary weights.
node = nodes.elementAti(index);
U_ASSERT(isTailoredNode(node));
ce = tempCEFromIndexAndStrength(index, strength);
} else {
U_ASSERT(strength == UCOL_PRIMARY);
uint32_t p = static_cast<uint32_t>(ce >> 32);
int32_t pIndex = rootElements.findPrimary(p);
UBool isCompressible = baseData->isCompressiblePrimary(p);
p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);
ce = Collation::makeCE(p);
index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode); if(U_FAILURE(errorCode)) { return 0; }
node = nodes.elementAti(index);
}
} if(nodeHasAnyBefore(node)) { // Get the first node that was tailored before this one at a weaker strength. if(nodeHasBefore2(node)) {
index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
node = nodes.elementAti(index);
} if(nodeHasBefore3(node)) {
index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
}
U_ASSERT(isTailoredNode(nodes.elementAti(index)));
ce = tempCEFromIndexAndStrength(index, strength);
}
} else { // odd pos = [last xyz] // Find the last node that was tailored after the [last xyz] // at a strength no greater than the position's strength. for(;;) {
int32_t nextIndex = nextIndexFromNode(node); if(nextIndex == 0) { break; }
int64_t nextNode = nodes.elementAti(nextIndex); if(strengthFromNode(nextNode) < strength) { break; }
index = nextIndex;
node = nextNode;
} // Do not make a temporary CE for a root node. // This last node might be the node for the root CE itself, // or a node with a common secondary or tertiary weight. if(isTailoredNode(node)) {
ce = tempCEFromIndexAndStrength(index, strength);
}
} return ce;
}
// The runtime code decomposes Hangul syllables on the fly, // with recursive processing but without making the Jamo pieces visible for matching. // It does not work with certain types of contextual mappings.
int32_t nfdLength = nfdString.length(); if(nfdLength >= 2) {
char16_t c = nfdString.charAt(0); if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) { // While handling a Hangul syllable, contractions starting with Jamo L or V // would not see the following Jamo of that syllable.
errorCode = U_UNSUPPORTED_ERROR;
parserErrorReason = "contractions starting with conjoining Jamo L or V not supported"; return;
}
c = nfdString.charAt(nfdLength - 1); if(Hangul::isJamoL(c) ||
(Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) { // A contraction ending with Jamo L or L+V would require // generating Hangul syllables in addTailComposites() (588 for a Jamo L), // or decomposing a following Hangul syllable on the fly, during contraction matching.
errorCode = U_UNSUPPORTED_ERROR;
parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported"; return;
} // A Hangul syllable completely inside a contraction is ok.
} // Note: If there is a prefix, then the parser checked that // both the prefix and the string begin with NFC boundaries (not Jamo V or T). // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0)) // (While handling a Hangul syllable, prefixes on Jamo V or T // would not see the previous Jamo of that syllable.)
if(strength != UCOL_IDENTICAL) { // Find the node index after which we insert the new tailored node.
int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);
U_ASSERT(cesLength > 0);
int64_t ce = ces[cesLength - 1]; if (strength == UCOL_PRIMARY && !isTempCE(ce) && static_cast<uint32_t>(ce >> 32) == 0) { // There is no primary gap between ignorables and the space-first-primary.
errorCode = U_UNSUPPORTED_ERROR;
parserErrorReason = "tailoring primary after ignorables not supported"; return;
} if(strength == UCOL_QUATERNARY && ce == 0) { // The CE data structure does not support non-zero quaternary weights // on tertiary ignorables.
errorCode = U_UNSUPPORTED_ERROR;
parserErrorReason = "tailoring quaternary after tertiary ignorables not supported"; return;
} // Insert the new tailored node.
index = insertTailoredNodeAfter(index, strength, errorCode); if(U_FAILURE(errorCode)) {
parserErrorReason = "modifying collation elements"; return;
} // Strength of the temporary CE: // The new relation may yield a stronger CE but not a weaker one.
int32_t tempStrength = ceStrength(ce); if(strength < tempStrength) { tempStrength = strength; }
ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);
}
// Find the last CE that is at least as "strong" as the requested difference. // Note: Stronger is smaller (UCOL_PRIMARY=0).
int64_t ce; for(;; --cesLength) { if(cesLength == 0) {
ce = ces[0] = 0;
cesLength = 1; break;
} else {
ce = ces[cesLength - 1];
} if(ceStrength(ce) <= strength) { break; }
}
if(isTempCE(ce)) { // No need to findCommonNode() here for lower levels // because insertTailoredNodeAfter() will do that anyway. return indexFromTempCE(ce);
}
// root CE if (static_cast<uint8_t>(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) {
errorCode = U_UNSUPPORTED_ERROR;
parserErrorReason = "tailoring relative to an unassigned code point not supported"; return 0;
} return findOrInsertNodeForRootCE(ce, strength, errorCode);
}
// Find or insert the node for each of the root CE's weights, // down to the requested level/strength. // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
U_ASSERT((ce & 0xc0) == 0);
int32_t index = findOrInsertNodeForPrimary(static_cast<uint32_t>(ce >> 32), errorCode); if(strength >= UCOL_SECONDARY) {
uint32_t lower32 = static_cast<uint32_t>(ce);
index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode); if(strength >= UCOL_TERTIARY) {
index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK,
UCOL_TERTIARY, errorCode);
}
} return index;
}
namespace {
/** * Like Java Collections.binarySearch(List, key, Comparator). * * @return the index>=0 where the item was found, * or the index<0 for inserting the string at ~index in sorted order * (index into rootPrimaryIndexes)
*/
int32_t
binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length, const int64_t *nodes, uint32_t p) { if(length == 0) { return ~0; }
int32_t start = 0;
int32_t limit = length; for (;;) {
int32_t i = (start + limit) / 2;
int64_t node = nodes[rootPrimaryIndexes[i]];
uint32_t nodePrimary = static_cast<uint32_t>(node >> 32); // weight32FromNode(node) if (p == nodePrimary) { return i;
} elseif (p < nodePrimary) { if (i == start) { return ~start; // insert s before i
}
limit = i;
} else { if (i == start) { return ~(start + 1); // insert s after i
}
start = i;
}
}
}
// If this will be the first below-common weight for the parent node, // then we will also need to insert a common weight after it.
int64_t node = nodes.elementAti(index);
U_ASSERT(strengthFromNode(node) < level); // parent node is stronger if(weight16 != 0 && weight16 < Collation::COMMON_WEIGHT16) {
int32_t hasThisLevelBefore = level == UCOL_SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3; if((node & hasThisLevelBefore) == 0) { // The parent node has an implied level-common weight.
int64_t commonNode =
nodeFromWeight16(Collation::COMMON_WEIGHT16) | nodeFromStrength(level); if(level == UCOL_SECONDARY) { // Move the HAS_BEFORE3 flag from the parent node // to the new secondary common node.
commonNode |= node & HAS_BEFORE3;
node &= ~static_cast<int64_t>(HAS_BEFORE3);
}
nodes.setElementAt(node | hasThisLevelBefore, index); // Insert below-common-weight node.
int32_t nextIndex = nextIndexFromNode(node);
node = nodeFromWeight16(weight16) | nodeFromStrength(level);
index = insertNodeBetween(index, nextIndex, node, errorCode); // Insert common-weight node.
insertNodeBetween(index, nextIndex, commonNode, errorCode); // Return index of below-common-weight node. return index;
}
}
// Find the root CE's weight for this level. // Postpone insertion if not found: // Insert the new root node before the next stronger node, // or before the next root node with the same strength and a larger weight.
int32_t nextIndex; while((nextIndex = nextIndexFromNode(node)) != 0) {
node = nodes.elementAti(nextIndex);
int32_t nextStrength = strengthFromNode(node); if(nextStrength <= level) { // Insert before a stronger node. if(nextStrength < level) { break; } // nextStrength == level if(!isTailoredNode(node)) {
uint32_t nextWeight16 = weight16FromNode(node); if(nextWeight16 == weight16) { // Found the node for the root CE up to this level. return nextIndex;
} // Insert before a node with a larger same-strength weight. if(nextWeight16 > weight16) { break; }
}
} // Skip the next node.
index = nextIndex;
}
node = nodeFromWeight16(weight16) | nodeFromStrength(level); return insertNodeBetween(index, nextIndex, node, errorCode);
}
int32_t
CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; }
U_ASSERT(0 <= index && index < nodes.size()); if(strength >= UCOL_SECONDARY) {
index = findCommonNode(index, UCOL_SECONDARY); if(strength >= UCOL_TERTIARY) {
index = findCommonNode(index, UCOL_TERTIARY);
}
} // Postpone insertion: // Insert the new node before the next one with a strength at least as strong.
int64_t node = nodes.elementAti(index);
int32_t nextIndex; while((nextIndex = nextIndexFromNode(node)) != 0) {
node = nodes.elementAti(nextIndex); if(strengthFromNode(node) <= strength) { break; } // Skip the next node which has a weaker (larger) strength than the new one.
index = nextIndex;
}
node = IS_TAILORED | nodeFromStrength(strength); return insertNodeBetween(index, nextIndex, node, errorCode);
}
int32_t
CollationBuilder::findCommonNode(int32_t index, int32_t strength) const {
U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY);
int64_t node = nodes.elementAti(index); if(strengthFromNode(node) >= strength) { // The current node is no stronger. return index;
} if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) { // The current node implies the strength-common weight. return index;
}
index = nextIndexFromNode(node);
node = nodes.elementAti(index);
U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&
weight16FromNode(node) < Collation::COMMON_WEIGHT16); // Skip to the explicit common node. do {
index = nextIndexFromNode(node);
node = nodes.elementAti(index);
U_ASSERT(strengthFromNode(node) >= strength);
} while(isTailoredNode(node) || strengthFromNode(node) > strength ||
weight16FromNode(node) < Collation::COMMON_WEIGHT16);
U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16); return index;
}
void
CollationBuilder::setCaseBits(const UnicodeString &nfdString, constchar *&parserErrorReason, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; }
int32_t numTailoredPrimaries = 0; for(int32_t i = 0; i < cesLength; ++i) { if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; }
} // We should not be able to get too many case bits because // cesLength<=31==MAX_EXPANSION_LENGTH. // 31 pairs of case bits fit into an int64_t without setting its sign bit.
U_ASSERT(numTailoredPrimaries <= 31);
uint32_t lastCase = 0;
int32_t numBasePrimaries = 0; for(int32_t i = 0; i < baseCEsLength; ++i) {
int64_t ce = baseCEs.getCE(i); if((ce >> 32) != 0) {
++numBasePrimaries;
uint32_t c = (static_cast<uint32_t>(ce) >> 14) & 3;
U_ASSERT(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE if(numBasePrimaries < numTailoredPrimaries) {
cases |= static_cast<int64_t>(c) << ((numBasePrimaries - 1) * 2);
} elseif(numBasePrimaries == numTailoredPrimaries) {
lastCase = c;
} elseif(c != lastCase) { // There are more base primary CEs than tailored primaries. // Set mixed case if the case bits of the remainder differ.
lastCase = 1; // Nothing more can change. break;
}
}
} if(numBasePrimaries >= numTailoredPrimaries) {
cases |= static_cast<int64_t>(lastCase) << ((numTailoredPrimaries - 1) * 2);
}
}
for(int32_t i = 0; i < cesLength; ++i) {
int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff); // clear old case bits
int32_t strength = ceStrength(ce); if(strength == UCOL_PRIMARY) {
ce |= (cases & 3) << 14;
cases >>= 2;
} elseif(strength == UCOL_TERTIARY) { // Tertiary CEs must have uppercase bits. // See the LDML spec, and comments in class CollationCompare.
ce |= 0x8000;
} // Tertiary ignorable CEs must have 0 case bits. // We set 0 case bits for secondary CEs too // since currently only U+0345 is cased and maps to a secondary CE, // and it is lowercase. Other secondaries are uncased. // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.
ces[i] = ce;
}
}
// ICU-22517 // This constant defines a limit for the addOnlyClosure to return // error, to avoid taking a long time for canonical closure expansion. // Please let us know if you have a reasonable use case that needed // for a practical Collation rule that needs to increase this limit. // This value is needed for compiling a rule with eight Hangul syllables such as // "&a=b쫊쫊쫊쫊쫊쫊쫊" without error, which should be more than realistic // usage. static constexpr int32_t kClosureLoopLimit = 3000;
// Look for the last starter in the NFD string.
UChar32 lastStarter;
int32_t indexAfterLastStarter = nfdString.length(); for(;;) { if(indexAfterLastStarter == 0) { return; } // no starter at all
lastStarter = nfdString.char32At(indexAfterLastStarter - 1); if(nfd.getCombiningClass(lastStarter) == 0) { break; }
indexAfterLastStarter -= U16_LENGTH(lastStarter);
} // No closure to Hangul syllables since we decompose them on the fly. if(Hangul::isJamoL(lastStarter)) { return; }
// Are there any composites whose decomposition starts with the lastStarter? // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters. // We might find some more equivalent mappings here if it did.
UnicodeSet composites; if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }
UnicodeString decomp;
UnicodeString newNFDString, newString;
int64_t newCEs[Collation::MAX_EXPANSION_LENGTH];
UnicodeSetIterator iter(composites); while(iter.next()) {
U_ASSERT(!iter.isString());
UChar32 composite = iter.getCodepoint();
nfd.getDecomposition(composite, decomp); if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
newNFDString, newString, errorCode)) { continue;
}
int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0); if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) { // Ignore mappings that we cannot store. continue;
} // Note: It is possible that the newCEs do not make use of the mapping // for which we are adding the tail composites, in which case we might be adding // unnecessary mappings. // For example, when we add tail composites for ae^ (^=combining circumflex), // UCA discontiguous-contraction matching does not find any matches // for ae_^ (_=any combining diacritic below) *unless* there is also // a contraction mapping for ae. // Thus, if there is no ae contraction, then the ae^ mapping is ignored // while fetching the newCEs for ae_^. // TODO: Try to detect this effectively. // (Alternatively, print a warning when prefix contractions are missing.)
// We do not need an explicit mapping for the NFD strings. // It is fine if the NFD input collates like this via a sequence of mappings. // It also saves a little bit of space, and may reduce the set of characters with contractions.
uint32_t ce32 = addIfDifferent(nfdPrefix, newString,
newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode); if(ce32 != Collation::UNASSIGNED_CE32) { // was different, was added
addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode);
}
}
}
UBool
CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString,
int32_t indexAfterLastStarter,
UChar32 composite, const UnicodeString &decomp,
UnicodeString &newNFDString, UnicodeString &newString,
UErrorCode &errorCode) const { if(U_FAILURE(errorCode)) { returnfalse; }
U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0));
int32_t lastStarterLength = decomp.moveIndex32(0, 1); if(lastStarterLength == decomp.length()) { // Singleton decompositions should be found by addWithClosure() // and the CanonicalIterator, so we can ignore them here. returnfalse;
} if(nfdString.compare(indexAfterLastStarter, 0x7fffffff,
decomp, lastStarterLength, 0x7fffffff) == 0) { // same strings, nothing new to be found here returnfalse;
}
// Make new FCD strings that combine a composite, or its decomposition, // into the nfdString's last starter and the combining marks following it. // Make an NFD version, and a version with the composite.
newNFDString.setTo(nfdString, 0, indexAfterLastStarter);
newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite);
// The following is related to discontiguous contraction matching, // but builds only FCD strings (or else returns false).
int32_t sourceIndex = indexAfterLastStarter;
int32_t decompIndex = lastStarterLength; // Small optimization: We keep the source character across loop iterations // because we do not always consume it, // and then need not fetch it again nor look up its combining class again.
UChar32 sourceChar = U_SENTINEL; // The cc variables need to be declared before the loop so that at the end // they are set to the last combining classes seen.
uint8_t sourceCC = 0;
uint8_t decompCC = 0; for(;;) { if(sourceChar < 0) { if(sourceIndex >= nfdString.length()) { break; }
sourceChar = nfdString.char32At(sourceIndex);
sourceCC = nfd.getCombiningClass(sourceChar);
U_ASSERT(sourceCC != 0);
} // We consume a decomposition character in each iteration. if(decompIndex >= decomp.length()) { break; }
UChar32 decompChar = decomp.char32At(decompIndex);
decompCC = nfd.getCombiningClass(decompChar); // Compare the two characters and their combining classes. if(decompCC == 0) { // Unable to merge because the source contains a non-zero combining mark // but the composite's decomposition contains another starter. // The strings would not be equivalent. returnfalse;
} elseif(sourceCC < decompCC) { // Composite + sourceChar would not be FCD. returnfalse;
} elseif(decompCC < sourceCC) {
newNFDString.append(decompChar);
decompIndex += U16_LENGTH(decompChar);
} elseif(decompChar != sourceChar) { // Blocked because same combining class. returnfalse;
} else { // match: decompChar == sourceChar
newNFDString.append(decompChar);
decompIndex += U16_LENGTH(decompChar);
sourceIndex += U16_LENGTH(decompChar);
sourceChar = U_SENTINEL;
}
} // We are at the end of at least one of the two inputs. if(sourceChar >= 0) { // more characters from nfdString but not from decomp if(sourceCC < decompCC) { // Appending the next source character to the composite would not be FCD. returnfalse;
}
newNFDString.append(nfdString, sourceIndex, 0x7fffffff);
newString.append(nfdString, sourceIndex, 0x7fffffff);
} elseif(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString
newNFDString.append(decomp, decompIndex, 0x7fffffff);
}
U_ASSERT(nfd.isNormalized(newNFDString, errorCode));
U_ASSERT(fcd.isNormalized(newString, errorCode));
U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString); // canonically equivalent returntrue;
}
UBool
CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const { // Do not map non-FCD prefixes. return !isFCD(s, errorCode);
}
UBool
CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const { // Do not map non-FCD strings. // Do not map strings that start with Hangul syllables: We decompose those on the fly. return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0));
}
void
CollationBuilder::closeOverComposites(UErrorCode &errorCode) {
UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode); // Java: static final if(U_FAILURE(errorCode)) { return; } // Hangul is decomposed on the fly during collation.
composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
UnicodeString prefix; // empty
UnicodeString nfdString;
UnicodeSetIterator iter(composites); while(iter.next()) {
U_ASSERT(!iter.isString());
nfd.getDecomposition(iter.getCodepoint(), nfdString);
cesLength = dataBuilder->getCEs(nfdString, ces, 0); if(cesLength > Collation::MAX_EXPANSION_LENGTH) { // Too many CEs from the decomposition (unusual), ignore this composite. // We could add a capacity parameter to getCEs() and reallocate if necessary. // However, this can only really happen in contrived cases. continue;
} const UnicodeString &composite(iter.getString());
addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode);
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.