Quelle collationbuilder.cpp

Sprache: C

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* collationbuilder.cpp
*
* (replaced the former ucol_bld.cpp)
*
* created on: 2013may06
* created by: Markus W. Scherer
*/

#ifdef DEBUG_COLLATION_BUILDER
#include <stdio.h>
#endif

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/caniter.h"
#include "unicode/normalizer2.h"
#include "unicode/tblcoll.h"
#include "unicode/parseerr.h"
#include "unicode/uchar.h"
#include "unicode/ucol.h"
#include "unicode/unistr.h"
#include "unicode/usetiter.h"
#include "unicode/utf16.h"
#include "unicode/uversion.h"
#include "cmemory.h"
#include "collation.h"
#include "collationbuilder.h"
#include "collationdata.h"
#include "collationdatabuilder.h"
#include "collationfastlatin.h"
#include "collationroot.h"
#include "collationrootelements.h"
#include "collationruleparser.h"
#include "collationsettings.h"
#include "collationtailoring.h"
#include "collationweights.h"
#include "normalizer2impl.h"
#include "uassert.h"
#include "ucol_imp.h"
#include "utf16collationiterator.h"

U_NAMESPACE_BEGIN

namespace {

class BundleImporter : public CollationRuleParser::Importer {
public:
    BundleImporter() {}
    virtual ~BundleImporter();
    virtual void getRules(
            const char *localeID, const char *collationType,
            UnicodeString &rules,
            const char *&errorReason, UErrorCode &errorCode) override;
};

BundleImporter::~BundleImporter() {}

void
BundleImporter::getRules(
        const char *localeID, const char *collationType,
        UnicodeString &rules,
        const char *& /*errorReason*/, UErrorCode &errorCode) {
    CollationLoader::loadRules(localeID, collationType, rules, errorCode);
}

}  // namespace

// RuleBasedCollator implementation ---------------------------------------- ***

// These methods are here, rather than in rulebasedcollator.cpp,
// for modularization:
// Most code using Collator does not need to build a Collator from rules.
// By moving these constructors and helper methods to a separate file,
// most code will not have a static dependency on the builder code.

RuleBasedCollator::RuleBasedCollator()
        : data(nullptr),
          settings(nullptr),
          tailoring(nullptr),
          cacheEntry(nullptr),
          validLocale(""),
          explicitlySetAttributes(0),
          actualLocaleIsSameAsValid(false) {
}

RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode)
        : data(nullptr),
          settings(nullptr),
          tailoring(nullptr),
          cacheEntry(nullptr),
          validLocale(""),
          explicitlySetAttributes(0),
          actualLocaleIsSameAsValid(false) {
    internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, nullptr, nullptr, errorCode);
}

RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength,
                                     UErrorCode &errorCode)
        : data(nullptr),
          settings(nullptr),
          tailoring(nullptr),
          cacheEntry(nullptr),
          validLocale(""),
          explicitlySetAttributes(0),
          actualLocaleIsSameAsValid(false) {
    internalBuildTailoring(rules, strength, UCOL_DEFAULT, nullptr, nullptr, errorCode);
}

RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
                                     UColAttributeValue decompositionMode,
                                     UErrorCode &errorCode)
        : data(nullptr),
          settings(nullptr),
          tailoring(nullptr),
          cacheEntry(nullptr),
          validLocale(""),
          explicitlySetAttributes(0),
          actualLocaleIsSameAsValid(false) {
    internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, nullptr, nullptr, errorCode);
}

RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
                                     ECollationStrength strength,
                                     UColAttributeValue decompositionMode,
                                     UErrorCode &errorCode)
        : data(nullptr),
          settings(nullptr),
          tailoring(nullptr),
          cacheEntry(nullptr),
          validLocale(""),
          explicitlySetAttributes(0),
          actualLocaleIsSameAsValid(false) {
    internalBuildTailoring(rules, strength, decompositionMode, nullptr, nullptr, errorCode);
}

RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
                                     UParseError &parseError, UnicodeString &reason,
                                     UErrorCode &errorCode)
        : data(nullptr),
          settings(nullptr),
          tailoring(nullptr),
          cacheEntry(nullptr),
          validLocale(""),
          explicitlySetAttributes(0),
          actualLocaleIsSameAsValid(false) {
    internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode);
}

void
RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules,
                                          int32_t strength,
                                          UColAttributeValue decompositionMode,
                                          UParseError *outParseError, UnicodeString *outReason,
                                          UErrorCode &errorCode) {
    const CollationTailoring *base = CollationRoot::getRoot(errorCode);
    if(U_FAILURE(errorCode)) { return; }
    if(outReason != nullptr) { outReason->remove(); }
    CollationBuilder builder(base, errorCode);
    UVersionInfo noVersion = { 0, 0, 0, 0 };
    BundleImporter importer;
    LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion,
                                                             &importer,
                                                             outParseError, errorCode));
    if(U_FAILURE(errorCode)) {
        const char *reason = builder.getErrorReason();
        if(reason != nullptr && outReason != nullptr) {
            *outReason = UnicodeString(reason, -1, US_INV);
        }
        return;
    }
    t->actualLocale.setToBogus();
    adoptTailoring(t.orphan(), errorCode);
    // Set attributes after building the collator,
    // to keep the default settings consistent with the rule string.
    if(strength != UCOL_DEFAULT) {
        setAttribute(UCOL_STRENGTH, static_cast<UColAttributeValue>(strength), errorCode);
    }
    if(decompositionMode != UCOL_DEFAULT) {
        setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode);
    }
}

// CollationBuilder implementation ----------------------------------------- ***

CollationBuilder::CollationBuilder(const CollationTailoring *b, UBool icu4xMode, UErrorCode &errorCode)
        : nfd(*Normalizer2::getNFDInstance(errorCode)),
          fcd(*Normalizer2Factory::getFCDInstance(errorCode)),
          nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
          base(b),
          baseData(b->data),
          rootElements(b->data->rootElements, b->data->rootElementsLength),
          variableTop(0),
          dataBuilder(new CollationDataBuilder(icu4xMode, errorCode)), fastLatinEnabled(true),
          icu4xMode(icu4xMode),
          errorReason(nullptr),
          cesLength(0),
          rootPrimaryIndexes(errorCode), nodes(errorCode) {
    nfcImpl.ensureCanonIterData(errorCode);
    if(U_FAILURE(errorCode)) {
        errorReason = "CollationBuilder fields initialization failed";
        return;
    }
    if(dataBuilder == nullptr) {
        errorCode = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    dataBuilder->initForTailoring(baseData, errorCode);
    if(U_FAILURE(errorCode)) {
        errorReason = "CollationBuilder initialization failed";
    }
}

CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)
  : CollationBuilder(b, false, errorCode)
{}

CollationBuilder::~CollationBuilder() {
    delete dataBuilder;
}

CollationTailoring *
CollationBuilder::parseAndBuild(const UnicodeString &ruleString,
                                const UVersionInfo rulesVersion,
                                CollationRuleParser::Importer *importer,
                                UParseError *outParseError,
                                UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return nullptr; }
    if(baseData->rootElements == nullptr) {
        errorCode = U_MISSING_RESOURCE_ERROR;
        errorReason = "missing root elements data, tailoring not supported";
        return nullptr;
    }
    LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings));
    if(tailoring.isNull() || tailoring->isBogus()) {
        errorCode = U_MEMORY_ALLOCATION_ERROR;
        return nullptr;
    }
    CollationRuleParser parser(baseData, errorCode);
    if(U_FAILURE(errorCode)) { return nullptr; }
    // Note: This always bases &[last variable] and &[first regular]
    // on the root collator's maxVariable/variableTop.
    // If we wanted this to change after [maxVariable x], then we would keep
    // the tailoring.settings pointer here and read its variableTop when we need it.
    // See http://unicode.org/cldr/trac/ticket/6070
    variableTop = base->settings->variableTop;
    parser.setSink(this);
    parser.setImporter(importer);
    CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings);
    parser.parse(ruleString, ownedSettings, outParseError, errorCode);
    errorReason = parser.getErrorReason();
    if(U_FAILURE(errorCode)) { return nullptr; }
    if(dataBuilder->hasMappings()) {
        makeTailoredCEs(errorCode);
        if (!icu4xMode) {
            closeOverComposites(errorCode);
        }
        finalizeCEs(errorCode);
        if (!icu4xMode) {
            // Copy all of ASCII, and Latin-1 letters, into each tailoring.
            optimizeSet.add(0, 0x7f);
            optimizeSet.add(0xc0, 0xff);
            // Hangul is decomposed on the fly during collation,
            // and the tailoring data is always built with HANGUL_TAG specials.
            optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
            dataBuilder->optimize(optimizeSet, errorCode);
        }
        tailoring->ensureOwnedData(errorCode);
        if(U_FAILURE(errorCode)) { return nullptr; }
        if(fastLatinEnabled) { dataBuilder->enableFastLatin(); }
        dataBuilder->build(*tailoring->ownedData, errorCode);
        tailoring->builder = dataBuilder;
        dataBuilder = nullptr;
    } else {
        tailoring->data = baseData;
    }
    if(U_FAILURE(errorCode)) { return nullptr; }
    ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
        tailoring->data, ownedSettings,
        ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));
    tailoring->rules = ruleString;
    tailoring->rules.getTerminatedBuffer();  // ensure NUL-termination
    tailoring->setVersion(base->version, rulesVersion);
    return tailoring.orphan();
}

void
CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
                           const char *&parserErrorReason, UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }
    U_ASSERT(!str.isEmpty());
    if(str.charAt(0) == CollationRuleParser::POS_LEAD) {
        ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode);
        cesLength = 1;
        if(U_FAILURE(errorCode)) { return; }
        U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0);
    } else {
        // normal reset to a character or string
        UnicodeString nfdString = nfd.normalize(str, errorCode);
        if(U_FAILURE(errorCode)) {
            parserErrorReason = "normalizing the reset position";
            return;
        }
        cesLength = dataBuilder->getCEs(nfdString, ces, 0);
        if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
            parserErrorReason = "reset position maps to too many collation elements (more than 31)";
            return;
        }
    }
    if(strength == UCOL_IDENTICAL) { return; }  // simple reset-at-position

    // &[before strength]position
    U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY);
    int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);
    if(U_FAILURE(errorCode)) { return; }

    int64_t node = nodes.elementAti(index);
    // If the index is for a "weaker" node,
    // then skip backwards over this and further "weaker" nodes.
    while(strengthFromNode(node) > strength) {
        index = previousIndexFromNode(node);
        node = nodes.elementAti(index);
    }

    // Find or insert a node whose index we will put into a temporary CE.
    if(strengthFromNode(node) == strength && isTailoredNode(node)) {
        // Reset to just before this same-strength tailored node.
        index = previousIndexFromNode(node);
    } else if(strength == UCOL_PRIMARY) {
        // root primary node (has no previous index)
        uint32_t p = weight32FromNode(node);
        if(p == 0) {
            errorCode = U_UNSUPPORTED_ERROR;
            parserErrorReason = "reset primary-before ignorable not possible";
            return;
        }
        if(p <= rootElements.getFirstPrimary()) {
            // There is no primary gap between ignorables and the space-first-primary.
            errorCode = U_UNSUPPORTED_ERROR;
            parserErrorReason = "reset primary-before first non-ignorable not supported";
            return;
        }
        if(p == Collation::FIRST_TRAILING_PRIMARY) {
            // We do not support tailoring to an unassigned-implicit CE.
            errorCode = U_UNSUPPORTED_ERROR;
            parserErrorReason = "reset primary-before [first trailing] not supported";
            return;
        }
        p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p));
        index = findOrInsertNodeForPrimary(p, errorCode);
        // Go to the last node in this list:
        // Tailor after the last node between adjacent root nodes.
        for(;;) {
            node = nodes.elementAti(index);
            int32_t nextIndex = nextIndexFromNode(node);
            if(nextIndex == 0) { break; }
            index = nextIndex;
        }
    } else {
        // &[before 2] or &[before 3]
        index = findCommonNode(index, UCOL_SECONDARY);
        if(strength >= UCOL_TERTIARY) {
            index = findCommonNode(index, UCOL_TERTIARY);
        }
        // findCommonNode() stayed on the stronger node or moved to
        // an explicit common-weight node of the reset-before strength.
        node = nodes.elementAti(index);
        if(strengthFromNode(node) == strength) {
            // Found a same-strength node with an explicit weight.
            uint32_t weight16 = weight16FromNode(node);
            if(weight16 == 0) {
                errorCode = U_UNSUPPORTED_ERROR;
                if(strength == UCOL_SECONDARY) {
                    parserErrorReason = "reset secondary-before secondary ignorable not possible";
                } else {
                    parserErrorReason = "reset tertiary-before completely ignorable not possible";
                }
                return;
            }
            U_ASSERT(weight16 > Collation::BEFORE_WEIGHT16);
            // Reset to just before this node.
            // Insert the preceding same-level explicit weight if it is not there already.
            // Which explicit weight immediately precedes this one?
            weight16 = getWeight16Before(index, node, strength);
            // Does this preceding weight have a node?
            uint32_t previousWeight16;
            int32_t previousIndex = previousIndexFromNode(node);
            for(int32_t i = previousIndex;; i = previousIndexFromNode(node)) {
                node = nodes.elementAti(i);
                int32_t previousStrength = strengthFromNode(node);
                if(previousStrength < strength) {
                    U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16 || i == previousIndex);
                    // Either the reset element has an above-common weight and
                    // the parent node provides the implied common weight,
                    // or the reset element has a weight<=common in the node
                    // right after the parent, and we need to insert the preceding weight.
                    previousWeight16 = Collation::COMMON_WEIGHT16;
                    break;
                } else if(previousStrength == strength && !isTailoredNode(node)) {
                    previousWeight16 = weight16FromNode(node);
                    break;
                }
                // Skip weaker nodes and same-level tailored nodes.
            }
            if(previousWeight16 == weight16) {
                // The preceding weight has a node,
                // maybe with following weaker or tailored nodes.
                // Reset to the last of them.
                index = previousIndex;
            } else {
                // Insert a node with the preceding weight, reset to that.
                node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
                index = insertNodeBetween(previousIndex, index, node, errorCode);
            }
        } else {
            // Found a stronger node with implied strength-common weight.
            uint32_t weight16 = getWeight16Before(index, node, strength);
            index = findOrInsertWeakNode(index, weight16, strength, errorCode);
        }
        // Strength of the temporary CE = strength of its reset position.
        // Code above raises an error if the before-strength is stronger.
        strength = ceStrength(ces[cesLength - 1]);
    }
    if(U_FAILURE(errorCode)) {
        parserErrorReason = "inserting reset position for &[before n]";
        return;
    }
    ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
}

uint32_t
CollationBuilder::getWeight16Before(int32_t index, int64_t node, int32_t level) {
    U_ASSERT(strengthFromNode(node) < level || !isTailoredNode(node));
    // Collect the root CE weights if this node is for a root CE.
    // If it is not, then return the low non-primary boundary for a tailored CE.
    uint32_t t;
    if(strengthFromNode(node) == UCOL_TERTIARY) {
        t = weight16FromNode(node);
    } else {
        t = Collation::COMMON_WEIGHT16;  // Stronger node with implied common weight.
    }
    while(strengthFromNode(node) > UCOL_SECONDARY) {
        index = previousIndexFromNode(node);
        node = nodes.elementAti(index);
    }
    if(isTailoredNode(node)) {
        return Collation::BEFORE_WEIGHT16;
    }
    uint32_t s;
    if(strengthFromNode(node) == UCOL_SECONDARY) {
        s = weight16FromNode(node);
    } else {
        s = Collation::COMMON_WEIGHT16;  // Stronger node with implied common weight.
    }
    while(strengthFromNode(node) > UCOL_PRIMARY) {
        index = previousIndexFromNode(node);
        node = nodes.elementAti(index);
    }
    if(isTailoredNode(node)) {
        return Collation::BEFORE_WEIGHT16;
    }
    // [p, s, t] is a root CE. Return the preceding weight for the requested level.
    uint32_t p = weight32FromNode(node);
    uint32_t weight16;
    if(level == UCOL_SECONDARY) {
        weight16 = rootElements.getSecondaryBefore(p, s);
    } else {
        weight16 = rootElements.getTertiaryBefore(p, s, t);
        U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);
    }
    return weight16;
}

int64_t
CollationBuilder::getSpecialResetPosition(const UnicodeString &str,
                                          const char *&parserErrorReason, UErrorCode &errorCode) {
    U_ASSERT(str.length() == 2);
    int64_t ce;
    int32_t strength = UCOL_PRIMARY;
    UBool isBoundary = false;
    UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE;
    U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING);
    switch(pos) {
    case CollationRuleParser::FIRST_TERTIARY_IGNORABLE:
        // Quaternary CEs are not supported.
        // Non-zero quaternary weights are possible only on tertiary or stronger CEs.
        return 0;
    case CollationRuleParser::LAST_TERTIARY_IGNORABLE:
        return 0;
    case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: {
        // Look for a tailored tertiary node after [0, 0, 0].
        int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode);
        if(U_FAILURE(errorCode)) { return 0; }
        int64_t node = nodes.elementAti(index);
        if((index = nextIndexFromNode(node)) != 0) {
            node = nodes.elementAti(index);
            U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY);
            if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) {
                return tempCEFromIndexAndStrength(index, UCOL_TERTIARY);
            }
        }
        return rootElements.getFirstTertiaryCE();
        // No need to look for nodeHasAnyBefore() on a tertiary node.
    }
    case CollationRuleParser::LAST_SECONDARY_IGNORABLE:
        ce = rootElements.getLastTertiaryCE();
        strength = UCOL_TERTIARY;
        break;
    case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: {
        // Look for a tailored secondary node after [0, 0, *].
        int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode);
        if(U_FAILURE(errorCode)) { return 0; }
        int64_t node = nodes.elementAti(index);
        while((index = nextIndexFromNode(node)) != 0) {
            node = nodes.elementAti(index);
            strength = strengthFromNode(node);
            if(strength < UCOL_SECONDARY) { break; }
            if(strength == UCOL_SECONDARY) {
                if(isTailoredNode(node)) {
                    if(nodeHasBefore3(node)) {
                        index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
                        U_ASSERT(isTailoredNode(nodes.elementAti(index)));
                    }
                    return tempCEFromIndexAndStrength(index, UCOL_SECONDARY);
                } else {
                    break;
                }
            }
        }
        ce = rootElements.getFirstSecondaryCE();
        strength = UCOL_SECONDARY;
        break;
    }
    case CollationRuleParser::LAST_PRIMARY_IGNORABLE:
        ce = rootElements.getLastSecondaryCE();
        strength = UCOL_SECONDARY;
        break;
    case CollationRuleParser::FIRST_VARIABLE:
        ce = rootElements.getFirstPrimaryCE();
        isBoundary = true;  // FractionalUCA.txt: FDD1 00A0, SPACE first primary
        break;
    case CollationRuleParser::LAST_VARIABLE:
        ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);
        break;
    case CollationRuleParser::FIRST_REGULAR:
        ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);
        isBoundary = true;  // FractionalUCA.txt: FDD1 263A, SYMBOL first primary
        break;
    case CollationRuleParser::LAST_REGULAR:
        // Use the Hani-first-primary rather than the actual last "regular" CE before it,
        // for backward compatibility with behavior before the introduction of
        // script-first-primary CEs in the root collator.
        ce = rootElements.firstCEWithPrimaryAtLeast(
            baseData->getFirstPrimaryForGroup(USCRIPT_HAN));
        break;
    case CollationRuleParser::FIRST_IMPLICIT:
        ce = baseData->getSingleCE(0x4e00, errorCode);
        break;
    case CollationRuleParser::LAST_IMPLICIT:
        // We do not support tailoring to an unassigned-implicit CE.
        errorCode = U_UNSUPPORTED_ERROR;
        parserErrorReason = "reset to [last implicit] not supported";
        return 0;
    case CollationRuleParser::FIRST_TRAILING:
        ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
        isBoundary = true;  // trailing first primary (there is no mapping for it)
        break;
    case CollationRuleParser::LAST_TRAILING:
        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
        parserErrorReason = "LDML forbids tailoring to U+FFFF";
        return 0;
    default:
        UPRV_UNREACHABLE_EXIT;
    }

    int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode);
    if(U_FAILURE(errorCode)) { return 0; }
    int64_t node = nodes.elementAti(index);
    if((pos & 1) == 0) {
        // even pos = [first xyz]
        if(!nodeHasAnyBefore(node) && isBoundary) {
            // A <group> first primary boundary is artificially added to FractionalUCA.txt.
            // It is reachable via its special contraction, but is not normally used.
            // Find the first character tailored after the boundary CE,
            // or the first real root CE after it.
            if((index = nextIndexFromNode(node)) != 0) {
                // If there is a following node, then it must be tailored
                // because there are no root CEs with a boundary primary
                // and non-common secondary/tertiary weights.
                node = nodes.elementAti(index);
                U_ASSERT(isTailoredNode(node));
                ce = tempCEFromIndexAndStrength(index, strength);
            } else {
                U_ASSERT(strength == UCOL_PRIMARY);
                uint32_t p = static_cast<uint32_t>(ce >> 32);
                int32_t pIndex = rootElements.findPrimary(p);
                UBool isCompressible = baseData->isCompressiblePrimary(p);
                p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);
                ce = Collation::makeCE(p);
                index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode);
                if(U_FAILURE(errorCode)) { return 0; }
                node = nodes.elementAti(index);
            }
        }
        if(nodeHasAnyBefore(node)) {
            // Get the first node that was tailored before this one at a weaker strength.
            if(nodeHasBefore2(node)) {
                index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
                node = nodes.elementAti(index);
            }
            if(nodeHasBefore3(node)) {
                index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
            }
            U_ASSERT(isTailoredNode(nodes.elementAti(index)));
            ce = tempCEFromIndexAndStrength(index, strength);
        }
    } else {
        // odd pos = [last xyz]
        // Find the last node that was tailored after the [last xyz]
        // at a strength no greater than the position's strength.
        for(;;) {
            int32_t nextIndex = nextIndexFromNode(node);
            if(nextIndex == 0) { break; }
            int64_t nextNode = nodes.elementAti(nextIndex);
            if(strengthFromNode(nextNode) < strength) { break; }
            index = nextIndex;
            node = nextNode;
        }
        // Do not make a temporary CE for a root node.
        // This last node might be the node for the root CE itself,
        // or a node with a common secondary or tertiary weight.
        if(isTailoredNode(node)) {
            ce = tempCEFromIndexAndStrength(index, strength);
        }
    }
    return ce;
}

void
CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix,
                              const UnicodeString &str, const UnicodeString &extension,
                              const char *&parserErrorReason, UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }
    UnicodeString nfdPrefix;
    if(!prefix.isEmpty()) {
        nfd.normalize(prefix, nfdPrefix, errorCode);
        if(U_FAILURE(errorCode)) {
            parserErrorReason = "normalizing the relation prefix";
            return;
        }
    }
    UnicodeString nfdString = nfd.normalize(str, errorCode);
    if(U_FAILURE(errorCode)) {
        parserErrorReason = "normalizing the relation string";
        return;
    }

    // The runtime code decomposes Hangul syllables on the fly,
    // with recursive processing but without making the Jamo pieces visible for matching.
    // It does not work with certain types of contextual mappings.
    int32_t nfdLength = nfdString.length();
    if(nfdLength >= 2) {
        char16_t c = nfdString.charAt(0);
        if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) {
            // While handling a Hangul syllable, contractions starting with Jamo L or V
            // would not see the following Jamo of that syllable.
            errorCode = U_UNSUPPORTED_ERROR;
            parserErrorReason = "contractions starting with conjoining Jamo L or V not supported";
            return;
        }
        c = nfdString.charAt(nfdLength - 1);
        if(Hangul::isJamoL(c) ||
                (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) {
            // A contraction ending with Jamo L or L+V would require
            // generating Hangul syllables in addTailComposites() (588 for a Jamo L),
            // or decomposing a following Hangul syllable on the fly, during contraction matching.
            errorCode = U_UNSUPPORTED_ERROR;
            parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported";
            return;
        }
        // A Hangul syllable completely inside a contraction is ok.
    }
    // Note: If there is a prefix, then the parser checked that
    // both the prefix and the string begin with NFC boundaries (not Jamo V or T).
    // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))
    // (While handling a Hangul syllable, prefixes on Jamo V or T
    // would not see the previous Jamo of that syllable.)

    if(strength != UCOL_IDENTICAL) {
        // Find the node index after which we insert the new tailored node.
        int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);
        U_ASSERT(cesLength > 0);
        int64_t ce = ces[cesLength - 1];
        if (strength == UCOL_PRIMARY && !isTempCE(ce) && static_cast<uint32_t>(ce >> 32) == 0) {
            // There is no primary gap between ignorables and the space-first-primary.
            errorCode = U_UNSUPPORTED_ERROR;
            parserErrorReason = "tailoring primary after ignorables not supported";
            return;
        }
        if(strength == UCOL_QUATERNARY && ce == 0) {
            // The CE data structure does not support non-zero quaternary weights
            // on tertiary ignorables.
            errorCode = U_UNSUPPORTED_ERROR;
            parserErrorReason = "tailoring quaternary after tertiary ignorables not supported";
            return;
        }
        // Insert the new tailored node.
        index = insertTailoredNodeAfter(index, strength, errorCode);
        if(U_FAILURE(errorCode)) {
            parserErrorReason = "modifying collation elements";
            return;
        }
        // Strength of the temporary CE:
        // The new relation may yield a stronger CE but not a weaker one.
        int32_t tempStrength = ceStrength(ce);
        if(strength < tempStrength) { tempStrength = strength; }
        ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);
    }

    setCaseBits(nfdString, parserErrorReason, errorCode);
    if(U_FAILURE(errorCode)) { return; }

    int32_t cesLengthBeforeExtension = cesLength;
    if(!extension.isEmpty()) {
        UnicodeString nfdExtension = nfd.normalize(extension, errorCode);
        if(U_FAILURE(errorCode)) {
            parserErrorReason = "normalizing the relation extension";
            return;
        }
        cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength);
        if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
            parserErrorReason =
                "extension string adds too many collation elements (more than 31 total)";
            return;
        }
    }
    uint32_t ce32 = Collation::UNASSIGNED_CE32;
    if(!icu4xMode && (prefix != nfdPrefix || str != nfdString) &&
            !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) {
        // Map from the original input to the CEs.
        // We do this in case the canonical closure is incomplete,
        // so that it is possible to explicitly provide the missing mappings.
        ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode);
    }
    if (!icu4xMode) {
        addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
    } else {
        addIfDifferent(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
    }
    if(U_FAILURE(errorCode)) {
        parserErrorReason = "writing collation elements";
        return;
    }
    cesLength = cesLengthBeforeExtension;
}

int32_t
CollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason,
                                         UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return 0; }
    U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY);

    // Find the last CE that is at least as "strong" as the requested difference.
    // Note: Stronger is smaller (UCOL_PRIMARY=0).
    int64_t ce;
    for(;; --cesLength) {
        if(cesLength == 0) {
            ce = ces[0] = 0;
            cesLength = 1;
            break;
        } else {
            ce = ces[cesLength - 1];
        }
        if(ceStrength(ce) <= strength) { break; }
    }

    if(isTempCE(ce)) {
        // No need to findCommonNode() here for lower levels
        // because insertTailoredNodeAfter() will do that anyway.
        return indexFromTempCE(ce);
    }

    // root CE
    if (static_cast<uint8_t>(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) {
        errorCode = U_UNSUPPORTED_ERROR;
        parserErrorReason = "tailoring relative to an unassigned code point not supported";
        return 0;
    }
    return findOrInsertNodeForRootCE(ce, strength, errorCode);
}

int32_t
CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return 0; }
    U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE);

    // Find or insert the node for each of the root CE's weights,
    // down to the requested level/strength.
    // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
    U_ASSERT((ce & 0xc0) == 0);
    int32_t index = findOrInsertNodeForPrimary(static_cast<uint32_t>(ce >> 32), errorCode);
    if(strength >= UCOL_SECONDARY) {
        uint32_t lower32 = static_cast<uint32_t>(ce);
        index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode);
        if(strength >= UCOL_TERTIARY) {
            index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK,
                                         UCOL_TERTIARY, errorCode);
        }
    }
    return index;
}

namespace {

/**
* Like Java Collections.binarySearch(List, key, Comparator).
*
* @return the index>=0 where the item was found,
*         or the index<0 for inserting the string at ~index in sorted order
*         (index into rootPrimaryIndexes)
*/
int32_t
binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length,
                               const int64_t *nodes, uint32_t p) {
    if(length == 0) { return ~0; }
    int32_t start = 0;
    int32_t limit = length;
    for (;;) {
        int32_t i = (start + limit) / 2;
        int64_t node = nodes[rootPrimaryIndexes[i]];
        uint32_t nodePrimary = static_cast<uint32_t>(node >> 32); // weight32FromNode(node)
        if (p == nodePrimary) {
            return i;
        } else if (p < nodePrimary) {
            if (i == start) {
                return ~start;  // insert s before i
            }
            limit = i;
        } else {
            if (i == start) {
                return ~(start + 1);  // insert s after i
            }
            start = i;
        }
    }
}

}  // namespace

int32_t
CollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return 0; }

    int32_t rootIndex = binarySearchForRootPrimaryNode(
        rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);
    if(rootIndex >= 0) {
        return rootPrimaryIndexes.elementAti(rootIndex);
    } else {
        // Start a new list of nodes with this primary.
        int32_t index = nodes.size();
        nodes.addElement(nodeFromWeight32(p), errorCode);
        rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode);
        return index;
    }
}

int32_t
CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return 0; }
    U_ASSERT(0 <= index && index < nodes.size());
    U_ASSERT(UCOL_SECONDARY <= level && level <= UCOL_TERTIARY);

    if(weight16 == Collation::COMMON_WEIGHT16) {
        return findCommonNode(index, level);
    }

    // If this will be the first below-common weight for the parent node,
    // then we will also need to insert a common weight after it.
    int64_t node = nodes.elementAti(index);
    U_ASSERT(strengthFromNode(node) < level);  // parent node is stronger
    if(weight16 != 0 && weight16 < Collation::COMMON_WEIGHT16) {
        int32_t hasThisLevelBefore = level == UCOL_SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;
        if((node & hasThisLevelBefore) == 0) {
            // The parent node has an implied level-common weight.
            int64_t commonNode =
                nodeFromWeight16(Collation::COMMON_WEIGHT16) | nodeFromStrength(level);
            if(level == UCOL_SECONDARY) {
                // Move the HAS_BEFORE3 flag from the parent node
                // to the new secondary common node.
                commonNode |= node & HAS_BEFORE3;
                node &= ~static_cast<int64_t>(HAS_BEFORE3);
            }
            nodes.setElementAt(node | hasThisLevelBefore, index);
            // Insert below-common-weight node.
            int32_t nextIndex = nextIndexFromNode(node);
            node = nodeFromWeight16(weight16) | nodeFromStrength(level);
            index = insertNodeBetween(index, nextIndex, node, errorCode);
            // Insert common-weight node.
            insertNodeBetween(index, nextIndex, commonNode, errorCode);
            // Return index of below-common-weight node.
            return index;
        }
    }

    // Find the root CE's weight for this level.
    // Postpone insertion if not found:
    // Insert the new root node before the next stronger node,
    // or before the next root node with the same strength and a larger weight.
    int32_t nextIndex;
    while((nextIndex = nextIndexFromNode(node)) != 0) {
        node = nodes.elementAti(nextIndex);
        int32_t nextStrength = strengthFromNode(node);
        if(nextStrength <= level) {
            // Insert before a stronger node.
            if(nextStrength < level) { break; }
            // nextStrength == level
            if(!isTailoredNode(node)) {
                uint32_t nextWeight16 = weight16FromNode(node);
                if(nextWeight16 == weight16) {
                    // Found the node for the root CE up to this level.
                    return nextIndex;
                }
                // Insert before a node with a larger same-strength weight.
                if(nextWeight16 > weight16) { break; }
            }
        }
        // Skip the next node.
        index = nextIndex;
    }
    node = nodeFromWeight16(weight16) | nodeFromStrength(level);
    return insertNodeBetween(index, nextIndex, node, errorCode);
}

int32_t
CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return 0; }
    U_ASSERT(0 <= index && index < nodes.size());
    if(strength >= UCOL_SECONDARY) {
        index = findCommonNode(index, UCOL_SECONDARY);
        if(strength >= UCOL_TERTIARY) {
            index = findCommonNode(index, UCOL_TERTIARY);
        }
    }
    // Postpone insertion:
    // Insert the new node before the next one with a strength at least as strong.
    int64_t node = nodes.elementAti(index);
    int32_t nextIndex;
    while((nextIndex = nextIndexFromNode(node)) != 0) {
        node = nodes.elementAti(nextIndex);
        if(strengthFromNode(node) <= strength) { break; }
        // Skip the next node which has a weaker (larger) strength than the new one.
        index = nextIndex;
    }
    node = IS_TAILORED | nodeFromStrength(strength);
    return insertNodeBetween(index, nextIndex, node, errorCode);
}

int32_t
CollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node,
                                    UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return 0; }
    U_ASSERT(previousIndexFromNode(node) == 0);
    U_ASSERT(nextIndexFromNode(node) == 0);
    U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);
    // Append the new node and link it to the existing nodes.
    int32_t newIndex = nodes.size();
    node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);
    nodes.addElement(node, errorCode);
    if(U_FAILURE(errorCode)) { return 0; }
    // nodes[index].nextIndex = newIndex
    node = nodes.elementAti(index);
    nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);
    // nodes[nextIndex].previousIndex = newIndex
    if(nextIndex != 0) {
        node = nodes.elementAti(nextIndex);
        nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);
    }
    return newIndex;
}

int32_t
CollationBuilder::findCommonNode(int32_t index, int32_t strength) const {
    U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY);
    int64_t node = nodes.elementAti(index);
    if(strengthFromNode(node) >= strength) {
        // The current node is no stronger.
        return index;
    }
    if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) {
        // The current node implies the strength-common weight.
        return index;
    }
    index = nextIndexFromNode(node);
    node = nodes.elementAti(index);
    U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&
            weight16FromNode(node) < Collation::COMMON_WEIGHT16);
    // Skip to the explicit common node.
    do {
        index = nextIndexFromNode(node);
        node = nodes.elementAti(index);
        U_ASSERT(strengthFromNode(node) >= strength);
    } while(isTailoredNode(node) || strengthFromNode(node) > strength ||
            weight16FromNode(node) < Collation::COMMON_WEIGHT16);
    U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16);
    return index;
}

void
CollationBuilder::setCaseBits(const UnicodeString &nfdString,
                              const char *&parserErrorReason, UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }
    int32_t numTailoredPrimaries = 0;
    for(int32_t i = 0; i < cesLength; ++i) {
        if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; }
    }
    // We should not be able to get too many case bits because
    // cesLength<=31==MAX_EXPANSION_LENGTH.
    // 31 pairs of case bits fit into an int64_t without setting its sign bit.
    U_ASSERT(numTailoredPrimaries <= 31);

    int64_t cases = 0;
    if(numTailoredPrimaries > 0) {
        const char16_t *s = nfdString.getBuffer();
        UTF16CollationIterator baseCEs(baseData, false, s, s, s + nfdString.length());
        int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1;
        if(U_FAILURE(errorCode)) {
            parserErrorReason = "fetching root CEs for tailored string";
            return;
        }
        U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE);

        uint32_t lastCase = 0;
        int32_t numBasePrimaries = 0;
        for(int32_t i = 0; i < baseCEsLength; ++i) {
            int64_t ce = baseCEs.getCE(i);
            if((ce >> 32) != 0) {
                ++numBasePrimaries;
                uint32_t c = (static_cast<uint32_t>(ce) >> 14) & 3;
                U_ASSERT(c == 0 || c == 2);  // lowercase or uppercase, no mixed case in any base CE
                if(numBasePrimaries < numTailoredPrimaries) {
                    cases |= static_cast<int64_t>(c) << ((numBasePrimaries - 1) * 2);
                } else if(numBasePrimaries == numTailoredPrimaries) {
                    lastCase = c;
                } else if(c != lastCase) {
                    // There are more base primary CEs than tailored primaries.
                    // Set mixed case if the case bits of the remainder differ.
                    lastCase = 1;
                    // Nothing more can change.
                    break;
                }
            }
        }
        if(numBasePrimaries >= numTailoredPrimaries) {
            cases |= static_cast<int64_t>(lastCase) << ((numTailoredPrimaries - 1) * 2);
        }
    }

    for(int32_t i = 0; i < cesLength; ++i) {
        int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff);  // clear old case bits
        int32_t strength = ceStrength(ce);
        if(strength == UCOL_PRIMARY) {
            ce |= (cases & 3) << 14;
            cases >>= 2;
        } else if(strength == UCOL_TERTIARY) {
            // Tertiary CEs must have uppercase bits.
            // See the LDML spec, and comments in class CollationCompare.
            ce |= 0x8000;
        }
        // Tertiary ignorable CEs must have 0 case bits.
        // We set 0 case bits for secondary CEs too
        // since currently only U+0345 is cased and maps to a secondary CE,
        // and it is lowercase. Other secondaries are uncased.
        // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.
        ces[i] = ce;
    }
}

void
CollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason,
                                       UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }
    dataBuilder->suppressContractions(set, errorCode);
    if(U_FAILURE(errorCode)) {
        parserErrorReason = "application of [suppressContractions [set]] failed";
    }
}

void
CollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */,
                           UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }
    optimizeSet.addAll(set);
}

uint32_t
CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
                                 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
                                 UErrorCode &errorCode) {
    // Map from the NFD input to the CEs.
    ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);
    ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);
    addTailComposites(nfdPrefix, nfdString, errorCode);
    return ce32;
}

// ICU-22517
// This constant defines a limit for the addOnlyClosure to return
// error, to avoid taking a long time for canonical closure expansion.
// Please let us know if you have a reasonable use case that needed
// for a practical Collation rule that needs to increase this limit.
// This value is needed for compiling a rule with eight Hangul syllables such as
// "&a=b쫊쫊쫊쫊쫊쫊쫊" without error, which should be more than realistic
// usage.
static constexpr int32_t kClosureLoopLimit = 3000;

uint32_t
CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
                                 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
                                 UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return ce32; }

    int32_t loop = 0;
    // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)
    if(nfdPrefix.isEmpty()) {
        CanonicalIterator stringIter(nfdString, errorCode);
        if(U_FAILURE(errorCode)) { return ce32; }
        UnicodeString prefix;
        for(;;) {
            UnicodeString str = stringIter.next();
            if(str.isBogus()) { break; }
            if (loop++ > kClosureLoopLimit) {
                // To avoid hang as in ICU-22517, return with error.
                errorCode = U_INPUT_TOO_LONG_ERROR;
                return ce32;
            }
            if(ignoreString(str, errorCode) || str == nfdString) { continue; }
            ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
            if(U_FAILURE(errorCode)) { return ce32; }
        }
    } else {
        CanonicalIterator prefixIter(nfdPrefix, errorCode);
        CanonicalIterator stringIter(nfdString, errorCode);
        if(U_FAILURE(errorCode)) { return ce32; }
        for(;;) {
            UnicodeString prefix = prefixIter.next();
            if(prefix.isBogus()) { break; }
            if(ignorePrefix(prefix, errorCode)) { continue; }
            UBool samePrefix = prefix == nfdPrefix;
            for(;;) {
                UnicodeString str = stringIter.next();
                if(str.isBogus()) { break; }
                if (loop++ > kClosureLoopLimit) {
                    // To avoid hang as in ICU-22517, return with error.
                    errorCode = U_INPUT_TOO_LONG_ERROR;
                    return ce32;
                }
                if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; }
                ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
                if(U_FAILURE(errorCode)) { return ce32; }
            }
            stringIter.reset();
        }
    }
    return ce32;
}

void
CollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
                                    UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }

    // Look for the last starter in the NFD string.
    UChar32 lastStarter;
    int32_t indexAfterLastStarter = nfdString.length();
    for(;;) {
        if(indexAfterLastStarter == 0) { return; }  // no starter at all
        lastStarter = nfdString.char32At(indexAfterLastStarter - 1);
        if(nfd.getCombiningClass(lastStarter) == 0) { break; }
        indexAfterLastStarter -= U16_LENGTH(lastStarter);
    }
    // No closure to Hangul syllables since we decompose them on the fly.
    if(Hangul::isJamoL(lastStarter)) { return; }

    // Are there any composites whose decomposition starts with the lastStarter?
    // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
    // We might find some more equivalent mappings here if it did.
    UnicodeSet composites;
    if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }

    UnicodeString decomp;
    UnicodeString newNFDString, newString;
    int64_t newCEs[Collation::MAX_EXPANSION_LENGTH];
    UnicodeSetIterator iter(composites);
    while(iter.next()) {
        U_ASSERT(!iter.isString());
        UChar32 composite = iter.getCodepoint();
        nfd.getDecomposition(composite, decomp);
        if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
                                     newNFDString, newString, errorCode)) {
            continue;
        }
        int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0);
        if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) {
            // Ignore mappings that we cannot store.
            continue;
        }
        // Note: It is possible that the newCEs do not make use of the mapping
        // for which we are adding the tail composites, in which case we might be adding
        // unnecessary mappings.
        // For example, when we add tail composites for ae^ (^=combining circumflex),
        // UCA discontiguous-contraction matching does not find any matches
        // for ae_^ (_=any combining diacritic below) *unless* there is also
        // a contraction mapping for ae.
        // Thus, if there is no ae contraction, then the ae^ mapping is ignored
        // while fetching the newCEs for ae_^.
        // TODO: Try to detect this effectively.
        // (Alternatively, print a warning when prefix contractions are missing.)

        // We do not need an explicit mapping for the NFD strings.
        // It is fine if the NFD input collates like this via a sequence of mappings.
        // It also saves a little bit of space, and may reduce the set of characters with contractions.
        uint32_t ce32 = addIfDifferent(nfdPrefix, newString,
                                       newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode);
        if(ce32 != Collation::UNASSIGNED_CE32) {
            // was different, was added
            addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode);
        }
    }
}

UBool
CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString,
                                           int32_t indexAfterLastStarter,
                                           UChar32 composite, const UnicodeString &decomp,
                                           UnicodeString &newNFDString, UnicodeString &newString,
                                           UErrorCode &errorCode) const {
    if(U_FAILURE(errorCode)) { return false; }
    U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0));
    int32_t lastStarterLength = decomp.moveIndex32(0, 1);
    if(lastStarterLength == decomp.length()) {
        // Singleton decompositions should be found by addWithClosure()
        // and the CanonicalIterator, so we can ignore them here.
        return false;
    }
    if(nfdString.compare(indexAfterLastStarter, 0x7fffffff,
                         decomp, lastStarterLength, 0x7fffffff) == 0) {
        // same strings, nothing new to be found here
        return false;
    }

    // Make new FCD strings that combine a composite, or its decomposition,
    // into the nfdString's last starter and the combining marks following it.
    // Make an NFD version, and a version with the composite.
    newNFDString.setTo(nfdString, 0, indexAfterLastStarter);
    newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite);

    // The following is related to discontiguous contraction matching,
    // but builds only FCD strings (or else returns false).
    int32_t sourceIndex = indexAfterLastStarter;
    int32_t decompIndex = lastStarterLength;
    // Small optimization: We keep the source character across loop iterations
    // because we do not always consume it,
    // and then need not fetch it again nor look up its combining class again.
    UChar32 sourceChar = U_SENTINEL;
    // The cc variables need to be declared before the loop so that at the end
    // they are set to the last combining classes seen.
    uint8_t sourceCC = 0;
    uint8_t decompCC = 0;
    for(;;) {
        if(sourceChar < 0) {
            if(sourceIndex >= nfdString.length()) { break; }
            sourceChar = nfdString.char32At(sourceIndex);
            sourceCC = nfd.getCombiningClass(sourceChar);
            U_ASSERT(sourceCC != 0);
        }
        // We consume a decomposition character in each iteration.
        if(decompIndex >= decomp.length()) { break; }
        UChar32 decompChar = decomp.char32At(decompIndex);
        decompCC = nfd.getCombiningClass(decompChar);
        // Compare the two characters and their combining classes.
        if(decompCC == 0) {
            // Unable to merge because the source contains a non-zero combining mark
            // but the composite's decomposition contains another starter.
            // The strings would not be equivalent.
            return false;
        } else if(sourceCC < decompCC) {
            // Composite + sourceChar would not be FCD.
            return false;
        } else if(decompCC < sourceCC) {
            newNFDString.append(decompChar);
            decompIndex += U16_LENGTH(decompChar);
        } else if(decompChar != sourceChar) {
            // Blocked because same combining class.
            return false;
        } else {  // match: decompChar == sourceChar
            newNFDString.append(decompChar);
            decompIndex += U16_LENGTH(decompChar);
            sourceIndex += U16_LENGTH(decompChar);
            sourceChar = U_SENTINEL;
        }
    }
    // We are at the end of at least one of the two inputs.
    if(sourceChar >= 0) {  // more characters from nfdString but not from decomp
        if(sourceCC < decompCC) {
            // Appending the next source character to the composite would not be FCD.
            return false;
        }
        newNFDString.append(nfdString, sourceIndex, 0x7fffffff);
        newString.append(nfdString, sourceIndex, 0x7fffffff);
    } else if(decompIndex < decomp.length()) {  // more characters from decomp, not from nfdString
        newNFDString.append(decomp, decompIndex, 0x7fffffff);
    }
    U_ASSERT(nfd.isNormalized(newNFDString, errorCode));
    U_ASSERT(fcd.isNormalized(newString, errorCode));
    U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString);  // canonically equivalent
    return true;
}

UBool
CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const {
    // Do not map non-FCD prefixes.
    return !isFCD(s, errorCode);
}

UBool
CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const {
    // Do not map non-FCD strings.
    // Do not map strings that start with Hangul syllables: We decompose those on the fly.
    return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0));
}

UBool
CollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const {
    return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode);
}

void
CollationBuilder::closeOverComposites(UErrorCode &errorCode) {
    UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode);  // Java: static final
    if(U_FAILURE(errorCode)) { return; }
    // Hangul is decomposed on the fly during collation.
    composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
    UnicodeString prefix;  // empty
    UnicodeString nfdString;
    UnicodeSetIterator iter(composites);
    while(iter.next()) {
        U_ASSERT(!iter.isString());
        nfd.getDecomposition(iter.getCodepoint(), nfdString);
        cesLength = dataBuilder->getCEs(nfdString, ces, 0);
        if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
            // Too many CEs from the decomposition (unusual), ignore this composite.
            // We could add a capacity parameter to getCEs() and reallocate if necessary.
            // However, this can only really happen in contrived cases.
            continue;
        }
        const UnicodeString &composite(iter.getString());
        addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode);
    }
}

uint32_t
CollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str,
                                 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
                                 UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return ce32; }
    int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH];
    int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0);
    if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) {
        if(ce32 == Collation::UNASSIGNED_CE32) {
            ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode);
        }
        dataBuilder->addCE32(prefix, str, ce32, errorCode);
    }
    return ce32;
}

UBool
CollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length,
                          const int64_t ces2[], int32_t ces2Length) {
    if(ces1Length != ces2Length) {
        return false;
    }
    U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH);
    for(int32_t i = 0; i < ces1Length; ++i) {
        if(ces1[i] != ces2[i]) { return false; }
    }
    return true;
}

#ifdef DEBUG_COLLATION_BUILDER

uint32_t
alignWeightRight(uint32_t w) {
    if(w != 0) {
        while((w & 0xff) == 0) { w >>= 8; }
    }
    return w;
}

#endif

void
CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }

    CollationWeights primaries, secondaries, tertiaries;
    int64_t *nodesArray = nodes.getBuffer();
#ifdef DEBUG_COLLATION_BUILDER
        puts("\nCollationBuilder::makeTailoredCEs()");
#endif

    for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
        int32_t i = rootPrimaryIndexes.elementAti(rpi);
        int64_t node = nodesArray[i];
        uint32_t p = weight32FromNode(node);
        uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16;
        uint32_t t = s;
        uint32_t q = 0;
        UBool pIsTailored = false;
        UBool sIsTailored = false;
        UBool tIsTailored = false;
#ifdef DEBUG_COLLATION_BUILDER
        printf("\nprimary     %lx\n", (long)alignWeightRight(p));
#endif
        int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p);
        int32_t nextIndex = nextIndexFromNode(node);
        while(nextIndex != 0) {
            i = nextIndex;
            node = nodesArray[i];
            nextIndex = nextIndexFromNode(node);
            int32_t strength = strengthFromNode(node);
            if(strength == UCOL_QUATERNARY) {
                U_ASSERT(isTailoredNode(node));
#ifdef DEBUG_COLLATION_BUILDER
                printf("      quat+     ");
#endif
                if(q == 3) {
                    errorCode = U_BUFFER_OVERFLOW_ERROR;
                    errorReason = "quaternary tailoring gap too small";
                    return;
                }
                ++q;
            } else {
                if(strength == UCOL_TERTIARY) {
                    if(isTailoredNode(node)) {
#ifdef DEBUG_COLLATION_BUILDER
                        printf("    ter+        ");
#endif
                        if(!tIsTailored) {
                            // First tailored tertiary node for [p, s].
                            int32_t tCount = countTailoredNodes(nodesArray, nextIndex,
                                                                UCOL_TERTIARY) + 1;
                            uint32_t tLimit;
                            if(t == 0) {
                                // Gap at the beginning of the tertiary CE range.
                                t = rootElements.getTertiaryBoundary() - 0x100;
                                tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;
                            } else if(!pIsTailored && !sIsTailored) {
                                // p and s are root weights.
                                tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
                            } else if(t == Collation::BEFORE_WEIGHT16) {
                                tLimit = Collation::COMMON_WEIGHT16;
                            } else {
                                // [p, s] is tailored.
                                U_ASSERT(t == Collation::COMMON_WEIGHT16);
                                tLimit = rootElements.getTertiaryBoundary();
                            }
                            U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0);
                            tertiaries.initForTertiary();
                            if(!tertiaries.allocWeights(t, tLimit, tCount)) {
                                errorCode = U_BUFFER_OVERFLOW_ERROR;
                                errorReason = "tertiary tailoring gap too small";
                                return;
                            }
                            tIsTailored = true;
                        }
                        t = tertiaries.nextWeight();
                        U_ASSERT(t != 0xffffffff);
                    } else {
                        t = weight16FromNode(node);
                        tIsTailored = false;
#ifdef DEBUG_COLLATION_BUILDER
                        printf("    ter     %lx\n", (long)alignWeightRight(t));
#endif
                    }
                } else {
                    if(strength == UCOL_SECONDARY) {
                        if(isTailoredNode(node)) {
#ifdef DEBUG_COLLATION_BUILDER
                            printf("  sec+          ");
#endif
                            if(!sIsTailored) {
                                // First tailored secondary node for p.
                                int32_t sCount = countTailoredNodes(nodesArray, nextIndex,
                                                                    UCOL_SECONDARY) + 1;
                                uint32_t sLimit;
                                if(s == 0) {
                                    // Gap at the beginning of the secondary CE range.
                                    s = rootElements.getSecondaryBoundary() - 0x100;
                                    sLimit = rootElements.getFirstSecondaryCE() >> 16;
                                } else if(!pIsTailored) {
                                    // p is a root primary.
                                    sLimit = rootElements.getSecondaryAfter(pIndex, s);
                                } else if(s == Collation::BEFORE_WEIGHT16) {
                                    sLimit = Collation::COMMON_WEIGHT16;
                                } else {
                                    // p is a tailored primary.
                                    U_ASSERT(s == Collation::COMMON_WEIGHT16);
                                    sLimit = rootElements.getSecondaryBoundary();
                                }
                                if(s == Collation::COMMON_WEIGHT16) {
                                    // Do not tailor into the getSortKey() range of
                                    // compressed common secondaries.
                                    s = rootElements.getLastCommonSecondary();
                                }
                                secondaries.initForSecondary();
                                if(!secondaries.allocWeights(s, sLimit, sCount)) {
                                    errorCode = U_BUFFER_OVERFLOW_ERROR;
                                    errorReason = "secondary tailoring gap too small";
#ifdef DEBUG_COLLATION_BUILDER
                                    printf("!secondaries.allocWeights(%lx, %lx, sCount=%ld)\n",
                                           (long)alignWeightRight(s), (long)alignWeightRight(sLimit),
                                           (long)alignWeightRight(sCount));
#endif
                                    return;
                                }
                                sIsTailored = true;
                            }
                            s = secondaries.nextWeight();
                            U_ASSERT(s != 0xffffffff);
                        } else {
                            s = weight16FromNode(node);
                            sIsTailored = false;
#ifdef DEBUG_COLLATION_BUILDER
                            printf("  sec       %lx\n", (long)alignWeightRight(s));
#endif
                        }
                    } else /* UCOL_PRIMARY */ {
                        U_ASSERT(isTailoredNode(node));
#ifdef DEBUG_COLLATION_BUILDER
                        printf("pri+            ");
#endif
                        if(!pIsTailored) {
                            // First tailored primary node in this list.
                            int32_t pCount = countTailoredNodes(nodesArray, nextIndex,
                                                                UCOL_PRIMARY) + 1;
                            UBool isCompressible = baseData->isCompressiblePrimary(p);
                            uint32_t pLimit =
                                rootElements.getPrimaryAfter(p, pIndex, isCompressible);
                            primaries.initForPrimary(isCompressible);
                            if(!primaries.allocWeights(p, pLimit, pCount)) {
                                errorCode = U_BUFFER_OVERFLOW_ERROR;  // TODO: introduce a more specific UErrorCode?
                                errorReason = "primary tailoring gap too small";
                                return;
                            }
                            pIsTailored = true;
                        }
                        p = primaries.nextWeight();
                        U_ASSERT(p != 0xffffffff);
                        s = Collation::COMMON_WEIGHT16;
                        sIsTailored = false;
                    }
                    t = s == 0 ? 0 : Collation::COMMON_WEIGHT16;
                    tIsTailored = false;
                }
                q = 0;
            }
            if(isTailoredNode(node)) {
                nodesArray[i] = Collation::makeCE(p, s, t, q);
#ifdef DEBUG_COLLATION_BUILDER
                printf("%016llx\n", (long long)nodesArray[i]);
#endif
            }
        }
    }
}

int32_t
CollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) {
    int32_t count = 0;
    for(;;) {
        if(i == 0) { break; }
        int64_t node = nodesArray[i];
        if(strengthFromNode(node) < strength) { break; }
        if(strengthFromNode(node) == strength) {
            if(isTailoredNode(node)) {
                ++count;
            } else {
                break;
            }
        }
        i = nextIndexFromNode(node);
    }
    return count;
}

class CEFinalizer : public CollationDataBuilder::CEModifier {
public:
    CEFinalizer(const int64_t *ces) : finalCEs(ces) {}
    virtual ~CEFinalizer();
    virtual int64_t modifyCE32(uint32_t ce32) const override {
        U_ASSERT(!Collation::isSpecialCE32(ce32));
        if(CollationBuilder::isTempCE32(ce32)) {
            // retain case bits
            return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);
        } else {
            return Collation::NO_CE;
        }
    }
    virtual int64_t modifyCE(int64_t ce) const override {
        if(CollationBuilder::isTempCE(ce)) {
            // retain case bits
            return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000);
        } else {
            return Collation::NO_CE;
        }
    }

private:
    const int64_t *finalCEs;
};

CEFinalizer::~CEFinalizer() {}

void
CollationBuilder::finalizeCEs(UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }
    LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(icu4xMode, errorCode), errorCode);
    if(U_FAILURE(errorCode)) {
        return;
    }
    newBuilder->initForTailoring(baseData, errorCode);
    CEFinalizer finalizer(nodes.getBuffer());
    newBuilder->copyFrom(*dataBuilder, finalizer, errorCode);
    if(U_FAILURE(errorCode)) { return; }
    delete dataBuilder;
    dataBuilder = newBuilder.orphan();
}

int32_t
CollationBuilder::ceStrength(int64_t ce) {
    return
        isTempCE(ce) ? strengthFromTempCE(ce) :
        (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY :
        (static_cast<uint32_t>(ce) & 0xff000000) != 0 ? UCOL_SECONDARY :
        ce != 0 ? UCOL_TERTIARY :
        UCOL_IDENTICAL;
}

U_NAMESPACE_END

U_NAMESPACE_USE

U_CAPI UCollator * U_EXPORT2
ucol_openRules(const char16_t *rules, int32_t rulesLength,
               UColAttributeValue normalizationMode, UCollationStrength strength,
               UParseError *parseError, UErrorCode *pErrorCode) {
    if(U_FAILURE(*pErrorCode)) { return nullptr; }
    if(rules == nullptr && rulesLength != 0) {
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
        return nullptr;
    }
    RuleBasedCollator *coll = new RuleBasedCollator();
    if(coll == nullptr) {
        *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
        return nullptr;
    }
    UnicodeString r(rulesLength < 0, rules, rulesLength);
    coll->internalBuildTailoring(r, strength, normalizationMode, parseError, nullptr, *pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        delete coll;
        return nullptr;
    }
    return coll->toUCollator();
}

static const int32_t internalBufferSize = 512;

// The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp
// because it calls UnicodeSet "builder" code that depends on all Unicode properties,
// and the rest of the collation "runtime" code only depends on normalization.
// This function is not related to the collation builder,
// but it did not seem worth moving it into its own .cpp file,
// nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods.
U_CAPI int32_t U_EXPORT2
ucol_getUnsafeSet( const UCollator *coll,
                  USet *unsafe,
                  UErrorCode *status)
{
    char16_t buffer[internalBufferSize];
    int32_t len = 0;

    uset_clear(unsafe);

    // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant
    static const char16_t cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,
                                    0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };

    // add chars that fail the fcd check
    uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);

    // add lead/trail surrogates
    // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*,
    // not when testing code *points*)
    uset_addRange(unsafe, 0xd800, 0xdfff);

    USet *contractions = uset_open(0,0);

    int32_t i = 0, j = 0;
    ucol_getContractionsAndExpansions(coll, contractions, nullptr, false, status);
    int32_t contsSize = uset_size(contractions);
    UChar32 c = 0;
    // Contraction set consists only of strings
    // to get unsafe code points, we need to
    // break the strings apart and add them to the unsafe set
    for(i = 0; i < contsSize; i++) {
        len = uset_getItem(contractions, i, nullptr, nullptr, buffer, internalBufferSize, status);
        if(len > 0) {
            j = 0;
            while(j < len) {
                U16_NEXT(buffer, j, len, c);
                if(j < len) {
                    uset_add(unsafe, c);
                }
            }
        }
    }

    uset_close(contractions);

    return uset_size(unsafe);
}

#endif  // !UCONFIG_NO_COLLATION

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.24 Sekunden (vorverarbeitet am 2026-04-26) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.