/** * Like Java Collections.binarySearch(List, String, Comparator). * * @return the index>=0 where the item was found, * or the index<0 for inserting the string at ~index in sorted order
*/
int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { if (list.size() == 0) { return ~0; }
int32_t start = 0;
int32_t limit = list.size(); for (;;) {
int32_t i = (start + limit) / 2; const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
UErrorCode errorCode = U_ZERO_ERROR;
UCollationResult cmp = coll.compare(s, *si, errorCode); if (cmp == UCOL_EQUAL) { return i;
} elseif (cmp < 0) { if (i == start) { return ~start; // insert s before *si
}
limit = i;
} else { if (i == start) { return ~(start + 1); // insert s after *si
}
start = i;
}
}
}
} // namespace
// The BucketList is not in the anonymous namespace because only Clang // seems to support its use in other classes from there. // However, we also don't need U_I18N_API because it is not used from outside the i18n library. class BucketList : public UObject { public:
BucketList(UVector *bucketList, UVector *publicBucketList)
: bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
int32_t displayIndex = 0; for (int32_t i = 0; i < publicBucketList->size(); ++i) {
getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
}
}
// The virtual destructor must not be inline. // See ticket #8454 for details. virtual ~BucketList();
AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return nullptr; } // In C++, the ImmutableIndex must own its copy of the BucketList, // even if it contains no records, for proper memory management. // We could clone the buckets_ if they are not nullptr, // but that would be worth it only if this method is called multiple times, // or called after using the old-style bucket iterator API.
LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
LocalPointer<RuleBasedCollator> coll(collatorPrimaryOnly_->clone()); if (immutableBucketList.isNull() || coll.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR; return nullptr;
}
ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); if (immIndex == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR; return nullptr;
} // The ImmutableIndex adopted its parameter objects.
immutableBucketList.orphan();
coll.orphan(); return immIndex;
}
// We make a sorted array of elements. // Some of the input may be redundant. // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". // We filter out those cases.
UnicodeSetIterator iter(*initialLabels_); while (U_SUCCESS(errorCode) && iter.next()) { const UnicodeString *item = &iter.getString();
LocalPointer<UnicodeString> ownedItem;
UBool checkDistinct;
int32_t itemLength = item->length(); if (!item->hasMoreChar32Than(0, itemLength, 1)) {
checkDistinct = false;
} elseif(item->charAt(itemLength - 1) == 0x2a && // '*'
item->charAt(itemLength - 2) != 0x2a) { // Use a label if it is marked with one trailing star, // even if the label string sorts the same when all contractions are suppressed.
ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
item = ownedItem.getAlias(); if (item == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR; return;
}
checkDistinct = false;
} else {
checkDistinct = true;
} if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { // Ignore a primary-ignorable or non-alphabetic index character.
} elseif (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { // Ignore an index character that will land in the overflow bucket.
} elseif (checkDistinct &&
collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { // Ignore a multi-code point index character that does not sort distinctly // from the sequence of its separate characters.
} else {
int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); if (insertionPoint < 0) {
indexCharacters.insertElementAt(
ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
} else { const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
indexCharacters.setElementAt(
ownedString(*item, ownedItem, errorCode), insertionPoint);
}
}
}
} if (U_FAILURE(errorCode)) { return; }
// if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
int32_t size = indexCharacters.size() - 1; if (size > maxLabelCount_) {
int32_t count = 0;
int32_t old = -1; for (int32_t i = 0; i < indexCharacters.size();) {
++count;
int32_t bump = count * maxLabelCount_ / size; if (bump == old) {
indexCharacters.removeElementAt(i);
} else {
old = bump;
++i;
}
}
}
}
// fix up the list, adding underflow, additions, overflow // Insert inflow labels as needed.
int32_t scriptIndex = -1; const UnicodeString *scriptUpperBoundary = &emptyString_; for (int32_t i = 0; i < indexCharacters.size(); ++i) {
UnicodeString ¤t = *getString(indexCharacters, i); if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { // We crossed the script boundary into a new script. const UnicodeString &inflowBoundary = *scriptUpperBoundary;
UBool skippedScript = false; for (;;) {
scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { break;
}
skippedScript = true;
} if (skippedScript && bucketList->size() > 1) { // We are skipping one or more scripts, // and we are not just getting out of the underflow label.
bucket.adoptInsteadAndCheckErrorCode( new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW), errorCode);
bucketList->adoptElement(bucket.orphan(), errorCode); if (U_FAILURE(errorCode)) { return nullptr; }
}
} // Add a bucket with the current label.
bucket.adoptInsteadAndCheckErrorCode( new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL), errorCode);
bucketList->adoptElement(bucket.orphan(), errorCode); if (U_FAILURE(errorCode)) { return nullptr; } // Remember ASCII and Pinyin buckets for Pinyin redirects.
char16_t c; if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z
asciiBuckets[c - 0x41] = static_cast<Bucket*>(bucketList->lastElement());
} elseif (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
pinyinBuckets[c - 0x41] = static_cast<Bucket*>(bucketList->lastElement());
hasPinyin = true;
} // Check for multiple primary weights. if (!current.startsWith(BASE, BASE_LENGTH) &&
hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current,
ces, errorCode) &&
current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { // "AE-ligature" or "Sch" etc. for (int32_t j = bucketList->size() - 2;; --j) {
Bucket *singleBucket = getBucket(*bucketList, j); if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { // There is no single-character bucket since the last // underflow or inflow label. break;
} if (singleBucket->displayBucket_ == nullptr &&
!hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop,
singleBucket->lowerBoundary_,
ces, errorCode)) { // Add an invisible bucket that redirects strings greater than the expansion // to the previous single-character bucket. // For example, after ... Q R S Sch we add Sch\uFFFF->S // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
bucket.adoptInsteadAndCheckErrorCode(new Bucket(emptyString_,
UnicodeString(current).append(static_cast<char16_t>(0xFFFF)),
U_ALPHAINDEX_NORMAL),
errorCode); if (U_FAILURE(errorCode)) { return nullptr;
}
bucket->displayBucket_ = singleBucket;
bucketList->adoptElement(bucket.orphan(), errorCode); if (U_FAILURE(errorCode)) { return nullptr; }
hasInvisibleBuckets = true; break;
}
}
}
} if (U_FAILURE(errorCode)) { return nullptr; } if (bucketList->size() == 1) { // No real labels, show only the underflow label.
BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); if (bl == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR; return nullptr;
}
bucketList.orphan(); return bl;
} // overflow bucket
bucket.adoptInsteadAndCheckErrorCode( new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW), errorCode);
bucketList->adoptElement(bucket.orphan(), errorCode); // final if (U_FAILURE(errorCode)) { return nullptr; }
if (hasPinyin) { // Redirect Pinyin buckets.
Bucket *asciiBucket = nullptr; for (int32_t i = 0; i < 26; ++i) { if (asciiBuckets[i] != nullptr) {
asciiBucket = asciiBuckets[i];
} if (pinyinBuckets[i] != nullptr && asciiBucket != nullptr) {
pinyinBuckets[i]->displayBucket_ = asciiBucket;
hasInvisibleBuckets = true;
}
}
}
if (U_FAILURE(errorCode)) { return nullptr; } if (!hasInvisibleBuckets) {
BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); if (bl == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR; return nullptr;
}
bucketList.orphan(); return bl;
} // Merge inflow buckets that are visually adjacent. // Iterate backwards: Merge inflow into overflow rather than the other way around.
int32_t i = bucketList->size() - 1;
Bucket *nextBucket = getBucket(*bucketList, i); while (--i > 0) {
Bucket *bucket = getBucket(*bucketList, i); if (bucket->displayBucket_ != nullptr) { continue; // skip invisible buckets
} if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
bucket->displayBucket_ = nextBucket; continue;
}
}
nextBucket = bucket;
}
LocalPointer<UVector> publicBucketList(new UVector(errorCode), errorCode); if (U_FAILURE(errorCode)) { return nullptr;
} // Do not call publicBucketList->setDeleter(): // This vector shares its objects with the bucketList. for (int32_t j = 0; j < bucketList->size(); ++j) {
Bucket *bucket = getBucket(*bucketList, j); if (bucket->displayBucket_ == nullptr) {
publicBucketList->addElement(bucket, errorCode);
}
} if (U_FAILURE(errorCode)) { return nullptr; }
BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); if (bl == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR; return nullptr;
}
bucketList.orphan();
publicBucketList.orphan(); return bl;
}
/** * Creates an index, and buckets and sorts the list of records into the index.
*/ void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { if (U_FAILURE(errorCode) || buckets_ != nullptr) { return;
}
buckets_ = createBucketList(errorCode); if (U_FAILURE(errorCode) || inputList_ == nullptr || inputList_->isEmpty()) { return;
}
// Sort the records by name. // Stable sort preserves input order of collation duplicates.
inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
// Now, we traverse all of the input, which is now sorted. // If the item doesn't go in the current bucket, we find the next bucket that contains it. // This makes the process order n*log(n), since we just sort the list and then do a linear process. // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so // we need to improve it for that case.
Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
int32_t bucketIndex = 1;
Bucket *nextBucket; const UnicodeString *upperBoundary; if (bucketIndex < buckets_->bucketList_->size()) {
nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
upperBoundary = &nextBucket->lowerBoundary_;
} else {
nextBucket = nullptr;
upperBoundary = nullptr;
} for (int32_t i = 0; i < inputList_->size(); ++i) {
Record *r = getRecord(*inputList_, i); // if the current bucket isn't the right one, find the one that is // We have a special flag for the last bucket so that we don't look any further while (upperBoundary != nullptr &&
collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
currentBucket = nextBucket; // now reset the boundary that we compare against if (bucketIndex < buckets_->bucketList_->size()) {
nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
upperBoundary = &nextBucket->lowerBoundary_;
} else {
upperBoundary = nullptr;
}
} // now put the record into the bucket.
Bucket *bucket = currentBucket; if (bucket->displayBucket_ != nullptr) {
bucket = bucket->displayBucket_;
} if (bucket->records_ == nullptr) {
LocalPointer<UVector> records(new UVector(errorCode), errorCode); if (U_FAILURE(errorCode)) { return;
}
bucket->records_ = records.orphan();
}
bucket->records_->addElement(r, errorCode);
}
}
UnicodeSet exemplars;
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); if (U_SUCCESS(status)) {
initialLabels_->addAll(exemplars); return;
}
status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
// The locale data did not include explicit Index characters. // Synthesize a set of them from the locale's standard exemplar characters.
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); if (U_FAILURE(status)) { return;
}
// question: should we add auxiliary exemplars? if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.isEmpty()) {
exemplars.add(0x61, 0x7A);
} if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables // cut down to small list
exemplars.remove(0xAC00, 0xD7A3).
add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
add(0xD30C).add(0xD558);
} if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block // cut down to small list // make use of the fact that Ethiopic is allocated in 8's, where // the base is 0 mod 8.
UnicodeSet ethiopic(UnicodeString(u"[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]"), status);
ethiopic.retainAll(exemplars);
exemplars.remove(u'ሀ', 0x137F).addAll(ethiopic);
}
// Upper-case any that aren't already so. // (We only do this for synthesized index characters.)
UnicodeSetIterator it(exemplars);
UnicodeString upperC; while (it.next()) { const UnicodeString &exemplarC = it.getString();
upperC = exemplarC;
upperC.toUpper(locale);
initialLabels_->add(upperC);
}
}
UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
UnicodeSet contractions;
collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode); if (U_FAILURE(errorCode) || contractions.isEmpty()) { returnfalse; }
initialLabels_->addAll(contractions);
UnicodeSetIterator iter(contractions); while (iter.next()) { const UnicodeString &s = iter.getString();
U_ASSERT (s.startsWith(BASE, BASE_LENGTH));
char16_t c = s.charAt(s.length() - 1); if (0x41 <= c && c <= 0x5A) { // A-Z // There are Pinyin labels, add ASCII A-Z labels as well.
initialLabels_->add(0x41, 0x5A); // A-Z break;
}
} returntrue;
}
/* * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
*/ staticconst char16_t CGJ = 0x034F;
UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
UnicodeString result; if (item.length() == 0) { return result;
}
int32_t i = 0; for (;;) {
UChar32 cp = item.char32At(i);
result.append(cp);
i = item.moveIndex32(i, 1); if (i >= item.length()) { break;
}
result.append(CGJ);
} return result;
}
bool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { returnfalse;
}
bool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { returnfalse;
}
if (collator_ == nullptr) {
Collator *coll = Collator::createInstance(*locale, status); if (U_FAILURE(status)) { delete coll; return;
} if (coll == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR; return;
}
collator_ = dynamic_cast<RuleBasedCollator *>(coll); if (collator_ == nullptr) { delete coll;
status = U_UNSUPPORTED_ERROR; return;
}
}
collatorPrimaryOnly_ = collator_->clone(); if (collatorPrimaryOnly_ == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR; return;
}
collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
firstCharsInScripts_ = firstStringsInScript(status); if (U_FAILURE(status)) { return; }
firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); // Guard against a degenerate collator where // some script boundary strings are primary ignorable. for (;;) { if (U_FAILURE(status)) { return; } if (firstCharsInScripts_->isEmpty()) { // AlphabeticIndex requires some non-ignorable script boundary strings.
status = U_ILLEGAL_ARGUMENT_ERROR; return;
} if (collatorPrimaryOnly_->compare(
*static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
emptyString_, status) == UCOL_EQUAL) {
firstCharsInScripts_->removeElementAt(0);
} else { break;
}
}
// Chinese index characters, which are specific to each of the several Chinese tailorings, // take precedence over the single locale data exemplar set per language. if (!addChineseIndexCharacters(status) && locale != nullptr) {
addIndexExemplars(*locale, status);
}
}
UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { if (U_FAILURE(status)) { return nullptr;
}
LocalPointer<UVector> dest(new UVector(status), status); if (U_FAILURE(status)) { return nullptr;
}
dest->setDeleter(uprv_deleteUObject); // Fetch the script-first-primary contractions which are defined in the root collator. // They all start with U+FDD1.
UnicodeSet set;
collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status); if (U_FAILURE(status)) { return nullptr;
} if (set.isEmpty()) {
status = U_UNSUPPORTED_ERROR; return nullptr;
}
UnicodeSetIterator iter(set); while (iter.next()) { const UnicodeString &boundary = iter.getString();
uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1)); if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) { // Ignore boundaries for the special reordering groups. // Take only those for "real scripts" (where the sample character is a Letter, // and the one for unassigned implicit weights (Cn). continue;
}
LocalPointer<UnicodeString> s(new UnicodeString(boundary), status);
dest->adoptElement(s.orphan(), status); if (U_FAILURE(status)) { return nullptr;
}
} return dest.orphan();
}
namespace {
/** * Returns true if one index character string is "better" than the other. * Shorter NFKD is better, and otherwise NFKD-binary-less-than is * better, and otherwise binary-less-than is better.
*/
UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, const UnicodeString &one, const UnicodeString &other) { // This is called with primary-equal strings, but never with one.equals(other).
UErrorCode status = U_ZERO_ERROR;
UnicodeString n1 = nfkdNormalizer.normalize(one, status);
UnicodeString n2 = nfkdNormalizer.normalize(other, status); if (U_FAILURE(status)) { returnfalse; }
int32_t result = n1.countChar32() - n2.countChar32(); if (result != 0) { return result < 0;
}
result = n1.compareCodePointOrder(n2); if (result != 0) { return result < 0;
} return one.compareCodePointOrder(other) < 0;
}
} // namespace
// // Constructor & Destructor for AlphabeticIndex::Record // // Records are internal only, instances are not directly surfaced in the public API. // This class is mostly struct-like, with all public fields.
UBool AlphabeticIndex::nextRecord(UErrorCode &status) { if (U_FAILURE(status)) { returnfalse;
} if (currentBucket_ == nullptr) { // We are trying to iterate over the items in a bucket, but there is no // current bucket from the enumeration of buckets.
status = U_INVALID_STATE_ERROR; returnfalse;
} if (buckets_ == nullptr) {
status = U_ENUM_OUT_OF_SYNC_ERROR; returnfalse;
} if (currentBucket_->records_ == nullptr) { returnfalse;
}
++itemsIterIndex_; if (itemsIterIndex_ >= currentBucket_->records_->size()) {
itemsIterIndex_ = currentBucket_->records_->size(); returnfalse;
} returntrue;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.