/* * File coleitr.cpp * * Created by: Helena Shih * * Modification History: * * Date Name Description * * 6/23/97 helena Adding comments to make code more readable. * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java * 12/10/99 aliu Ported Thai collation support from Java. * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) * 02/19/01 swquek Removed CollationElementIterator() since it is * private constructor and no calls are made to it * 2012-2014 markus Rewritten in C++ again.
*/
int32_t CollationElementIterator::getOffset() const
{ if (dir_ < 0 && offsets_ != nullptr && !offsets_->isEmpty()) { // CollationIterator::previousCE() decrements the CEs length // while it pops CEs from its internal buffer.
int32_t i = iter_->getCEsLength(); if (otherHalf_ != 0) { // Return the trailing CE offset while we are in the middle of a 64-bit CE.
++i;
}
U_ASSERT(i < offsets_->size()); return offsets_->elementAti(i);
} return iter_->getOffset();
}
/** * Get the ordering priority of the next character in the string. * @return the next character's ordering. Returns NULLORDER if an error has * occurred or if the end of string has been reached
*/
int32_t CollationElementIterator::next(UErrorCode& status)
{ if (U_FAILURE(status)) { return NULLORDER; } if (dir_ > 1) { // Continue forward iteration. Test this first. if (otherHalf_ != 0) {
uint32_t oh = otherHalf_;
otherHalf_ = 0; return oh;
}
} elseif (dir_ == 1) { // next() after setOffset()
dir_ = 2;
} elseif (dir_ == 0) { // The iter_ is already reset to the start of the text.
dir_ = 2;
} else/* dir_ < 0 */ { // illegal change of direction
status = U_INVALID_STATE_ERROR; return NULLORDER;
} // No need to keep all CEs in the buffer when we iterate.
iter_->clearCEsIfNoneRemaining();
int64_t ce = iter_->nextCE(status); if (ce == Collation::NO_CE) { return NULLORDER; } // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
uint32_t p = static_cast<uint32_t>(ce >> 32);
uint32_t lower32 = static_cast<uint32_t>(ce);
uint32_t firstHalf = getFirstHalf(p, lower32);
uint32_t secondHalf = getSecondHalf(p, lower32); if (secondHalf != 0) {
otherHalf_ = secondHalf | 0xc0; // continuation CE
} return firstHalf;
}
/** * Get the ordering priority of the previous collation element in the string. * @param status the error code status. * @return the previous element's ordering. Returns NULLORDER if an error has * occurred or if the start of string has been reached.
*/
int32_t CollationElementIterator::previous(UErrorCode& status)
{ if (U_FAILURE(status)) { return NULLORDER; } if (dir_ < 0) { // Continue backwards iteration. Test this first. if (otherHalf_ != 0) {
uint32_t oh = otherHalf_;
otherHalf_ = 0; return oh;
}
} elseif (dir_ == 0) {
iter_->resetToOffset(string_.length());
dir_ = -1;
} elseif (dir_ == 1) { // previous() after setOffset()
dir_ = -1;
} else/* dir_ > 1 */ { // illegal change of direction
status = U_INVALID_STATE_ERROR; return NULLORDER;
} if (offsets_ == nullptr) {
offsets_ = new UVector32(status); if (offsets_ == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR; return NULLORDER;
}
} // If we already have expansion CEs, then we also have offsets. // Otherwise remember the trailing offset in case we need to // write offsets for an artificial expansion.
int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
int64_t ce = iter_->previousCE(*offsets_, status); if (ce == Collation::NO_CE) { return NULLORDER; } // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
uint32_t p = static_cast<uint32_t>(ce >> 32);
uint32_t lower32 = static_cast<uint32_t>(ce);
uint32_t firstHalf = getFirstHalf(p, lower32);
uint32_t secondHalf = getSecondHalf(p, lower32); if (secondHalf != 0) { if (offsets_->isEmpty()) { // When we convert a single 64-bit CE into two 32-bit CEs, // we need to make this artificial expansion behave like a normal expansion. // See CollationIterator::previousCE().
offsets_->addElement(iter_->getOffset(), status);
offsets_->addElement(limitOffset, status);
}
otherHalf_ = firstHalf; return secondHalf | 0xc0; // continuation CE
} return firstHalf;
}
/** * Resets the cursor to the beginning of the string.
*/ void CollationElementIterator::reset()
{
iter_ ->resetToOffset(0);
otherHalf_ = 0;
dir_ = 0;
}
void CollationElementIterator::setOffset(int32_t newOffset,
UErrorCode& status)
{ if (U_FAILURE(status)) { return; } if (0 < newOffset && newOffset < string_.length()) {
int32_t offset = newOffset; do {
char16_t c = string_.charAt(offset); if (!rbc_->isUnsafe(c) ||
(U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) { break;
} // Back up to before this unsafe character.
--offset;
} while (offset > 0); if (offset < newOffset) { // We might have backed up more than necessary. // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, // but for text "chu" setOffset(2) should remain at 2 // although we initially back up to offset 0. // Find the last safe offset no greater than newOffset by iterating forward.
int32_t lastSafeOffset = offset; do {
iter_->resetToOffset(lastSafeOffset); do {
iter_->nextCE(status); if (U_FAILURE(status)) { return; }
} while ((offset = iter_->getOffset()) == lastSafeOffset); if (offset <= newOffset) {
lastSafeOffset = offset;
}
} while (offset < newOffset);
newOffset = lastSafeOffset;
}
}
iter_->resetToOffset(newOffset);
otherHalf_ = 0;
dir_ = 1;
}
/** * Sets the source to the new source string.
*/ void CollationElementIterator::setText(const UnicodeString& source,
UErrorCode& status)
{ if (U_FAILURE(status)) { return;
}
string_ = source; const char16_t *s = string_.getBuffer();
CollationIterator *newIter;
UBool numeric = rbc_->settings->isNumeric(); if (rbc_->settings->dontCheckFCD()) {
newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
} else {
newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
} if (newIter == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR; return;
} delete iter_;
iter_ = newIter;
otherHalf_ = 0;
dir_ = 0;
}
// Sets the source to the new character iterator. void CollationElementIterator::setText(CharacterIterator& source,
UErrorCode& status)
{ if (U_FAILURE(status)) return;
int32_t CollationElementIterator::strengthOrder(int32_t order) const
{
UColAttributeValue s = static_cast<UColAttributeValue>(rbc_->settings->getStrength()); // Mask off the unwanted differences. if (s == UCOL_PRIMARY) {
order &= 0xffff0000;
} elseif (s == UCOL_SECONDARY) {
order &= 0xffffff00;
}
/** * This is the "real" constructor for this class; it constructs an iterator * over the source text using the specified collator
*/
CollationElementIterator::CollationElementIterator( const UnicodeString &source, const RuleBasedCollator *coll,
UErrorCode &status)
: iter_(nullptr), rbc_(coll), otherHalf_(0), dir_(0), offsets_(nullptr) {
setText(source, status);
}
/** * This is the "real" constructor for this class; it constructs an iterator over * the source text using the specified collator
*/
CollationElementIterator::CollationElementIterator( const CharacterIterator &source, const RuleBasedCollator *coll,
UErrorCode &status)
: iter_(nullptr), rbc_(coll), otherHalf_(0), dir_(0), offsets_(nullptr) { // We only call source.getText() which should be const anyway.
setText(const_cast<CharacterIterator &>(source), status);
}
class MaxExpSink : public ContractionsAndExpansions::CESink { public:
MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {} virtual ~MaxExpSink(); virtualvoid handleCE(int64_t /*ce*/) override {} virtualvoid handleExpansion(const int64_t ces[], int32_t length) override { if (length <= 1) { // We do not need to add single CEs into the map. return;
}
int32_t count = 0; // number of CE "halves" for (int32_t i = 0; i < length; ++i) {
count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
} // last "half" of the last CE
int64_t ce = ces[length - 1];
uint32_t p = static_cast<uint32_t>(ce >> 32);
uint32_t lower32 = static_cast<uint32_t>(ce);
uint32_t lastHalf = getSecondHalf(p, lower32); if (lastHalf == 0) {
lastHalf = getFirstHalf(p, lower32);
U_ASSERT(lastHalf != 0);
} else {
lastHalf |= 0xc0; // old-style continuation CE
} if (count > uhash_igeti(maxExpansions, static_cast<int32_t>(lastHalf))) {
uhash_iputi(maxExpansions, static_cast<int32_t>(lastHalf), count, &errorCode);
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.