/* * List of offsets from the current position from where to try matching * a code point or a string. * Store offsets rather than indexes to simplify the code and use the same list * for both increments (in span()) and decrements (in spanBack()). * * Assumption: The maximum offset is limited, and the offsets that are stored * at any one time are relatively dense, that is, there are normally no gaps of * hundreds or thousands of offset values. * * The implementation uses a circular buffer of byte flags, * each indicating whether the corresponding offset is in the list. * This avoids inserting into a sorted list of offsets (or absolute indexes) and * physically moving part of the list. * * Note: In principle, the caller should setMaxLength() to the maximum of the * max string length and U16_LENGTH/U8_LENGTH to account for * "long" single code points. * However, this implementation uses at least a staticList with more than * U8_LENGTH entries anyway. * * Note: If maxLength were guaranteed to be no more than 32 or 64, * the list could be stored as bit flags in a single integer. * Rather than handling a circular buffer with a start list index, * the integer would simply be shifted when lower offsets are removed. * UnicodeSet does not have a limit on the lengths of strings.
*/ class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. public:
OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
// Call exactly once if the list is to be used. void setMaxLength(int32_t maxLength) { if (maxLength <= static_cast<int32_t>(sizeof(staticList))) {
capacity = static_cast<int32_t>(sizeof(staticList));
} else {
UBool* l = static_cast<UBool*>(uprv_malloc(maxLength)); if(l!=nullptr) {
list=l;
capacity=maxLength;
}
}
uprv_memset(list, 0, capacity);
}
// Reduce all stored offsets by delta, used when the current position // moves by delta. // There must not be any offsets lower than delta. // If there is an offset equal to delta, it is removed. // delta=[1..maxLength] void shift(int32_t delta) {
int32_t i=start+delta; if(i>=capacity) {
i-=capacity;
} if(list[i]) {
list[i]=false;
--length;
}
start=i;
}
// Add an offset. The list must not contain it yet. // offset=[1..maxLength] void addOffset(int32_t offset) {
int32_t i=start+offset; if(i>=capacity) {
i-=capacity;
}
list[i]=true;
++length;
}
// Find the lowest stored offset from a non-empty list, remove it, // and reduce all other offsets by this minimum. // Returns [1..maxLength].
int32_t popMinimum() { // Look for the next offset in list[start+1..capacity-1].
int32_t i=start, result; while(++i<capacity) { if(list[i]) {
list[i]=false;
--length;
result=i-start;
start=i; return result;
}
} // i==capacity
// Wrap around and look for the next offset in list[0..start]. // Since the list is not empty, there will be one.
result=capacity-start;
i=0; while(!list[i]) {
++i;
}
list[i]=false;
--length;
start=i; return result+=i;
}
// Construct for all variants of span(), or only for any one variant. // Initialize as little as possible, for single use.
UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings,
uint32_t which)
: spanSet(0, 0x10ffff), pSpanNotSet(nullptr), strings(setStrings),
utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr),
utf8Length(0),
maxLength16(0), maxLength8(0),
all(static_cast<UBool>(which == ALL)) {
spanSet.retainAll(set); if(which&NOT_CONTAINED) { // Default to the same sets. // addToSpanNotSet() will create a separate set if necessary.
pSpanNotSet=&spanSet;
}
// Determine if the strings even need to be taken into account at all for span() etc. // If any string is relevant, then all strings need to be used for // span(longest match) but only the relevant ones for span(while contained). // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH // and do not store UTF-8 strings if !thisRelevant and CONTAINED. // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
int32_t stringsLength=strings.size();
// Freeze after checking for the need to use strings at all because freezing // a set takes some time and memory which are wasted if there are no relevant strings. if(all) {
spanSet.freeze();
}
// Compare 16-bit Unicode strings (which may be malformed UTF-16) // at code point boundaries. // That is, each edge of a match must not be in the middle of a surrogate pair. staticinline UBool
matches16CPB(const char16_t *s, int32_t start, int32_t limit, const char16_t *t, int32_t length) {
s+=start;
limit-=start; return matches16(s, t, length) &&
!(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
!(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
}
// Does the set contain the next code point? // If so, return its length; otherwise return its negative length. staticinline int32_t
spanOne(const UnicodeSet &set, const char16_t *s, int32_t length) {
char16_t c=*s, c2; if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
} return set.contains(c) ? 1 : -1;
}
/* * Note: In span() when spanLength==0 (after a string match, or at the beginning * after an empty code point span) and in spanNot() and spanNotUTF8(), * string matching could use a binary search * because all string matches are done from the same start index. * * For UTF-8, this would require a comparison function that returns UTF-16 order. * * This optimization should not be necessary for normal UnicodeSets because * most sets have no strings, and most sets with strings have * very few very short strings. * For cases with many strings, it might be better to use a different API * and implementation with a DFA (state machine).
*/
/* * Algorithm for span(USET_SPAN_CONTAINED) * * Theoretical algorithm: * - Iterate through the string, and at each code point boundary: * + If the code point there is in the set, then remember to continue after it. * + If a set string matches at the current position, then remember to continue after it. * + Either recursively span for each code point or string match, * or recursively span for all but the shortest one and * iteratively continue the span with the shortest local match. * + Remember the longest recursive span (the farthest end point). * + If there is no match at the current position, neither for the code point there * nor for any set string, then stop and return the longest recursive span length. * * Optimized implementation: * * (We assume that most sets will have very few very short strings. * A span using a string-less set is extremely fast.) * * Create and cache a spanSet which contains all of the single code points * of the original set but none of its strings. * * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). * - Loop: * + Try to match each set string at the end of the spanLength. * ~ Set strings that start with set-contained code points must be matched * with a partial overlap because the recursive algorithm would have tried * to match them at every position. * ~ Set strings that entirely consist of set-contained code points * are irrelevant for span(USET_SPAN_CONTAINED) because the * recursive algorithm would continue after them anyway * and find the longest recursive match from their end. * ~ Rather than recursing, note each end point of a set string match. * + If no set string matched after spanSet.span(), then return * with where the spanSet.span() ended. * + If at least one set string matched after spanSet.span(), then * pop the shortest string match end point and continue * the loop, trying to match all set strings from there. * + If at least one more set string matched after a previous string match, * then test if the code point after the previous string match is also * contained in the set. * Continue the loop with the shortest end point of either this code point * or a matching set string. * + If no more set string matched after a previous string match, * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). * Stop if spanLength==0, otherwise continue the loop. * * By noting each end point of a set string match, * the function visits each string position at most once and finishes * in linear time. * * The recursive algorithm may visit the same string position many times * if multiple paths lead to it and finishes in exponential time.
*/
/* * Algorithm for span(USET_SPAN_SIMPLE) * * Theoretical algorithm: * - Iterate through the string, and at each code point boundary: * + If the code point there is in the set, then remember to continue after it. * + If a set string matches at the current position, then remember to continue after it. * + Continue from the farthest match position and ignore all others. * + If there is no match at the current position, * then stop and return the current position. * * Optimized implementation: * * (Same assumption and spanSet as above.) * * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). * - Loop: * + Try to match each set string at the end of the spanLength. * ~ Set strings that start with set-contained code points must be matched * with a partial overlap because the standard algorithm would have tried * to match them earlier. * ~ Set strings that entirely consist of set-contained code points * must be matched with a full overlap because the longest-match algorithm * would hide set string matches that end earlier. * Such set strings need not be matched earlier inside the code point span * because the standard algorithm would then have continued after * the set string match anyway. * ~ Remember the longest set string match (farthest end point) from the earliest * starting point. * + If no set string matched after spanSet.span(), then return * with where the spanSet.span() ended. * + If at least one set string matched, then continue the loop after the * longest match from the earliest position. * + If no more set string matched after a previous string match, * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). * Stop if spanLength==0, otherwise continue the loop.
*/
// Consider strings; they may overlap with the span.
OffsetList offsets; if(spanCondition==USET_SPAN_CONTAINED) { // Use offset list to try all possibilities.
offsets.setMaxLength(maxLength16);
}
int32_t pos=spanLength, rest=length-pos;
int32_t i, stringsLength=strings.size(); for(;;) { if(spanCondition==USET_SPAN_CONTAINED) { for(i=0; i<stringsLength; ++i) {
int32_t overlap=spanLengths[i]; if(overlap==ALL_CP_CONTAINED) { continue; // Irrelevant string. (Also the empty string.)
} const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i)); const char16_t *s16=string.getBuffer();
int32_t length16=string.length();
U_ASSERT(length>0);
// Try to match this string at pos-overlap..pos. if(overlap>=LONG_SPAN) {
overlap=length16; // While contained: No point matching fully inside the code point span.
U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
} if(overlap>spanLength) {
overlap=spanLength;
}
int32_t inc=length16-overlap; // Keep overlap+inc==length16. for(;;) { if(inc>rest) { break;
} // Try to match if the increment is not listed already. if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { if(inc==rest) { return length; // Reached the end of the string.
}
offsets.addOffset(inc);
} if(overlap==0) { break;
}
--overlap;
++inc;
}
}
} else/* USET_SPAN_SIMPLE */ {
int32_t maxInc=0, maxOverlap=0; for(i=0; i<stringsLength; ++i) {
int32_t overlap=spanLengths[i]; // For longest match, we do need to try to match even an all-contained string // to find the match from the earliest start.
// Try to match this string at pos-overlap..pos. if(overlap>=LONG_SPAN) {
overlap=length16; // Longest match: Need to match fully inside the code point span // to find the match from the earliest start.
} if(overlap>spanLength) {
overlap=spanLength;
}
int32_t inc=length16-overlap; // Keep overlap+inc==length16. for(;;) { if(inc>rest || overlap<maxOverlap) { break;
} // Try to match if the string is longer or starts earlier. if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
matches16CPB(s, pos-overlap, length, s16, length16)
) {
maxInc=inc; // Longest match from earliest start.
maxOverlap=overlap; break;
}
--overlap;
++inc;
}
}
if(maxInc!=0 || maxOverlap!=0) { // Longest-match algorithm, and there was a string match. // Simply continue after it.
pos+=maxInc;
rest-=maxInc; if(rest==0) { return length; // Reached the end of the string.
}
spanLength=0; // Match strings from after a string match. continue;
}
} // Finished trying to match all strings at pos.
if(spanLength!=0 || pos==0) { // The position is after an unlimited code point span (spanLength!=0), // not after a string match. // The only position where spanLength==0 after a span is pos==0. // Otherwise, an unlimited code point span is only tried again when no // strings match, and if such a non-initial span fails we stop. if(offsets.isEmpty()) { return pos; // No strings matched after a span.
} // Match strings from after the next string match.
} else { // The position is after a string match (or a single code point). if(offsets.isEmpty()) { // No more strings matched after a previous string match. // Try another code point span from after the last string match.
spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); if( spanLength==rest || // Reached the end of the string, or
spanLength==0 // neither strings nor span progressed.
) { return pos+spanLength;
}
pos+=spanLength;
rest-=spanLength; continue; // spanLength>0: Match strings from after a span.
} else { // Try to match only one code point from after a string match if some // string matched beyond it, so that we try all possible positions // and don't overshoot.
spanLength=spanOne(spanSet, s+pos, rest); if(spanLength>0) { if(spanLength==rest) { return length; // Reached the end of the string.
} // Match strings after this code point. // There cannot be any increments below it because UnicodeSet strings // contain multiple code points.
pos+=spanLength;
rest-=spanLength;
offsets.shift(spanLength);
spanLength=0; continue; // Match strings from after a single code point.
} // Match strings from after the next string match.
}
}
int32_t minOffset=offsets.popMinimum();
pos+=minOffset;
rest-=minOffset;
spanLength=0; // Match strings from after a string match.
}
}
// Consider strings; they may overlap with the span.
OffsetList offsets; if(spanCondition==USET_SPAN_CONTAINED) { // Use offset list to try all possibilities.
offsets.setMaxLength(maxLength16);
}
int32_t i, stringsLength=strings.size();
uint8_t *spanBackLengths=spanLengths; if(all) {
spanBackLengths+=stringsLength;
} for(;;) { if(spanCondition==USET_SPAN_CONTAINED) { for(i=0; i<stringsLength; ++i) {
int32_t overlap=spanBackLengths[i]; if(overlap==ALL_CP_CONTAINED) { continue; // Irrelevant string. (Also the empty string.)
} const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i)); const char16_t *s16=string.getBuffer();
int32_t length16=string.length();
U_ASSERT(length>0);
// Try to match this string at pos-(length16-overlap)..pos-length16. if(overlap>=LONG_SPAN) {
overlap=length16; // While contained: No point matching fully inside the code point span.
int32_t len1=0;
U16_FWD_1(s16, len1, overlap);
overlap-=len1; // Length of the string minus the first code point.
} if(overlap>spanLength) {
overlap=spanLength;
}
int32_t dec=length16-overlap; // Keep dec+overlap==length16. for(;;) { if(dec>pos) { break;
} // Try to match if the decrement is not listed already. if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { if(dec==pos) { return 0; // Reached the start of the string.
}
offsets.addOffset(dec);
} if(overlap==0) { break;
}
--overlap;
++dec;
}
}
} else/* USET_SPAN_SIMPLE */ {
int32_t maxDec=0, maxOverlap=0; for(i=0; i<stringsLength; ++i) {
int32_t overlap=spanBackLengths[i]; // For longest match, we do need to try to match even an all-contained string // to find the match from the latest end.
// Try to match this string at pos-(length16-overlap)..pos-length16. if(overlap>=LONG_SPAN) {
overlap=length16; // Longest match: Need to match fully inside the code point span // to find the match from the latest end.
} if(overlap>spanLength) {
overlap=spanLength;
}
int32_t dec=length16-overlap; // Keep dec+overlap==length16. for(;;) { if(dec>pos || overlap<maxOverlap) { break;
} // Try to match if the string is longer or ends later. if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
matches16CPB(s, pos-dec, length, s16, length16)
) {
maxDec=dec; // Longest match from latest end.
maxOverlap=overlap; break;
}
--overlap;
++dec;
}
}
if(maxDec!=0 || maxOverlap!=0) { // Longest-match algorithm, and there was a string match. // Simply continue before it.
pos-=maxDec; if(pos==0) { return 0; // Reached the start of the string.
}
spanLength=0; // Match strings from before a string match. continue;
}
} // Finished trying to match all strings at pos.
if(spanLength!=0 || pos==length) { // The position is before an unlimited code point span (spanLength!=0), // not before a string match. // The only position where spanLength==0 before a span is pos==length. // Otherwise, an unlimited code point span is only tried again when no // strings match, and if such a non-initial span fails we stop. if(offsets.isEmpty()) { return pos; // No strings matched before a span.
} // Match strings from before the next string match.
} else { // The position is before a string match (or a single code point). if(offsets.isEmpty()) { // No more strings matched before a previous string match. // Try another code point span from before the last string match.
int32_t oldPos=pos;
pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
spanLength=oldPos-pos; if( pos==0 || // Reached the start of the string, or
spanLength==0 // neither strings nor span progressed.
) { return pos;
} continue; // spanLength>0: Match strings from before a span.
} else { // Try to match only one code point from before a string match if some // string matched beyond it, so that we try all possible positions // and don't overshoot.
spanLength=spanOneBack(spanSet, s, pos); if(spanLength>0) { if(spanLength==pos) { return 0; // Reached the start of the string.
} // Match strings before this code point. // There cannot be any decrements below it because UnicodeSet strings // contain multiple code points.
pos-=spanLength;
offsets.shift(spanLength);
spanLength=0; continue; // Match strings from before a single code point.
} // Match strings from before the next string match.
}
}
pos-=offsets.popMinimum();
spanLength=0; // Match strings from before a string match.
}
}
// Consider strings; they may overlap with the span.
OffsetList offsets; if(spanCondition==USET_SPAN_CONTAINED) { // Use offset list to try all possibilities.
offsets.setMaxLength(maxLength8);
}
int32_t pos=spanLength, rest=length-pos;
int32_t i, stringsLength=strings.size();
uint8_t *spanUTF8Lengths=spanLengths; if(all) {
spanUTF8Lengths+=2*stringsLength;
} for(;;) { const uint8_t *s8=utf8;
int32_t length8; if(spanCondition==USET_SPAN_CONTAINED) { for(i=0; i<stringsLength; ++i) {
length8=utf8Lengths[i]; if(length8==0) { continue; // String not representable in UTF-8.
}
int32_t overlap=spanUTF8Lengths[i]; if(overlap==ALL_CP_CONTAINED) {
s8+=length8; continue; // Irrelevant string.
}
// Try to match this string at pos-overlap..pos. if(overlap>=LONG_SPAN) {
overlap=length8; // While contained: No point matching fully inside the code point span.
U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
} if(overlap>spanLength) {
overlap=spanLength;
}
int32_t inc=length8-overlap; // Keep overlap+inc==length8. for(;;) { if(inc>rest) { break;
} // Try to match if the increment is not listed already. // Match at code point boundaries. (The UTF-8 strings were converted // from UTF-16 and are guaranteed to be well-formed.) if(!U8_IS_TRAIL(s[pos-overlap]) &&
!offsets.containsOffset(inc) &&
matches8(s+pos-overlap, s8, length8)) { if(inc==rest) { return length; // Reached the end of the string.
}
offsets.addOffset(inc);
} if(overlap==0) { break;
}
--overlap;
++inc;
}
s8+=length8;
}
} else/* USET_SPAN_SIMPLE */ {
int32_t maxInc=0, maxOverlap=0; for(i=0; i<stringsLength; ++i) {
length8=utf8Lengths[i]; if(length8==0) { continue; // String not representable in UTF-8.
}
int32_t overlap=spanUTF8Lengths[i]; // For longest match, we do need to try to match even an all-contained string // to find the match from the earliest start.
// Try to match this string at pos-overlap..pos. if(overlap>=LONG_SPAN) {
overlap=length8; // Longest match: Need to match fully inside the code point span // to find the match from the earliest start.
} if(overlap>spanLength) {
overlap=spanLength;
}
int32_t inc=length8-overlap; // Keep overlap+inc==length8. for(;;) { if(inc>rest || overlap<maxOverlap) { break;
} // Try to match if the string is longer or starts earlier. // Match at code point boundaries. (The UTF-8 strings were converted // from UTF-16 and are guaranteed to be well-formed.) if(!U8_IS_TRAIL(s[pos-overlap]) &&
(overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
matches8(s+pos-overlap, s8, length8)) {
maxInc=inc; // Longest match from earliest start.
maxOverlap=overlap; break;
}
--overlap;
++inc;
}
s8+=length8;
}
if(maxInc!=0 || maxOverlap!=0) { // Longest-match algorithm, and there was a string match. // Simply continue after it.
pos+=maxInc;
rest-=maxInc; if(rest==0) { return length; // Reached the end of the string.
}
spanLength=0; // Match strings from after a string match. continue;
}
} // Finished trying to match all strings at pos.
if(spanLength!=0 || pos==0) { // The position is after an unlimited code point span (spanLength!=0), // not after a string match. // The only position where spanLength==0 after a span is pos==0. // Otherwise, an unlimited code point span is only tried again when no // strings match, and if such a non-initial span fails we stop. if(offsets.isEmpty()) { return pos; // No strings matched after a span.
} // Match strings from after the next string match.
} else { // The position is after a string match (or a single code point). if(offsets.isEmpty()) { // No more strings matched after a previous string match. // Try another code point span from after the last string match.
spanLength = spanSet.spanUTF8(reinterpret_cast<constchar*>(s) + pos, rest, USET_SPAN_CONTAINED); if( spanLength==rest || // Reached the end of the string, or
spanLength==0 // neither strings nor span progressed.
) { return pos+spanLength;
}
pos+=spanLength;
rest-=spanLength; continue; // spanLength>0: Match strings from after a span.
} else { // Try to match only one code point from after a string match if some // string matched beyond it, so that we try all possible positions // and don't overshoot.
spanLength=spanOneUTF8(spanSet, s+pos, rest); if(spanLength>0) { if(spanLength==rest) { return length; // Reached the end of the string.
} // Match strings after this code point. // There cannot be any increments below it because UnicodeSet strings // contain multiple code points.
pos+=spanLength;
rest-=spanLength;
offsets.shift(spanLength);
spanLength=0; continue; // Match strings from after a single code point.
} // Match strings from after the next string match.
}
}
int32_t minOffset=offsets.popMinimum();
pos+=minOffset;
rest-=minOffset;
spanLength=0; // Match strings from after a string match.
}
}
// Consider strings; they may overlap with the span.
OffsetList offsets; if(spanCondition==USET_SPAN_CONTAINED) { // Use offset list to try all possibilities.
offsets.setMaxLength(maxLength8);
}
int32_t i, stringsLength=strings.size();
uint8_t *spanBackUTF8Lengths=spanLengths; if(all) {
spanBackUTF8Lengths+=3*stringsLength;
} for(;;) { const uint8_t *s8=utf8;
int32_t length8; if(spanCondition==USET_SPAN_CONTAINED) { for(i=0; i<stringsLength; ++i) {
length8=utf8Lengths[i]; if(length8==0) { continue; // String not representable in UTF-8.
}
int32_t overlap=spanBackUTF8Lengths[i]; if(overlap==ALL_CP_CONTAINED) {
s8+=length8; continue; // Irrelevant string.
}
// Try to match this string at pos-(length8-overlap)..pos-length8. if(overlap>=LONG_SPAN) {
overlap=length8; // While contained: No point matching fully inside the code point span.
int32_t len1=0;
U8_FWD_1(s8, len1, overlap);
overlap-=len1; // Length of the string minus the first code point.
} if(overlap>spanLength) {
overlap=spanLength;
}
int32_t dec=length8-overlap; // Keep dec+overlap==length8. for(;;) { if(dec>pos) { break;
} // Try to match if the decrement is not listed already. // Match at code point boundaries. (The UTF-8 strings were converted // from UTF-16 and are guaranteed to be well-formed.) if( !U8_IS_TRAIL(s[pos-dec]) &&
!offsets.containsOffset(dec) &&
matches8(s+pos-dec, s8, length8)
) { if(dec==pos) { return 0; // Reached the start of the string.
}
offsets.addOffset(dec);
} if(overlap==0) { break;
}
--overlap;
++dec;
}
s8+=length8;
}
} else/* USET_SPAN_SIMPLE */ {
int32_t maxDec=0, maxOverlap=0; for(i=0; i<stringsLength; ++i) {
length8=utf8Lengths[i]; if(length8==0) { continue; // String not representable in UTF-8.
}
int32_t overlap=spanBackUTF8Lengths[i]; // For longest match, we do need to try to match even an all-contained string // to find the match from the latest end.
// Try to match this string at pos-(length8-overlap)..pos-length8. if(overlap>=LONG_SPAN) {
overlap=length8; // Longest match: Need to match fully inside the code point span // to find the match from the latest end.
} if(overlap>spanLength) {
overlap=spanLength;
}
int32_t dec=length8-overlap; // Keep dec+overlap==length8. for(;;) { if(dec>pos || overlap<maxOverlap) { break;
} // Try to match if the string is longer or ends later. // Match at code point boundaries. (The UTF-8 strings were converted // from UTF-16 and are guaranteed to be well-formed.) if( !U8_IS_TRAIL(s[pos-dec]) &&
(overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
matches8(s+pos-dec, s8, length8)
) {
maxDec=dec; // Longest match from latest end.
maxOverlap=overlap; break;
}
--overlap;
++dec;
}
s8+=length8;
}
if(maxDec!=0 || maxOverlap!=0) { // Longest-match algorithm, and there was a string match. // Simply continue before it.
pos-=maxDec; if(pos==0) { return 0; // Reached the start of the string.
}
spanLength=0; // Match strings from before a string match. continue;
}
} // Finished trying to match all strings at pos.
if(spanLength!=0 || pos==length) { // The position is before an unlimited code point span (spanLength!=0), // not before a string match. // The only position where spanLength==0 before a span is pos==length. // Otherwise, an unlimited code point span is only tried again when no // strings match, and if such a non-initial span fails we stop. if(offsets.isEmpty()) { return pos; // No strings matched before a span.
} // Match strings from before the next string match.
} else { // The position is before a string match (or a single code point). if(offsets.isEmpty()) { // No more strings matched before a previous string match. // Try another code point span from before the last string match.
int32_t oldPos=pos;
pos = spanSet.spanBackUTF8(reinterpret_cast<constchar*>(s), oldPos, USET_SPAN_CONTAINED);
spanLength=oldPos-pos; if( pos==0 || // Reached the start of the string, or
spanLength==0 // neither strings nor span progressed.
) { return pos;
} continue; // spanLength>0: Match strings from before a span.
} else { // Try to match only one code point from before a string match if some // string matched beyond it, so that we try all possible positions // and don't overshoot.
spanLength=spanOneBackUTF8(spanSet, s, pos); if(spanLength>0) { if(spanLength==pos) { return 0; // Reached the start of the string.
} // Match strings before this code point. // There cannot be any decrements below it because UnicodeSet strings // contain multiple code points.
pos-=spanLength;
offsets.shift(spanLength);
spanLength=0; continue; // Match strings from before a single code point.
} // Match strings from before the next string match.
}
}
pos-=offsets.popMinimum();
spanLength=0; // Match strings from before a string match.
}
}
/* * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) * * Theoretical algorithm: * - Iterate through the string, and at each code point boundary: * + If the code point there is in the set, then return with the current position. * + If a set string matches at the current position, then return with the current position. * * Optimized implementation: * * (Same assumption as for span() above.) * * Create and cache a spanNotSet which contains all of the single code points * of the original set but none of its strings. * For each set string add its initial code point to the spanNotSet. * (Also add its final code point for spanNotBack().) * * - Loop: * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). * + If the current code point is in the original set, then * return the current position. * + If any set string matches at the current position, then * return the current position. * + If there is no match at the current position, neither for the code point there * nor for any set string, then skip this code point and continue the loop. * This happens for set-string-initial code points that were added to spanNotSet * when there is not actually a match for such a set string.
*/
int32_t UnicodeSetStringSpan::spanNot(const char16_t *s, int32_t length) const {
int32_t pos=0, rest=length;
int32_t i, stringsLength=strings.size(); do { // Span until we find a code point from the set, // or a code point that starts or ends some string.
i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); if(i==rest) { return length; // Reached the end of the string.
}
pos+=i;
rest-=i;
// Check whether the current code point is in the original set, // without the string starts and ends.
int32_t cpLength=spanOne(spanSet, s+pos, rest); if(cpLength>0) { return pos; // There is a set element at pos.
}
// Try to match the strings at pos. for(i=0; i<stringsLength; ++i) { if(spanLengths[i]==ALL_CP_CONTAINED) { continue; // Irrelevant string. (Also the empty string.)
} const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i)); const char16_t *s16=string.getBuffer();
int32_t length16=string.length();
U_ASSERT(length>0); if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { return pos; // There is a set element at pos.
}
}
// The span(while not contained) ended on a string start/end which is // not in the original set. Skip this code point and continue. // cpLength<0
pos-=cpLength;
rest+=cpLength;
} while(rest!=0); return length; // Reached the end of the string.
}
int32_t UnicodeSetStringSpan::spanNotBack(const char16_t *s, int32_t length) const {
int32_t pos=length;
int32_t i, stringsLength=strings.size(); do { // Span until we find a code point from the set, // or a code point that starts or ends some string.
pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); if(pos==0) { return 0; // Reached the start of the string.
}
// Check whether the current code point is in the original set, // without the string starts and ends.
int32_t cpLength=spanOneBack(spanSet, s, pos); if(cpLength>0) { return pos; // There is a set element at pos.
}
// Try to match the strings at pos. for(i=0; i<stringsLength; ++i) { // Use spanLengths rather than a spanBackLengths pointer because // it is easier and we only need to know whether the string is irrelevant // which is the same in either array. if(spanLengths[i]==ALL_CP_CONTAINED) { continue; // Irrelevant string. (Also the empty string.)
} const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i)); const char16_t *s16=string.getBuffer();
int32_t length16=string.length();
U_ASSERT(length>0); if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { return pos; // There is a set element at pos.
}
}
// The span(while not contained) ended on a string start/end which is // not in the original set. Skip this code point and continue. // cpLength<0
pos+=cpLength;
} while(pos!=0); return 0; // Reached the start of the string.
}
int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
int32_t pos=0, rest=length;
int32_t i, stringsLength=strings.size();
uint8_t *spanUTF8Lengths=spanLengths; if(all) {
spanUTF8Lengths+=2*stringsLength;
} do { // Span until we find a code point from the set, // or a code point that starts or ends some string.
i = pSpanNotSet->spanUTF8(reinterpret_cast<constchar*>(s) + pos, rest, USET_SPAN_NOT_CONTAINED); if(i==rest) { return length; // Reached the end of the string.
}
pos+=i;
rest-=i;
// Check whether the current code point is in the original set, // without the string starts and ends.
int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); if(cpLength>0) { return pos; // There is a set element at pos.
}
// Try to match the strings at pos. const uint8_t *s8=utf8;
int32_t length8; for(i=0; i<stringsLength; ++i) {
length8=utf8Lengths[i]; // ALL_CP_CONTAINED: Irrelevant string. if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { return pos; // There is a set element at pos.
}
s8+=length8;
}
// The span(while not contained) ended on a string start/end which is // not in the original set. Skip this code point and continue. // cpLength<0
pos-=cpLength;
rest+=cpLength;
} while(rest!=0); return length; // Reached the end of the string.
}
int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
int32_t pos=length;
int32_t i, stringsLength=strings.size();
uint8_t *spanBackUTF8Lengths=spanLengths; if(all) {
spanBackUTF8Lengths+=3*stringsLength;
} do { // Span until we find a code point from the set, // or a code point that starts or ends some string.
pos = pSpanNotSet->spanBackUTF8(reinterpret_cast<constchar*>(s), pos, USET_SPAN_NOT_CONTAINED); if(pos==0) { return 0; // Reached the start of the string.
}
// Check whether the current code point is in the original set, // without the string starts and ends.
int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); if(cpLength>0) { return pos; // There is a set element at pos.
}
// Try to match the strings at pos. const uint8_t *s8=utf8;
int32_t length8; for(i=0; i<stringsLength; ++i) {
length8=utf8Lengths[i]; // ALL_CP_CONTAINED: Irrelevant string. if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { return pos; // There is a set element at pos.
}
s8+=length8;
}
// The span(while not contained) ended on a string start/end which is // not in the original set. Skip this code point and continue. // cpLength<0
pos+=cpLength;
} while(pos!=0); return 0; // Reached the start of the string.
}
U_NAMESPACE_END
Messung V0.5
¤ Dauer der Verarbeitung: 0.20 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.