/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
namespace
{
TransliterationFlags maskComplexTrans( TransliterationFlags n )
{ // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect // in complex transliteration. return
n & (COMPLEX_TRANS_MASK | // all set ignore bits
TransliterationFlags::IGNORE_KANA | // plus IGNORE_KANA bit
TransliterationFlags::FULLWIDTH_HALFWIDTH); // and the FULLWIDTH_HALFWIDTH value
}
bool isComplexTrans( TransliterationFlags n )
{ returnbool(n & COMPLEX_TRANS_MASK);
}
TransliterationFlags maskSimpleTrans( TransliterationFlags n )
{ return n & ~COMPLEX_TRANS_MASK;
}
bool isSimpleTrans( TransliterationFlags n )
{ returnbool(maskSimpleTrans(n));
}
// Regex patterns are case sensitive.
TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n )
{
TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE;
TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK; if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE)
v = TransliterationFlags::NONE; return (m | v) & ~COMPLEX_TRANS_MASK;
}
bool isSimpleRegexTrans( TransliterationFlags n )
{ returnbool(maskSimpleRegexTrans(n));
}
if ( bReplaceApostrophe )
sSrchStr = replacePunctuation(sSrchStr);
// Take the new SearchOptions2::AlgorithmType2 field and ignore // SearchOptions::algorithmType switch( aSrchPara.AlgorithmType2)
{ case SearchAlgorithms2::REGEXP:
fnForward = &TextSearch::RESrchFrwrd;
fnBackward = &TextSearch::RESrchBkwrd;
RESrchPrepare( aSrchPara); break;
case SearchAlgorithms2::APPROXIMATE:
fnForward = &TextSearch::ApproxSrchFrwrd;
fnBackward = &TextSearch::ApproxSrchBkwrd;
void TextSearch::setOptions( const SearchOptions& rOptions )
{
sal_Int16 nAlgorithmType2; switch (rOptions.algorithmType)
{ case SearchAlgorithms_REGEXP:
nAlgorithmType2 = SearchAlgorithms2::REGEXP; break; case SearchAlgorithms_APPROXIMATE:
nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE; break; default:
SAL_WARN("i18npool","TextSearch::setOptions - default what?");
[[fallthrough]]; case SearchAlgorithms_ABSOLUTE:
nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE; break;
} // It would be nice if an inherited struct had a ctor that takes an // instance of the object the struct derived from...
SearchOptions2 aOptions2(
rOptions.algorithmType,
rOptions.searchFlag,
rOptions.searchString,
rOptions.replaceString,
rOptions.Locale,
rOptions.changedChars,
rOptions.deletedChars,
rOptions.insertedChars,
rOptions.transliterateFlags,
nAlgorithmType2,
0 // no wildcard search, no escape character...
);
setOptions2( aOptions2);
}
// in non-regex mode, allow searching typographical apostrophe with the ASCII one // to avoid regression after using automatic conversion to U+2019 during typing in Writer bool bReplaceApostrophe = bSearchApostrophe && isReplacePunctuation(in_str);
bUsePrimarySrchStr = true;
if ( xTranslit.is() )
{ // apply normal transliteration (1<->1, 1<->0)
sal_Int32 nInStartPos = startPos; if (pRegexMatcher && startPos > 0)
{ // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet // transliterate enough of the leading text to allow sensible look-behind assertions. // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit. // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax. // When search regex doesn't start with an assertion, 3 is to allow startPos to be in // the middle of a surrogate pair, preceded by another surrogate pair. const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3;
nInStartPos -= std::min(nMaxLeadingLen, startPos);
}
sal_Int32 nInEndPos = endPos; if (pRegexMatcher && endPos < searchStr.getLength())
{ // tdf#65038: ditto for look-ahead assertions const sal_Int32 nMaxTrailingLen = aSrchPara.searchString.endsWith(")") ? 100 : 3;
nInEndPos += std::min(nMaxTrailingLen, searchStr.getLength() - endPos);
}
if ( bReplaceApostrophe )
in_str = replacePunctuation(in_str);
// JP 20.6.2001: also the start and end positions must be corrected!
sal_Int32 newStartPos =
(startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos );
// Map offsets back to untransliterated string. const sal_Int32 nOffsets = offset.getLength(); if (nOffsets)
{ auto sres_startOffsetRange = asNonConstRange(sres.startOffset); auto sres_endOffsetRange = asNonConstRange(sres.endOffset); // For regex nGroups is the number of groups+1 with group 0 being // the entire match. const sal_Int32 nGroups = sres.startOffset.getLength(); for ( sal_Int32 k = 0; k < nGroups; k++ )
{ const sal_Int32 nStart = sres.startOffset[k]; // Result offsets are negative (-1) if a group expression was // not matched. if (nStart >= 0)
sres_startOffsetRange[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1)); // JP 20.6.2001: end is ever exclusive and then don't return // the position of the next character - return the // next position behind the last found character! // "a b c" find "b" must return 2,3 and not 2,4!!! const sal_Int32 nStop = sres.endOffset[k]; if (nStop >= 0)
{ if (nStop > 0)
sres_endOffsetRange[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1; else
sres_endOffsetRange[k] = offset[0];
}
}
}
} else
{ if ( bReplaceApostrophe )
in_str = in_str.replace(u'\u2019', '\'');
bUsePrimarySrchStr = false;
sres2 = (this->*fnForward)( g, in_str, startPos, endPos ); auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset); auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
{ if (sres2.startOffset[k])
sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1] + 1; if (sres2.endOffset[k])
sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1] + 1;
}
// pick first and long one if ( sres.subRegExpressions == 0) return sres2; if ( sres2.subRegExpressions == 1)
{ if ( sres.startOffset[0] > sres2.startOffset[0]) return sres2; elseif ( sres.startOffset[0] == sres2.startOffset[0] &&
sres.endOffset[0] < sres2.endOffset[0]) return sres2;
}
}
// in non-regex mode, allow searching typographical apostrophe with the ASCII one // to avoid regression after using automatic conversion to U+2019 during typing in Writer bool bReplaceApostrophe = bSearchApostrophe && isReplacePunctuation(in_str);
bUsePrimarySrchStr = true;
if ( xTranslit.is() )
{ // apply only simple 1<->1 transliteration here
css::uno::Sequence<sal_Int32> offset(startPos - endPos);
in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset );
if ( bReplaceApostrophe )
in_str = replacePunctuation(in_str);
// JP 20.6.2001: also the start and end positions must be corrected!
sal_Int32 const newStartPos = (startPos < searchStr.getLength())
? FindPosInSeq_Impl( offset, startPos )
: in_str.getLength();
// TODO: this would need nExtraOffset handling to avoid $ matching // if (pRegexMatcher && startPos < searchStr.getLength()) // but that appears to be impossible with ICU regex
// Map offsets back to untransliterated string. const sal_Int32 nOffsets = offset.getLength(); if (nOffsets)
{ auto sres_startOffsetRange = asNonConstRange(sres.startOffset); auto sres_endOffsetRange = asNonConstRange(sres.endOffset); // For regex nGroups is the number of groups+1 with group 0 being // the entire match. const sal_Int32 nGroups = sres.startOffset.getLength(); for ( sal_Int32 k = 0; k < nGroups; k++ )
{ const sal_Int32 nStart = sres.startOffset[k]; // Result offsets are negative (-1) if a group expression was // not matched. if (nStart >= 0)
{ if (nStart > 0)
sres_startOffsetRange[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1; else
sres_startOffsetRange[k] = offset[0];
} // JP 20.6.2001: end is ever exclusive and then don't return // the position of the next character - return the // next position behind the last found character! // "a b c" find "b" must return 2,3 and not 2,4!!! const sal_Int32 nStop = sres.endOffset[k]; if (nStop >= 0)
sres_endOffsetRange[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1));
}
}
} else
{ if ( bReplaceApostrophe )
in_str = replacePunctuation(in_str);
bUsePrimarySrchStr = false;
sres2 = (this->*fnBackward)( g, in_str, startPos, endPos ); auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset); auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
for( int k = 0; k < sres2.startOffset.getLength(); k++ )
{ if (sres2.startOffset[k])
sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1]+1; if (sres2.endOffset[k])
sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1]+1;
}
// pick last and long one if ( sres.subRegExpressions == 0 ) return sres2; if ( sres2.subRegExpressions == 1 )
{ if ( sres.startOffset[0] < sres2.startOffset[0] ) return sres2; if ( sres.startOffset[0] == sres2.startOffset[0] &&
sres.endOffset[0] > sres2.endOffset[0] ) return sres2;
}
}
// --------- helper methods for Boyer-Moore like text searching ---------- // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
void TextSearch::MakeForwardTab()
{ // create the jumptable for the search text
if( pJumpTable && bIsForwardTab )
{ return; // the jumpTable is ok
}
bIsForwardTab = true;
sal_Int32 n, nLen = sSrchStr.getLength();
pJumpTable.reset( new TextSearchJumpTable );
for( n = 0; n < nLen - 1; ++n )
{
sal_Unicode cCh = sSrchStr[n];
sal_Int32 nDiff = nLen - n - 1;
TextSearchJumpTable::value_type aEntry( cCh, nDiff );
void TextSearch::MakeForwardTab2()
{ // create the jumptable for the search text if( pJumpTable2 && bIsForwardTab )
{ return; // the jumpTable is ok
}
bIsForwardTab = true;
sal_Int32 n, nLen = sSrchStr2.getLength();
pJumpTable2.reset( new TextSearchJumpTable );
for( n = 0; n < nLen - 1; ++n )
{
sal_Unicode cCh = sSrchStr2[n];
sal_Int32 nDiff = nLen - n - 1;
void TextSearch::MakeBackwardTab()
{ // create the jumptable for the search text if( pJumpTable && !bIsForwardTab)
{ return; // the jumpTable is ok
}
bIsForwardTab = false;
sal_Int32 n, nLen = sSrchStr.getLength();
pJumpTable.reset( new TextSearchJumpTable );
for( n = nLen-1; n > 0; --n )
{
sal_Unicode cCh = sSrchStr[n];
TextSearchJumpTable::value_type aEntry( cCh, n );
::std::pair< TextSearchJumpTable::iterator, bool > aPair =
pJumpTable->insert( aEntry ); if ( !aPair.second )
(*(aPair.first)).second = n;
}
}
void TextSearch::MakeBackwardTab2()
{ // create the jumptable for the search text if( pJumpTable2 && !bIsForwardTab )
{ return; // the jumpTable is ok
}
bIsForwardTab = false;
sal_Int32 n, nLen = sSrchStr2.getLength();
pJumpTable2.reset( new TextSearchJumpTable );
for( n = nLen-1; n > 0; --n )
{
sal_Unicode cCh = sSrchStr2[n];
TextSearchJumpTable::value_type aEntry( cCh, n );
::std::pair< TextSearchJumpTable::iterator, bool > aPair =
pJumpTable2->insert( aEntry ); if ( !aPair.second )
(*(aPair.first)).second = n;
}
}
sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability // map css::util::SearchFlags to ICU uregex.h flags // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE // REG_NEWLINE is neither properly defined nor used anywhere => not implemented // REG_NOSUB is not used anywhere => not implemented // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute // LEV_RELAXED is only used for SearchAlgorithm==Approximate // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO // probably because the transliteration flag IGNORE_CASE handles it as well. if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0
|| (transliterateFlags & TransliterationFlags::IGNORE_CASE))
nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
UErrorCode nIcuErr = U_ZERO_ERROR; // assumption: transliteration didn't mangle regexp control chars
icu::UnicodeString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength()); #ifndef DISABLE_WORDBOUND_EMULATION // for convenience specific syntax elements of the old regex engine are emulated // - by replacing \< with "word-break followed by a look-ahead word-char" staticconst icu::UnicodeString aChevronPatternB( "\\\\<", -1, icu::UnicodeString::kInvariant); staticconst icu::UnicodeString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, icu::UnicodeString::kInvariant); static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
aChevronMatcherB.reset( aIcuSearchPatStr);
aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
aChevronMatcherB.reset(); // - by replacing \> with "look-behind word-char followed by a word-break" staticconst icu::UnicodeString aChevronPatternE( "\\\\>", -1, icu::UnicodeString::kInvariant); staticconst icu::UnicodeString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, icu::UnicodeString::kInvariant); static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
aChevronMatcherE.reset( aIcuSearchPatStr);
aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
aChevronMatcherE.reset(); #endif
pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) ); if (nIcuErr)
{
SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr);
pRegexMatcher.reset();
} else
{ // Pathological patterns may result in exponential run time making the // application appear to be frozen. Limit that. Documentation for this // call says // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00 // "The units of the limit are steps of the match engine. // Correspondence with actual processor time will depend on the speed // of the processor and the details of the specific pattern, but will // typically be on the order of milliseconds." // Just what is a good value? 42 is always an answer ... the 23 enigma // as well... which on the dev's machine is roughly 50 seconds with the // pattern of fdo#70627. /* TODO: make this a configuration settable value and possibly take * complexity of expression into account and maybe even length of text * to be matched; currently (2013-11-25) that is at most one 64k
* paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
pRegexMatcher->setTimeLimit( 23*1000, nIcuErr);
}
}
staticbool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher,
sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr)
{
pRegexMatcher->region(nStartPos, nEndPos, rIcuErr);
pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's
pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for // look-ahead/behind assertions
if (!pRegexMatcher->find(rIcuErr))
{ /* TODO: future versions could pass the UErrorCode or translations * thereof to the caller, for example to inform the user of * U_REGEX_TIME_OUT. The strange thing though is that an error is set * only after the second call that returns immediately and not if
* timeout occurred on the first call?!? */
SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr); returnfalse;
} returntrue;
}
// use the ICU RegexMatcher to find the matches
UErrorCode nIcuErr = U_ZERO_ERROR; const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
searchStr.getLength());
pRegexMatcher->reset( aSearchTargetStr); // search until there is a valid match for(;;)
{ if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr)) return aRet;
// #i118887# ignore zero-length matches e.g. "a*" in "bc" int nStartOfs = pRegexMatcher->start( nIcuErr); int nEndOfs = pRegexMatcher->end( nIcuErr); if( nStartOfs < nEndOfs) break; // If the zero-length match is behind the string, do not match it again // and again until startPos reaches there. A match behind the string is // a "$" anchor. if (nStartOfs == endPos) break; // try at next position if there was a zero-length match if( ++startPos >= endPos) return aRet;
}
// extract the result of the search constint nGroupCount = pRegexMatcher->groupCount();
aRet.subRegExpressions = nGroupCount + 1;
aRet.startOffset.realloc( aRet.subRegExpressions); auto pstartOffset = aRet.startOffset.getArray();
aRet.endOffset.realloc( aRet.subRegExpressions); auto pendOffset = aRet.endOffset.getArray();
pstartOffset[0] = pRegexMatcher->start( nIcuErr);
pendOffset[0] = pRegexMatcher->end( nIcuErr); for( int i = 1; i <= nGroupCount; ++i) {
pstartOffset[i] = pRegexMatcher->start( i, nIcuErr);
pendOffset[i] = pRegexMatcher->end( i, nIcuErr);
}
// use the ICU RegexMatcher to find the matches // TODO: use ICU's backward searching once it becomes available // as its replacement using forward search is not as good as the real thing
UErrorCode nIcuErr = U_ZERO_ERROR; const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
searchStr.getLength());
pRegexMatcher->reset( aSearchTargetStr); if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr)) return aRet;
// find the last match int nLastPos = 0; int nFoundEnd = 0; int nGoodPos = 0, nGoodEnd = 0; bool bFirst = true; do {
nLastPos = pRegexMatcher->start( nIcuErr);
nFoundEnd = pRegexMatcher->end( nIcuErr); if (nLastPos < nFoundEnd)
{ // remember last non-zero-length match
nGoodPos = nLastPos;
nGoodEnd = nFoundEnd;
} if( nFoundEnd >= startPos) break;
bFirst = false; if( nFoundEnd == nLastPos)
++nFoundEnd;
} while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr));
// Ignore all zero-length matches except "$" anchor on first match. if (nGoodPos == nGoodEnd)
{ if (bFirst && nLastPos == startPos)
nGoodPos = nLastPos; else return aRet;
}
// find last match again to get its details
lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr);
// fill in the details of the last match constint nGroupCount = pRegexMatcher->groupCount();
aRet.subRegExpressions = nGroupCount + 1;
aRet.startOffset.realloc( aRet.subRegExpressions); auto pstartOffset = aRet.startOffset.getArray();
aRet.endOffset.realloc( aRet.subRegExpressions); auto pendOffset = aRet.endOffset.getArray(); // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
pstartOffset[0] = pRegexMatcher->end( nIcuErr);
pendOffset[0] = pRegexMatcher->start( nIcuErr); for( int i = 1; i <= nGroupCount; ++i) {
pstartOffset[i] = pRegexMatcher->end( i, nIcuErr);
pendOffset[i] = pRegexMatcher->start( i, nIcuErr);
}
// Handle special cases empty pattern and/or string outside of the loop to // not add performance penalties there and simplify. if (nStartPos == nEndPos)
{
sal_Int32 i = 0; while (i < nPatternLen && rPattern[i] == '*')
++i; if (i == nPatternLen)
setWildcardMatch( aRes, nStartOffset, nEndOffset); return aRes;
}
// Empty pattern does not match any non-empty string. if (!nPatternLen) return aRes;
bool bRewind = false;
sal_uInt32 cPattern = 0;
sal_Int32 nPattern = 0;
sal_Int32 nAfterFakePattern = nPattern; if (mbWildcardAllowSubstring)
{ // Fake a leading '*' wildcard.
cPattern = '*';
bRewind = true; // Assume a non-'*' pattern character follows. If it is a '*' instead // that will be handled in the loop by setting nPat.
sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern); if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen)
rPattern.iterateCodePoints( &nAfterFakePattern);
}
// The loop code tries to avoid costly calls to iterateCodePoints() when // possible.
do
{ if (bRewind)
{ // Reuse cPattern after '*', nPattern was correspondingly // incremented to point behind cPattern.
bRewind = false;
} elseif (nPattern < nPatternLen)
{ // nPattern will be incremented by iterateCodePoints().
cPattern = rPattern.iterateCodePoints( &nPattern); if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
{
bEscaped = true;
cPattern = rPattern.iterateCodePoints( &nPattern);
}
} else
{ // A trailing '*' is handled below. if (mbWildcardAllowSubstring)
{ // If the pattern is consumed and substring match allowed we're good.
setWildcardMatch( aRes, nStartOffset, nString); return aRes;
} elseif (nString < nEndPos && nLastAsterisk >= 0)
{ // If substring match is not allowed try a greedy '*' match.
nPattern = nLastAsterisk; continue; // do
} else return aRes;
}
if (cPattern == '*' && !bEscaped)
{ // '*' is one code unit, so not using iterateCodePoints() is ok. while (nPattern < nPatternLen && rPattern[nPattern] == '*')
++nPattern;
if (nPattern >= nPatternLen)
{ // Last pattern is '*', remaining string matches.
setWildcardMatch( aRes, nStartOffset, nEndOffset); return aRes;
}
nLastAsterisk = nPattern; // Remember last encountered '*'.
// cPattern will be the next non-'*' character, nPattern // incremented.
cPattern = rPattern.iterateCodePoints( &nPattern); if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
{
bEscaped = true;
cPattern = rPattern.iterateCodePoints( &nPattern);
}
cPatternAfterAsterisk = cPattern;
bEscapedAfterAsterisk = bEscaped;
nPat = nPattern; // Remember position of pattern behind '*', already incremented.
nStr = nString; // Remember the current string to be matched.
}
if (nString >= nEndPos) // Whatever follows in pattern, string will not match. return aRes;
// nString will be incremented by iterateCodePoints().
sal_uInt32 cString = searchStr.iterateCodePoints( &nString);
if ((cPattern != '?' || bEscaped) && cPattern != cString)
{ if (nPat == -1) // Non-match already without any '*' pattern. return aRes;
bRewind = true;
nPattern = nPat; // Rewind pattern to character behind '*', already incremented.
cPattern = cPatternAfterAsterisk;
bEscaped = bEscapedAfterAsterisk;
searchStr.iterateCodePoints( &nStr);
nString = nStr; // Restore incremented remembered string position. if (nPat == nAfterFakePattern)
{ // Next start offset will be the next character.
nStartOffset = nString;
}
} else
{ // An unescaped '?' pattern matched any character, or characters // matched. Reset only escaped state.
bEscaped = false;
}
} while (nString < nEndPos);
if (bRewind) return aRes;
// Eat trailing '*' pattern that matches anything, including nothing. // '*' is one code unit, so not using iterateCodePoints() is ok. while (nPattern < nPatternLen && rPattern[nPattern] == '*')
++nPattern;
// Handle special cases empty pattern and/or string outside of the loop to // not add performance penalties there and simplify. if (nStartPos == nEndPos)
{
sal_Int32 i = 0; while (i < nPatternLen && rPattern[i] == '*')
++i; if (i == nPatternLen)
setWildcardMatch( aRes, nStartOffset, nEndOffset); return aRes;
}
// Empty pattern does not match any non-empty string. if (!nPatternLen) return aRes;
// Reverse escaped patterns to ease the handling of escapes, keeping escape // and following character as one sequence in backward direction. if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) ||
(!bUsePrimarySrchStr && maWildcardReversePattern2.isEmpty()))
{
OUStringBuffer aPatternBuf( rPattern);
sal_Int32 nIndex = 0; while (nIndex < nPatternLen)
{ const sal_Int32 nOld = nIndex; const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex); if (cu == mcWildcardEscapeChar)
{ if (nIndex < nPatternLen)
{ if (nIndex - nOld == 1)
{ // Simply move code units, we already memorized the one // in 'cu'. const sal_Int32 nOld2 = nIndex;
rPattern.iterateCodePoints( &nIndex); for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
aPatternBuf[nOld+i] = rPattern[nOld2+i];
aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu);
} else
{ // Copy the escape character code units first in the // unlikely case that it would not be of BMP.
assert(nIndex - nOld == 2); // it's UTF-16, so...
sal_Unicode buf[2];
buf[0] = rPattern[nOld];
buf[1] = rPattern[nOld+1]; const sal_Int32 nOld2 = nIndex;
rPattern.iterateCodePoints( &nIndex); for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
aPatternBuf[nOld+i] = rPattern[nOld2+i];
aPatternBuf[nIndex-2] = buf[0];
aPatternBuf[nIndex-1] = buf[1];
}
} else
{ // Trailing escape would become leading escape, do what? // Eliminate.
aPatternBuf.remove( nOld, nIndex - nOld);
}
}
} if (bUsePrimarySrchStr)
maWildcardReversePattern = aPatternBuf.makeStringAndClear(); else
maWildcardReversePattern2 = aPatternBuf.makeStringAndClear();
} const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2);
nPatternLen = rReversePattern.getLength();
bool bRewind = false;
sal_uInt32 cPattern = 0;
sal_Int32 nPattern = nPatternLen;
sal_Int32 nAfterFakePattern = nPattern; if (mbWildcardAllowSubstring)
{ // Fake a trailing '*' wildcard.
cPattern = '*';
bRewind = true; // Assume a non-'*' pattern character follows. If it is a '*' instead // that will be handled in the loop by setting nPat.
sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1); if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0)
rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
}
// The loop code tries to avoid costly calls to iterateCodePoints() when // possible.
do
{ if (bRewind)
{ // Reuse cPattern after '*', nPattern was correspondingly // decremented to point before cPattern.
bRewind = false;
} elseif (nPattern > 0)
{ // nPattern will be decremented by iterateCodePoints().
cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
{
bEscaped = true;
cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
}
} else
{ // A trailing '*' is handled below. if (mbWildcardAllowSubstring)
{ // If the pattern is consumed and substring match allowed we're good.
setWildcardMatch( aRes, nStartOffset, nString); return aRes;
} elseif (nString > nEndPos && nLastAsterisk >= 0)
{ // If substring match is not allowed try a greedy '*' match.
nPattern = nLastAsterisk; continue; // do
} else return aRes;
}
if (cPattern == '*' && !bEscaped)
{ // '*' is one code unit, so not using iterateCodePoints() is ok. while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
--nPattern;
if (nPattern <= 0)
{ // First pattern is '*', remaining string matches.
setWildcardMatch( aRes, nStartOffset, nEndOffset); return aRes;
}
nLastAsterisk = nPattern; // Remember last encountered '*'.
// cPattern will be the previous non-'*' character, nPattern // decremented.
cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
{
bEscaped = true;
cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
}
cPatternAfterAsterisk = cPattern;
bEscapedAfterAsterisk = bEscaped;
nPat = nPattern; // Remember position of pattern before '*', already decremented.
nStr = nString; // Remember the current string to be matched.
}
if (nString <= nEndPos) // Whatever leads in pattern, string will not match. return aRes;
// nString will be decremented by iterateCodePoints().
sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1);
if ((cPattern != '?' || bEscaped) && cPattern != cString)
{ if (nPat == -1) // Non-match already without any '*' pattern. return aRes;
bRewind = true;
nPattern = nPat; // Rewind pattern to character before '*', already decremented.
cPattern = cPatternAfterAsterisk;
bEscaped = bEscapedAfterAsterisk;
searchStr.iterateCodePoints( &nStr, -1);
nString = nStr; // Restore decremented remembered string position. if (nPat == nAfterFakePattern)
{ // Next start offset will be this character (exclusive).
nStartOffset = nString;
}
} else
{ // An unescaped '?' pattern matched any character, or characters // matched. Reset only escaped state.
bEscaped = false;
}
} while (nString > nEndPos);
if (bRewind) return aRes;
// Eat leading '*' pattern that matches anything, including nothing. // '*' is one code unit, so not using iterateCodePoints() is ok. while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
--nPattern;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.