#include"regexcst.h"// Contains state table for the regex pattern parser. // generated by a Perl script. #include"regexst.h"
U_NAMESPACE_BEGIN
// "Rule Char" Characters are those with special meaning, and therefore // need to be escaped to appear as literals in a regexp.
constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\.";
// // The backslash escape characters that ICU's unescape() function will handle. //
constexpr char16_t const *gUnescapeChars = u"acefnrtuUx";
// // Unicode Set pattern for Regular Expression \w //
constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]";
// // Unicode Set Definitions for Regular Expression \s //
constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]";
// // "Normal" is the set of characters that don't need special handling // when finding grapheme cluster boundaries. //
fPropSets[URX_GC_NORMAL].complement();
fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4);
fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]);
fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]);
fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]);
fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]);
fPropSets[URX_GC_NORMAL].freeze();
// Initialize the 8-bit fast bit sets from the parallel full // UnicodeSets. // // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping? // Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x" // This runs in exponential time, making it easy to adjust the time for // convenient measuring. // // This 8 bit optimization dates from the early days of ICU, // with a less optimized UnicodeSet. At the time, the difference // was substantial.
for (int32_t i=0; i<URX_LAST_SET; i++) {
fPropSets8[i].init(&fPropSets[i]);
}
// Sets used while parsing rules, but not referenced from the parse state table
fRuleSets[kRuleSet_rule_char-128]
.addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze();