#include"rbbirpt.h"// Contains state table for the rbbi rules parser. // generated by a Perl script. #include"rbbirb.h" #include"rbbinode.h" #include"rbbiscan.h" #include"rbbitblb.h"
#include"uassert.h"
//------------------------------------------------------------------------------ // // Unicode Set init strings for each of the character classes needed for parsing a rule file. // (Initialized with hex values for portability to EBCDIC based machines. // Really ugly, but there's no good way to avoid it.) // // The sets are referred to by name in the rbbirpt.txt, which is the // source form of the state transition table for the RBBI rule parser. // //------------------------------------------------------------------------------ staticconst char16_t gRuleSet_rule_char_pattern[] = { // Characters that may appear as literals in patterns without escaping or quoting. // [ ^ [ \ p { Z } \ u 0 0 2 0
0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30, // - \ u 0 0 7 f ] - [ \ p
0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, // { L } ] - [ \ p { N } ] ]
0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};
staticconst char16_t gRuleSet_name_char_pattern[] = { // [ _ \ p { L } \ p { N } ]
0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
U_CDECL_BEGIN staticvoid U_CALLCONV RBBISetTable_deleter(void *p) {
icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p; delete px->key; // Note: px->val is owned by the linked list "fSetsListHead" in scanner. // Don't delete the value nodes here.
uprv_free(px);
}
U_CDECL_END
// Do not check status until after all critical fields are sufficiently initialized // that the destructor can run cleanly. if (U_FAILURE(*rb->fStatus)) { return;
}
// // Set up the constant Unicode Sets. // Note: These could be made static, lazily initialized, and shared among // all instances of RBBIRuleScanners. BUT this is quite a bit simpler, // and the time to build these few sets should be small compared to a // full break iterator build.
fRuleSets[kRuleSet_rule_char-128]
= UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus); // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]
fRuleSets[kRuleSet_white_space-128].
add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
fRuleSets[kRuleSet_name_char-128]
= UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus);
fRuleSets[kRuleSet_name_start_char-128]
= UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus);
fRuleSets[kRuleSet_digit_char-128]
= UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus); if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) { // This case happens if ICU's data is missing. UnicodeSet tries to look up property // names from the init string, can't find them, and claims an illegal argument. // Change the error so that the actual problem will be clearer to users.
*rb->fStatus = U_BRK_INIT_ERROR;
} if (U_FAILURE(*rb->fStatus)) { return;
}
fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus); if (fSymbolTable == nullptr) {
*rb->fStatus = U_MEMORY_ALLOCATION_ERROR; return;
}
fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, rb->fStatus); if (U_FAILURE(*rb->fStatus)) { return;
}
uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
}
// Node Stack. // Normally has one entry, which is the entire parse tree for the rules. // If errors occurred, there may be additional subtrees left on the stack. while (fNodeStackPtr > 0) { delete fNodeStack[fNodeStackPtr];
fNodeStackPtr--;
}
}
//------------------------------------------------------------------------------ // // doParseAction Do some action during rule parsing. // Called by the parse state machine. // Actions build the parse tree and Unicode Sets, // and maintain the parse stack for nested expressions. // // TODO: unify EParseAction and RBBI_RuleParseAction enum types. // They represent exactly the same thing. They're separate // only to work around enum forward declaration restrictions // in some compilers, while at the same time avoiding multiple // definitions problems. I'm sure that there's a better way. // //------------------------------------------------------------------------------
UBool RBBIRuleScanner::doParseActions(int32_t action)
{
RBBINode *n = nullptr;
UBool returnVal = true;
switch (action) {
case doExprStart:
pushNewNode(RBBINode::opStart);
fRuleNum++; break;
case doNoChain: // Scanned a '^' while on the rule start state.
fNoChainInRule = true; break;
case doExprCatOperator: // concatenation operator. // For the implicit concatenation of adjacent terms in an expression that are // not separated by any other operator. Action is invoked between the // actions for the two terms.
{
fixOpStack(RBBINode::precOpCat);
RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
RBBINode *catNode = pushNewNode(RBBINode::opCat); if (U_FAILURE(*fRB->fStatus)) { break;
}
catNode->fLeftChild = operandNode;
operandNode->fParent = catNode;
} break;
case doLParen: // Open Paren. // The openParen node is a dummy operation type with a low precedence, // which has the affect of ensuring that any real binary op that // follows within the parens binds more tightly to the operands than // stuff outside of the parens.
pushNewNode(RBBINode::opLParen); break;
case doExprRParen:
fixOpStack(RBBINode::precLParen); break;
case doNOP: break;
case doStartAssign: // We've just scanned "$variable = " // The top of the node stack has the $variable ref node.
// Save the start position of the RHS text in the StartExpression node // that precedes the $variableReference node on the stack. // This will eventually be used when saving the full $variable replacement // text as a string.
n = fNodeStack[fNodeStackPtr-1];
n->fFirstPos = fNextIndex; // move past the '='
// Push a new start-of-expression node; needed to keep parse of the // RHS expression happy.
pushNewNode(RBBINode::opStart); break;
case doEndAssign:
{ // We have reached the end of an assignment statement. // Current scan char is the ';' that terminates the assignment.
// Terminate expression, leaves expression parse tree rooted in TOS node.
fixOpStack(RBBINode::precStart); if (U_FAILURE(*fRB->fStatus)) { break;
}
// Save original text of right side of assignment, excluding the terminating ';' // in the root of the node for the right-hand-side expression.
RHSExprNode->fFirstPos = startExprNode->fFirstPos;
RHSExprNode->fLastPos = fScanIndex;
fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
// Expression parse tree becomes l. child of the $variable reference node.
varRefNode->fLeftChild = RHSExprNode;
RHSExprNode->fParent = varRefNode;
// Make a symbol table entry for the $variableRef node.
fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus); if (U_FAILURE(*fRB->fStatus)) { // This is a round-about way to get the parse position set // so that duplicate symbols error messages include a line number.
UErrorCode t = *fRB->fStatus;
*fRB->fStatus = U_ZERO_ERROR;
error(t); // When adding $variableRef to the symbol table fail, Delete // both nodes because deleting varRefNode will not delete // RHSExprNode internally. delete RHSExprNode; delete varRefNode;
}
// Clean up the stack. delete startExprNode;
fNodeStackPtr-=3; break;
}
case doEndOfRule:
{
fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node. break;
} #ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");} #endif
U_ASSERT(fNodeStackPtr == 1);
RBBINode *thisRule = fNodeStack[fNodeStackPtr];
// If this rule includes a look-ahead '/', add a endMark node to the // expression tree. if (fLookAheadRule) {
RBBINode *endNode = pushNewNode(RBBINode::endMark);
RBBINode *catNode = pushNewNode(RBBINode::opCat); if (U_FAILURE(*fRB->fStatus)) { break;
}
fNodeStackPtr -= 2;
catNode->fLeftChild = thisRule;
catNode->fRightChild = endNode;
fNodeStack[fNodeStackPtr] = catNode;
endNode->fVal = fRuleNum;
endNode->fLookAheadEnd = true;
thisRule = catNode;
// TODO: Disable chaining out of look-ahead (hard break) rules. // The break on rule match is forced, so there is no point in building up // the state table to chain into another rule for a longer match.
}
// Mark this node as being the root of a rule.
thisRule->fRuleRoot = true;
// Flag if chaining into this rule is wanted. // if (fRB->fChainRules && // If rule chaining is enabled globally via !!chain
!fNoChainInRule) { // and no '^' chain-in inhibit was on this rule
thisRule->fChainIn = true;
}
// All rule expressions are ORed together. // The ';' that terminates an expression really just functions as a '|' with // a low operator prededence. // // Each of the four sets of rules are collected separately. // (forward, reverse, safe_forward, safe_reverse) // OR this rule into the appropriate group of them. //
RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree);
if (*destRules != nullptr) { // This is not the first rule encountered. // OR previous stuff (from *destRules) // with the current rule expression (on the Node Stack) // with the resulting OR expression going to *destRules //
thisRule = fNodeStack[fNodeStackPtr];
RBBINode *prevRules = *destRules;
RBBINode *orNode = pushNewNode(RBBINode::opOr); if (U_FAILURE(*fRB->fStatus)) { break;
}
orNode->fLeftChild = prevRules;
prevRules->fParent = orNode;
orNode->fRightChild = thisRule;
thisRule->fParent = orNode;
*destRules = orNode;
} else
{ // This is the first rule encountered (for this direction). // Just move its parse tree from the stack to *destRules.
*destRules = fNodeStack[fNodeStackPtr];
}
fReverseRule = false; // in preparation for the next rule.
fLookAheadRule = false;
fNoChainInRule = false;
fNodeStackPtr = 0;
} break;
case doRuleError:
error(U_BRK_RULE_SYNTAX);
returnVal = false; break;
case doVariableNameExpectedErr:
error(U_BRK_RULE_SYNTAX); break;
// // Unary operands + ? * // These all appear after the operand to which they apply. // When we hit one, the operand (may be a whole sub expression) // will be on the top of the stack. // Unary Operator becomes TOS, with the old TOS as its one child. case doUnaryOpPlus:
{
RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
RBBINode *plusNode = pushNewNode(RBBINode::opPlus); if (U_FAILURE(*fRB->fStatus)) { break;
}
plusNode->fLeftChild = operandNode;
operandNode->fParent = plusNode;
} break;
case doRuleChar: // A "Rule Character" is any single character that is a literal part // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]" // These are pretty uncommon in break rules; the terms are more commonly // sets. To keep things uniform, treat these characters like as // sets that just happen to contain only one character.
{
n = pushNewNode(RBBINode::setRef); if (U_FAILURE(*fRB->fStatus)) { break;
}
findSetFor(UnicodeString(fC.fChar), n);
n->fFirstPos = fScanIndex;
n->fLastPos = fNextIndex;
fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); break;
}
case doDotAny: // scanned a ".", meaning match any single character.
{
n = pushNewNode(RBBINode::setRef); if (U_FAILURE(*fRB->fStatus)) { break;
}
findSetFor(UnicodeString(true, kAny, 3), n);
n->fFirstPos = fScanIndex;
n->fLastPos = fNextIndex;
fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); break;
}
case doSlash: // Scanned a '/', which identifies a look-ahead break position in a rule.
n = pushNewNode(RBBINode::lookAhead); if (U_FAILURE(*fRB->fStatus)) { break;
}
n->fVal = fRuleNum;
n->fFirstPos = fScanIndex;
n->fLastPos = fNextIndex;
fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
fLookAheadRule = true; break;
case doStartTagValue: // Scanned a '{', the opening delimiter for a tag value within a rule.
n = pushNewNode(RBBINode::tag); if (U_FAILURE(*fRB->fStatus)) { break;
}
n->fVal = 0;
n->fFirstPos = fScanIndex;
n->fLastPos = fNextIndex; break;
case doTagDigit: // Just scanned a decimal digit that's part of a tag value
{
n = fNodeStack[fNodeStackPtr];
uint32_t v = u_charDigitValue(fC.fChar);
U_ASSERT(v < 10);
int64_t updated = static_cast<int64_t>(n->fVal)*10 + v; // Avoid overflow n->fVal if (updated > INT32_MAX) {
error(U_BRK_RULE_SYNTAX); break;
}
n->fVal = static_cast<int32_t>(updated); break;
}
case doTagValue:
n = fNodeStack[fNodeStackPtr];
n->fLastPos = fNextIndex;
fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); break;
case doTagExpectedError:
error(U_BRK_MALFORMED_RULE_TAG);
returnVal = false; break;
case doOptionStart: // Scanning a !!option. At the start of string.
fOptionStart = fScanIndex; break;
case doStartVariableName:
n = pushNewNode(RBBINode::varRef); if (U_FAILURE(*fRB->fStatus)) { break;
}
n->fFirstPos = fScanIndex; break;
case doEndVariableName:
n = fNodeStack[fNodeStackPtr]; if (n==nullptr || n->fType != RBBINode::varRef) {
error(U_BRK_INTERNAL_ERROR); break;
}
n->fLastPos = fScanIndex;
fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText); // Look the newly scanned name up in the symbol table // If there's an entry, set the l. child of the var ref to the replacement expression. // (We also pass through here when scanning assignments, but no harm is done, other // than a slight wasted effort that seems hard to avoid. Lookup will be null)
n->fLeftChild = fSymbolTable->lookupNode(n->fText); break;
case doCheckVarDef:
n = fNodeStack[fNodeStackPtr]; if (n->fLeftChild == nullptr) {
error(U_BRK_UNDEFINED_VARIABLE);
returnVal = false;
} break;
case doExprFinished: break;
case doRuleErrorAssignExpr:
error(U_BRK_ASSIGN_ERROR);
returnVal = false; break;
//------------------------------------------------------------------------------ // // Error Report a rule parse error. // Only report it if no previous error has been recorded. // //------------------------------------------------------------------------------ void RBBIRuleScanner::error(UErrorCode e) { if (U_SUCCESS(*fRB->fStatus)) {
*fRB->fStatus = e; if (fRB->fParseError) {
fRB->fParseError->line = fLineNum;
fRB->fParseError->offset = fCharNum;
fRB->fParseError->preContext[0] = 0;
fRB->fParseError->postContext[0] = 0;
}
}
}
//------------------------------------------------------------------------------ // // fixOpStack The parse stack holds partially assembled chunks of the parse tree. // An entry on the stack may be as small as a single setRef node, // or as large as the parse tree // for an entire expression (this will be the one item left on the stack // when the parsing of an RBBI rule completes. // // This function is called when a binary operator is encountered. // It looks back up the stack for operators that are not yet associated // with a right operand, and if the precedence of the stacked operator >= // the precedence of the current operator, binds the operand left, // to the previously encountered operator. // //------------------------------------------------------------------------------ void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
RBBINode *n; // printNodeStack("entering fixOpStack()"); for (;;) {
n = fNodeStack[fNodeStackPtr-1]; // an operator node if (n->fPrecedence == 0) {
RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
error(U_BRK_INTERNAL_ERROR); return;
}
if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) { // The most recent operand goes with the current operator, // not with the previously stacked one. break;
} // Stack operator is a binary op ( '|' or concatenation) // TOS operand becomes right child of this operator. // Resulting subexpression becomes the TOS operand.
n->fRightChild = fNodeStack[fNodeStackPtr];
fNodeStack[fNodeStackPtr]->fParent = n;
fNodeStackPtr--; // printNodeStack("looping in fixOpStack() ");
}
if (p <= RBBINode::precLParen) { // Scan is at a right paren or end of expression. // The scanned item must match the stack, or else there was an error. // Discard the left paren (or start expr) node from the stack, // leaving the completed (sub)expression as TOS. if (n->fPrecedence != p) { // Right paren encountered matched start of expression node, or // end of expression matched with a left paren node.
error(U_BRK_MISMATCHED_PAREN);
}
fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
fNodeStackPtr--; // Delete the now-discarded LParen or Start node. delete n;
} // printNodeStack("leaving fixOpStack()");
}
//------------------------------------------------------------------------------ // // findSetFor given a UnicodeString, // - find the corresponding Unicode Set (uset node) // (create one if necessary) // - Set fLeftChild of the caller's node (should be a setRef node) // to the uset node // Maintain a hash table of uset nodes, so the same one is always used // for the same string. // If a "to adopt" set is provided and we haven't seen this key before, // add the provided set to the hash table. // If the string is one (32 bit) char in length, the set contains // just one element which is the char in question. // If the string is "any", return a set containing all chars. // //------------------------------------------------------------------------------ void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {
RBBISetTableEl *el;
// First check whether we've already cached a set for this string. // If so, just use the cached set in the new node. // delete any set provided by the caller, since we own it.
el = static_cast<RBBISetTableEl*>(uhash_get(fSetTable, &s)); if (el != nullptr) { delete setToAdopt;
node->fLeftChild = el->val;
U_ASSERT(node->fLeftChild->fType == RBBINode::uset); return;
}
// Haven't seen this set before. // If the caller didn't provide us with a prebuilt set, // create a new UnicodeSet now. if (setToAdopt == nullptr) { if (s.compare(kAny, -1) == 0) {
setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
} else {
UChar32 c;
c = s.char32At(0);
setToAdopt = new UnicodeSet(c, c);
}
}
// // Make a new uset node to refer to this UnicodeSet // This new uset node becomes the child of the caller's setReference node. //
RBBINode *usetNode = new RBBINode(RBBINode::uset); if (usetNode == nullptr) {
error(U_MEMORY_ALLOCATION_ERROR); delete setToAdopt; return;
}
usetNode->fInputSet = setToAdopt;
usetNode->fParent = node;
node->fLeftChild = usetNode;
usetNode->fText = s;
// // Add the new uset node to the list of all uset nodes. //
fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
// // Add the new set to the set hash table. //
el = static_cast<RBBISetTableEl*>(uprv_malloc(sizeof(RBBISetTableEl)));
UnicodeString *tkey = new UnicodeString(s); if (tkey == nullptr || el == nullptr || setToAdopt == nullptr) { // Delete to avoid memory leak delete tkey;
tkey = nullptr;
uprv_free(el);
el = nullptr; delete setToAdopt;
setToAdopt = nullptr;
//------------------------------------------------------------------------------ // // nextCharLL Low Level Next Char from rule input source. // Get a char from the input character iterator, // keep track of input position for error reporting. // //------------------------------------------------------------------------------
UChar32 RBBIRuleScanner::nextCharLL() {
UChar32 ch;
if (ch == chCR ||
ch == chNEL ||
ch == chLS ||
(ch == chLF && fLastChar != chCR)) { // Character is starting a new line. Bump up the line number, and // reset the column to 0.
fLineNum++;
fCharNum=0; if (fQuoteMode) {
error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
fQuoteMode = false;
}
} else { // Character is not starting a new line. Except in the case of a // LF following a CR, increment the column position. if (ch != chLF) {
fCharNum++;
}
}
fLastChar = ch; return ch;
}
//------------------------------------------------------------------------------ // // nextChar for rules scanning. At this level, we handle stripping // out comments and processing backslash character escapes. // The rest of the rules grammar is handled at the next level up. // //------------------------------------------------------------------------------ void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
// Unicode Character constants needed for the processing done by nextChar(), // in hex because literals wont work on EBCDIC machines.
// // check for '' sequence. // These are recognized in all contexts, whether in quoted text or not. // if (c.fChar == chApos) { if (fRB->fRules.char32At(fNextIndex) == chApos) {
c.fChar = nextCharLL(); // get nextChar officially so character counts
c.fEscaped = true; // stay correct.
} else
{ // Single quote, by itself. // Toggle quoting mode. // Return either '(' or ')', because quotes cause a grouping of the quoted text.
fQuoteMode = !fQuoteMode; if (fQuoteMode) {
c.fChar = chLParen;
} else {
c.fChar = chRParen;
}
c.fEscaped = false; // The paren that we return is not escaped. return;
}
}
if (c.fChar == static_cast<UChar32>(-1)) { return;
} if (fQuoteMode) {
c.fEscaped = true;
} else
{ // We are not in a 'quoted region' of the source. // if (c.fChar == chPound) { // Start of a comment. Consume the rest of it. // The new-line char that terminates the comment is always returned. // It will be treated as white-space, and serves to break up anything // that might otherwise incorrectly clump together with a comment in // the middle (a variable name, for example.)
int32_t commentStart = fScanIndex; for (;;) {
c.fChar = nextCharLL(); if (c.fChar == static_cast<UChar32>(-1) || // EOF
c.fChar == chCR ||
c.fChar == chLF ||
c.fChar == chNEL ||
c.fChar == chLS) {break;}
} for (int32_t i=commentStart; i<fNextIndex-1; ++i) {
fRB->fStrippedRules.setCharAt(i, u' ');
}
} if (c.fChar == static_cast<UChar32>(-1)) { return;
}
// // check for backslash escaped characters. // Use UnicodeString::unescapeAt() to handle them. // if (c.fChar == chBackSlash) {
c.fEscaped = true;
int32_t startX = fNextIndex;
c.fChar = fRB->fRules.unescapeAt(fNextIndex); if (fNextIndex == startX) {
error(U_BRK_HEX_DIGITS_EXPECTED);
}
fCharNum += fNextIndex-startX;
}
} // putc(c.fChar, stdout);
}
//------------------------------------------------------------------------------ // // Parse RBBI rules. The state machine for rules parsing is here. // The state tables are hand-written in the file rbbirpt.txt, // and converted to the form used here by a perl // script rbbicst.pl // //------------------------------------------------------------------------------ void RBBIRuleScanner::parse() {
uint16_t state; const RBBIRuleTableEl *tableEl;
if (U_FAILURE(*fRB->fStatus)) { return;
}
state = 1;
nextChar(fC); // // Main loop for the rule parsing state machine. // Runs once per state transition. // Each time through optionally performs, depending on the state table, // - an advance to the the next input char // - an action to be performed. // - pushing or popping a state to/from the local state return stack. // for (;;) { // Bail out if anything has gone wrong. // RBBI rule file parsing stops on the first error encountered. if (U_FAILURE(*fRB->fStatus)) { break;
}
// Quit if state == 0. This is the normal way to exit the state machine. // if (state == 0) { break;
}
// Find the state table element that matches the input char from the rule, or the // class of the input character. Start with the first table row for this // state, then linearly scan forward until we find a row that matches the // character. The last row for each state always matches all characters, so // the search will stop there, if not before. //
tableEl = &gRuleParseStateTable[state]; #ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ",
fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
} #endif
for (;;) { #ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);} #endif if (tableEl->fCharClass < 127 && fC.fEscaped == false && tableEl->fCharClass == fC.fChar) { // Table row specified an individual character, not a set, and // the input character is not escaped, and // the input character matched it. break;
} if (tableEl->fCharClass == 255) { // Table row specified default, match anything character class. break;
} if (tableEl->fCharClass == 254 && fC.fEscaped) { // Table row specified "escaped" and the char was escaped. break;
} if (tableEl->fCharClass == 253 && fC.fEscaped &&
(fC.fChar == 0x50 || fC.fChar == 0x70 )) { // Table row specified "escaped P" and the char is either 'p' or 'P'. break;
} if (tableEl->fCharClass == 252 && fC.fChar == static_cast<UChar32>(-1)) { // Table row specified eof and we hit eof on the input. break;
}
if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
fC.fEscaped == false && // char is not escaped &&
fC.fChar != static_cast<UChar32>(-1)) { // char is not EOF
U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets)); if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { // Table row specified a character class, or set of characters, // and the current char matches it. break;
}
}
// No match on this row, advance to the next row for this state,
tableEl++;
} if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
// // We've found the row of the state table that matches the current input // character from the rules string. // Perform any action specified by this row in the state table. if (doParseActions(static_cast<int32_t>(tableEl->fAction)) == false) { // Break out of the state machine loop if the // the action signalled some kind of error, or // the action was to exit, occurs on normal end-of-rules-input. break;
}
if (tableEl->fPushState != 0) {
fStackPtr++; if (fStackPtr >= kStackSize) {
error(U_BRK_INTERNAL_ERROR);
RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
fStackPtr--;
}
fStack[fStackPtr] = tableEl->fPushState;
}
if (tableEl->fNextChar) {
nextChar(fC);
}
// Get the next state from the table entry, or from the // state stack if the next state was specified as "pop". if (tableEl->fNextState != 255) {
state = tableEl->fNextState;
} else {
state = fStack[fStackPtr];
fStackPtr--; if (fStackPtr < 0) {
error(U_BRK_INTERNAL_ERROR);
RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
fStackPtr++;
}
}
}
if (U_FAILURE(*fRB->fStatus)) { return;
}
// If there are no forward rules set an error. // if (fRB->fForwardTree == nullptr) {
error(U_BRK_RULE_SYNTAX); return;
}
// // Parsing of the input RBBI rules is complete. // We now have a parse tree for the rule expressions // and a list of all UnicodeSets that are referenced. // #ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();} if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) {
RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
RBBINode::printTree(fRB->fForwardTree, true);
RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
RBBINode::printTree(fRB->fReverseTree, true);
RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
RBBINode::printTree(fRB->fSafeFwdTree, true);
RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
RBBINode::printTree(fRB->fSafeRevTree, true);
} #endif
}
//------------------------------------------------------------------------------ // // printNodeStack for debugging... // //------------------------------------------------------------------------------ #ifdef RBBI_DEBUG void RBBIRuleScanner::printNodeStack(constchar *title) { int i;
RBBIDebugPrintf("%s. Dumping node stack...\n", title); for (i=fNodeStackPtr; i>0; i--) {RBBINode::printTree(fNodeStack[i], true);}
} #endif
//------------------------------------------------------------------------------ // // pushNewNode create a new RBBINode of the specified type and push it // onto the stack of nodes. // //------------------------------------------------------------------------------
RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) { if (U_FAILURE(*fRB->fStatus)) { return nullptr;
} if (fNodeStackPtr >= kStackSize - 1) {
error(U_BRK_RULE_SYNTAX);
RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow."); return nullptr;
}
fNodeStackPtr++;
fNodeStack[fNodeStackPtr] = new RBBINode(t); if (fNodeStack[fNodeStackPtr] == nullptr) {
*fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
} return fNodeStack[fNodeStackPtr];
}
//------------------------------------------------------------------------------ // // scanSet Construct a UnicodeSet from the text at the current scan // position. Advance the scan position to the first character // after the set. // // A new RBBI setref node referring to the set is pushed onto the node // stack. // // The scan position is normally under the control of the state machine // that controls rule parsing. UnicodeSets, however, are parsed by // the UnicodeSet constructor, not by the RBBI rule parser. // //------------------------------------------------------------------------------ void RBBIRuleScanner::scanSet() {
ParsePosition pos; int startPos; int i;
if (U_FAILURE(*fRB->fStatus)) { return;
}
pos.setIndex(fScanIndex);
startPos = fScanIndex;
UErrorCode localStatus = U_ZERO_ERROR;
LocalPointer<UnicodeSet> uset(new UnicodeSet(), localStatus); if (U_FAILURE(localStatus)) {
error(localStatus); return;
}
uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus); if (U_FAILURE(localStatus)) { // TODO: Get more accurate position of the error from UnicodeSet's return info. // UnicodeSet appears to not be reporting correctly at this time. #ifdef RBBI_DEBUG
RBBIDebugPrintf("UnicodeSet parse position.ErrorIndex = %d\n", pos.getIndex()); #endif
error(localStatus); return;
}
// Verify that the set contains at least one code point. //
U_ASSERT(uset.isValid());
UnicodeSet tempSet(*uset); // Use tempSet to handle the case that the UnicodeSet contains // only string element, such as [{ab}] and treat it as empty set.
tempSet.removeAllStrings(); if (tempSet.isEmpty()) { // This set is empty. // Make it an error, because it almost certainly is not what the user wanted. // Also, avoids having to think about corner cases in the tree manipulation code // that occurs later on.
error(U_BRK_RULE_EMPTY_SET); return;
}
// Advance the RBBI parse position over the UnicodeSet pattern. // Don't just set fScanIndex because the line/char positions maintained // for error reporting would be thrown off.
i = pos.getIndex(); for (;U_SUCCESS(*fRB->fStatus);) { if (fNextIndex >= i) { break;
}
nextCharLL();
}
if (U_SUCCESS(*fRB->fStatus)) {
RBBINode *n;
n = pushNewNode(RBBINode::setRef); if (U_FAILURE(*fRB->fStatus)) { return;
}
n->fFirstPos = startPos;
n->fLastPos = fNextIndex;
fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); // findSetFor() serves several purposes here: // - Adopts storage for the UnicodeSet, will be responsible for deleting. // - Maintains collection of all sets in use, needed later for establishing // character categories for run time engine. // - Eliminates mulitiple instances of the same set. // - Creates a new uset node if necessary (if this isn't a duplicate.)
findSetFor(n->fText, n, uset.orphan());
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.