icu51/source/common/rbbiscan.cpp - Issue 20882002: Check in the pristine copy of ICU 51.2

Unified Diff: icu51/source/common/rbbiscan.cpp

Issue 20882002: Check in the pristine copy of ICU 51.2 (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu51/source/common/rbbiscan.cpp

===================================================================

--- icu51/source/common/rbbiscan.cpp (revision 0)

+++ icu51/source/common/rbbiscan.cpp (revision 0)

@@ -0,0 +1,1213 @@

+//

+// file: rbbiscan.cpp

+//

+// This file contains the Rule Based Break Iterator Rule Builder functions for

+// scanning the rules and assembling a parse tree. This is the first phase

+// of compiling the rules.

+//

+// The overall of the rules is managed by class RBBIRuleBuilder, which will

+// create and use an instance of this class as part of the process.

+//

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_BREAK_ITERATION

+#include "unicode/unistr.h"

+#include "unicode/uniset.h"

+#include "unicode/uchar.h"

+#include "unicode/uchriter.h"

+#include "unicode/parsepos.h"

+#include "unicode/parseerr.h"

+#include "cmemory.h"

+#include "cstring.h"

+#include "rbbirpt.h" // Contains state table for the rbbi rules parser.

+ // generated by a Perl script.

+#include "rbbirb.h"

+#include "rbbinode.h"

+#include "rbbiscan.h"

+#include "rbbitblb.h"

+#include "uassert.h"

+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

+//------------------------------------------------------------------------------

+//

+// Unicode Set init strings for each of the character classes needed for parsing a rule file.

+// (Initialized with hex values for portability to EBCDIC based machines.

+// Really ugly, but there's no good way to avoid it.)

+//

+// The sets are referred to by name in the rbbirpt.txt, which is the

+// source form of the state transition table for the RBBI rule parser.

+//

+//------------------------------------------------------------------------------

+static const UChar gRuleSet_rule_char_pattern[] = {

+ // [ ^ [ \ p { Z } \ u 0 0 2 0

+ 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,

+ // - \ u 0 0 7 f ] - [ \ p

+ 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,

+ // { L } ] - [ \ p { N } ] ]

+ 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};

+static const UChar gRuleSet_name_char_pattern[] = {

+// [ _ \ p { L } \ p { N } ]

+ 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};

+static const UChar gRuleSet_digit_char_pattern[] = {

+// [ 0 - 9 ]

+ 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};

+static const UChar gRuleSet_name_start_char_pattern[] = {

+// [ _ \ p { L } ]

+ 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };

+static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any"

+U_CDECL_BEGIN

+static void U_CALLCONV RBBISetTable_deleter(void *p) {

+ icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p;

+ delete px->key;

+ // Note: px->val is owned by the linked list "fSetsListHead" in scanner.

+ // Don't delete the value nodes here.

+ uprv_free(px);

+U_CDECL_END

+U_NAMESPACE_BEGIN

+//------------------------------------------------------------------------------

+//

+// Constructor.

+//

+//------------------------------------------------------------------------------

+RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)

+ fRB = rb;

+ fStackPtr = 0;

+ fStack[fStackPtr] = 0;

+ fNodeStackPtr = 0;

+ fRuleNum = 0;

+ fNodeStack[0] = NULL;

+ fSymbolTable = NULL;

+ fSetTable = NULL;

+ fScanIndex = 0;

+ fNextIndex = 0;

+ fReverseRule = FALSE;

+ fLookAheadRule = FALSE;

+ fLineNum = 1;

+ fCharNum = 0;

+ fQuoteMode = FALSE;

+ // Do not check status until after all critical fields are sufficiently initialized

+ // that the destructor can run cleanly.

+ if (U_FAILURE(*rb->fStatus)) {

+ return;

+ }

+ //

+ // Set up the constant Unicode Sets.

+ // Note: These could be made static, lazily initialized, and shared among

+ // all instances of RBBIRuleScanners. BUT this is quite a bit simpler,

+ // and the time to build these few sets should be small compared to a

+ // full break iterator build.

+ fRuleSets[kRuleSet_rule_char-128]

+ = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus);

+ // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]

+ fRuleSets[kRuleSet_white_space-128].

+ add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);

+ fRuleSets[kRuleSet_name_char-128]

+ = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus);

+ fRuleSets[kRuleSet_name_start_char-128]

+ = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus);

+ fRuleSets[kRuleSet_digit_char-128]

+ = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus);

+ if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {

+ // This case happens if ICU's data is missing. UnicodeSet tries to look up property

+ // names from the init string, can't find them, and claims an illegal argument.

+ // Change the error so that the actual problem will be clearer to users.

+ *rb->fStatus = U_BRK_INIT_ERROR;

+ }

+ if (U_FAILURE(*rb->fStatus)) {

+ return;

+ }

+ fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);

+ if (fSymbolTable == NULL) {

+ *rb->fStatus = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus);

+ if (U_FAILURE(*rb->fStatus)) {

+ return;

+ }

+ uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);

+//------------------------------------------------------------------------------

+//

+// Destructor

+//

+//------------------------------------------------------------------------------

+RBBIRuleScanner::~RBBIRuleScanner() {

+ delete fSymbolTable;

+ if (fSetTable != NULL) {

+ uhash_close(fSetTable);

+ fSetTable = NULL;

+ }

+ // Node Stack.

+ // Normally has one entry, which is the entire parse tree for the rules.

+ // If errors occured, there may be additional subtrees left on the stack.

+ while (fNodeStackPtr > 0) {

+ delete fNodeStack[fNodeStackPtr];

+ fNodeStackPtr--;

+ }

+//------------------------------------------------------------------------------

+//

+// doParseAction Do some action during rule parsing.

+// Called by the parse state machine.

+// Actions build the parse tree and Unicode Sets,

+// and maintain the parse stack for nested expressions.

+//

+// TODO: unify EParseAction and RBBI_RuleParseAction enum types.

+// They represent exactly the same thing. They're separate

+// only to work around enum forward declaration restrictions

+// in some compilers, while at the same time avoiding multiple

+// definitions problems. I'm sure that there's a better way.

+//

+//------------------------------------------------------------------------------

+UBool RBBIRuleScanner::doParseActions(int32_t action)

+ RBBINode *n = NULL;

+ UBool returnVal = TRUE;

+ switch (action) {

+ case doExprStart:

+ pushNewNode(RBBINode::opStart);

+ fRuleNum++;

+ break;

+ case doExprOrOperator:

+ {

+ fixOpStack(RBBINode::precOpCat);

+ RBBINode *operandNode = fNodeStack[fNodeStackPtr--];

+ RBBINode *orNode = pushNewNode(RBBINode::opOr);

+ orNode->fLeftChild = operandNode;

+ operandNode->fParent = orNode;

+ }

+ break;

+ case doExprCatOperator:

+ // concatenation operator.

+ // For the implicit concatenation of adjacent terms in an expression that are

+ // not separated by any other operator. Action is invoked between the

+ // actions for the two terms.

+ {

+ fixOpStack(RBBINode::precOpCat);

+ RBBINode *operandNode = fNodeStack[fNodeStackPtr--];

+ RBBINode *catNode = pushNewNode(RBBINode::opCat);

+ catNode->fLeftChild = operandNode;

+ operandNode->fParent = catNode;

+ }

+ break;

+ case doLParen:

+ // Open Paren.

+ // The openParen node is a dummy operation type with a low precedence,

+ // which has the affect of ensuring that any real binary op that

+ // follows within the parens binds more tightly to the operands than

+ // stuff outside of the parens.

+ pushNewNode(RBBINode::opLParen);

+ break;

+ case doExprRParen:

+ fixOpStack(RBBINode::precLParen);

+ break;

+ case doNOP:

+ break;

+ case doStartAssign:

+ // We've just scanned "$variable = "

+ // The top of the node stack has the $variable ref node.

+ // Save the start position of the RHS text in the StartExpression node

+ // that precedes the $variableReference node on the stack.

+ // This will eventually be used when saving the full $variable replacement

+ // text as a string.

+ n = fNodeStack[fNodeStackPtr-1];

+ n->fFirstPos = fNextIndex; // move past the '='

+ // Push a new start-of-expression node; needed to keep parse of the

+ // RHS expression happy.

+ pushNewNode(RBBINode::opStart);

+ break;

+ case doEndAssign:

+ {

+ // We have reached the end of an assignement statement.

+ // Current scan char is the ';' that terminates the assignment.

+ // Terminate expression, leaves expression parse tree rooted in TOS node.

+ fixOpStack(RBBINode::precStart);

+ RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2];

+ RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1];

+ RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr];

+ // Save original text of right side of assignment, excluding the terminating ';'

+ // in the root of the node for the right-hand-side expression.

+ RHSExprNode->fFirstPos = startExprNode->fFirstPos;

+ RHSExprNode->fLastPos = fScanIndex;

+ fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);

+ // Expression parse tree becomes l. child of the $variable reference node.

+ varRefNode->fLeftChild = RHSExprNode;

+ RHSExprNode->fParent = varRefNode;

+ // Make a symbol table entry for the $variableRef node.

+ fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);

+ if (U_FAILURE(*fRB->fStatus)) {

+ // This is a round-about way to get the parse position set

+ // so that duplicate symbols error messages include a line number.

+ UErrorCode t = *fRB->fStatus;

+ *fRB->fStatus = U_ZERO_ERROR;

+ error(t);

+ }

+ // Clean up the stack.

+ delete startExprNode;

+ fNodeStackPtr-=3;

+ break;

+ }

+ case doEndOfRule:

+ {

+ fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression

+ if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node.

+ break;

+ }

+#ifdef RBBI_DEBUG

+ if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}

+#endif

+ U_ASSERT(fNodeStackPtr == 1);

+ // If this rule includes a look-ahead '/', add a endMark node to the

+ // expression tree.

+ if (fLookAheadRule) {

+ RBBINode *thisRule = fNodeStack[fNodeStackPtr];

+ RBBINode *endNode = pushNewNode(RBBINode::endMark);

+ RBBINode *catNode = pushNewNode(RBBINode::opCat);

+ fNodeStackPtr -= 2;

+ catNode->fLeftChild = thisRule;

+ catNode->fRightChild = endNode;

+ fNodeStack[fNodeStackPtr] = catNode;

+ endNode->fVal = fRuleNum;

+ endNode->fLookAheadEnd = TRUE;

+ }

+ // All rule expressions are ORed together.

+ // The ';' that terminates an expression really just functions as a '|' with

+ // a low operator prededence.

+ //

+ // Each of the four sets of rules are collected separately.

+ // (forward, reverse, safe_forward, safe_reverse)

+ // OR this rule into the appropriate group of them.

+ //

+ RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);

+ if (*destRules != NULL) {

+ // This is not the first rule encounted.

+ // OR previous stuff (from *destRules)

+ // with the current rule expression (on the Node Stack)

+ // with the resulting OR expression going to *destRules

+ //

+ RBBINode *thisRule = fNodeStack[fNodeStackPtr];

+ RBBINode *prevRules = *destRules;

+ RBBINode *orNode = pushNewNode(RBBINode::opOr);

+ orNode->fLeftChild = prevRules;

+ prevRules->fParent = orNode;

+ orNode->fRightChild = thisRule;

+ thisRule->fParent = orNode;

+ *destRules = orNode;

+ }

+ else

+ {

+ // This is the first rule encountered (for this direction).

+ // Just move its parse tree from the stack to *destRules.

+ *destRules = fNodeStack[fNodeStackPtr];

+ }

+ fReverseRule = FALSE; // in preparation for the next rule.

+ fLookAheadRule = FALSE;

+ fNodeStackPtr = 0;

+ }

+ break;

+ case doRuleError:

+ error(U_BRK_RULE_SYNTAX);

+ returnVal = FALSE;

+ break;

+ case doVariableNameExpectedErr:

+ error(U_BRK_RULE_SYNTAX);

+ break;

+ //

+ // Unary operands + ? *

+ // These all appear after the operand to which they apply.

+ // When we hit one, the operand (may be a whole sub expression)

+ // will be on the top of the stack.

+ // Unary Operator becomes TOS, with the old TOS as its one child.

+ case doUnaryOpPlus:

+ {

+ RBBINode *operandNode = fNodeStack[fNodeStackPtr--];

+ RBBINode *plusNode = pushNewNode(RBBINode::opPlus);

+ plusNode->fLeftChild = operandNode;

+ operandNode->fParent = plusNode;

+ }

+ break;

+ case doUnaryOpQuestion:

+ {

+ RBBINode *operandNode = fNodeStack[fNodeStackPtr--];

+ RBBINode *qNode = pushNewNode(RBBINode::opQuestion);

+ qNode->fLeftChild = operandNode;

+ operandNode->fParent = qNode;

+ }

+ break;

+ case doUnaryOpStar:

+ {

+ RBBINode *operandNode = fNodeStack[fNodeStackPtr--];

+ RBBINode *starNode = pushNewNode(RBBINode::opStar);

+ starNode->fLeftChild = operandNode;

+ operandNode->fParent = starNode;

+ }

+ break;

+ case doRuleChar:

+ // A "Rule Character" is any single character that is a literal part

+ // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]"

+ // These are pretty uncommon in break rules; the terms are more commonly

+ // sets. To keep things uniform, treat these characters like as

+ // sets that just happen to contain only one character.

+ {

+ n = pushNewNode(RBBINode::setRef);

+ findSetFor(UnicodeString(fC.fChar), n);

+ n->fFirstPos = fScanIndex;

+ n->fLastPos = fNextIndex;

+ fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

+ break;

+ }

+ case doDotAny:

+ // scanned a ".", meaning match any single character.

+ {

+ n = pushNewNode(RBBINode::setRef);

+ findSetFor(UnicodeString(TRUE, kAny, 3), n);

+ n->fFirstPos = fScanIndex;

+ n->fLastPos = fNextIndex;

+ fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

+ break;

+ }

+ case doSlash:

+ // Scanned a '/', which identifies a look-ahead break position in a rule.

+ n = pushNewNode(RBBINode::lookAhead);

+ n->fVal = fRuleNum;

+ n->fFirstPos = fScanIndex;

+ n->fLastPos = fNextIndex;

+ fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

+ fLookAheadRule = TRUE;

+ break;

+ case doStartTagValue:

+ // Scanned a '{', the opening delimiter for a tag value within a rule.

+ n = pushNewNode(RBBINode::tag);

+ n->fVal = 0;

+ n->fFirstPos = fScanIndex;

+ n->fLastPos = fNextIndex;

+ break;

+ case doTagDigit:

+ // Just scanned a decimal digit that's part of a tag value

+ {

+ n = fNodeStack[fNodeStackPtr];

+ uint32_t v = u_charDigitValue(fC.fChar);

+ U_ASSERT(v < 10);

+ n->fVal = n->fVal*10 + v;

+ break;

+ }

+ case doTagValue:

+ n = fNodeStack[fNodeStackPtr];

+ n->fLastPos = fNextIndex;

+ fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

+ break;

+ case doTagExpectedError:

+ error(U_BRK_MALFORMED_RULE_TAG);

+ returnVal = FALSE;

+ break;

+ case doOptionStart:

+ // Scanning a !!option. At the start of string.

+ fOptionStart = fScanIndex;

+ break;

+ case doOptionEnd:

+ {

+ UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);

+ if (opt == UNICODE_STRING("chain", 5)) {

+ fRB->fChainRules = TRUE;

+ } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {

+ fRB->fLBCMNoChain = TRUE;

+ } else if (opt == UNICODE_STRING("forward", 7)) {

+ fRB->fDefaultTree = &fRB->fForwardTree;

+ } else if (opt == UNICODE_STRING("reverse", 7)) {

+ fRB->fDefaultTree = &fRB->fReverseTree;

+ } else if (opt == UNICODE_STRING("safe_forward", 12)) {

+ fRB->fDefaultTree = &fRB->fSafeFwdTree;

+ } else if (opt == UNICODE_STRING("safe_reverse", 12)) {

+ fRB->fDefaultTree = &fRB->fSafeRevTree;

+ } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {

+ fRB->fLookAheadHardBreak = TRUE;

+ } else {

+ error(U_BRK_UNRECOGNIZED_OPTION);

+ }

+ break;

+ case doReverseDir:

+ fReverseRule = TRUE;

+ break;

+ case doStartVariableName:

+ n = pushNewNode(RBBINode::varRef);

+ if (U_FAILURE(*fRB->fStatus)) {

+ break;

+ }

+ n->fFirstPos = fScanIndex;

+ break;

+ case doEndVariableName:

+ n = fNodeStack[fNodeStackPtr];

+ if (n==NULL || n->fType != RBBINode::varRef) {

+ error(U_BRK_INTERNAL_ERROR);

+ break;

+ }

+ n->fLastPos = fScanIndex;

+ fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);

+ // Look the newly scanned name up in the symbol table

+ // If there's an entry, set the l. child of the var ref to the replacement expression.

+ // (We also pass through here when scanning assignments, but no harm is done, other

+ // than a slight wasted effort that seems hard to avoid. Lookup will be null)

+ n->fLeftChild = fSymbolTable->lookupNode(n->fText);

+ break;

+ case doCheckVarDef:

+ n = fNodeStack[fNodeStackPtr];

+ if (n->fLeftChild == NULL) {

+ error(U_BRK_UNDEFINED_VARIABLE);

+ returnVal = FALSE;

+ }

+ break;

+ case doExprFinished:

+ break;

+ case doRuleErrorAssignExpr:

+ error(U_BRK_ASSIGN_ERROR);

+ returnVal = FALSE;

+ break;

+ case doExit:

+ returnVal = FALSE;

+ break;

+ case doScanUnicodeSet:

+ scanSet();

+ break;

+ default:

+ error(U_BRK_INTERNAL_ERROR);

+ returnVal = FALSE;

+ break;

+ }

+ return returnVal;

+//------------------------------------------------------------------------------

+//

+// Error Report a rule parse error.

+// Only report it if no previous error has been recorded.

+//

+//------------------------------------------------------------------------------

+void RBBIRuleScanner::error(UErrorCode e) {

+ if (U_SUCCESS(*fRB->fStatus)) {

+ *fRB->fStatus = e;

+ if (fRB->fParseError) {

+ fRB->fParseError->line = fLineNum;

+ fRB->fParseError->offset = fCharNum;

+ fRB->fParseError->preContext[0] = 0;

+ }

+//------------------------------------------------------------------------------

+//

+// fixOpStack The parse stack holds partially assembled chunks of the parse tree.

+// An entry on the stack may be as small as a single setRef node,

+// or as large as the parse tree

+// for an entire expression (this will be the one item left on the stack

+// when the parsing of an RBBI rule completes.

+//

+// This function is called when a binary operator is encountered.

+// It looks back up the stack for operators that are not yet associated

+// with a right operand, and if the precedence of the stacked operator >=

+// the precedence of the current operator, binds the operand left,

+// to the previously encountered operator.

+//

+//------------------------------------------------------------------------------

+void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {

+ RBBINode *n;

+ // printNodeStack("entering fixOpStack()");

+ for (;;) {

+ n = fNodeStack[fNodeStackPtr-1]; // an operator node

+ if (n->fPrecedence == 0) {

+ RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");

+ error(U_BRK_INTERNAL_ERROR);

+ return;

+ }

+ if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) {

+ // The most recent operand goes with the current operator,

+ // not with the previously stacked one.

+ break;

+ }

+ // Stack operator is a binary op ( '|' or concatenation)

+ // TOS operand becomes right child of this operator.

+ // Resulting subexpression becomes the TOS operand.

+ n->fRightChild = fNodeStack[fNodeStackPtr];

+ fNodeStack[fNodeStackPtr]->fParent = n;

+ fNodeStackPtr--;

+ // printNodeStack("looping in fixOpStack() ");

+ }

+ if (p <= RBBINode::precLParen) {

+ // Scan is at a right paren or end of expression.

+ // The scanned item must match the stack, or else there was an error.

+ // Discard the left paren (or start expr) node from the stack,

+ // leaving the completed (sub)expression as TOS.

+ if (n->fPrecedence != p) {

+ // Right paren encountered matched start of expression node, or

+ // end of expression matched with a left paren node.

+ error(U_BRK_MISMATCHED_PAREN);

+ }

+ fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];

+ fNodeStackPtr--;

+ // Delete the now-discarded LParen or Start node.

+ delete n;

+ }

+ // printNodeStack("leaving fixOpStack()");

+//------------------------------------------------------------------------------

+//

+// findSetFor given a UnicodeString,

+// - find the corresponding Unicode Set (uset node)

+// (create one if necessary)

+// - Set fLeftChild of the caller's node (should be a setRef node)

+// to the uset node

+// Maintain a hash table of uset nodes, so the same one is always used

+// for the same string.

+// If a "to adopt" set is provided and we haven't seen this key before,

+// add the provided set to the hash table.

+// If the string is one (32 bit) char in length, the set contains

+// just one element which is the char in question.

+// If the string is "any", return a set containing all chars.

+//

+//------------------------------------------------------------------------------

+void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {

+ RBBISetTableEl *el;

+ // First check whether we've already cached a set for this string.

+ // If so, just use the cached set in the new node.

+ // delete any set provided by the caller, since we own it.

+ el = (RBBISetTableEl *)uhash_get(fSetTable, &s);

+ if (el != NULL) {

+ delete setToAdopt;

+ node->fLeftChild = el->val;

+ U_ASSERT(node->fLeftChild->fType == RBBINode::uset);

+ return;

+ }

+ // Haven't seen this set before.

+ // If the caller didn't provide us with a prebuilt set,

+ // create a new UnicodeSet now.

+ if (setToAdopt == NULL) {

+ if (s.compare(kAny, -1) == 0) {

+ setToAdopt = new UnicodeSet(0x000000, 0x10ffff);

+ } else {

+ UChar32 c;

+ c = s.char32At(0);

+ setToAdopt = new UnicodeSet(c, c);

+ }

+ //

+ // Make a new uset node to refer to this UnicodeSet

+ // This new uset node becomes the child of the caller's setReference node.

+ //

+ RBBINode *usetNode = new RBBINode(RBBINode::uset);

+ if (usetNode == NULL) {

+ error(U_MEMORY_ALLOCATION_ERROR);

+ return;

+ }

+ usetNode->fInputSet = setToAdopt;

+ usetNode->fParent = node;

+ node->fLeftChild = usetNode;

+ usetNode->fText = s;

+ //

+ // Add the new uset node to the list of all uset nodes.

+ //

+ fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);

+ //

+ // Add the new set to the set hash table.

+ //

+ el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));

+ UnicodeString *tkey = new UnicodeString(s);

+ if (tkey == NULL || el == NULL || setToAdopt == NULL) {

+ // Delete to avoid memory leak

+ delete tkey;

+ tkey = NULL;

+ uprv_free(el);

+ el = NULL;

+ delete setToAdopt;

+ setToAdopt = NULL;

+ error(U_MEMORY_ALLOCATION_ERROR);

+ return;

+ }

+ el->key = tkey;

+ el->val = usetNode;

+ uhash_put(fSetTable, el->key, el, fRB->fStatus);

+ return;

+//

+// Assorted Unicode character constants.

+// Numeric because there is no portable way to enter them as literals.

+// (Think EBCDIC).

+//

+static const UChar chCR = 0x0d; // New lines, for terminating comments.

+static const UChar chLF = 0x0a;

+static const UChar chNEL = 0x85; // NEL newline variant

+static const UChar chLS = 0x2028; // Unicode Line Separator

+static const UChar chApos = 0x27; // single quote, for quoted chars.

+static const UChar chPound = 0x23; // '#', introduces a comment.

+static const UChar chBackSlash = 0x5c; // '\' introduces a char escape

+static const UChar chLParen = 0x28;

+static const UChar chRParen = 0x29;

+//------------------------------------------------------------------------------

+//

+// stripRules Return a rules string without unnecessary

+// characters.

+//

+//------------------------------------------------------------------------------

+UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {

+ UnicodeString strippedRules;

+ int rulesLength = rules.length();

+ for (int idx = 0; idx < rulesLength; ) {

+ UChar ch = rules[idx++];

+ if (ch == chPound) {

+ while (idx < rulesLength

+ && ch != chCR && ch != chLF && ch != chNEL)

+ {

+ ch = rules[idx++];

+ }

+ if (!u_isISOControl(ch)) {

+ strippedRules.append(ch);

+ }

+ // strippedRules = strippedRules.unescape();

+ return strippedRules;

+//------------------------------------------------------------------------------

+//

+// nextCharLL Low Level Next Char from rule input source.

+// Get a char from the input character iterator,

+// keep track of input position for error reporting.

+//

+//------------------------------------------------------------------------------

+UChar32 RBBIRuleScanner::nextCharLL() {

+ UChar32 ch;

+ if (fNextIndex >= fRB->fRules.length()) {

+ return (UChar32)-1;

+ }

+ ch = fRB->fRules.char32At(fNextIndex);

+ fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);

+ if (ch == chCR ||

+ ch == chNEL ||

+ ch == chLS ||

+ (ch == chLF && fLastChar != chCR)) {

+ // Character is starting a new line. Bump up the line number, and

+ // reset the column to 0.

+ fLineNum++;

+ fCharNum=0;

+ if (fQuoteMode) {

+ error(U_BRK_NEW_LINE_IN_QUOTED_STRING);

+ fQuoteMode = FALSE;

+ }

+ else {

+ // Character is not starting a new line. Except in the case of a

+ // LF following a CR, increment the column position.

+ if (ch != chLF) {

+ fCharNum++;

+ }

+ fLastChar = ch;

+ return ch;

+//------------------------------------------------------------------------------

+//

+// nextChar for rules scanning. At this level, we handle stripping

+// out comments and processing backslash character escapes.

+// The rest of the rules grammar is handled at the next level up.

+//

+//------------------------------------------------------------------------------

+void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {

+ // Unicode Character constants needed for the processing done by nextChar(),

+ // in hex because literals wont work on EBCDIC machines.

+ fScanIndex = fNextIndex;

+ c.fChar = nextCharLL();

+ c.fEscaped = FALSE;

+ //

+ // check for '' sequence.

+ // These are recognized in all contexts, whether in quoted text or not.

+ //

+ if (c.fChar == chApos) {

+ if (fRB->fRules.char32At(fNextIndex) == chApos) {

+ c.fChar = nextCharLL(); // get nextChar officially so character counts

+ c.fEscaped = TRUE; // stay correct.

+ }

+ else

+ {

+ // Single quote, by itself.

+ // Toggle quoting mode.

+ // Return either '(' or ')', because quotes cause a grouping of the quoted text.

+ fQuoteMode = !fQuoteMode;

+ if (fQuoteMode == TRUE) {

+ c.fChar = chLParen;

+ } else {

+ c.fChar = chRParen;

+ }

+ c.fEscaped = FALSE; // The paren that we return is not escaped.

+ return;

+ }

+ if (fQuoteMode) {

+ c.fEscaped = TRUE;

+ }

+ else

+ {

+ // We are not in a 'quoted region' of the source.

+ //

+ if (c.fChar == chPound) {

+ // Start of a comment. Consume the rest of it.

+ // The new-line char that terminates the comment is always returned.

+ // It will be treated as white-space, and serves to break up anything

+ // that might otherwise incorrectly clump together with a comment in

+ // the middle (a variable name, for example.)

+ for (;;) {

+ c.fChar = nextCharLL();

+ if (c.fChar == (UChar32)-1 || // EOF

+ c.fChar == chCR ||

+ c.fChar == chLF ||

+ c.fChar == chNEL ||

+ c.fChar == chLS) {break;}

+ }

+ if (c.fChar == (UChar32)-1) {

+ return;

+ }

+ //

+ // check for backslash escaped characters.

+ // Use UnicodeString::unescapeAt() to handle them.

+ //

+ if (c.fChar == chBackSlash) {

+ c.fEscaped = TRUE;

+ int32_t startX = fNextIndex;

+ c.fChar = fRB->fRules.unescapeAt(fNextIndex);

+ if (fNextIndex == startX) {

+ error(U_BRK_HEX_DIGITS_EXPECTED);

+ }

+ fCharNum += fNextIndex-startX;

+ }

+ // putc(c.fChar, stdout);

+//------------------------------------------------------------------------------

+//

+// Parse RBBI rules. The state machine for rules parsing is here.

+// The state tables are hand-written in the file rbbirpt.txt,

+// and converted to the form used here by a perl

+// script rbbicst.pl

+//

+//------------------------------------------------------------------------------

+void RBBIRuleScanner::parse() {

+ uint16_t state;

+ const RBBIRuleTableEl *tableEl;

+ if (U_FAILURE(*fRB->fStatus)) {

+ return;

+ }

+ state = 1;

+ nextChar(fC);

+ //

+ // Main loop for the rule parsing state machine.

+ // Runs once per state transition.

+ // Each time through optionally performs, depending on the state table,

+ // - an advance to the the next input char

+ // - an action to be performed.

+ // - pushing or popping a state to/from the local state return stack.

+ //

+ for (;;) {

+ // Bail out if anything has gone wrong.

+ // RBBI rule file parsing stops on the first error encountered.

+ if (U_FAILURE(*fRB->fStatus)) {

+ break;

+ }

+ // Quit if state == 0. This is the normal way to exit the state machine.

+ //

+ if (state == 0) {

+ break;

+ }

+ // Find the state table element that matches the input char from the rule, or the

+ // class of the input character. Start with the first table row for this

+ // state, then linearly scan forward until we find a row that matches the

+ // character. The last row for each state always matches all characters, so

+ // the search will stop there, if not before.

+ //

+ tableEl = &gRuleParseStateTable[state];

+ #ifdef RBBI_DEBUG

+ if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {

+ RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ",

+ fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);

+ }

+ #endif

+ for (;;) {

+ #ifdef RBBI_DEBUG

+ if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");}

+ #endif

+ if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) {

+ // Table row specified an individual character, not a set, and

+ // the input character is not escaped, and

+ // the input character matched it.

+ break;

+ }

+ if (tableEl->fCharClass == 255) {

+ // Table row specified default, match anything character class.

+ break;

+ }

+ if (tableEl->fCharClass == 254 && fC.fEscaped) {

+ // Table row specified "escaped" and the char was escaped.

+ break;

+ }

+ if (tableEl->fCharClass == 253 && fC.fEscaped &&

+ (fC.fChar == 0x50 || fC.fChar == 0x70 )) {

+ // Table row specified "escaped P" and the char is either 'p' or 'P'.

+ break;

+ }

+ if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) {

+ // Table row specified eof and we hit eof on the input.

+ break;

+ }

+ if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&

+ fC.fEscaped == FALSE && // char is not escaped &&

+ fC.fChar != (UChar32)-1) { // char is not EOF

+ U_ASSERT((tableEl->fCharClass-128) < LENGTHOF(fRuleSets));

+ if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {

+ // Table row specified a character class, or set of characters,

+ // and the current char matches it.

+ break;

+ }

+ // No match on this row, advance to the next row for this state,

+ tableEl++;

+ }

+ if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}

+ //

+ // We've found the row of the state table that matches the current input

+ // character from the rules string.

+ // Perform any action specified by this row in the state table.

+ if (doParseActions((int32_t)tableEl->fAction) == FALSE) {

+ // Break out of the state machine loop if the

+ // the action signalled some kind of error, or

+ // the action was to exit, occurs on normal end-of-rules-input.

+ break;

+ }

+ if (tableEl->fPushState != 0) {

+ fStackPtr++;

+ if (fStackPtr >= kStackSize) {

+ error(U_BRK_INTERNAL_ERROR);

+ RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");

+ fStackPtr--;

+ }

+ fStack[fStackPtr] = tableEl->fPushState;

+ }

+ if (tableEl->fNextChar) {

+ nextChar(fC);

+ }

+ // Get the next state from the table entry, or from the

+ // state stack if the next state was specified as "pop".

+ if (tableEl->fNextState != 255) {

+ state = tableEl->fNextState;

+ } else {

+ state = fStack[fStackPtr];

+ fStackPtr--;

+ if (fStackPtr < 0) {

+ error(U_BRK_INTERNAL_ERROR);

+ RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");

+ fStackPtr++;

+ }

+ //

+ // If there were NO user specified reverse rules, set up the equivalent of ".*;"

+ //

+ if (fRB->fReverseTree == NULL) {

+ fRB->fReverseTree = pushNewNode(RBBINode::opStar);

+ RBBINode *operand = pushNewNode(RBBINode::setRef);

+ findSetFor(UnicodeString(TRUE, kAny, 3), operand);

+ fRB->fReverseTree->fLeftChild = operand;

+ operand->fParent = fRB->fReverseTree;

+ fNodeStackPtr -= 2;

+ }

+ //

+ // Parsing of the input RBBI rules is complete.

+ // We now have a parse tree for the rule expressions

+ // and a list of all UnicodeSets that are referenced.

+ //

+#ifdef RBBI_DEBUG

+ if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}

+ if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))

+ {

+ RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");

+ fRB->fForwardTree->printTree(TRUE);

+ RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");

+ fRB->fReverseTree->printTree(TRUE);

+ RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");

+ fRB->fSafeFwdTree->printTree(TRUE);

+ RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");

+ fRB->fSafeRevTree->printTree(TRUE);

+ }

+#endif

+//------------------------------------------------------------------------------

+//

+// printNodeStack for debugging...

+//

+//------------------------------------------------------------------------------

+#ifdef RBBI_DEBUG

+void RBBIRuleScanner::printNodeStack(const char *title) {

+ int i;

+ RBBIDebugPrintf("%s. Dumping node stack...\n", title);

+ for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);}

+#endif

+//------------------------------------------------------------------------------

+//

+// pushNewNode create a new RBBINode of the specified type and push it

+// onto the stack of nodes.

+//

+//------------------------------------------------------------------------------

+RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) {

+ fNodeStackPtr++;

+ if (fNodeStackPtr >= kStackSize) {

+ error(U_BRK_INTERNAL_ERROR);

+ RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");

+ *fRB->fStatus = U_BRK_INTERNAL_ERROR;

+ return NULL;

+ }

+ fNodeStack[fNodeStackPtr] = new RBBINode(t);

+ if (fNodeStack[fNodeStackPtr] == NULL) {

+ *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;

+ }

+ return fNodeStack[fNodeStackPtr];

+//------------------------------------------------------------------------------

+//

+// scanSet Construct a UnicodeSet from the text at the current scan

+// position. Advance the scan position to the first character

+// after the set.

+//

+// A new RBBI setref node referring to the set is pushed onto the node

+// stack.

+//

+// The scan position is normally under the control of the state machine

+// that controls rule parsing. UnicodeSets, however, are parsed by

+// the UnicodeSet constructor, not by the RBBI rule parser.

+//

+//------------------------------------------------------------------------------

+void RBBIRuleScanner::scanSet() {

+ UnicodeSet *uset;

+ ParsePosition pos;

+ int startPos;

+ int i;

+ if (U_FAILURE(*fRB->fStatus)) {

+ return;

+ }

+ pos.setIndex(fScanIndex);

+ startPos = fScanIndex;

+ UErrorCode localStatus = U_ZERO_ERROR;

+ uset = new UnicodeSet();

+ if (uset == NULL) {

+ localStatus = U_MEMORY_ALLOCATION_ERROR;

+ } else {

+ uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);

+ }

+ if (U_FAILURE(localStatus)) {

+ // TODO: Get more accurate position of the error from UnicodeSet's return info.

+ // UnicodeSet appears to not be reporting correctly at this time.

+ #ifdef RBBI_DEBUG

+ RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());

+ #endif

+ error(localStatus);

+ delete uset;

+ return;

+ }

+ // Verify that the set contains at least one code point.

+ //

+ U_ASSERT(uset!=NULL);

+ if (uset->isEmpty()) {

+ // This set is empty.

+ // Make it an error, because it almost certainly is not what the user wanted.

+ // Also, avoids having to think about corner cases in the tree manipulation code

+ // that occurs later on.

+ error(U_BRK_RULE_EMPTY_SET);

+ delete uset;

+ return;

+ }

+ // Advance the RBBI parse postion over the UnicodeSet pattern.

+ // Don't just set fScanIndex because the line/char positions maintained

+ // for error reporting would be thrown off.

+ i = pos.getIndex();

+ for (;;) {

+ if (fNextIndex >= i) {

+ break;

+ }

+ nextCharLL();

+ }

+ if (U_SUCCESS(*fRB->fStatus)) {

+ RBBINode *n;

+ n = pushNewNode(RBBINode::setRef);

+ n->fFirstPos = startPos;

+ n->fLastPos = fNextIndex;

+ fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

+ // findSetFor() serves several purposes here:

+ // - Adopts storage for the UnicodeSet, will be responsible for deleting.

+ // - Mantains collection of all sets in use, needed later for establishing

+ // character categories for run time engine.

+ // - Eliminates mulitiple instances of the same set.

+ // - Creates a new uset node if necessary (if this isn't a duplicate.)

+ findSetFor(n->fText, n, uset);

+ }

+U_NAMESPACE_END

+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

Property changes on: icu51/source/common/rbbiscan.cpp

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu51/source/common/rbbiscan.h ('k') | icu51/source/common/rbbisetb.h » ('j') | no next file with comments »