| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | // | 
|---|
| 4 | //  file:  rbbiscan.cpp | 
|---|
| 5 | // | 
|---|
| 6 | //  Copyright (C) 2002-2016, International Business Machines Corporation and others. | 
|---|
| 7 | //  All Rights Reserved. | 
|---|
| 8 | // | 
|---|
| 9 | //  This file contains the Rule Based Break Iterator Rule Builder functions for | 
|---|
| 10 | //   scanning the rules and assembling a parse tree.  This is the first phase | 
|---|
| 11 | //   of compiling the rules. | 
|---|
| 12 | // | 
|---|
| 13 | //  The overall of the rules is managed by class RBBIRuleBuilder, which will | 
|---|
| 14 | //  create and use an instance of this class as part of the process. | 
|---|
| 15 | // | 
|---|
| 16 |  | 
|---|
| 17 | #include "unicode/utypes.h" | 
|---|
| 18 |  | 
|---|
| 19 | #if !UCONFIG_NO_BREAK_ITERATION | 
|---|
| 20 |  | 
|---|
| 21 | #include "unicode/unistr.h" | 
|---|
| 22 | #include "unicode/uniset.h" | 
|---|
| 23 | #include "unicode/uchar.h" | 
|---|
| 24 | #include "unicode/uchriter.h" | 
|---|
| 25 | #include "unicode/parsepos.h" | 
|---|
| 26 | #include "unicode/parseerr.h" | 
|---|
| 27 | #include "cmemory.h" | 
|---|
| 28 | #include "cstring.h" | 
|---|
| 29 |  | 
|---|
| 30 | #include "rbbirpt.h"   // Contains state table for the rbbi rules parser. | 
|---|
| 31 | //   generated by a Perl script. | 
|---|
| 32 | #include "rbbirb.h" | 
|---|
| 33 | #include "rbbinode.h" | 
|---|
| 34 | #include "rbbiscan.h" | 
|---|
| 35 | #include "rbbitblb.h" | 
|---|
| 36 |  | 
|---|
| 37 | #include "uassert.h" | 
|---|
| 38 |  | 
|---|
| 39 | //------------------------------------------------------------------------------ | 
|---|
| 40 | // | 
|---|
| 41 | // Unicode Set init strings for each of the character classes needed for parsing a rule file. | 
|---|
| 42 | //               (Initialized with hex values for portability to EBCDIC based machines. | 
|---|
| 43 | //                Really ugly, but there's no good way to avoid it.) | 
|---|
| 44 | // | 
|---|
| 45 | //              The sets are referred to by name in the rbbirpt.txt, which is the | 
|---|
| 46 | //              source form of the state transition table for the RBBI rule parser. | 
|---|
| 47 | // | 
|---|
| 48 | //------------------------------------------------------------------------------ | 
|---|
| 49 | static const UChar gRuleSet_rule_char_pattern[]       = { | 
|---|
| 50 | // Characters that may appear as literals in patterns without escaping or quoting. | 
|---|
| 51 | //   [    ^      [    \     p     {      Z     }     \     u    0      0    2      0 | 
|---|
| 52 | 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30, | 
|---|
| 53 | //   -    \      u    0     0     7      f     ]     -     [    \      p | 
|---|
| 54 | 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, | 
|---|
| 55 | //   {     L     }    ]     -     [      \     p     {     N    }      ]     ] | 
|---|
| 56 | 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0}; | 
|---|
| 57 |  | 
|---|
| 58 | static const UChar gRuleSet_name_char_pattern[]       = { | 
|---|
| 59 | //    [    _      \    p     {     L      }     \     p     {    N      }     ] | 
|---|
| 60 | 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0}; | 
|---|
| 61 |  | 
|---|
| 62 | static const UChar gRuleSet_digit_char_pattern[] = { | 
|---|
| 63 | //    [    0      -    9     ] | 
|---|
| 64 | 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; | 
|---|
| 65 |  | 
|---|
| 66 | static const UChar gRuleSet_name_start_char_pattern[] = { | 
|---|
| 67 | //    [    _      \    p     {     L      }     ] | 
|---|
| 68 | 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 }; | 
|---|
| 69 |  | 
|---|
| 70 | static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00};  // "any" | 
|---|
| 71 |  | 
|---|
| 72 |  | 
|---|
| 73 | U_CDECL_BEGIN | 
|---|
| 74 | static void U_CALLCONV RBBISetTable_deleter(void *p) { | 
|---|
| 75 | icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p; | 
|---|
| 76 | delete px->key; | 
|---|
| 77 | // Note:  px->val is owned by the linked list "fSetsListHead" in scanner. | 
|---|
| 78 | //        Don't delete the value nodes here. | 
|---|
| 79 | uprv_free(px); | 
|---|
| 80 | } | 
|---|
| 81 | U_CDECL_END | 
|---|
| 82 |  | 
|---|
| 83 | U_NAMESPACE_BEGIN | 
|---|
| 84 |  | 
|---|
| 85 | //------------------------------------------------------------------------------ | 
|---|
| 86 | // | 
|---|
| 87 | //  Constructor. | 
|---|
| 88 | // | 
|---|
| 89 | //------------------------------------------------------------------------------ | 
|---|
| 90 | RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb) | 
|---|
| 91 | { | 
|---|
| 92 | fRB                 = rb; | 
|---|
| 93 | fScanIndex          = 0; | 
|---|
| 94 | fNextIndex          = 0; | 
|---|
| 95 | fQuoteMode          = FALSE; | 
|---|
| 96 | fLineNum            = 1; | 
|---|
| 97 | fCharNum            = 0; | 
|---|
| 98 | fLastChar           = 0; | 
|---|
| 99 |  | 
|---|
| 100 | fStateTable         = NULL; | 
|---|
| 101 | fStack[0]           = 0; | 
|---|
| 102 | fStackPtr           = 0; | 
|---|
| 103 | fNodeStack[0]       = NULL; | 
|---|
| 104 | fNodeStackPtr       = 0; | 
|---|
| 105 |  | 
|---|
| 106 | fReverseRule        = FALSE; | 
|---|
| 107 | fLookAheadRule      = FALSE; | 
|---|
| 108 | fNoChainInRule      = FALSE; | 
|---|
| 109 |  | 
|---|
| 110 | fSymbolTable        = NULL; | 
|---|
| 111 | fSetTable           = NULL; | 
|---|
| 112 | fRuleNum            = 0; | 
|---|
| 113 | fOptionStart        = 0; | 
|---|
| 114 |  | 
|---|
| 115 | // Do not check status until after all critical fields are sufficiently initialized | 
|---|
| 116 | //   that the destructor can run cleanly. | 
|---|
| 117 | if (U_FAILURE(*rb->fStatus)) { | 
|---|
| 118 | return; | 
|---|
| 119 | } | 
|---|
| 120 |  | 
|---|
| 121 | // | 
|---|
| 122 | //  Set up the constant Unicode Sets. | 
|---|
| 123 | //     Note:  These could be made static, lazily initialized, and shared among | 
|---|
| 124 | //            all instances of RBBIRuleScanners.  BUT this is quite a bit simpler, | 
|---|
| 125 | //            and the time to build these few sets should be small compared to a | 
|---|
| 126 | //            full break iterator build. | 
|---|
| 127 | fRuleSets[kRuleSet_rule_char-128] | 
|---|
| 128 | = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern),       *rb->fStatus); | 
|---|
| 129 | // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:] | 
|---|
| 130 | fRuleSets[kRuleSet_white_space-128]. | 
|---|
| 131 | add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029); | 
|---|
| 132 | fRuleSets[kRuleSet_name_char-128] | 
|---|
| 133 | = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern),       *rb->fStatus); | 
|---|
| 134 | fRuleSets[kRuleSet_name_start_char-128] | 
|---|
| 135 | = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus); | 
|---|
| 136 | fRuleSets[kRuleSet_digit_char-128] | 
|---|
| 137 | = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern),      *rb->fStatus); | 
|---|
| 138 | if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) { | 
|---|
| 139 | // This case happens if ICU's data is missing.  UnicodeSet tries to look up property | 
|---|
| 140 | //   names from the init string, can't find them, and claims an illegal argument. | 
|---|
| 141 | //   Change the error so that the actual problem will be clearer to users. | 
|---|
| 142 | *rb->fStatus = U_BRK_INIT_ERROR; | 
|---|
| 143 | } | 
|---|
| 144 | if (U_FAILURE(*rb->fStatus)) { | 
|---|
| 145 | return; | 
|---|
| 146 | } | 
|---|
| 147 |  | 
|---|
| 148 | fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus); | 
|---|
| 149 | if (fSymbolTable == NULL) { | 
|---|
| 150 | *rb->fStatus = U_MEMORY_ALLOCATION_ERROR; | 
|---|
| 151 | return; | 
|---|
| 152 | } | 
|---|
| 153 | fSetTable    = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus); | 
|---|
| 154 | if (U_FAILURE(*rb->fStatus)) { | 
|---|
| 155 | return; | 
|---|
| 156 | } | 
|---|
| 157 | uhash_setValueDeleter(fSetTable, RBBISetTable_deleter); | 
|---|
| 158 | } | 
|---|
| 159 |  | 
|---|
| 160 |  | 
|---|
| 161 |  | 
|---|
| 162 | //------------------------------------------------------------------------------ | 
|---|
| 163 | // | 
|---|
| 164 | //  Destructor | 
|---|
| 165 | // | 
|---|
| 166 | //------------------------------------------------------------------------------ | 
|---|
| 167 | RBBIRuleScanner::~RBBIRuleScanner() { | 
|---|
| 168 | delete fSymbolTable; | 
|---|
| 169 | if (fSetTable != NULL) { | 
|---|
| 170 | uhash_close(fSetTable); | 
|---|
| 171 | fSetTable = NULL; | 
|---|
| 172 |  | 
|---|
| 173 | } | 
|---|
| 174 |  | 
|---|
| 175 |  | 
|---|
| 176 | // Node Stack. | 
|---|
| 177 | //   Normally has one entry, which is the entire parse tree for the rules. | 
|---|
| 178 | //   If errors occured, there may be additional subtrees left on the stack. | 
|---|
| 179 | while (fNodeStackPtr > 0) { | 
|---|
| 180 | delete fNodeStack[fNodeStackPtr]; | 
|---|
| 181 | fNodeStackPtr--; | 
|---|
| 182 | } | 
|---|
| 183 |  | 
|---|
| 184 | } | 
|---|
| 185 |  | 
|---|
| 186 | //------------------------------------------------------------------------------ | 
|---|
| 187 | // | 
|---|
| 188 | //  doParseAction        Do some action during rule parsing. | 
|---|
| 189 | //                       Called by the parse state machine. | 
|---|
| 190 | //                       Actions build the parse tree and Unicode Sets, | 
|---|
| 191 | //                       and maintain the parse stack for nested expressions. | 
|---|
| 192 | // | 
|---|
| 193 | //                       TODO:  unify EParseAction and RBBI_RuleParseAction enum types. | 
|---|
| 194 | //                              They represent exactly the same thing.  They're separate | 
|---|
| 195 | //                              only to work around enum forward declaration restrictions | 
|---|
| 196 | //                              in some compilers, while at the same time avoiding multiple | 
|---|
| 197 | //                              definitions problems.  I'm sure that there's a better way. | 
|---|
| 198 | // | 
|---|
| 199 | //------------------------------------------------------------------------------ | 
|---|
| 200 | UBool RBBIRuleScanner::doParseActions(int32_t action) | 
|---|
| 201 | { | 
|---|
| 202 | RBBINode *n       = NULL; | 
|---|
| 203 |  | 
|---|
| 204 | UBool   returnVal = TRUE; | 
|---|
| 205 |  | 
|---|
| 206 | switch (action) { | 
|---|
| 207 |  | 
|---|
| 208 | case doExprStart: | 
|---|
| 209 | pushNewNode(RBBINode::opStart); | 
|---|
| 210 | fRuleNum++; | 
|---|
| 211 | break; | 
|---|
| 212 |  | 
|---|
| 213 |  | 
|---|
| 214 | case doNoChain: | 
|---|
| 215 | // Scanned a '^' while on the rule start state. | 
|---|
| 216 | fNoChainInRule = TRUE; | 
|---|
| 217 | break; | 
|---|
| 218 |  | 
|---|
| 219 |  | 
|---|
| 220 | case doExprOrOperator: | 
|---|
| 221 | { | 
|---|
| 222 | fixOpStack(RBBINode::precOpCat); | 
|---|
| 223 | RBBINode  *operandNode = fNodeStack[fNodeStackPtr--]; | 
|---|
| 224 | RBBINode  *orNode      = pushNewNode(RBBINode::opOr); | 
|---|
| 225 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 226 | break; | 
|---|
| 227 | } | 
|---|
| 228 | orNode->fLeftChild     = operandNode; | 
|---|
| 229 | operandNode->fParent   = orNode; | 
|---|
| 230 | } | 
|---|
| 231 | break; | 
|---|
| 232 |  | 
|---|
| 233 | case doExprCatOperator: | 
|---|
| 234 | // concatenation operator. | 
|---|
| 235 | // For the implicit concatenation of adjacent terms in an expression that are | 
|---|
| 236 | //   not separated by any other operator.  Action is invoked between the | 
|---|
| 237 | //   actions for the two terms. | 
|---|
| 238 | { | 
|---|
| 239 | fixOpStack(RBBINode::precOpCat); | 
|---|
| 240 | RBBINode  *operandNode = fNodeStack[fNodeStackPtr--]; | 
|---|
| 241 | RBBINode  *catNode     = pushNewNode(RBBINode::opCat); | 
|---|
| 242 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 243 | break; | 
|---|
| 244 | } | 
|---|
| 245 | catNode->fLeftChild    = operandNode; | 
|---|
| 246 | operandNode->fParent   = catNode; | 
|---|
| 247 | } | 
|---|
| 248 | break; | 
|---|
| 249 |  | 
|---|
| 250 | case doLParen: | 
|---|
| 251 | // Open Paren. | 
|---|
| 252 | //   The openParen node is a dummy operation type with a low precedence, | 
|---|
| 253 | //     which has the affect of ensuring that any real binary op that | 
|---|
| 254 | //     follows within the parens binds more tightly to the operands than | 
|---|
| 255 | //     stuff outside of the parens. | 
|---|
| 256 | pushNewNode(RBBINode::opLParen); | 
|---|
| 257 | break; | 
|---|
| 258 |  | 
|---|
| 259 | case doExprRParen: | 
|---|
| 260 | fixOpStack(RBBINode::precLParen); | 
|---|
| 261 | break; | 
|---|
| 262 |  | 
|---|
| 263 | case doNOP: | 
|---|
| 264 | break; | 
|---|
| 265 |  | 
|---|
| 266 | case doStartAssign: | 
|---|
| 267 | // We've just scanned "$variable = " | 
|---|
| 268 | // The top of the node stack has the $variable ref node. | 
|---|
| 269 |  | 
|---|
| 270 | // Save the start position of the RHS text in the StartExpression node | 
|---|
| 271 | //   that precedes the $variableReference node on the stack. | 
|---|
| 272 | //   This will eventually be used when saving the full $variable replacement | 
|---|
| 273 | //   text as a string. | 
|---|
| 274 | n = fNodeStack[fNodeStackPtr-1]; | 
|---|
| 275 | n->fFirstPos = fNextIndex;              // move past the '=' | 
|---|
| 276 |  | 
|---|
| 277 | // Push a new start-of-expression node; needed to keep parse of the | 
|---|
| 278 | //   RHS expression happy. | 
|---|
| 279 | pushNewNode(RBBINode::opStart); | 
|---|
| 280 | break; | 
|---|
| 281 |  | 
|---|
| 282 |  | 
|---|
| 283 |  | 
|---|
| 284 |  | 
|---|
| 285 | case doEndAssign: | 
|---|
| 286 | { | 
|---|
| 287 | // We have reached the end of an assignement statement. | 
|---|
| 288 | //   Current scan char is the ';' that terminates the assignment. | 
|---|
| 289 |  | 
|---|
| 290 | // Terminate expression, leaves expression parse tree rooted in TOS node. | 
|---|
| 291 | fixOpStack(RBBINode::precStart); | 
|---|
| 292 |  | 
|---|
| 293 | RBBINode *startExprNode  = fNodeStack[fNodeStackPtr-2]; | 
|---|
| 294 | RBBINode *varRefNode     = fNodeStack[fNodeStackPtr-1]; | 
|---|
| 295 | RBBINode *RHSExprNode    = fNodeStack[fNodeStackPtr]; | 
|---|
| 296 |  | 
|---|
| 297 | // Save original text of right side of assignment, excluding the terminating ';' | 
|---|
| 298 | //  in the root of the node for the right-hand-side expression. | 
|---|
| 299 | RHSExprNode->fFirstPos = startExprNode->fFirstPos; | 
|---|
| 300 | RHSExprNode->fLastPos  = fScanIndex; | 
|---|
| 301 | fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText); | 
|---|
| 302 |  | 
|---|
| 303 | // Expression parse tree becomes l. child of the $variable reference node. | 
|---|
| 304 | varRefNode->fLeftChild = RHSExprNode; | 
|---|
| 305 | RHSExprNode->fParent   = varRefNode; | 
|---|
| 306 |  | 
|---|
| 307 | // Make a symbol table entry for the $variableRef node. | 
|---|
| 308 | fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus); | 
|---|
| 309 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 310 | // This is a round-about way to get the parse position set | 
|---|
| 311 | //  so that duplicate symbols error messages include a line number. | 
|---|
| 312 | UErrorCode t = *fRB->fStatus; | 
|---|
| 313 | *fRB->fStatus = U_ZERO_ERROR; | 
|---|
| 314 | error(t); | 
|---|
| 315 | } | 
|---|
| 316 |  | 
|---|
| 317 | // Clean up the stack. | 
|---|
| 318 | delete startExprNode; | 
|---|
| 319 | fNodeStackPtr-=3; | 
|---|
| 320 | break; | 
|---|
| 321 | } | 
|---|
| 322 |  | 
|---|
| 323 | case doEndOfRule: | 
|---|
| 324 | { | 
|---|
| 325 | fixOpStack(RBBINode::precStart);      // Terminate expression, leaves expression | 
|---|
| 326 | if (U_FAILURE(*fRB->fStatus)) {       //   parse tree rooted in TOS node. | 
|---|
| 327 | break; | 
|---|
| 328 | } | 
|---|
| 329 | #ifdef RBBI_DEBUG | 
|---|
| 330 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack( "end of rule");} | 
|---|
| 331 | #endif | 
|---|
| 332 | U_ASSERT(fNodeStackPtr == 1); | 
|---|
| 333 | RBBINode *thisRule = fNodeStack[fNodeStackPtr]; | 
|---|
| 334 |  | 
|---|
| 335 | // If this rule includes a look-ahead '/', add a endMark node to the | 
|---|
| 336 | //   expression tree. | 
|---|
| 337 | if (fLookAheadRule) { | 
|---|
| 338 | RBBINode  *endNode        = pushNewNode(RBBINode::endMark); | 
|---|
| 339 | RBBINode  *catNode        = pushNewNode(RBBINode::opCat); | 
|---|
| 340 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 341 | break; | 
|---|
| 342 | } | 
|---|
| 343 | fNodeStackPtr -= 2; | 
|---|
| 344 | catNode->fLeftChild       = thisRule; | 
|---|
| 345 | catNode->fRightChild      = endNode; | 
|---|
| 346 | fNodeStack[fNodeStackPtr] = catNode; | 
|---|
| 347 | endNode->fVal             = fRuleNum; | 
|---|
| 348 | endNode->fLookAheadEnd    = TRUE; | 
|---|
| 349 | thisRule                  = catNode; | 
|---|
| 350 |  | 
|---|
| 351 | // TODO: Disable chaining out of look-ahead (hard break) rules. | 
|---|
| 352 | //   The break on rule match is forced, so there is no point in building up | 
|---|
| 353 | //   the state table to chain into another rule for a longer match. | 
|---|
| 354 | } | 
|---|
| 355 |  | 
|---|
| 356 | // Mark this node as being the root of a rule. | 
|---|
| 357 | thisRule->fRuleRoot = TRUE; | 
|---|
| 358 |  | 
|---|
| 359 | // Flag if chaining into this rule is wanted. | 
|---|
| 360 | // | 
|---|
| 361 | if (fRB->fChainRules &&         // If rule chaining is enabled globally via !!chain | 
|---|
| 362 | !fNoChainInRule) {      //     and no '^' chain-in inhibit was on this rule | 
|---|
| 363 | thisRule->fChainIn = TRUE; | 
|---|
| 364 | } | 
|---|
| 365 |  | 
|---|
| 366 |  | 
|---|
| 367 | // All rule expressions are ORed together. | 
|---|
| 368 | // The ';' that terminates an expression really just functions as a '|' with | 
|---|
| 369 | //   a low operator prededence. | 
|---|
| 370 | // | 
|---|
| 371 | // Each of the four sets of rules are collected separately. | 
|---|
| 372 | //  (forward, reverse, safe_forward, safe_reverse) | 
|---|
| 373 | //  OR this rule into the appropriate group of them. | 
|---|
| 374 | // | 
|---|
| 375 | RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree); | 
|---|
| 376 |  | 
|---|
| 377 | if (*destRules != NULL) { | 
|---|
| 378 | // This is not the first rule encounted. | 
|---|
| 379 | // OR previous stuff  (from *destRules) | 
|---|
| 380 | // with the current rule expression (on the Node Stack) | 
|---|
| 381 | //  with the resulting OR expression going to *destRules | 
|---|
| 382 | // | 
|---|
| 383 | thisRule    = fNodeStack[fNodeStackPtr]; | 
|---|
| 384 | RBBINode  *prevRules   = *destRules; | 
|---|
| 385 | RBBINode  *orNode      = pushNewNode(RBBINode::opOr); | 
|---|
| 386 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 387 | break; | 
|---|
| 388 | } | 
|---|
| 389 | orNode->fLeftChild     = prevRules; | 
|---|
| 390 | prevRules->fParent     = orNode; | 
|---|
| 391 | orNode->fRightChild    = thisRule; | 
|---|
| 392 | thisRule->fParent      = orNode; | 
|---|
| 393 | *destRules             = orNode; | 
|---|
| 394 | } | 
|---|
| 395 | else | 
|---|
| 396 | { | 
|---|
| 397 | // This is the first rule encountered (for this direction). | 
|---|
| 398 | // Just move its parse tree from the stack to *destRules. | 
|---|
| 399 | *destRules = fNodeStack[fNodeStackPtr]; | 
|---|
| 400 | } | 
|---|
| 401 | fReverseRule   = FALSE;   // in preparation for the next rule. | 
|---|
| 402 | fLookAheadRule = FALSE; | 
|---|
| 403 | fNoChainInRule = FALSE; | 
|---|
| 404 | fNodeStackPtr  = 0; | 
|---|
| 405 | } | 
|---|
| 406 | break; | 
|---|
| 407 |  | 
|---|
| 408 |  | 
|---|
| 409 | case doRuleError: | 
|---|
| 410 | error(U_BRK_RULE_SYNTAX); | 
|---|
| 411 | returnVal = FALSE; | 
|---|
| 412 | break; | 
|---|
| 413 |  | 
|---|
| 414 |  | 
|---|
| 415 | case doVariableNameExpectedErr: | 
|---|
| 416 | error(U_BRK_RULE_SYNTAX); | 
|---|
| 417 | break; | 
|---|
| 418 |  | 
|---|
| 419 |  | 
|---|
| 420 | // | 
|---|
| 421 | //  Unary operands  + ? * | 
|---|
| 422 | //    These all appear after the operand to which they apply. | 
|---|
| 423 | //    When we hit one, the operand (may be a whole sub expression) | 
|---|
| 424 | //    will be on the top of the stack. | 
|---|
| 425 | //    Unary Operator becomes TOS, with the old TOS as its one child. | 
|---|
| 426 | case doUnaryOpPlus: | 
|---|
| 427 | { | 
|---|
| 428 | RBBINode  *operandNode = fNodeStack[fNodeStackPtr--]; | 
|---|
| 429 | RBBINode  *plusNode    = pushNewNode(RBBINode::opPlus); | 
|---|
| 430 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 431 | break; | 
|---|
| 432 | } | 
|---|
| 433 | plusNode->fLeftChild   = operandNode; | 
|---|
| 434 | operandNode->fParent   = plusNode; | 
|---|
| 435 | } | 
|---|
| 436 | break; | 
|---|
| 437 |  | 
|---|
| 438 | case doUnaryOpQuestion: | 
|---|
| 439 | { | 
|---|
| 440 | RBBINode  *operandNode = fNodeStack[fNodeStackPtr--]; | 
|---|
| 441 | RBBINode  *qNode       = pushNewNode(RBBINode::opQuestion); | 
|---|
| 442 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 443 | break; | 
|---|
| 444 | } | 
|---|
| 445 | qNode->fLeftChild      = operandNode; | 
|---|
| 446 | operandNode->fParent   = qNode; | 
|---|
| 447 | } | 
|---|
| 448 | break; | 
|---|
| 449 |  | 
|---|
| 450 | case doUnaryOpStar: | 
|---|
| 451 | { | 
|---|
| 452 | RBBINode  *operandNode = fNodeStack[fNodeStackPtr--]; | 
|---|
| 453 | RBBINode  *starNode    = pushNewNode(RBBINode::opStar); | 
|---|
| 454 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 455 | break; | 
|---|
| 456 | } | 
|---|
| 457 | starNode->fLeftChild   = operandNode; | 
|---|
| 458 | operandNode->fParent   = starNode; | 
|---|
| 459 | } | 
|---|
| 460 | break; | 
|---|
| 461 |  | 
|---|
| 462 | case doRuleChar: | 
|---|
| 463 | // A "Rule Character" is any single character that is a literal part | 
|---|
| 464 | // of the regular expression.  Like a, b and c in the expression "(abc*) | [:L:]" | 
|---|
| 465 | // These are pretty uncommon in break rules; the terms are more commonly | 
|---|
| 466 | //  sets.  To keep things uniform, treat these characters like as | 
|---|
| 467 | // sets that just happen to contain only one character. | 
|---|
| 468 | { | 
|---|
| 469 | n = pushNewNode(RBBINode::setRef); | 
|---|
| 470 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 471 | break; | 
|---|
| 472 | } | 
|---|
| 473 | findSetFor(UnicodeString(fC.fChar), n); | 
|---|
| 474 | n->fFirstPos = fScanIndex; | 
|---|
| 475 | n->fLastPos  = fNextIndex; | 
|---|
| 476 | fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); | 
|---|
| 477 | break; | 
|---|
| 478 | } | 
|---|
| 479 |  | 
|---|
| 480 | case doDotAny: | 
|---|
| 481 | // scanned a ".", meaning match any single character. | 
|---|
| 482 | { | 
|---|
| 483 | n = pushNewNode(RBBINode::setRef); | 
|---|
| 484 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 485 | break; | 
|---|
| 486 | } | 
|---|
| 487 | findSetFor(UnicodeString(TRUE, kAny, 3), n); | 
|---|
| 488 | n->fFirstPos = fScanIndex; | 
|---|
| 489 | n->fLastPos  = fNextIndex; | 
|---|
| 490 | fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); | 
|---|
| 491 | break; | 
|---|
| 492 | } | 
|---|
| 493 |  | 
|---|
| 494 | case doSlash: | 
|---|
| 495 | // Scanned a '/', which identifies a look-ahead break position in a rule. | 
|---|
| 496 | n = pushNewNode(RBBINode::lookAhead); | 
|---|
| 497 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 498 | break; | 
|---|
| 499 | } | 
|---|
| 500 | n->fVal      = fRuleNum; | 
|---|
| 501 | n->fFirstPos = fScanIndex; | 
|---|
| 502 | n->fLastPos  = fNextIndex; | 
|---|
| 503 | fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); | 
|---|
| 504 | fLookAheadRule = TRUE; | 
|---|
| 505 | break; | 
|---|
| 506 |  | 
|---|
| 507 |  | 
|---|
| 508 | case doStartTagValue: | 
|---|
| 509 | // Scanned a '{', the opening delimiter for a tag value within a rule. | 
|---|
| 510 | n = pushNewNode(RBBINode::tag); | 
|---|
| 511 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 512 | break; | 
|---|
| 513 | } | 
|---|
| 514 | n->fVal      = 0; | 
|---|
| 515 | n->fFirstPos = fScanIndex; | 
|---|
| 516 | n->fLastPos  = fNextIndex; | 
|---|
| 517 | break; | 
|---|
| 518 |  | 
|---|
| 519 | case doTagDigit: | 
|---|
| 520 | // Just scanned a decimal digit that's part of a tag value | 
|---|
| 521 | { | 
|---|
| 522 | n = fNodeStack[fNodeStackPtr]; | 
|---|
| 523 | uint32_t v = u_charDigitValue(fC.fChar); | 
|---|
| 524 | U_ASSERT(v < 10); | 
|---|
| 525 | n->fVal = n->fVal*10 + v; | 
|---|
| 526 | break; | 
|---|
| 527 | } | 
|---|
| 528 |  | 
|---|
| 529 | case doTagValue: | 
|---|
| 530 | n = fNodeStack[fNodeStackPtr]; | 
|---|
| 531 | n->fLastPos = fNextIndex; | 
|---|
| 532 | fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); | 
|---|
| 533 | break; | 
|---|
| 534 |  | 
|---|
| 535 | case doTagExpectedError: | 
|---|
| 536 | error(U_BRK_MALFORMED_RULE_TAG); | 
|---|
| 537 | returnVal = FALSE; | 
|---|
| 538 | break; | 
|---|
| 539 |  | 
|---|
| 540 | case doOptionStart: | 
|---|
| 541 | // Scanning a !!option.   At the start of string. | 
|---|
| 542 | fOptionStart = fScanIndex; | 
|---|
| 543 | break; | 
|---|
| 544 |  | 
|---|
| 545 | case doOptionEnd: | 
|---|
| 546 | { | 
|---|
| 547 | UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart); | 
|---|
| 548 | if (opt == UNICODE_STRING( "chain", 5)) { | 
|---|
| 549 | fRB->fChainRules = TRUE; | 
|---|
| 550 | } else if (opt == UNICODE_STRING( "LBCMNoChain", 11)) { | 
|---|
| 551 | fRB->fLBCMNoChain = TRUE; | 
|---|
| 552 | } else if (opt == UNICODE_STRING( "forward", 7)) { | 
|---|
| 553 | fRB->fDefaultTree   = &fRB->fForwardTree; | 
|---|
| 554 | } else if (opt == UNICODE_STRING( "reverse", 7)) { | 
|---|
| 555 | fRB->fDefaultTree   = &fRB->fReverseTree; | 
|---|
| 556 | } else if (opt == UNICODE_STRING( "safe_forward", 12)) { | 
|---|
| 557 | fRB->fDefaultTree   = &fRB->fSafeFwdTree; | 
|---|
| 558 | } else if (opt == UNICODE_STRING( "safe_reverse", 12)) { | 
|---|
| 559 | fRB->fDefaultTree   = &fRB->fSafeRevTree; | 
|---|
| 560 | } else if (opt == UNICODE_STRING( "lookAheadHardBreak", 18)) { | 
|---|
| 561 | fRB->fLookAheadHardBreak = TRUE; | 
|---|
| 562 | } else if (opt == UNICODE_STRING( "quoted_literals_only", 20)) { | 
|---|
| 563 | fRuleSets[kRuleSet_rule_char-128].clear(); | 
|---|
| 564 | } else if (opt == UNICODE_STRING( "unquoted_literals",  17)) { | 
|---|
| 565 | fRuleSets[kRuleSet_rule_char-128].applyPattern(UnicodeString(gRuleSet_rule_char_pattern), *fRB->fStatus); | 
|---|
| 566 | } else { | 
|---|
| 567 | error(U_BRK_UNRECOGNIZED_OPTION); | 
|---|
| 568 | } | 
|---|
| 569 | } | 
|---|
| 570 | break; | 
|---|
| 571 |  | 
|---|
| 572 | case doReverseDir: | 
|---|
| 573 | fReverseRule = TRUE; | 
|---|
| 574 | break; | 
|---|
| 575 |  | 
|---|
| 576 | case doStartVariableName: | 
|---|
| 577 | n = pushNewNode(RBBINode::varRef); | 
|---|
| 578 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 579 | break; | 
|---|
| 580 | } | 
|---|
| 581 | n->fFirstPos = fScanIndex; | 
|---|
| 582 | break; | 
|---|
| 583 |  | 
|---|
| 584 | case doEndVariableName: | 
|---|
| 585 | n = fNodeStack[fNodeStackPtr]; | 
|---|
| 586 | if (n==NULL || n->fType != RBBINode::varRef) { | 
|---|
| 587 | error(U_BRK_INTERNAL_ERROR); | 
|---|
| 588 | break; | 
|---|
| 589 | } | 
|---|
| 590 | n->fLastPos = fScanIndex; | 
|---|
| 591 | fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText); | 
|---|
| 592 | // Look the newly scanned name up in the symbol table | 
|---|
| 593 | //   If there's an entry, set the l. child of the var ref to the replacement expression. | 
|---|
| 594 | //   (We also pass through here when scanning assignments, but no harm is done, other | 
|---|
| 595 | //    than a slight wasted effort that seems hard to avoid.  Lookup will be null) | 
|---|
| 596 | n->fLeftChild = fSymbolTable->lookupNode(n->fText); | 
|---|
| 597 | break; | 
|---|
| 598 |  | 
|---|
| 599 | case doCheckVarDef: | 
|---|
| 600 | n = fNodeStack[fNodeStackPtr]; | 
|---|
| 601 | if (n->fLeftChild == NULL) { | 
|---|
| 602 | error(U_BRK_UNDEFINED_VARIABLE); | 
|---|
| 603 | returnVal = FALSE; | 
|---|
| 604 | } | 
|---|
| 605 | break; | 
|---|
| 606 |  | 
|---|
| 607 | case doExprFinished: | 
|---|
| 608 | break; | 
|---|
| 609 |  | 
|---|
| 610 | case doRuleErrorAssignExpr: | 
|---|
| 611 | error(U_BRK_ASSIGN_ERROR); | 
|---|
| 612 | returnVal = FALSE; | 
|---|
| 613 | break; | 
|---|
| 614 |  | 
|---|
| 615 | case doExit: | 
|---|
| 616 | returnVal = FALSE; | 
|---|
| 617 | break; | 
|---|
| 618 |  | 
|---|
| 619 | case doScanUnicodeSet: | 
|---|
| 620 | scanSet(); | 
|---|
| 621 | break; | 
|---|
| 622 |  | 
|---|
| 623 | default: | 
|---|
| 624 | error(U_BRK_INTERNAL_ERROR); | 
|---|
| 625 | returnVal = FALSE; | 
|---|
| 626 | break; | 
|---|
| 627 | } | 
|---|
| 628 | return returnVal && U_SUCCESS(*fRB->fStatus); | 
|---|
| 629 | } | 
|---|
| 630 |  | 
|---|
| 631 |  | 
|---|
| 632 |  | 
|---|
| 633 |  | 
|---|
| 634 | //------------------------------------------------------------------------------ | 
|---|
| 635 | // | 
|---|
| 636 | //  Error         Report a rule parse error. | 
|---|
| 637 | //                Only report it if no previous error has been recorded. | 
|---|
| 638 | // | 
|---|
| 639 | //------------------------------------------------------------------------------ | 
|---|
| 640 | void RBBIRuleScanner::error(UErrorCode e) { | 
|---|
| 641 | if (U_SUCCESS(*fRB->fStatus)) { | 
|---|
| 642 | *fRB->fStatus = e; | 
|---|
| 643 | if (fRB->fParseError) { | 
|---|
| 644 | fRB->fParseError->line  = fLineNum; | 
|---|
| 645 | fRB->fParseError->offset = fCharNum; | 
|---|
| 646 | fRB->fParseError->preContext[0] = 0; | 
|---|
| 647 | fRB->fParseError->postContext[0] = 0; | 
|---|
| 648 | } | 
|---|
| 649 | } | 
|---|
| 650 | } | 
|---|
| 651 |  | 
|---|
| 652 |  | 
|---|
| 653 |  | 
|---|
| 654 |  | 
|---|
| 655 | //------------------------------------------------------------------------------ | 
|---|
| 656 | // | 
|---|
| 657 | //  fixOpStack   The parse stack holds partially assembled chunks of the parse tree. | 
|---|
| 658 | //               An entry on the stack may be as small as a single setRef node, | 
|---|
| 659 | //               or as large as the parse tree | 
|---|
| 660 | //               for an entire expression (this will be the one item left on the stack | 
|---|
| 661 | //               when the parsing of an RBBI rule completes. | 
|---|
| 662 | // | 
|---|
| 663 | //               This function is called when a binary operator is encountered. | 
|---|
| 664 | //               It looks back up the stack for operators that are not yet associated | 
|---|
| 665 | //               with a right operand, and if the precedence of the stacked operator >= | 
|---|
| 666 | //               the precedence of the current operator, binds the operand left, | 
|---|
| 667 | //               to the previously encountered operator. | 
|---|
| 668 | // | 
|---|
| 669 | //------------------------------------------------------------------------------ | 
|---|
| 670 | void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) { | 
|---|
| 671 | RBBINode *n; | 
|---|
| 672 | // printNodeStack("entering fixOpStack()"); | 
|---|
| 673 | for (;;) { | 
|---|
| 674 | n = fNodeStack[fNodeStackPtr-1];   // an operator node | 
|---|
| 675 | if (n->fPrecedence == 0) { | 
|---|
| 676 | RBBIDebugPuts( "RBBIRuleScanner::fixOpStack, bad operator node"); | 
|---|
| 677 | error(U_BRK_INTERNAL_ERROR); | 
|---|
| 678 | return; | 
|---|
| 679 | } | 
|---|
| 680 |  | 
|---|
| 681 | if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) { | 
|---|
| 682 | // The most recent operand goes with the current operator, | 
|---|
| 683 | //   not with the previously stacked one. | 
|---|
| 684 | break; | 
|---|
| 685 | } | 
|---|
| 686 | // Stack operator is a binary op  ( '|' or concatenation) | 
|---|
| 687 | //   TOS operand becomes right child of this operator. | 
|---|
| 688 | //   Resulting subexpression becomes the TOS operand. | 
|---|
| 689 | n->fRightChild = fNodeStack[fNodeStackPtr]; | 
|---|
| 690 | fNodeStack[fNodeStackPtr]->fParent = n; | 
|---|
| 691 | fNodeStackPtr--; | 
|---|
| 692 | // printNodeStack("looping in fixOpStack()   "); | 
|---|
| 693 | } | 
|---|
| 694 |  | 
|---|
| 695 | if (p <= RBBINode::precLParen) { | 
|---|
| 696 | // Scan is at a right paren or end of expression. | 
|---|
| 697 | //  The scanned item must match the stack, or else there was an error. | 
|---|
| 698 | //  Discard the left paren (or start expr) node from the stack, | 
|---|
| 699 | //  leaving the completed (sub)expression as TOS. | 
|---|
| 700 | if (n->fPrecedence != p) { | 
|---|
| 701 | // Right paren encountered matched start of expression node, or | 
|---|
| 702 | // end of expression matched with a left paren node. | 
|---|
| 703 | error(U_BRK_MISMATCHED_PAREN); | 
|---|
| 704 | } | 
|---|
| 705 | fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr]; | 
|---|
| 706 | fNodeStackPtr--; | 
|---|
| 707 | // Delete the now-discarded LParen or Start node. | 
|---|
| 708 | delete n; | 
|---|
| 709 | } | 
|---|
| 710 | // printNodeStack("leaving fixOpStack()"); | 
|---|
| 711 | } | 
|---|
| 712 |  | 
|---|
| 713 |  | 
|---|
| 714 |  | 
|---|
| 715 |  | 
|---|
| 716 | //------------------------------------------------------------------------------ | 
|---|
| 717 | // | 
|---|
| 718 | //   findSetFor    given a UnicodeString, | 
|---|
| 719 | //                  - find the corresponding Unicode Set  (uset node) | 
|---|
| 720 | //                         (create one if necessary) | 
|---|
| 721 | //                  - Set fLeftChild of the caller's node (should be a setRef node) | 
|---|
| 722 | //                         to the uset node | 
|---|
| 723 | //                 Maintain a hash table of uset nodes, so the same one is always used | 
|---|
| 724 | //                    for the same string. | 
|---|
| 725 | //                 If a "to adopt" set is provided and we haven't seen this key before, | 
|---|
| 726 | //                    add the provided set to the hash table. | 
|---|
| 727 | //                 If the string is one (32 bit) char in length, the set contains | 
|---|
| 728 | //                    just one element which is the char in question. | 
|---|
| 729 | //                 If the string is "any", return a set containing all chars. | 
|---|
| 730 | // | 
|---|
| 731 | //------------------------------------------------------------------------------ | 
|---|
| 732 | void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) { | 
|---|
| 733 |  | 
|---|
| 734 | RBBISetTableEl   *el; | 
|---|
| 735 |  | 
|---|
| 736 | // First check whether we've already cached a set for this string. | 
|---|
| 737 | // If so, just use the cached set in the new node. | 
|---|
| 738 | //   delete any set provided by the caller, since we own it. | 
|---|
| 739 | el = (RBBISetTableEl *)uhash_get(fSetTable, &s); | 
|---|
| 740 | if (el != NULL) { | 
|---|
| 741 | delete setToAdopt; | 
|---|
| 742 | node->fLeftChild = el->val; | 
|---|
| 743 | U_ASSERT(node->fLeftChild->fType == RBBINode::uset); | 
|---|
| 744 | return; | 
|---|
| 745 | } | 
|---|
| 746 |  | 
|---|
| 747 | // Haven't seen this set before. | 
|---|
| 748 | // If the caller didn't provide us with a prebuilt set, | 
|---|
| 749 | //   create a new UnicodeSet now. | 
|---|
| 750 | if (setToAdopt == NULL) { | 
|---|
| 751 | if (s.compare(kAny, -1) == 0) { | 
|---|
| 752 | setToAdopt = new UnicodeSet(0x000000, 0x10ffff); | 
|---|
| 753 | } else { | 
|---|
| 754 | UChar32 c; | 
|---|
| 755 | c = s.char32At(0); | 
|---|
| 756 | setToAdopt = new UnicodeSet(c, c); | 
|---|
| 757 | } | 
|---|
| 758 | } | 
|---|
| 759 |  | 
|---|
| 760 | // | 
|---|
| 761 | // Make a new uset node to refer to this UnicodeSet | 
|---|
| 762 | // This new uset node becomes the child of the caller's setReference node. | 
|---|
| 763 | // | 
|---|
| 764 | RBBINode *usetNode    = new RBBINode(RBBINode::uset); | 
|---|
| 765 | if (usetNode == NULL) { | 
|---|
| 766 | error(U_MEMORY_ALLOCATION_ERROR); | 
|---|
| 767 | return; | 
|---|
| 768 | } | 
|---|
| 769 | usetNode->fInputSet   = setToAdopt; | 
|---|
| 770 | usetNode->fParent     = node; | 
|---|
| 771 | node->fLeftChild      = usetNode; | 
|---|
| 772 | usetNode->fText = s; | 
|---|
| 773 |  | 
|---|
| 774 |  | 
|---|
| 775 | // | 
|---|
| 776 | // Add the new uset node to the list of all uset nodes. | 
|---|
| 777 | // | 
|---|
| 778 | fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus); | 
|---|
| 779 |  | 
|---|
| 780 |  | 
|---|
| 781 | // | 
|---|
| 782 | // Add the new set to the set hash table. | 
|---|
| 783 | // | 
|---|
| 784 | el      = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl)); | 
|---|
| 785 | UnicodeString *tkey = new UnicodeString(s); | 
|---|
| 786 | if (tkey == NULL || el == NULL || setToAdopt == NULL) { | 
|---|
| 787 | // Delete to avoid memory leak | 
|---|
| 788 | delete tkey; | 
|---|
| 789 | tkey = NULL; | 
|---|
| 790 | uprv_free(el); | 
|---|
| 791 | el = NULL; | 
|---|
| 792 | delete setToAdopt; | 
|---|
| 793 | setToAdopt = NULL; | 
|---|
| 794 |  | 
|---|
| 795 | error(U_MEMORY_ALLOCATION_ERROR); | 
|---|
| 796 | return; | 
|---|
| 797 | } | 
|---|
| 798 | el->key = tkey; | 
|---|
| 799 | el->val = usetNode; | 
|---|
| 800 | uhash_put(fSetTable, el->key, el, fRB->fStatus); | 
|---|
| 801 |  | 
|---|
| 802 | return; | 
|---|
| 803 | } | 
|---|
| 804 |  | 
|---|
| 805 |  | 
|---|
| 806 |  | 
|---|
| 807 | // | 
|---|
| 808 | //  Assorted Unicode character constants. | 
|---|
| 809 | //     Numeric because there is no portable way to enter them as literals. | 
|---|
| 810 | //     (Think EBCDIC). | 
|---|
| 811 | // | 
|---|
| 812 | static const UChar      chCR        = 0x0d;      // New lines, for terminating comments. | 
|---|
| 813 | static const UChar      chLF        = 0x0a; | 
|---|
| 814 | static const UChar      chNEL       = 0x85;      //    NEL newline variant | 
|---|
| 815 | static const UChar      chLS        = 0x2028;    //    Unicode Line Separator | 
|---|
| 816 | static const UChar      chApos      = 0x27;      //  single quote, for quoted chars. | 
|---|
| 817 | static const UChar      chPound     = 0x23;      // '#', introduces a comment. | 
|---|
| 818 | static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape | 
|---|
| 819 | static const UChar      chLParen    = 0x28; | 
|---|
| 820 | static const UChar      chRParen    = 0x29; | 
|---|
| 821 |  | 
|---|
| 822 |  | 
|---|
| 823 | //------------------------------------------------------------------------------ | 
|---|
| 824 | // | 
|---|
| 825 | //  stripRules    Return a rules string without extra spaces. | 
|---|
| 826 | //                (Comments are removed separately, during rule parsing.) | 
|---|
| 827 | // | 
|---|
| 828 | //------------------------------------------------------------------------------ | 
|---|
| 829 | UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { | 
|---|
| 830 | UnicodeString strippedRules; | 
|---|
| 831 | int32_t rulesLength = rules.length(); | 
|---|
| 832 | bool skippingSpaces = false; | 
|---|
| 833 |  | 
|---|
| 834 | for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) { | 
|---|
| 835 | UChar32 cp = rules.char32At(idx); | 
|---|
| 836 | bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE); | 
|---|
| 837 | if (skippingSpaces && whiteSpace) { | 
|---|
| 838 | continue; | 
|---|
| 839 | } | 
|---|
| 840 | strippedRules.append(cp); | 
|---|
| 841 | skippingSpaces = whiteSpace; | 
|---|
| 842 | } | 
|---|
| 843 | return strippedRules; | 
|---|
| 844 | } | 
|---|
| 845 |  | 
|---|
| 846 |  | 
|---|
| 847 | //------------------------------------------------------------------------------ | 
|---|
| 848 | // | 
|---|
| 849 | //  nextCharLL    Low Level Next Char from rule input source. | 
|---|
| 850 | //                Get a char from the input character iterator, | 
|---|
| 851 | //                keep track of input position for error reporting. | 
|---|
| 852 | // | 
|---|
| 853 | //------------------------------------------------------------------------------ | 
|---|
| 854 | UChar32  RBBIRuleScanner::nextCharLL() { | 
|---|
| 855 | UChar32  ch; | 
|---|
| 856 |  | 
|---|
| 857 | if (fNextIndex >= fRB->fRules.length()) { | 
|---|
| 858 | return (UChar32)-1; | 
|---|
| 859 | } | 
|---|
| 860 | ch         = fRB->fRules.char32At(fNextIndex); | 
|---|
| 861 | fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1); | 
|---|
| 862 |  | 
|---|
| 863 | if (ch == chCR || | 
|---|
| 864 | ch == chNEL || | 
|---|
| 865 | ch == chLS   || | 
|---|
| 866 | (ch == chLF && fLastChar != chCR)) { | 
|---|
| 867 | // Character is starting a new line.  Bump up the line number, and | 
|---|
| 868 | //  reset the column to 0. | 
|---|
| 869 | fLineNum++; | 
|---|
| 870 | fCharNum=0; | 
|---|
| 871 | if (fQuoteMode) { | 
|---|
| 872 | error(U_BRK_NEW_LINE_IN_QUOTED_STRING); | 
|---|
| 873 | fQuoteMode = FALSE; | 
|---|
| 874 | } | 
|---|
| 875 | } | 
|---|
| 876 | else { | 
|---|
| 877 | // Character is not starting a new line.  Except in the case of a | 
|---|
| 878 | //   LF following a CR, increment the column position. | 
|---|
| 879 | if (ch != chLF) { | 
|---|
| 880 | fCharNum++; | 
|---|
| 881 | } | 
|---|
| 882 | } | 
|---|
| 883 | fLastChar = ch; | 
|---|
| 884 | return ch; | 
|---|
| 885 | } | 
|---|
| 886 |  | 
|---|
| 887 |  | 
|---|
| 888 | //------------------------------------------------------------------------------ | 
|---|
| 889 | // | 
|---|
| 890 | //   nextChar     for rules scanning.  At this level, we handle stripping | 
|---|
| 891 | //                out comments and processing backslash character escapes. | 
|---|
| 892 | //                The rest of the rules grammar is handled at the next level up. | 
|---|
| 893 | // | 
|---|
| 894 | //------------------------------------------------------------------------------ | 
|---|
| 895 | void RBBIRuleScanner::nextChar(RBBIRuleChar &c) { | 
|---|
| 896 |  | 
|---|
| 897 | // Unicode Character constants needed for the processing done by nextChar(), | 
|---|
| 898 | //   in hex because literals wont work on EBCDIC machines. | 
|---|
| 899 |  | 
|---|
| 900 | fScanIndex = fNextIndex; | 
|---|
| 901 | c.fChar    = nextCharLL(); | 
|---|
| 902 | c.fEscaped = FALSE; | 
|---|
| 903 |  | 
|---|
| 904 | // | 
|---|
| 905 | //  check for '' sequence. | 
|---|
| 906 | //  These are recognized in all contexts, whether in quoted text or not. | 
|---|
| 907 | // | 
|---|
| 908 | if (c.fChar == chApos) { | 
|---|
| 909 | if (fRB->fRules.char32At(fNextIndex) == chApos) { | 
|---|
| 910 | c.fChar    = nextCharLL();        // get nextChar officially so character counts | 
|---|
| 911 | c.fEscaped = TRUE;                //   stay correct. | 
|---|
| 912 | } | 
|---|
| 913 | else | 
|---|
| 914 | { | 
|---|
| 915 | // Single quote, by itself. | 
|---|
| 916 | //   Toggle quoting mode. | 
|---|
| 917 | //   Return either '('  or ')', because quotes cause a grouping of the quoted text. | 
|---|
| 918 | fQuoteMode = !fQuoteMode; | 
|---|
| 919 | if (fQuoteMode == TRUE) { | 
|---|
| 920 | c.fChar = chLParen; | 
|---|
| 921 | } else { | 
|---|
| 922 | c.fChar = chRParen; | 
|---|
| 923 | } | 
|---|
| 924 | c.fEscaped = FALSE;      // The paren that we return is not escaped. | 
|---|
| 925 | return; | 
|---|
| 926 | } | 
|---|
| 927 | } | 
|---|
| 928 |  | 
|---|
| 929 | if (fQuoteMode) { | 
|---|
| 930 | c.fEscaped = TRUE; | 
|---|
| 931 | } | 
|---|
| 932 | else | 
|---|
| 933 | { | 
|---|
| 934 | // We are not in a 'quoted region' of the source. | 
|---|
| 935 | // | 
|---|
| 936 | if (c.fChar == chPound) { | 
|---|
| 937 | // Start of a comment.  Consume the rest of it. | 
|---|
| 938 | //  The new-line char that terminates the comment is always returned. | 
|---|
| 939 | //  It will be treated as white-space, and serves to break up anything | 
|---|
| 940 | //    that might otherwise incorrectly clump together with a comment in | 
|---|
| 941 | //    the middle (a variable name, for example.) | 
|---|
| 942 | int32_t  = fScanIndex; | 
|---|
| 943 | for (;;) { | 
|---|
| 944 | c.fChar = nextCharLL(); | 
|---|
| 945 | if (c.fChar == (UChar32)-1 ||  // EOF | 
|---|
| 946 | c.fChar == chCR     || | 
|---|
| 947 | c.fChar == chLF     || | 
|---|
| 948 | c.fChar == chNEL    || | 
|---|
| 949 | c.fChar == chLS)       {break;} | 
|---|
| 950 | } | 
|---|
| 951 | for (int32_t i=commentStart; i<fNextIndex-1; ++i) { | 
|---|
| 952 | fRB->fStrippedRules.setCharAt(i, u' '); | 
|---|
| 953 | } | 
|---|
| 954 | } | 
|---|
| 955 | if (c.fChar == (UChar32)-1) { | 
|---|
| 956 | return; | 
|---|
| 957 | } | 
|---|
| 958 |  | 
|---|
| 959 | // | 
|---|
| 960 | //  check for backslash escaped characters. | 
|---|
| 961 | //  Use UnicodeString::unescapeAt() to handle them. | 
|---|
| 962 | // | 
|---|
| 963 | if (c.fChar == chBackSlash) { | 
|---|
| 964 | c.fEscaped = TRUE; | 
|---|
| 965 | int32_t startX = fNextIndex; | 
|---|
| 966 | c.fChar = fRB->fRules.unescapeAt(fNextIndex); | 
|---|
| 967 | if (fNextIndex == startX) { | 
|---|
| 968 | error(U_BRK_HEX_DIGITS_EXPECTED); | 
|---|
| 969 | } | 
|---|
| 970 | fCharNum += fNextIndex-startX; | 
|---|
| 971 | } | 
|---|
| 972 | } | 
|---|
| 973 | // putc(c.fChar, stdout); | 
|---|
| 974 | } | 
|---|
| 975 |  | 
|---|
| 976 | //------------------------------------------------------------------------------ | 
|---|
| 977 | // | 
|---|
| 978 | //  Parse RBBI rules.   The state machine for rules parsing is here. | 
|---|
| 979 | //                      The state tables are hand-written in the file rbbirpt.txt, | 
|---|
| 980 | //                      and converted to the form used here by a perl | 
|---|
| 981 | //                      script rbbicst.pl | 
|---|
| 982 | // | 
|---|
| 983 | //------------------------------------------------------------------------------ | 
|---|
| 984 | void RBBIRuleScanner::parse() { | 
|---|
| 985 | uint16_t                state; | 
|---|
| 986 | const RBBIRuleTableEl  *tableEl; | 
|---|
| 987 |  | 
|---|
| 988 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 989 | return; | 
|---|
| 990 | } | 
|---|
| 991 |  | 
|---|
| 992 | state = 1; | 
|---|
| 993 | nextChar(fC); | 
|---|
| 994 | // | 
|---|
| 995 | // Main loop for the rule parsing state machine. | 
|---|
| 996 | //   Runs once per state transition. | 
|---|
| 997 | //   Each time through optionally performs, depending on the state table, | 
|---|
| 998 | //      - an advance to the the next input char | 
|---|
| 999 | //      - an action to be performed. | 
|---|
| 1000 | //      - pushing or popping a state to/from the local state return stack. | 
|---|
| 1001 | // | 
|---|
| 1002 | for (;;) { | 
|---|
| 1003 | //  Bail out if anything has gone wrong. | 
|---|
| 1004 | //  RBBI rule file parsing stops on the first error encountered. | 
|---|
| 1005 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 1006 | break; | 
|---|
| 1007 | } | 
|---|
| 1008 |  | 
|---|
| 1009 | // Quit if state == 0.  This is the normal way to exit the state machine. | 
|---|
| 1010 | // | 
|---|
| 1011 | if (state == 0) { | 
|---|
| 1012 | break; | 
|---|
| 1013 | } | 
|---|
| 1014 |  | 
|---|
| 1015 | // Find the state table element that matches the input char from the rule, or the | 
|---|
| 1016 | //    class of the input character.  Start with the first table row for this | 
|---|
| 1017 | //    state, then linearly scan forward until we find a row that matches the | 
|---|
| 1018 | //    character.  The last row for each state always matches all characters, so | 
|---|
| 1019 | //    the search will stop there, if not before. | 
|---|
| 1020 | // | 
|---|
| 1021 | tableEl = &gRuleParseStateTable[state]; | 
|---|
| 1022 | #ifdef RBBI_DEBUG | 
|---|
| 1023 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { | 
|---|
| 1024 | RBBIDebugPrintf( "char, line, col = (\'%c\', %d, %d)    state=%s ", | 
|---|
| 1025 | fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]); | 
|---|
| 1026 | } | 
|---|
| 1027 | #endif | 
|---|
| 1028 |  | 
|---|
| 1029 | for (;;) { | 
|---|
| 1030 | #ifdef RBBI_DEBUG | 
|---|
| 1031 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf( "."); fflush(stdout);} | 
|---|
| 1032 | #endif | 
|---|
| 1033 | if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE &&   tableEl->fCharClass == fC.fChar) { | 
|---|
| 1034 | // Table row specified an individual character, not a set, and | 
|---|
| 1035 | //   the input character is not escaped, and | 
|---|
| 1036 | //   the input character matched it. | 
|---|
| 1037 | break; | 
|---|
| 1038 | } | 
|---|
| 1039 | if (tableEl->fCharClass == 255) { | 
|---|
| 1040 | // Table row specified default, match anything character class. | 
|---|
| 1041 | break; | 
|---|
| 1042 | } | 
|---|
| 1043 | if (tableEl->fCharClass == 254 && fC.fEscaped)  { | 
|---|
| 1044 | // Table row specified "escaped" and the char was escaped. | 
|---|
| 1045 | break; | 
|---|
| 1046 | } | 
|---|
| 1047 | if (tableEl->fCharClass == 253 && fC.fEscaped && | 
|---|
| 1048 | (fC.fChar == 0x50 || fC.fChar == 0x70 ))  { | 
|---|
| 1049 | // Table row specified "escaped P" and the char is either 'p' or 'P'. | 
|---|
| 1050 | break; | 
|---|
| 1051 | } | 
|---|
| 1052 | if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1)  { | 
|---|
| 1053 | // Table row specified eof and we hit eof on the input. | 
|---|
| 1054 | break; | 
|---|
| 1055 | } | 
|---|
| 1056 |  | 
|---|
| 1057 | if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class && | 
|---|
| 1058 | fC.fEscaped == FALSE &&                                      //   char is not escaped && | 
|---|
| 1059 | fC.fChar != (UChar32)-1) {                                   //   char is not EOF | 
|---|
| 1060 | U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets)); | 
|---|
| 1061 | if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { | 
|---|
| 1062 | // Table row specified a character class, or set of characters, | 
|---|
| 1063 | //   and the current char matches it. | 
|---|
| 1064 | break; | 
|---|
| 1065 | } | 
|---|
| 1066 | } | 
|---|
| 1067 |  | 
|---|
| 1068 | // No match on this row, advance to the next  row for this state, | 
|---|
| 1069 | tableEl++; | 
|---|
| 1070 | } | 
|---|
| 1071 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts( "");} | 
|---|
| 1072 |  | 
|---|
| 1073 | // | 
|---|
| 1074 | // We've found the row of the state table that matches the current input | 
|---|
| 1075 | //   character from the rules string. | 
|---|
| 1076 | // Perform any action specified  by this row in the state table. | 
|---|
| 1077 | if (doParseActions((int32_t)tableEl->fAction) == FALSE) { | 
|---|
| 1078 | // Break out of the state machine loop if the | 
|---|
| 1079 | //   the action signalled some kind of error, or | 
|---|
| 1080 | //   the action was to exit, occurs on normal end-of-rules-input. | 
|---|
| 1081 | break; | 
|---|
| 1082 | } | 
|---|
| 1083 |  | 
|---|
| 1084 | if (tableEl->fPushState != 0) { | 
|---|
| 1085 | fStackPtr++; | 
|---|
| 1086 | if (fStackPtr >= kStackSize) { | 
|---|
| 1087 | error(U_BRK_INTERNAL_ERROR); | 
|---|
| 1088 | RBBIDebugPuts( "RBBIRuleScanner::parse() - state stack overflow."); | 
|---|
| 1089 | fStackPtr--; | 
|---|
| 1090 | } | 
|---|
| 1091 | fStack[fStackPtr] = tableEl->fPushState; | 
|---|
| 1092 | } | 
|---|
| 1093 |  | 
|---|
| 1094 | if (tableEl->fNextChar) { | 
|---|
| 1095 | nextChar(fC); | 
|---|
| 1096 | } | 
|---|
| 1097 |  | 
|---|
| 1098 | // Get the next state from the table entry, or from the | 
|---|
| 1099 | //   state stack if the next state was specified as "pop". | 
|---|
| 1100 | if (tableEl->fNextState != 255) { | 
|---|
| 1101 | state = tableEl->fNextState; | 
|---|
| 1102 | } else { | 
|---|
| 1103 | state = fStack[fStackPtr]; | 
|---|
| 1104 | fStackPtr--; | 
|---|
| 1105 | if (fStackPtr < 0) { | 
|---|
| 1106 | error(U_BRK_INTERNAL_ERROR); | 
|---|
| 1107 | RBBIDebugPuts( "RBBIRuleScanner::parse() - state stack underflow."); | 
|---|
| 1108 | fStackPtr++; | 
|---|
| 1109 | } | 
|---|
| 1110 | } | 
|---|
| 1111 |  | 
|---|
| 1112 | } | 
|---|
| 1113 |  | 
|---|
| 1114 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 1115 | return; | 
|---|
| 1116 | } | 
|---|
| 1117 |  | 
|---|
| 1118 | // If there are no forward rules set an error. | 
|---|
| 1119 | // | 
|---|
| 1120 | if (fRB->fForwardTree == NULL) { | 
|---|
| 1121 | error(U_BRK_RULE_SYNTAX); | 
|---|
| 1122 | return; | 
|---|
| 1123 | } | 
|---|
| 1124 |  | 
|---|
| 1125 | // | 
|---|
| 1126 | // Parsing of the input RBBI rules is complete. | 
|---|
| 1127 | // We now have a parse tree for the rule expressions | 
|---|
| 1128 | // and a list of all UnicodeSets that are referenced. | 
|---|
| 1129 | // | 
|---|
| 1130 | #ifdef RBBI_DEBUG | 
|---|
| 1131 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();} | 
|---|
| 1132 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) { | 
|---|
| 1133 | RBBIDebugPrintf( "Completed Forward Rules Parse Tree...\n"); | 
|---|
| 1134 | RBBINode::printTree(fRB->fForwardTree, TRUE); | 
|---|
| 1135 | RBBIDebugPrintf( "\nCompleted Reverse Rules Parse Tree...\n"); | 
|---|
| 1136 | RBBINode::printTree(fRB->fReverseTree, TRUE); | 
|---|
| 1137 | RBBIDebugPrintf( "\nCompleted Safe Point Forward Rules Parse Tree...\n"); | 
|---|
| 1138 | RBBINode::printTree(fRB->fSafeFwdTree, TRUE); | 
|---|
| 1139 | RBBIDebugPrintf( "\nCompleted Safe Point Reverse Rules Parse Tree...\n"); | 
|---|
| 1140 | RBBINode::printTree(fRB->fSafeRevTree, TRUE); | 
|---|
| 1141 | } | 
|---|
| 1142 | #endif | 
|---|
| 1143 | } | 
|---|
| 1144 |  | 
|---|
| 1145 |  | 
|---|
| 1146 | //------------------------------------------------------------------------------ | 
|---|
| 1147 | // | 
|---|
| 1148 | //  printNodeStack     for debugging... | 
|---|
| 1149 | // | 
|---|
| 1150 | //------------------------------------------------------------------------------ | 
|---|
| 1151 | #ifdef RBBI_DEBUG | 
|---|
| 1152 | void RBBIRuleScanner::printNodeStack(const char *title) { | 
|---|
| 1153 | int i; | 
|---|
| 1154 | RBBIDebugPrintf( "%s.  Dumping node stack...\n", title); | 
|---|
| 1155 | for (i=fNodeStackPtr; i>0; i--) {RBBINode::printTree(fNodeStack[i], TRUE);} | 
|---|
| 1156 | } | 
|---|
| 1157 | #endif | 
|---|
| 1158 |  | 
|---|
| 1159 |  | 
|---|
| 1160 |  | 
|---|
| 1161 |  | 
|---|
| 1162 | //------------------------------------------------------------------------------ | 
|---|
| 1163 | // | 
|---|
| 1164 | //  pushNewNode   create a new RBBINode of the specified type and push it | 
|---|
| 1165 | //                onto the stack of nodes. | 
|---|
| 1166 | // | 
|---|
| 1167 | //------------------------------------------------------------------------------ | 
|---|
| 1168 | RBBINode  *RBBIRuleScanner::pushNewNode(RBBINode::NodeType  t) { | 
|---|
| 1169 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 1170 | return NULL; | 
|---|
| 1171 | } | 
|---|
| 1172 | if (fNodeStackPtr >= kStackSize - 1) { | 
|---|
| 1173 | error(U_BRK_RULE_SYNTAX); | 
|---|
| 1174 | RBBIDebugPuts( "RBBIRuleScanner::pushNewNode - stack overflow."); | 
|---|
| 1175 | return NULL; | 
|---|
| 1176 | } | 
|---|
| 1177 | fNodeStackPtr++; | 
|---|
| 1178 | fNodeStack[fNodeStackPtr] = new RBBINode(t); | 
|---|
| 1179 | if (fNodeStack[fNodeStackPtr] == NULL) { | 
|---|
| 1180 | *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR; | 
|---|
| 1181 | } | 
|---|
| 1182 | return fNodeStack[fNodeStackPtr]; | 
|---|
| 1183 | } | 
|---|
| 1184 |  | 
|---|
| 1185 |  | 
|---|
| 1186 |  | 
|---|
| 1187 | //------------------------------------------------------------------------------ | 
|---|
| 1188 | // | 
|---|
| 1189 | //  scanSet    Construct a UnicodeSet from the text at the current scan | 
|---|
| 1190 | //             position.  Advance the scan position to the first character | 
|---|
| 1191 | //             after the set. | 
|---|
| 1192 | // | 
|---|
| 1193 | //             A new RBBI setref node referring to the set is pushed onto the node | 
|---|
| 1194 | //             stack. | 
|---|
| 1195 | // | 
|---|
| 1196 | //             The scan position is normally under the control of the state machine | 
|---|
| 1197 | //             that controls rule parsing.  UnicodeSets, however, are parsed by | 
|---|
| 1198 | //             the UnicodeSet constructor, not by the RBBI rule parser. | 
|---|
| 1199 | // | 
|---|
| 1200 | //------------------------------------------------------------------------------ | 
|---|
| 1201 | void RBBIRuleScanner::scanSet() { | 
|---|
| 1202 | UnicodeSet    *uset; | 
|---|
| 1203 | ParsePosition  pos; | 
|---|
| 1204 | int            startPos; | 
|---|
| 1205 | int            i; | 
|---|
| 1206 |  | 
|---|
| 1207 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 1208 | return; | 
|---|
| 1209 | } | 
|---|
| 1210 |  | 
|---|
| 1211 | pos.setIndex(fScanIndex); | 
|---|
| 1212 | startPos = fScanIndex; | 
|---|
| 1213 | UErrorCode localStatus = U_ZERO_ERROR; | 
|---|
| 1214 | uset = new UnicodeSet(); | 
|---|
| 1215 | if (uset == NULL) { | 
|---|
| 1216 | localStatus = U_MEMORY_ALLOCATION_ERROR; | 
|---|
| 1217 | } else { | 
|---|
| 1218 | uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus); | 
|---|
| 1219 | } | 
|---|
| 1220 | if (U_FAILURE(localStatus)) { | 
|---|
| 1221 | //  TODO:  Get more accurate position of the error from UnicodeSet's return info. | 
|---|
| 1222 | //         UnicodeSet appears to not be reporting correctly at this time. | 
|---|
| 1223 | #ifdef RBBI_DEBUG | 
|---|
| 1224 | RBBIDebugPrintf( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()); | 
|---|
| 1225 | #endif | 
|---|
| 1226 | error(localStatus); | 
|---|
| 1227 | delete uset; | 
|---|
| 1228 | return; | 
|---|
| 1229 | } | 
|---|
| 1230 |  | 
|---|
| 1231 | // Verify that the set contains at least one code point. | 
|---|
| 1232 | // | 
|---|
| 1233 | U_ASSERT(uset!=NULL); | 
|---|
| 1234 | if (uset->isEmpty()) { | 
|---|
| 1235 | // This set is empty. | 
|---|
| 1236 | //  Make it an error, because it almost certainly is not what the user wanted. | 
|---|
| 1237 | //  Also, avoids having to think about corner cases in the tree manipulation code | 
|---|
| 1238 | //   that occurs later on. | 
|---|
| 1239 | error(U_BRK_RULE_EMPTY_SET); | 
|---|
| 1240 | delete uset; | 
|---|
| 1241 | return; | 
|---|
| 1242 | } | 
|---|
| 1243 |  | 
|---|
| 1244 |  | 
|---|
| 1245 | // Advance the RBBI parse postion over the UnicodeSet pattern. | 
|---|
| 1246 | //   Don't just set fScanIndex because the line/char positions maintained | 
|---|
| 1247 | //   for error reporting would be thrown off. | 
|---|
| 1248 | i = pos.getIndex(); | 
|---|
| 1249 | for (;;) { | 
|---|
| 1250 | if (fNextIndex >= i) { | 
|---|
| 1251 | break; | 
|---|
| 1252 | } | 
|---|
| 1253 | nextCharLL(); | 
|---|
| 1254 | } | 
|---|
| 1255 |  | 
|---|
| 1256 | if (U_SUCCESS(*fRB->fStatus)) { | 
|---|
| 1257 | RBBINode         *n; | 
|---|
| 1258 |  | 
|---|
| 1259 | n = pushNewNode(RBBINode::setRef); | 
|---|
| 1260 | if (U_FAILURE(*fRB->fStatus)) { | 
|---|
| 1261 | return; | 
|---|
| 1262 | } | 
|---|
| 1263 | n->fFirstPos = startPos; | 
|---|
| 1264 | n->fLastPos  = fNextIndex; | 
|---|
| 1265 | fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); | 
|---|
| 1266 | //  findSetFor() serves several purposes here: | 
|---|
| 1267 | //     - Adopts storage for the UnicodeSet, will be responsible for deleting. | 
|---|
| 1268 | //     - Mantains collection of all sets in use, needed later for establishing | 
|---|
| 1269 | //          character categories for run time engine. | 
|---|
| 1270 | //     - Eliminates mulitiple instances of the same set. | 
|---|
| 1271 | //     - Creates a new uset node if necessary (if this isn't a duplicate.) | 
|---|
| 1272 | findSetFor(n->fText, n, uset); | 
|---|
| 1273 | } | 
|---|
| 1274 |  | 
|---|
| 1275 | } | 
|---|
| 1276 |  | 
|---|
| 1277 | U_NAMESPACE_END | 
|---|
| 1278 |  | 
|---|
| 1279 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 
|---|
| 1280 |  | 
|---|