1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (c) 2002-2016, International Business Machines
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8*/
9//
10// rbbitblb.cpp
11//
12
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/unistr.h"
19#include "rbbitblb.h"
20#include "rbbirb.h"
21#include "rbbiscan.h"
22#include "rbbisetb.h"
23#include "rbbidata.h"
24#include "cstring.h"
25#include "uassert.h"
26#include "uvectr32.h"
27#include "cmemory.h"
28
29U_NAMESPACE_BEGIN
30
31RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
32 fRB(rb),
33 fTree(*rootNode),
34 fStatus(&status),
35 fDStates(nullptr),
36 fSafeTable(nullptr) {
37 if (U_FAILURE(status)) {
38 return;
39 }
40 // fDStates is UVector<RBBIStateDescriptor *>
41 fDStates = new UVector(status);
42 if (U_SUCCESS(status) && fDStates == nullptr ) {
43 status = U_MEMORY_ALLOCATION_ERROR;
44 }
45}
46
47
48
49RBBITableBuilder::~RBBITableBuilder() {
50 int i;
51 for (i=0; i<fDStates->size(); i++) {
52 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
53 }
54 delete fDStates;
55 delete fSafeTable;
56 delete fLookAheadRuleMap;
57}
58
59
60//-----------------------------------------------------------------------------
61//
62// RBBITableBuilder::buildForwardTable - This is the main function for building
63// the DFA state transition table from the RBBI rules parse tree.
64//
65//-----------------------------------------------------------------------------
66void RBBITableBuilder::buildForwardTable() {
67
68 if (U_FAILURE(*fStatus)) {
69 return;
70 }
71
72 // If there were no rules, just return. This situation can easily arise
73 // for the reverse rules.
74 if (fTree==NULL) {
75 return;
76 }
77
78 //
79 // Walk through the tree, replacing any references to $variables with a copy of the
80 // parse tree for the substition expression.
81 //
82 fTree = fTree->flattenVariables();
83#ifdef RBBI_DEBUG
84 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
85 RBBIDebugPuts("\nParse tree after flattening variable references.");
86 RBBINode::printTree(fTree, TRUE);
87 }
88#endif
89
90 //
91 // If the rules contained any references to {bof}
92 // add a {bof} <cat> <former root of tree> to the
93 // tree. Means that all matches must start out with the
94 // {bof} fake character.
95 //
96 if (fRB->fSetBuilder->sawBOF()) {
97 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
98 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
99 // Delete and exit if memory allocation failed.
100 if (bofTop == NULL || bofLeaf == NULL) {
101 *fStatus = U_MEMORY_ALLOCATION_ERROR;
102 delete bofTop;
103 delete bofLeaf;
104 return;
105 }
106 bofTop->fLeftChild = bofLeaf;
107 bofTop->fRightChild = fTree;
108 bofLeaf->fParent = bofTop;
109 bofLeaf->fVal = 2; // Reserved value for {bof}.
110 fTree = bofTop;
111 }
112
113 //
114 // Add a unique right-end marker to the expression.
115 // Appears as a cat-node, left child being the original tree,
116 // right child being the end marker.
117 //
118 RBBINode *cn = new RBBINode(RBBINode::opCat);
119 // Exit if memory allocation failed.
120 if (cn == NULL) {
121 *fStatus = U_MEMORY_ALLOCATION_ERROR;
122 return;
123 }
124 cn->fLeftChild = fTree;
125 fTree->fParent = cn;
126 RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark);
127 // Delete and exit if memory allocation failed.
128 if (cn->fRightChild == NULL) {
129 *fStatus = U_MEMORY_ALLOCATION_ERROR;
130 delete cn;
131 return;
132 }
133 cn->fRightChild->fParent = cn;
134 fTree = cn;
135
136 //
137 // Replace all references to UnicodeSets with the tree for the equivalent
138 // expression.
139 //
140 fTree->flattenSets();
141#ifdef RBBI_DEBUG
142 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
143 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
144 RBBINode::printTree(fTree, TRUE);
145 }
146#endif
147
148
149 //
150 // calculate the functions nullable, firstpos, lastpos and followpos on
151 // nodes in the parse tree.
152 // See the alogrithm description in Aho.
153 // Understanding how this works by looking at the code alone will be
154 // nearly impossible.
155 //
156 calcNullable(fTree);
157 calcFirstPos(fTree);
158 calcLastPos(fTree);
159 calcFollowPos(fTree);
160 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
161 RBBIDebugPuts("\n");
162 printPosSets(fTree);
163 }
164
165 //
166 // For "chained" rules, modify the followPos sets
167 //
168 if (fRB->fChainRules) {
169 calcChainedFollowPos(fTree, endMarkerNode);
170 }
171
172 //
173 // BOF (start of input) test fixup.
174 //
175 if (fRB->fSetBuilder->sawBOF()) {
176 bofFixup();
177 }
178
179 //
180 // Build the DFA state transition tables.
181 //
182 buildStateTable();
183 mapLookAheadRules();
184 flagAcceptingStates();
185 flagLookAheadStates();
186 flagTaggedStates();
187
188 //
189 // Update the global table of rule status {tag} values
190 // The rule builder has a global vector of status values that are common
191 // for all tables. Merge the ones from this table into the global set.
192 //
193 mergeRuleStatusVals();
194}
195
196
197
198//-----------------------------------------------------------------------------
199//
200// calcNullable. Impossible to explain succinctly. See Aho, section 3.9
201//
202//-----------------------------------------------------------------------------
203void RBBITableBuilder::calcNullable(RBBINode *n) {
204 if (n == NULL) {
205 return;
206 }
207 if (n->fType == RBBINode::setRef ||
208 n->fType == RBBINode::endMark ) {
209 // These are non-empty leaf node types.
210 n->fNullable = FALSE;
211 return;
212 }
213
214 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
215 // Lookahead marker node. It's a leaf, so no recursion on children.
216 // It's nullable because it does not match any literal text from the input stream.
217 n->fNullable = TRUE;
218 return;
219 }
220
221
222 // The node is not a leaf.
223 // Calculate nullable on its children.
224 calcNullable(n->fLeftChild);
225 calcNullable(n->fRightChild);
226
227 // Apply functions from table 3.40 in Aho
228 if (n->fType == RBBINode::opOr) {
229 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
230 }
231 else if (n->fType == RBBINode::opCat) {
232 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
233 }
234 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
235 n->fNullable = TRUE;
236 }
237 else {
238 n->fNullable = FALSE;
239 }
240}
241
242
243
244
245//-----------------------------------------------------------------------------
246//
247// calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
248//
249//-----------------------------------------------------------------------------
250void RBBITableBuilder::calcFirstPos(RBBINode *n) {
251 if (n == NULL) {
252 return;
253 }
254 if (n->fType == RBBINode::leafChar ||
255 n->fType == RBBINode::endMark ||
256 n->fType == RBBINode::lookAhead ||
257 n->fType == RBBINode::tag) {
258 // These are non-empty leaf node types.
259 // Note: In order to maintain the sort invariant on the set,
260 // this function should only be called on a node whose set is
261 // empty to start with.
262 n->fFirstPosSet->addElement(n, *fStatus);
263 return;
264 }
265
266 // The node is not a leaf.
267 // Calculate firstPos on its children.
268 calcFirstPos(n->fLeftChild);
269 calcFirstPos(n->fRightChild);
270
271 // Apply functions from table 3.40 in Aho
272 if (n->fType == RBBINode::opOr) {
273 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
274 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
275 }
276 else if (n->fType == RBBINode::opCat) {
277 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
278 if (n->fLeftChild->fNullable) {
279 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
280 }
281 }
282 else if (n->fType == RBBINode::opStar ||
283 n->fType == RBBINode::opQuestion ||
284 n->fType == RBBINode::opPlus) {
285 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
286 }
287}
288
289
290
291//-----------------------------------------------------------------------------
292//
293// calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
294//
295//-----------------------------------------------------------------------------
296void RBBITableBuilder::calcLastPos(RBBINode *n) {
297 if (n == NULL) {
298 return;
299 }
300 if (n->fType == RBBINode::leafChar ||
301 n->fType == RBBINode::endMark ||
302 n->fType == RBBINode::lookAhead ||
303 n->fType == RBBINode::tag) {
304 // These are non-empty leaf node types.
305 // Note: In order to maintain the sort invariant on the set,
306 // this function should only be called on a node whose set is
307 // empty to start with.
308 n->fLastPosSet->addElement(n, *fStatus);
309 return;
310 }
311
312 // The node is not a leaf.
313 // Calculate lastPos on its children.
314 calcLastPos(n->fLeftChild);
315 calcLastPos(n->fRightChild);
316
317 // Apply functions from table 3.40 in Aho
318 if (n->fType == RBBINode::opOr) {
319 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
320 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
321 }
322 else if (n->fType == RBBINode::opCat) {
323 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
324 if (n->fRightChild->fNullable) {
325 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
326 }
327 }
328 else if (n->fType == RBBINode::opStar ||
329 n->fType == RBBINode::opQuestion ||
330 n->fType == RBBINode::opPlus) {
331 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
332 }
333}
334
335
336
337//-----------------------------------------------------------------------------
338//
339// calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
340//
341//-----------------------------------------------------------------------------
342void RBBITableBuilder::calcFollowPos(RBBINode *n) {
343 if (n == NULL ||
344 n->fType == RBBINode::leafChar ||
345 n->fType == RBBINode::endMark) {
346 return;
347 }
348
349 calcFollowPos(n->fLeftChild);
350 calcFollowPos(n->fRightChild);
351
352 // Aho rule #1
353 if (n->fType == RBBINode::opCat) {
354 RBBINode *i; // is 'i' in Aho's description
355 uint32_t ix;
356
357 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
358
359 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
360 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
361 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
362 }
363 }
364
365 // Aho rule #2
366 if (n->fType == RBBINode::opStar ||
367 n->fType == RBBINode::opPlus) {
368 RBBINode *i; // again, n and i are the names from Aho's description.
369 uint32_t ix;
370
371 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
372 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
373 setAdd(i->fFollowPos, n->fFirstPosSet);
374 }
375 }
376
377
378
379}
380
381//-----------------------------------------------------------------------------
382//
383// addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
384// as roots of a rule to a destination vector.
385//
386//-----------------------------------------------------------------------------
387void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
388 if (node == NULL || U_FAILURE(*fStatus)) {
389 return;
390 }
391 if (node->fRuleRoot) {
392 dest->addElement(node, *fStatus);
393 // Note: rules cannot nest. If we found a rule start node,
394 // no child node can also be a start node.
395 return;
396 }
397 addRuleRootNodes(dest, node->fLeftChild);
398 addRuleRootNodes(dest, node->fRightChild);
399}
400
401//-----------------------------------------------------------------------------
402//
403// calcChainedFollowPos. Modify the previously calculated followPos sets
404// to implement rule chaining. NOT described by Aho
405//
406//-----------------------------------------------------------------------------
407void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
408
409 UVector leafNodes(*fStatus);
410 if (U_FAILURE(*fStatus)) {
411 return;
412 }
413
414 // get a list all leaf nodes
415 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
416 if (U_FAILURE(*fStatus)) {
417 return;
418 }
419
420 // Collect all leaf nodes that can start matches for rules
421 // with inbound chaining enabled, which is the union of the
422 // firstPosition sets from each of the rule root nodes.
423
424 UVector ruleRootNodes(*fStatus);
425 addRuleRootNodes(&ruleRootNodes, tree);
426
427 UVector matchStartNodes(*fStatus);
428 for (int j=0; j<ruleRootNodes.size(); ++j) {
429 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
430 if (node->fChainIn) {
431 setAdd(&matchStartNodes, node->fFirstPosSet);
432 }
433 }
434 if (U_FAILURE(*fStatus)) {
435 return;
436 }
437
438 int32_t endNodeIx;
439 int32_t startNodeIx;
440
441 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
442 RBBINode *endNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
443
444 // Identify leaf nodes that correspond to overall rule match positions.
445 // These include the endMarkNode in their followPos sets.
446 //
447 // Note: do not consider other end marker nodes, those that are added to
448 // look-ahead rules. These can't chain; a match immediately stops
449 // further matching. This leaves exactly one end marker node, the one
450 // at the end of the complete tree.
451
452 if (!endNode->fFollowPos->contains(endMarkNode)) {
453 continue;
454 }
455
456 // We've got a node that can end a match.
457
458 // !!LBCMNoChain implementation: If this node's val correspond to
459 // the Line Break $CM char class, don't chain from it.
460 // TODO: Remove this. !!LBCMNoChain is deprecated, and is not used
461 // by any of the standard ICU rules.
462 if (fRB->fLBCMNoChain) {
463 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
464 if (c != -1) {
465 // c == -1 occurs with sets containing only the {eof} marker string.
466 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
467 if (cLBProp == U_LB_COMBINING_MARK) {
468 continue;
469 }
470 }
471 }
472
473 // Now iterate over the nodes that can start a match, looking for ones
474 // with the same char class as our ending node.
475 RBBINode *startNode;
476 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
477 startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
478 if (startNode->fType != RBBINode::leafChar) {
479 continue;
480 }
481
482 if (endNode->fVal == startNode->fVal) {
483 // The end val (character class) of one possible match is the
484 // same as the start of another.
485
486 // Add all nodes from the followPos of the start node to the
487 // followPos set of the end node, which will have the effect of
488 // letting matches transition from a match state at endNode
489 // to the second char of a match starting with startNode.
490 setAdd(endNode->fFollowPos, startNode->fFollowPos);
491 }
492 }
493 }
494}
495
496
497//-----------------------------------------------------------------------------
498//
499// bofFixup. Fixup for state tables that include {bof} beginning of input testing.
500// Do an swizzle similar to chaining, modifying the followPos set of
501// the bofNode to include the followPos nodes from other {bot} nodes
502// scattered through the tree.
503//
504// This function has much in common with calcChainedFollowPos().
505//
506//-----------------------------------------------------------------------------
507void RBBITableBuilder::bofFixup() {
508
509 if (U_FAILURE(*fStatus)) {
510 return;
511 }
512
513 // The parse tree looks like this ...
514 // fTree root ---> <cat>
515 // / \ .
516 // <cat> <#end node>
517 // / \ .
518 // <bofNode> rest
519 // of tree
520 //
521 // We will be adding things to the followPos set of the <bofNode>
522 //
523 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
524 U_ASSERT(bofNode->fType == RBBINode::leafChar);
525 U_ASSERT(bofNode->fVal == 2);
526
527 // Get all nodes that can be the start a match of the user-written rules
528 // (excluding the fake bofNode)
529 // We want the nodes that can start a match in the
530 // part labeled "rest of tree"
531 //
532 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
533
534 RBBINode *startNode;
535 int startNodeIx;
536 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
537 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
538 if (startNode->fType != RBBINode::leafChar) {
539 continue;
540 }
541
542 if (startNode->fVal == bofNode->fVal) {
543 // We found a leaf node corresponding to a {bof} that was
544 // explicitly written into a rule.
545 // Add everything from the followPos set of this node to the
546 // followPos set of the fake bofNode at the start of the tree.
547 //
548 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
549 }
550 }
551}
552
553//-----------------------------------------------------------------------------
554//
555// buildStateTable() Determine the set of runtime DFA states and the
556// transition tables for these states, by the algorithm
557// of fig. 3.44 in Aho.
558//
559// Most of the comments are quotes of Aho's psuedo-code.
560//
561//-----------------------------------------------------------------------------
562void RBBITableBuilder::buildStateTable() {
563 if (U_FAILURE(*fStatus)) {
564 return;
565 }
566 RBBIStateDescriptor *failState;
567 // Set it to NULL to avoid uninitialized warning
568 RBBIStateDescriptor *initialState = NULL;
569 //
570 // Add a dummy state 0 - the stop state. Not from Aho.
571 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
572 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
573 if (failState == NULL) {
574 *fStatus = U_MEMORY_ALLOCATION_ERROR;
575 goto ExitBuildSTdeleteall;
576 }
577 failState->fPositions = new UVector(*fStatus);
578 if (failState->fPositions == NULL) {
579 *fStatus = U_MEMORY_ALLOCATION_ERROR;
580 }
581 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
582 goto ExitBuildSTdeleteall;
583 }
584 fDStates->addElement(failState, *fStatus);
585 if (U_FAILURE(*fStatus)) {
586 goto ExitBuildSTdeleteall;
587 }
588
589 // initially, the only unmarked state in Dstates is firstpos(root),
590 // where toot is the root of the syntax tree for (r)#;
591 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
592 if (initialState == NULL) {
593 *fStatus = U_MEMORY_ALLOCATION_ERROR;
594 }
595 if (U_FAILURE(*fStatus)) {
596 goto ExitBuildSTdeleteall;
597 }
598 initialState->fPositions = new UVector(*fStatus);
599 if (initialState->fPositions == NULL) {
600 *fStatus = U_MEMORY_ALLOCATION_ERROR;
601 }
602 if (U_FAILURE(*fStatus)) {
603 goto ExitBuildSTdeleteall;
604 }
605 setAdd(initialState->fPositions, fTree->fFirstPosSet);
606 fDStates->addElement(initialState, *fStatus);
607 if (U_FAILURE(*fStatus)) {
608 goto ExitBuildSTdeleteall;
609 }
610
611 // while there is an unmarked state T in Dstates do begin
612 for (;;) {
613 RBBIStateDescriptor *T = NULL;
614 int32_t tx;
615 for (tx=1; tx<fDStates->size(); tx++) {
616 RBBIStateDescriptor *temp;
617 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
618 if (temp->fMarked == FALSE) {
619 T = temp;
620 break;
621 }
622 }
623 if (T == NULL) {
624 break;
625 }
626
627 // mark T;
628 T->fMarked = TRUE;
629
630 // for each input symbol a do begin
631 int32_t a;
632 for (a = 1; a<=lastInputSymbol; a++) {
633 // let U be the set of positions that are in followpos(p)
634 // for some position p in T
635 // such that the symbol at position p is a;
636 UVector *U = NULL;
637 RBBINode *p;
638 int32_t px;
639 for (px=0; px<T->fPositions->size(); px++) {
640 p = (RBBINode *)T->fPositions->elementAt(px);
641 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
642 if (U == NULL) {
643 U = new UVector(*fStatus);
644 if (U == NULL) {
645 *fStatus = U_MEMORY_ALLOCATION_ERROR;
646 goto ExitBuildSTdeleteall;
647 }
648 }
649 setAdd(U, p->fFollowPos);
650 }
651 }
652
653 // if U is not empty and not in DStates then
654 int32_t ux = 0;
655 UBool UinDstates = FALSE;
656 if (U != NULL) {
657 U_ASSERT(U->size() > 0);
658 int ix;
659 for (ix=0; ix<fDStates->size(); ix++) {
660 RBBIStateDescriptor *temp2;
661 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
662 if (setEquals(U, temp2->fPositions)) {
663 delete U;
664 U = temp2->fPositions;
665 ux = ix;
666 UinDstates = TRUE;
667 break;
668 }
669 }
670
671 // Add U as an unmarked state to Dstates
672 if (!UinDstates)
673 {
674 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
675 if (newState == NULL) {
676 *fStatus = U_MEMORY_ALLOCATION_ERROR;
677 }
678 if (U_FAILURE(*fStatus)) {
679 goto ExitBuildSTdeleteall;
680 }
681 newState->fPositions = U;
682 fDStates->addElement(newState, *fStatus);
683 if (U_FAILURE(*fStatus)) {
684 return;
685 }
686 ux = fDStates->size()-1;
687 }
688
689 // Dtran[T, a] := U;
690 T->fDtran->setElementAt(ux, a);
691 }
692 }
693 }
694 return;
695 // delete local pointers only if error occured.
696ExitBuildSTdeleteall:
697 delete initialState;
698 delete failState;
699}
700
701
702/**
703 * mapLookAheadRules
704 *
705 */
706void RBBITableBuilder::mapLookAheadRules() {
707 fLookAheadRuleMap = new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
708 if (fLookAheadRuleMap == nullptr) {
709 *fStatus = U_MEMORY_ALLOCATION_ERROR;
710 }
711 if (U_FAILURE(*fStatus)) {
712 return;
713 }
714 fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
715 int32_t laSlotsInUse = 0;
716
717 for (int32_t n=0; n<fDStates->size(); n++) {
718 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
719 int32_t laSlotForState = 0;
720
721 // Establish the look-ahead slot for this state, if the state covers
722 // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
723
724 // If any of the look-ahead nodes already have a slot assigned, use it,
725 // otherwise assign a new one.
726
727 bool sawLookAheadNode = false;
728 for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
729 RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
730 if (node->fType != RBBINode::NodeType::lookAhead) {
731 continue;
732 }
733 sawLookAheadNode = true;
734 int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
735 U_ASSERT(ruleNum < fLookAheadRuleMap->size());
736 U_ASSERT(ruleNum > 0);
737 int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
738 if (laSlot != 0) {
739 if (laSlotForState == 0) {
740 laSlotForState = laSlot;
741 } else {
742 // TODO: figure out if this can fail, change to setting an error code if so.
743 U_ASSERT(laSlot == laSlotForState);
744 }
745 }
746 }
747 if (!sawLookAheadNode) {
748 continue;
749 }
750
751 if (laSlotForState == 0) {
752 laSlotForState = ++laSlotsInUse;
753 }
754
755 // For each look ahead node covered by this state,
756 // set the mapping from the node's rule number to the look ahead slot.
757 // There can be multiple nodes/rule numbers going to the same la slot.
758
759 for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
760 RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
761 if (node->fType != RBBINode::NodeType::lookAhead) {
762 continue;
763 }
764 int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
765 int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
766 (void)existingVal;
767 U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
768 fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
769 }
770 }
771
772}
773
774//-----------------------------------------------------------------------------
775//
776// flagAcceptingStates Identify accepting states.
777// First get a list of all of the end marker nodes.
778// Then, for each state s,
779// if s contains one of the end marker nodes in its list of tree positions then
780// s is an accepting state.
781//
782//-----------------------------------------------------------------------------
783void RBBITableBuilder::flagAcceptingStates() {
784 if (U_FAILURE(*fStatus)) {
785 return;
786 }
787 UVector endMarkerNodes(*fStatus);
788 RBBINode *endMarker;
789 int32_t i;
790 int32_t n;
791
792 if (U_FAILURE(*fStatus)) {
793 return;
794 }
795
796 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
797 if (U_FAILURE(*fStatus)) {
798 return;
799 }
800
801 for (i=0; i<endMarkerNodes.size(); i++) {
802 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
803 for (n=0; n<fDStates->size(); n++) {
804 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
805 if (sd->fPositions->indexOf(endMarker) >= 0) {
806 // Any non-zero value for fAccepting means this is an accepting node.
807 // The value is what will be returned to the user as the break status.
808 // If no other value was specified, force it to -1.
809
810 if (sd->fAccepting==0) {
811 // State hasn't been marked as accepting yet. Do it now.
812 sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
813 if (sd->fAccepting == 0) {
814 sd->fAccepting = -1;
815 }
816 }
817 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
818 // Both lookahead and non-lookahead accepting for this state.
819 // Favor the look-ahead, because a look-ahead match needs to
820 // immediately stop the run-time engine. First match, not longest.
821 sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
822 }
823 // implicit else:
824 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
825 }
826 }
827 }
828}
829
830
831//-----------------------------------------------------------------------------
832//
833// flagLookAheadStates Very similar to flagAcceptingStates, above.
834//
835//-----------------------------------------------------------------------------
836void RBBITableBuilder::flagLookAheadStates() {
837 if (U_FAILURE(*fStatus)) {
838 return;
839 }
840 UVector lookAheadNodes(*fStatus);
841 RBBINode *lookAheadNode;
842 int32_t i;
843 int32_t n;
844
845 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
846 if (U_FAILURE(*fStatus)) {
847 return;
848 }
849 for (i=0; i<lookAheadNodes.size(); i++) {
850 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
851 U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
852
853 for (n=0; n<fDStates->size(); n++) {
854 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
855 int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
856 if (positionsIdx >= 0) {
857 U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
858 int32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
859 U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
860 // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
861 // printf("%s:%d Bingo. sd->fLookAhead:%d lookaheadSlot:%d\n",
862 // __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
863 // }
864 sd->fLookAhead = lookaheadSlot;
865 }
866 }
867 }
868}
869
870
871
872
873//-----------------------------------------------------------------------------
874//
875// flagTaggedStates
876//
877//-----------------------------------------------------------------------------
878void RBBITableBuilder::flagTaggedStates() {
879 if (U_FAILURE(*fStatus)) {
880 return;
881 }
882 UVector tagNodes(*fStatus);
883 RBBINode *tagNode;
884 int32_t i;
885 int32_t n;
886
887 if (U_FAILURE(*fStatus)) {
888 return;
889 }
890 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
891 if (U_FAILURE(*fStatus)) {
892 return;
893 }
894 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
895 tagNode = (RBBINode *)tagNodes.elementAt(i);
896
897 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
898 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
899 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
900 sortedAdd(&sd->fTagVals, tagNode->fVal);
901 }
902 }
903 }
904}
905
906
907
908
909//-----------------------------------------------------------------------------
910//
911// mergeRuleStatusVals
912//
913// Update the global table of rule status {tag} values
914// The rule builder has a global vector of status values that are common
915// for all tables. Merge the ones from this table into the global set.
916//
917//-----------------------------------------------------------------------------
918void RBBITableBuilder::mergeRuleStatusVals() {
919 //
920 // The basic outline of what happens here is this...
921 //
922 // for each state in this state table
923 // if the status tag list for this state is in the global statuses list
924 // record where and
925 // continue with the next state
926 // else
927 // add the tag list for this state to the global list.
928 //
929 int i;
930 int n;
931
932 // Pre-set a single tag of {0} into the table.
933 // We will need this as a default, for rule sets with no explicit tagging.
934 if (fRB->fRuleStatusVals->size() == 0) {
935 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
936 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
937 }
938
939 // For each state
940 for (n=0; n<fDStates->size(); n++) {
941 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
942 UVector *thisStatesTagValues = sd->fTagVals;
943 if (thisStatesTagValues == NULL) {
944 // No tag values are explicitly associated with this state.
945 // Set the default tag value.
946 sd->fTagsIdx = 0;
947 continue;
948 }
949
950 // There are tag(s) associated with this state.
951 // fTagsIdx will be the index into the global tag list for this state's tag values.
952 // Initial value of -1 flags that we haven't got it set yet.
953 sd->fTagsIdx = -1;
954 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
955 int32_t nextTagGroupStart = 0;
956
957 // Loop runs once per group of tags in the global list
958 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
959 thisTagGroupStart = nextTagGroupStart;
960 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
961 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
962 // The number of tags for this state is different from
963 // the number of tags in this group from the global list.
964 // Continue with the next group from the global list.
965 continue;
966 }
967 // The lengths match, go ahead and compare the actual tag values
968 // between this state and the group from the global list.
969 for (i=0; i<thisStatesTagValues->size(); i++) {
970 if (thisStatesTagValues->elementAti(i) !=
971 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
972 // Mismatch.
973 break;
974 }
975 }
976
977 if (i == thisStatesTagValues->size()) {
978 // We found a set of tag values in the global list that match
979 // those for this state. Use them.
980 sd->fTagsIdx = thisTagGroupStart;
981 break;
982 }
983 }
984
985 if (sd->fTagsIdx == -1) {
986 // No suitable entry in the global tag list already. Add one
987 sd->fTagsIdx = fRB->fRuleStatusVals->size();
988 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
989 for (i=0; i<thisStatesTagValues->size(); i++) {
990 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
991 }
992 }
993 }
994}
995
996
997
998
999
1000
1001
1002//-----------------------------------------------------------------------------
1003//
1004// sortedAdd Add a value to a vector of sorted values (ints).
1005// Do not replicate entries; if the value is already there, do not
1006// add a second one.
1007// Lazily create the vector if it does not already exist.
1008//
1009//-----------------------------------------------------------------------------
1010void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
1011 int32_t i;
1012
1013 if (*vector == NULL) {
1014 *vector = new UVector(*fStatus);
1015 }
1016 if (*vector == NULL || U_FAILURE(*fStatus)) {
1017 return;
1018 }
1019 UVector *vec = *vector;
1020 int32_t vSize = vec->size();
1021 for (i=0; i<vSize; i++) {
1022 int32_t valAtI = vec->elementAti(i);
1023 if (valAtI == val) {
1024 // The value is already in the vector. Don't add it again.
1025 return;
1026 }
1027 if (valAtI > val) {
1028 break;
1029 }
1030 }
1031 vec->insertElementAt(val, i, *fStatus);
1032}
1033
1034
1035
1036//-----------------------------------------------------------------------------
1037//
1038// setAdd Set operation on UVector
1039// dest = dest union source
1040// Elements may only appear once and must be sorted.
1041//
1042//-----------------------------------------------------------------------------
1043void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
1044 int32_t destOriginalSize = dest->size();
1045 int32_t sourceSize = source->size();
1046 int32_t di = 0;
1047 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc
1048 void **destPtr, **sourcePtr;
1049 void **destLim, **sourceLim;
1050
1051 if (destOriginalSize > destArray.getCapacity()) {
1052 if (destArray.resize(destOriginalSize) == NULL) {
1053 return;
1054 }
1055 }
1056 destPtr = destArray.getAlias();
1057 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()?
1058
1059 if (sourceSize > sourceArray.getCapacity()) {
1060 if (sourceArray.resize(sourceSize) == NULL) {
1061 return;
1062 }
1063 }
1064 sourcePtr = sourceArray.getAlias();
1065 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()?
1066
1067 // Avoid multiple "get element" calls by getting the contents into arrays
1068 (void) dest->toArray(destPtr);
1069 (void) source->toArray(sourcePtr);
1070
1071 dest->setSize(sourceSize+destOriginalSize, *fStatus);
1072
1073 while (sourcePtr < sourceLim && destPtr < destLim) {
1074 if (*destPtr == *sourcePtr) {
1075 dest->setElementAt(*sourcePtr++, di++);
1076 destPtr++;
1077 }
1078 // This check is required for machines with segmented memory, like i5/OS.
1079 // Direct pointer comparison is not recommended.
1080 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1081 dest->setElementAt(*destPtr++, di++);
1082 }
1083 else { /* *sourcePtr < *destPtr */
1084 dest->setElementAt(*sourcePtr++, di++);
1085 }
1086 }
1087
1088 // At most one of these two cleanup loops will execute
1089 while (destPtr < destLim) {
1090 dest->setElementAt(*destPtr++, di++);
1091 }
1092 while (sourcePtr < sourceLim) {
1093 dest->setElementAt(*sourcePtr++, di++);
1094 }
1095
1096 dest->setSize(di, *fStatus);
1097}
1098
1099
1100
1101//-----------------------------------------------------------------------------
1102//
1103// setEqual Set operation on UVector.
1104// Compare for equality.
1105// Elements must be sorted.
1106//
1107//-----------------------------------------------------------------------------
1108UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1109 return a->equals(*b);
1110}
1111
1112
1113//-----------------------------------------------------------------------------
1114//
1115// printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1116// for each node in the tree.
1117//
1118//-----------------------------------------------------------------------------
1119#ifdef RBBI_DEBUG
1120void RBBITableBuilder::printPosSets(RBBINode *n) {
1121 if (n==NULL) {
1122 return;
1123 }
1124 printf("\n");
1125 RBBINode::printNodeHeader();
1126 RBBINode::printNode(n);
1127 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
1128
1129 RBBIDebugPrintf(" firstpos: ");
1130 printSet(n->fFirstPosSet);
1131
1132 RBBIDebugPrintf(" lastpos: ");
1133 printSet(n->fLastPosSet);
1134
1135 RBBIDebugPrintf(" followpos: ");
1136 printSet(n->fFollowPos);
1137
1138 printPosSets(n->fLeftChild);
1139 printPosSets(n->fRightChild);
1140}
1141#endif
1142
1143//
1144// findDuplCharClassFrom()
1145//
1146bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1147 int32_t numStates = fDStates->size();
1148 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1149
1150 for (; categories->first < numCols-1; categories->first++) {
1151 for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
1152 // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
1153 uint16_t table_base = 0;
1154 uint16_t table_dupl = 1;
1155 for (int32_t state=0; state<numStates; state++) {
1156 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1157 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1158 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1159 if (table_base != table_dupl) {
1160 break;
1161 }
1162 }
1163 if (table_base == table_dupl) {
1164 return true;
1165 }
1166 }
1167 }
1168 return false;
1169}
1170
1171
1172//
1173// removeColumn()
1174//
1175void RBBITableBuilder::removeColumn(int32_t column) {
1176 int32_t numStates = fDStates->size();
1177 for (int32_t state=0; state<numStates; state++) {
1178 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1179 U_ASSERT(column < sd->fDtran->size());
1180 sd->fDtran->removeElementAt(column);
1181 }
1182}
1183
1184/*
1185 * findDuplicateState
1186 */
1187bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1188 int32_t numStates = fDStates->size();
1189 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1190
1191 for (; states->first<numStates-1; states->first++) {
1192 RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1193 for (states->second=states->first+1; states->second<numStates; states->second++) {
1194 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1195 if (firstSD->fAccepting != duplSD->fAccepting ||
1196 firstSD->fLookAhead != duplSD->fLookAhead ||
1197 firstSD->fTagsIdx != duplSD->fTagsIdx) {
1198 continue;
1199 }
1200 bool rowsMatch = true;
1201 for (int32_t col=0; col < numCols; ++col) {
1202 int32_t firstVal = firstSD->fDtran->elementAti(col);
1203 int32_t duplVal = duplSD->fDtran->elementAti(col);
1204 if (!((firstVal == duplVal) ||
1205 ((firstVal == states->first || firstVal == states->second) &&
1206 (duplVal == states->first || duplVal == states->second)))) {
1207 rowsMatch = false;
1208 break;
1209 }
1210 }
1211 if (rowsMatch) {
1212 return true;
1213 }
1214 }
1215 }
1216 return false;
1217}
1218
1219
1220bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1221 int32_t numStates = fSafeTable->size();
1222
1223 for (; states->first<numStates-1; states->first++) {
1224 UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1225 for (states->second=states->first+1; states->second<numStates; states->second++) {
1226 UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1227 bool rowsMatch = true;
1228 int32_t numCols = firstRow->length();
1229 for (int32_t col=0; col < numCols; ++col) {
1230 int32_t firstVal = firstRow->charAt(col);
1231 int32_t duplVal = duplRow->charAt(col);
1232 if (!((firstVal == duplVal) ||
1233 ((firstVal == states->first || firstVal == states->second) &&
1234 (duplVal == states->first || duplVal == states->second)))) {
1235 rowsMatch = false;
1236 break;
1237 }
1238 }
1239 if (rowsMatch) {
1240 return true;
1241 }
1242 }
1243 }
1244 return false;
1245}
1246
1247
1248void RBBITableBuilder::removeState(IntPair duplStates) {
1249 const int32_t keepState = duplStates.first;
1250 const int32_t duplState = duplStates.second;
1251 U_ASSERT(keepState < duplState);
1252 U_ASSERT(duplState < fDStates->size());
1253
1254 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1255 fDStates->removeElementAt(duplState);
1256 delete duplSD;
1257
1258 int32_t numStates = fDStates->size();
1259 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1260 for (int32_t state=0; state<numStates; ++state) {
1261 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1262 for (int32_t col=0; col<numCols; col++) {
1263 int32_t existingVal = sd->fDtran->elementAti(col);
1264 int32_t newVal = existingVal;
1265 if (existingVal == duplState) {
1266 newVal = keepState;
1267 } else if (existingVal > duplState) {
1268 newVal = existingVal - 1;
1269 }
1270 sd->fDtran->setElementAt(newVal, col);
1271 }
1272 }
1273}
1274
1275void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1276 const int32_t keepState = duplStates.first;
1277 const int32_t duplState = duplStates.second;
1278 U_ASSERT(keepState < duplState);
1279 U_ASSERT(duplState < fSafeTable->size());
1280
1281 fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function
1282 // and will auto-delete the removed element.
1283 int32_t numStates = fSafeTable->size();
1284 for (int32_t state=0; state<numStates; ++state) {
1285 UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1286 int32_t numCols = sd->length();
1287 for (int32_t col=0; col<numCols; col++) {
1288 int32_t existingVal = sd->charAt(col);
1289 int32_t newVal = existingVal;
1290 if (existingVal == duplState) {
1291 newVal = keepState;
1292 } else if (existingVal > duplState) {
1293 newVal = existingVal - 1;
1294 }
1295 sd->setCharAt(col, static_cast<char16_t>(newVal));
1296 }
1297 }
1298}
1299
1300
1301/*
1302 * RemoveDuplicateStates
1303 */
1304int32_t RBBITableBuilder::removeDuplicateStates() {
1305 IntPair dupls = {3, 0};
1306 int32_t numStatesRemoved = 0;
1307
1308 while (findDuplicateState(&dupls)) {
1309 // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1310 removeState(dupls);
1311 ++numStatesRemoved;
1312 }
1313 return numStatesRemoved;
1314}
1315
1316
1317//-----------------------------------------------------------------------------
1318//
1319// getTableSize() Calculate the size of the runtime form of this
1320// state transition table.
1321//
1322//-----------------------------------------------------------------------------
1323int32_t RBBITableBuilder::getTableSize() const {
1324 int32_t size = 0;
1325 int32_t numRows;
1326 int32_t numCols;
1327 int32_t rowSize;
1328
1329 if (fTree == NULL) {
1330 return 0;
1331 }
1332
1333 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1334
1335 numRows = fDStates->size();
1336 numCols = fRB->fSetBuilder->getNumCharCategories();
1337
1338 rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1339 size += numRows * rowSize;
1340 return size;
1341}
1342
1343
1344//-----------------------------------------------------------------------------
1345//
1346// exportTable() export the state transition table in the format required
1347// by the runtime engine. getTableSize() bytes of memory
1348// must be available at the output address "where".
1349//
1350//-----------------------------------------------------------------------------
1351void RBBITableBuilder::exportTable(void *where) {
1352 RBBIStateTable *table = (RBBIStateTable *)where;
1353 uint32_t state;
1354 int col;
1355
1356 if (U_FAILURE(*fStatus) || fTree == NULL) {
1357 return;
1358 }
1359
1360 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1361 if (catCount > 0x7fff ||
1362 fDStates->size() > 0x7fff) {
1363 *fStatus = U_BRK_INTERNAL_ERROR;
1364 return;
1365 }
1366
1367 table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1368 table->fNumStates = fDStates->size();
1369 table->fFlags = 0;
1370 if (fRB->fLookAheadHardBreak) {
1371 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
1372 }
1373 if (fRB->fSetBuilder->sawBOF()) {
1374 table->fFlags |= RBBI_BOF_REQUIRED;
1375 }
1376 table->fReserved = 0;
1377
1378 for (state=0; state<table->fNumStates; state++) {
1379 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1380 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1381 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1382 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1383 row->fAccepting = (int16_t)sd->fAccepting;
1384 row->fLookAhead = (int16_t)sd->fLookAhead;
1385 row->fTagIdx = (int16_t)sd->fTagsIdx;
1386 for (col=0; col<catCount; col++) {
1387 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1388 }
1389 }
1390}
1391
1392
1393/**
1394 * Synthesize a safe state table from the main state table.
1395 */
1396void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1397 // The safe table creation has three steps:
1398
1399 // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
1400 // following the pair do not depend on context or state before the pair. To test
1401 // whether a pair is safe, run it through the main forward state table, starting
1402 // from each state. If the the final state is the same, no matter what the starting state,
1403 // the pair is safe.
1404 //
1405 // 2. Build a state table that recognizes the safe pairs. It's similar to their
1406 // forward table, with a column for each input character [class], and a row for
1407 // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1408 // create an additional state for each input character category; being in
1409 // one of these states means that the character has been seen, and is potentially
1410 // the first of a pair. In each of these rows, the entry for the second character
1411 // of a safe pair is set to the stop state (0), indicating that a match was found.
1412 // All other table entries are set to the state corresponding the current input
1413 // character, allowing that charcter to be the of a start following pair.
1414 //
1415 // Because the safe rules are to be run in reverse, moving backwards in the text,
1416 // the first and second pair categories are swapped when building the table.
1417 //
1418 // 3. Compress the table. There are typically many rows (states) that are
1419 // equivalent - that have zeroes (match completed) in the same columns -
1420 // and can be folded together.
1421
1422 // Each safe pair is stored as two UChars in the safePair string.
1423 UnicodeString safePairs;
1424
1425 int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1426 int32_t numStates = fDStates->size();
1427
1428 for (int32_t c1=0; c1<numCharClasses; ++c1) {
1429 for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1430 int32_t wantedEndState = -1;
1431 int32_t endState = 0;
1432 for (int32_t startState = 1; startState < numStates; ++startState) {
1433 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1434 int32_t s2 = startStateD->fDtran->elementAti(c1);
1435 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1436 endState = s2StateD->fDtran->elementAti(c2);
1437 if (wantedEndState < 0) {
1438 wantedEndState = endState;
1439 } else {
1440 if (wantedEndState != endState) {
1441 break;
1442 }
1443 }
1444 }
1445 if (wantedEndState == endState) {
1446 safePairs.append((char16_t)c1);
1447 safePairs.append((char16_t)c2);
1448 // printf("(%d, %d) ", c1, c2);
1449 }
1450 }
1451 // printf("\n");
1452 }
1453
1454 // Populate the initial safe table.
1455 // The table as a whole is UVector<UnicodeString>
1456 // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1457 // Row 0 is the stop state.
1458 // Row 1 is the start sate.
1459 // Row 2 and beyond are other states, initially one per char class, but
1460 // after initial construction, many of the states will be combined, compacting the table.
1461 // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1462 // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1463
1464 U_ASSERT(fSafeTable == nullptr);
1465 fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
1466 for (int32_t row=0; row<numCharClasses + 2; ++row) {
1467 fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1468 }
1469
1470 // From the start state, each input char class transitions to the state for that input.
1471 UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1472 for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1473 // Note: +2 for the start & stop state.
1474 startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
1475 }
1476
1477 // Initially make every other state table row look like the start state row,
1478 for (int32_t row=2; row<numCharClasses+2; ++row) {
1479 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1480 rowState = startState; // UnicodeString assignment, copies contents.
1481 }
1482
1483 // Run through the safe pairs, set the next state to zero when pair has been seen.
1484 // Zero being the stop state, meaning we found a safe point.
1485 for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1486 int32_t c1 = safePairs.charAt(pairIdx);
1487 int32_t c2 = safePairs.charAt(pairIdx + 1);
1488
1489 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1490 rowState.setCharAt(c1, 0);
1491 }
1492
1493 // Remove duplicate or redundant rows from the table.
1494 IntPair states = {1, 0};
1495 while (findDuplicateSafeState(&states)) {
1496 // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1497 removeSafeState(states);
1498 }
1499}
1500
1501
1502//-----------------------------------------------------------------------------
1503//
1504// getSafeTableSize() Calculate the size of the runtime form of this
1505// safe state table.
1506//
1507//-----------------------------------------------------------------------------
1508int32_t RBBITableBuilder::getSafeTableSize() const {
1509 int32_t size = 0;
1510 int32_t numRows;
1511 int32_t numCols;
1512 int32_t rowSize;
1513
1514 if (fSafeTable == nullptr) {
1515 return 0;
1516 }
1517
1518 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1519
1520 numRows = fSafeTable->size();
1521 numCols = fRB->fSetBuilder->getNumCharCategories();
1522
1523 rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1524 size += numRows * rowSize;
1525 return size;
1526}
1527
1528
1529//-----------------------------------------------------------------------------
1530//
1531// exportSafeTable() export the state transition table in the format required
1532// by the runtime engine. getTableSize() bytes of memory
1533// must be available at the output address "where".
1534//
1535//-----------------------------------------------------------------------------
1536void RBBITableBuilder::exportSafeTable(void *where) {
1537 RBBIStateTable *table = (RBBIStateTable *)where;
1538 uint32_t state;
1539 int col;
1540
1541 if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1542 return;
1543 }
1544
1545 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1546 if (catCount > 0x7fff ||
1547 fSafeTable->size() > 0x7fff) {
1548 *fStatus = U_BRK_INTERNAL_ERROR;
1549 return;
1550 }
1551
1552 table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1553 table->fNumStates = fSafeTable->size();
1554 table->fFlags = 0;
1555 table->fReserved = 0;
1556
1557 for (state=0; state<table->fNumStates; state++) {
1558 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1559 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1560 row->fAccepting = 0;
1561 row->fLookAhead = 0;
1562 row->fTagIdx = 0;
1563 row->fReserved = 0;
1564 for (col=0; col<catCount; col++) {
1565 row->fNextState[col] = rowString->charAt(col);
1566 }
1567 }
1568}
1569
1570
1571
1572
1573//-----------------------------------------------------------------------------
1574//
1575// printSet Debug function. Print the contents of a UVector
1576//
1577//-----------------------------------------------------------------------------
1578#ifdef RBBI_DEBUG
1579void RBBITableBuilder::printSet(UVector *s) {
1580 int32_t i;
1581 for (i=0; i<s->size(); i++) {
1582 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1583 RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
1584 }
1585 RBBIDebugPrintf("\n");
1586}
1587#endif
1588
1589
1590//-----------------------------------------------------------------------------
1591//
1592// printStates Debug Function. Dump the fully constructed state transition table.
1593//
1594//-----------------------------------------------------------------------------
1595#ifdef RBBI_DEBUG
1596void RBBITableBuilder::printStates() {
1597 int c; // input "character"
1598 int n; // state number
1599
1600 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1601 RBBIDebugPrintf(" | Acc LA Tag");
1602 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1603 RBBIDebugPrintf(" %2d", c);
1604 }
1605 RBBIDebugPrintf("\n");
1606 RBBIDebugPrintf(" |---------------");
1607 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1608 RBBIDebugPrintf("---");
1609 }
1610 RBBIDebugPrintf("\n");
1611
1612 for (n=0; n<fDStates->size(); n++) {
1613 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1614 RBBIDebugPrintf(" %3d | " , n);
1615 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1616 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1617 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1618 }
1619 RBBIDebugPrintf("\n");
1620 }
1621 RBBIDebugPrintf("\n\n");
1622}
1623#endif
1624
1625
1626//-----------------------------------------------------------------------------
1627//
1628// printSafeTable Debug Function. Dump the fully constructed safe table.
1629//
1630//-----------------------------------------------------------------------------
1631#ifdef RBBI_DEBUG
1632void RBBITableBuilder::printReverseTable() {
1633 int c; // input "character"
1634 int n; // state number
1635
1636 RBBIDebugPrintf(" Safe Reverse Table \n");
1637 if (fSafeTable == nullptr) {
1638 RBBIDebugPrintf(" --- nullptr ---\n");
1639 return;
1640 }
1641 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1642 RBBIDebugPrintf(" | Acc LA Tag");
1643 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1644 RBBIDebugPrintf(" %2d", c);
1645 }
1646 RBBIDebugPrintf("\n");
1647 RBBIDebugPrintf(" |---------------");
1648 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1649 RBBIDebugPrintf("---");
1650 }
1651 RBBIDebugPrintf("\n");
1652
1653 for (n=0; n<fSafeTable->size(); n++) {
1654 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1655 RBBIDebugPrintf(" %3d | " , n);
1656 RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
1657 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1658 RBBIDebugPrintf(" %2d", rowString->charAt(c));
1659 }
1660 RBBIDebugPrintf("\n");
1661 }
1662 RBBIDebugPrintf("\n\n");
1663}
1664#endif
1665
1666
1667
1668//-----------------------------------------------------------------------------
1669//
1670// printRuleStatusTable Debug Function. Dump the common rule status table
1671//
1672//-----------------------------------------------------------------------------
1673#ifdef RBBI_DEBUG
1674void RBBITableBuilder::printRuleStatusTable() {
1675 int32_t thisRecord = 0;
1676 int32_t nextRecord = 0;
1677 int i;
1678 UVector *tbl = fRB->fRuleStatusVals;
1679
1680 RBBIDebugPrintf("index | tags \n");
1681 RBBIDebugPrintf("-------------------\n");
1682
1683 while (nextRecord < tbl->size()) {
1684 thisRecord = nextRecord;
1685 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1686 RBBIDebugPrintf("%4d ", thisRecord);
1687 for (i=thisRecord+1; i<nextRecord; i++) {
1688 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1689 }
1690 RBBIDebugPrintf("\n");
1691 }
1692 RBBIDebugPrintf("\n\n");
1693}
1694#endif
1695
1696
1697//-----------------------------------------------------------------------------
1698//
1699// RBBIStateDescriptor Methods. This is a very struct-like class
1700// Most access is directly to the fields.
1701//
1702//-----------------------------------------------------------------------------
1703
1704RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1705 fMarked = FALSE;
1706 fAccepting = 0;
1707 fLookAhead = 0;
1708 fTagsIdx = 0;
1709 fTagVals = NULL;
1710 fPositions = NULL;
1711 fDtran = NULL;
1712
1713 fDtran = new UVector32(lastInputSymbol+1, *fStatus);
1714 if (U_FAILURE(*fStatus)) {
1715 return;
1716 }
1717 if (fDtran == NULL) {
1718 *fStatus = U_MEMORY_ALLOCATION_ERROR;
1719 return;
1720 }
1721 fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
1722 // It is indexed by input symbols, and will
1723 // hold the next state number for each
1724 // symbol.
1725}
1726
1727
1728RBBIStateDescriptor::~RBBIStateDescriptor() {
1729 delete fPositions;
1730 delete fDtran;
1731 delete fTagVals;
1732 fPositions = NULL;
1733 fDtran = NULL;
1734 fTagVals = NULL;
1735}
1736
1737U_NAMESPACE_END
1738
1739#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1740