1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4// rbbisetb.cpp
5//
6/*
7***************************************************************************
8* Copyright (C) 2002-2008 International Business Machines Corporation *
9* and others. All rights reserved. *
10***************************************************************************
11*/
12//
13// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
14// (part of the rule building process.)
15//
16// Starting with the rules parse tree from the scanner,
17//
18// - Enumerate the set of UnicodeSets that are referenced
19// by the RBBI rules.
20// - compute a set of non-overlapping character ranges
21// with all characters within a range belonging to the same
22// set of input uniocde sets.
23// - Derive a set of non-overlapping UnicodeSet (like things)
24// that will correspond to columns in the state table for
25// the RBBI execution engine. All characters within one
26// of these sets belong to the same set of the original
27// UnicodeSets from the user's rules.
28// - construct the trie table that maps input characters
29// to the index of the matching non-overlapping set of set from
30// the previous step.
31//
32
33#include "unicode/utypes.h"
34
35#if !UCONFIG_NO_BREAK_ITERATION
36
37#include "unicode/uniset.h"
38#include "utrie2.h"
39#include "uvector.h"
40#include "uassert.h"
41#include "cmemory.h"
42#include "cstring.h"
43
44#include "rbbisetb.h"
45#include "rbbinode.h"
46
47U_NAMESPACE_BEGIN
48
49//------------------------------------------------------------------------
50//
51// Constructor
52//
53//------------------------------------------------------------------------
54RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
55{
56 fRB = rb;
57 fStatus = rb->fStatus;
58 fRangeList = 0;
59 fTrie = 0;
60 fTrieSize = 0;
61 fGroupCount = 0;
62 fSawBOF = FALSE;
63}
64
65
66//------------------------------------------------------------------------
67//
68// Destructor
69//
70//------------------------------------------------------------------------
71RBBISetBuilder::~RBBISetBuilder()
72{
73 RangeDescriptor *nextRangeDesc;
74
75 // Walk through & delete the linked list of RangeDescriptors
76 for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
77 RangeDescriptor *r = nextRangeDesc;
78 nextRangeDesc = r->fNext;
79 delete r;
80 }
81
82 utrie2_close(fTrie);
83}
84
85
86
87
88//------------------------------------------------------------------------
89//
90// build Build the list of non-overlapping character ranges
91// from the Unicode Sets.
92//
93//------------------------------------------------------------------------
94void RBBISetBuilder::buildRanges() {
95 RBBINode *usetNode;
96 RangeDescriptor *rlRange;
97
98 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}
99
100 //
101 // Initialize the process by creating a single range encompassing all characters
102 // that is in no sets.
103 //
104 fRangeList = new RangeDescriptor(*fStatus); // will check for status here
105 if (fRangeList == NULL) {
106 *fStatus = U_MEMORY_ALLOCATION_ERROR;
107 return;
108 }
109 fRangeList->fStartChar = 0;
110 fRangeList->fEndChar = 0x10ffff;
111
112 if (U_FAILURE(*fStatus)) {
113 return;
114 }
115
116 //
117 // Find the set of non-overlapping ranges of characters
118 //
119 int ni;
120 for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
121 usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
122 if (usetNode==NULL) {
123 break;
124 }
125
126 UnicodeSet *inputSet = usetNode->fInputSet;
127 int32_t inputSetRangeCount = inputSet->getRangeCount();
128 int inputSetRangeIndex = 0;
129 rlRange = fRangeList;
130
131 for (;;) {
132 if (inputSetRangeIndex >= inputSetRangeCount) {
133 break;
134 }
135 UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex);
136 UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex);
137
138 // skip over ranges from the range list that are completely
139 // below the current range from the input unicode set.
140 while (rlRange->fEndChar < inputSetRangeBegin) {
141 rlRange = rlRange->fNext;
142 }
143
144 // If the start of the range from the range list is before with
145 // the start of the range from the unicode set, split the range list range
146 // in two, with one part being before (wholly outside of) the unicode set
147 // and the other containing the rest.
148 // Then continue the loop; the post-split current range will then be skipped
149 // over
150 if (rlRange->fStartChar < inputSetRangeBegin) {
151 rlRange->split(inputSetRangeBegin, *fStatus);
152 if (U_FAILURE(*fStatus)) {
153 return;
154 }
155 continue;
156 }
157
158 // Same thing at the end of the ranges...
159 // If the end of the range from the range list doesn't coincide with
160 // the end of the range from the unicode set, split the range list
161 // range in two. The first part of the split range will be
162 // wholly inside the Unicode set.
163 if (rlRange->fEndChar > inputSetRangeEnd) {
164 rlRange->split(inputSetRangeEnd+1, *fStatus);
165 if (U_FAILURE(*fStatus)) {
166 return;
167 }
168 }
169
170 // The current rlRange is now entirely within the UnicodeSet range.
171 // Add this unicode set to the list of sets for this rlRange
172 if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
173 rlRange->fIncludesSets->addElement(usetNode, *fStatus);
174 if (U_FAILURE(*fStatus)) {
175 return;
176 }
177 }
178
179 // Advance over ranges that we are finished with.
180 if (inputSetRangeEnd == rlRange->fEndChar) {
181 inputSetRangeIndex++;
182 }
183 rlRange = rlRange->fNext;
184 }
185 }
186
187 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}
188
189 //
190 // Group the above ranges, with each group consisting of one or more
191 // ranges that are in exactly the same set of original UnicodeSets.
192 // The groups are numbered, and these group numbers are the set of
193 // input symbols recognized by the run-time state machine.
194 //
195 // Numbering: # 0 (state table column 0) is unused.
196 // # 1 is reserved - table column 1 is for end-of-input
197 // # 2 is reserved - table column 2 is for beginning-in-input
198 // # 3 is the first range list.
199 //
200 RangeDescriptor *rlSearchRange;
201 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
202 for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
203 if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
204 rlRange->fNum = rlSearchRange->fNum;
205 break;
206 }
207 }
208 if (rlRange->fNum == 0) {
209 fGroupCount ++;
210 rlRange->fNum = fGroupCount+2;
211 rlRange->setDictionaryFlag();
212 addValToSets(rlRange->fIncludesSets, fGroupCount+2);
213 }
214 }
215
216 // Handle input sets that contain the special string {eof}.
217 // Column 1 of the state table is reserved for EOF on input.
218 // Column 2 is reserved for before-the-start-input.
219 // (This column can be optimized away later if there are no rule
220 // references to {bof}.)
221 // Add this column value (1 or 2) to the equivalent expression
222 // subtree for each UnicodeSet that contains the string {eof}
223 // Because {bof} and {eof} are not a characters in the normal sense,
224 // they doesn't affect the computation of ranges or TRIE.
225 static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
226 static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};
227
228 UnicodeString eofString(eofUString);
229 UnicodeString bofString(bofUString);
230 for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
231 usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
232 if (usetNode==NULL) {
233 break;
234 }
235 UnicodeSet *inputSet = usetNode->fInputSet;
236 if (inputSet->contains(eofString)) {
237 addValToSet(usetNode, 1);
238 }
239 if (inputSet->contains(bofString)) {
240 addValToSet(usetNode, 2);
241 fSawBOF = TRUE;
242 }
243 }
244
245
246 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
247 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
248}
249
250
251//
252// Build the Trie table for mapping UChar32 values to the corresponding
253// range group number.
254//
255void RBBISetBuilder::buildTrie() {
256 RangeDescriptor *rlRange;
257
258 fTrie = utrie2_open(0, // Initial value for all code points.
259 0, // Error value for out-of-range input.
260 fStatus);
261
262 for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) {
263 utrie2_setRange32(fTrie,
264 rlRange->fStartChar, // Range start
265 rlRange->fEndChar, // Range end (inclusive)
266 rlRange->fNum, // value for range
267 TRUE, // Overwrite previously written values
268 fStatus);
269 }
270}
271
272
273void RBBISetBuilder::mergeCategories(IntPair categories) {
274 U_ASSERT(categories.first >= 1);
275 U_ASSERT(categories.second > categories.first);
276 for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
277 int32_t rangeNum = rd->fNum & ~DICT_BIT;
278 int32_t rangeDict = rd->fNum & DICT_BIT;
279 if (rangeNum == categories.second) {
280 rd->fNum = categories.first | rangeDict;
281 } else if (rangeNum > categories.second) {
282 rd->fNum--;
283 }
284 }
285 --fGroupCount;
286}
287
288
289//-----------------------------------------------------------------------------------
290//
291// getTrieSize() Return the size that will be required to serialize the Trie.
292//
293//-----------------------------------------------------------------------------------
294int32_t RBBISetBuilder::getTrieSize() {
295 if (U_FAILURE(*fStatus)) {
296 return 0;
297 }
298 utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
299 fTrieSize = utrie2_serialize(fTrie,
300 NULL, // Buffer
301 0, // Capacity
302 fStatus);
303 if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
304 *fStatus = U_ZERO_ERROR;
305 }
306 // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
307 return fTrieSize;
308}
309
310
311//-----------------------------------------------------------------------------------
312//
313// serializeTrie() Put the serialized trie at the specified address.
314// Trust the caller to have given us enough memory.
315// getTrieSize() MUST be called first.
316//
317//-----------------------------------------------------------------------------------
318void RBBISetBuilder::serializeTrie(uint8_t *where) {
319 utrie2_serialize(fTrie,
320 where, // Buffer
321 fTrieSize, // Capacity
322 fStatus);
323}
324
325//------------------------------------------------------------------------
326//
327// addValToSets Add a runtime-mapped input value to each uset from a
328// list of uset nodes. (val corresponds to a state table column.)
329// For each of the original Unicode sets - which correspond
330// directly to uset nodes - a logically equivalent expression
331// is constructed in terms of the remapped runtime input
332// symbol set. This function adds one runtime input symbol to
333// a list of sets.
334//
335// The "logically equivalent expression" is the tree for an
336// or-ing together of all of the symbols that go into the set.
337//
338//------------------------------------------------------------------------
339void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
340 int32_t ix;
341
342 for (ix=0; ix<sets->size(); ix++) {
343 RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
344 addValToSet(usetNode, val);
345 }
346}
347
348void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
349 RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
350 if (leafNode == NULL) {
351 *fStatus = U_MEMORY_ALLOCATION_ERROR;
352 return;
353 }
354 leafNode->fVal = (unsigned short)val;
355 if (usetNode->fLeftChild == NULL) {
356 usetNode->fLeftChild = leafNode;
357 leafNode->fParent = usetNode;
358 } else {
359 // There are already input symbols present for this set.
360 // Set up an OR node, with the previous stuff as the left child
361 // and the new value as the right child.
362 RBBINode *orNode = new RBBINode(RBBINode::opOr);
363 if (orNode == NULL) {
364 *fStatus = U_MEMORY_ALLOCATION_ERROR;
365 return;
366 }
367 orNode->fLeftChild = usetNode->fLeftChild;
368 orNode->fRightChild = leafNode;
369 orNode->fLeftChild->fParent = orNode;
370 orNode->fRightChild->fParent = orNode;
371 usetNode->fLeftChild = orNode;
372 orNode->fParent = usetNode;
373 }
374}
375
376
377//------------------------------------------------------------------------
378//
379// getNumCharCategories
380//
381//------------------------------------------------------------------------
382int32_t RBBISetBuilder::getNumCharCategories() const {
383 return fGroupCount + 3;
384}
385
386
387//------------------------------------------------------------------------
388//
389// sawBOF
390//
391//------------------------------------------------------------------------
392UBool RBBISetBuilder::sawBOF() const {
393 return fSawBOF;
394}
395
396
397//------------------------------------------------------------------------
398//
399// getFirstChar Given a runtime RBBI character category, find
400// the first UChar32 that is in the set of chars
401// in the category.
402//------------------------------------------------------------------------
403UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
404 RangeDescriptor *rlRange;
405 UChar32 retVal = (UChar32)-1;
406 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
407 if (rlRange->fNum == category) {
408 retVal = rlRange->fStartChar;
409 break;
410 }
411 }
412 return retVal;
413}
414
415
416
417//------------------------------------------------------------------------
418//
419// printRanges A debugging function.
420// dump out all of the range definitions.
421//
422//------------------------------------------------------------------------
423#ifdef RBBI_DEBUG
424void RBBISetBuilder::printRanges() {
425 RangeDescriptor *rlRange;
426 int i;
427
428 RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
429 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
430 RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
431
432 for (i=0; i<rlRange->fIncludesSets->size(); i++) {
433 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
434 UnicodeString setName = UNICODE_STRING("anon", 4);
435 RBBINode *setRef = usetNode->fParent;
436 if (setRef != NULL) {
437 RBBINode *varRef = setRef->fParent;
438 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
439 setName = varRef->fText;
440 }
441 }
442 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
443 }
444 RBBIDebugPrintf("\n");
445 }
446}
447#endif
448
449
450//------------------------------------------------------------------------
451//
452// printRangeGroups A debugging function.
453// dump out all of the range groups.
454//
455//------------------------------------------------------------------------
456#ifdef RBBI_DEBUG
457void RBBISetBuilder::printRangeGroups() {
458 RangeDescriptor *rlRange;
459 RangeDescriptor *tRange;
460 int i;
461 int lastPrintedGroupNum = 0;
462
463 RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
464 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
465 int groupNum = rlRange->fNum & 0xbfff;
466 if (groupNum > lastPrintedGroupNum) {
467 lastPrintedGroupNum = groupNum;
468 RBBIDebugPrintf("%2i ", groupNum);
469
470 if (rlRange->fNum & DICT_BIT) { RBBIDebugPrintf(" <DICT> ");}
471
472 for (i=0; i<rlRange->fIncludesSets->size(); i++) {
473 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
474 UnicodeString setName = UNICODE_STRING("anon", 4);
475 RBBINode *setRef = usetNode->fParent;
476 if (setRef != NULL) {
477 RBBINode *varRef = setRef->fParent;
478 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
479 setName = varRef->fText;
480 }
481 }
482 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
483 }
484
485 i = 0;
486 for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
487 if (tRange->fNum == rlRange->fNum) {
488 if (i++ % 5 == 0) {
489 RBBIDebugPrintf("\n ");
490 }
491 RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar);
492 }
493 }
494 RBBIDebugPrintf("\n");
495 }
496 }
497 RBBIDebugPrintf("\n");
498}
499#endif
500
501
502//------------------------------------------------------------------------
503//
504// printSets A debugging function.
505// dump out all of the set definitions.
506//
507//------------------------------------------------------------------------
508#ifdef RBBI_DEBUG
509void RBBISetBuilder::printSets() {
510 int i;
511
512 RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
513 for (i=0; ; i++) {
514 RBBINode *usetNode;
515 RBBINode *setRef;
516 RBBINode *varRef;
517 UnicodeString setName;
518
519 usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
520 if (usetNode == NULL) {
521 break;
522 }
523
524 RBBIDebugPrintf("%3d ", i);
525 setName = UNICODE_STRING("anonymous", 9);
526 setRef = usetNode->fParent;
527 if (setRef != NULL) {
528 varRef = setRef->fParent;
529 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
530 setName = varRef->fText;
531 }
532 }
533 RBBI_DEBUG_printUnicodeString(setName);
534 RBBIDebugPrintf(" ");
535 RBBI_DEBUG_printUnicodeString(usetNode->fText);
536 RBBIDebugPrintf("\n");
537 if (usetNode->fLeftChild != NULL) {
538 RBBINode::printTree(usetNode->fLeftChild, TRUE);
539 }
540 }
541 RBBIDebugPrintf("\n");
542}
543#endif
544
545
546
547//-------------------------------------------------------------------------------------
548//
549// RangeDescriptor copy constructor
550//
551//-------------------------------------------------------------------------------------
552
553RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
554 int i;
555
556 this->fStartChar = other.fStartChar;
557 this->fEndChar = other.fEndChar;
558 this->fNum = other.fNum;
559 this->fNext = NULL;
560 UErrorCode oldstatus = status;
561 this->fIncludesSets = new UVector(status);
562 if (U_FAILURE(oldstatus)) {
563 status = oldstatus;
564 }
565 if (U_FAILURE(status)) {
566 return;
567 }
568 /* test for NULL */
569 if (this->fIncludesSets == 0) {
570 status = U_MEMORY_ALLOCATION_ERROR;
571 return;
572 }
573
574 for (i=0; i<other.fIncludesSets->size(); i++) {
575 this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
576 }
577}
578
579
580//-------------------------------------------------------------------------------------
581//
582// RangeDesriptor default constructor
583//
584//-------------------------------------------------------------------------------------
585RangeDescriptor::RangeDescriptor(UErrorCode &status) {
586 this->fStartChar = 0;
587 this->fEndChar = 0;
588 this->fNum = 0;
589 this->fNext = NULL;
590 UErrorCode oldstatus = status;
591 this->fIncludesSets = new UVector(status);
592 if (U_FAILURE(oldstatus)) {
593 status = oldstatus;
594 }
595 if (U_FAILURE(status)) {
596 return;
597 }
598 /* test for NULL */
599 if(this->fIncludesSets == 0) {
600 status = U_MEMORY_ALLOCATION_ERROR;
601 return;
602 }
603
604}
605
606
607//-------------------------------------------------------------------------------------
608//
609// RangeDesriptor Destructor
610//
611//-------------------------------------------------------------------------------------
612RangeDescriptor::~RangeDescriptor() {
613 delete fIncludesSets;
614 fIncludesSets = NULL;
615}
616
617//-------------------------------------------------------------------------------------
618//
619// RangeDesriptor::split()
620//
621//-------------------------------------------------------------------------------------
622void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
623 U_ASSERT(where>fStartChar && where<=fEndChar);
624 RangeDescriptor *nr = new RangeDescriptor(*this, status);
625 if(nr == 0) {
626 status = U_MEMORY_ALLOCATION_ERROR;
627 return;
628 }
629 if (U_FAILURE(status)) {
630 delete nr;
631 return;
632 }
633 // RangeDescriptor copy constructor copies all fields.
634 // Only need to update those that are different after the split.
635 nr->fStartChar = where;
636 this->fEndChar = where-1;
637 nr->fNext = this->fNext;
638 this->fNext = nr;
639}
640
641
642//-------------------------------------------------------------------------------------
643//
644// RangeDescriptor::setDictionaryFlag
645//
646// Character Category Numbers that include characters from
647// the original Unicode Set named "dictionary" have bit 14
648// set to 1. The RBBI runtime engine uses this to trigger
649// use of the word dictionary.
650//
651// This function looks through the Unicode Sets that it
652// (the range) includes, and sets the bit in fNum when
653// "dictionary" is among them.
654//
655// TODO: a faster way would be to find the set node for
656// "dictionary" just once, rather than looking it
657// up by name every time.
658//
659//-------------------------------------------------------------------------------------
660void RangeDescriptor::setDictionaryFlag() {
661 int i;
662
663 static const char16_t *dictionary = u"dictionary";
664 for (i=0; i<fIncludesSets->size(); i++) {
665 RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
666 RBBINode *setRef = usetNode->fParent;
667 if (setRef != nullptr) {
668 RBBINode *varRef = setRef->fParent;
669 if (varRef && varRef->fType == RBBINode::varRef) {
670 const UnicodeString *setName = &varRef->fText;
671 if (setName->compare(dictionary, -1) == 0) {
672 fNum |= RBBISetBuilder::DICT_BIT;
673 break;
674 }
675 }
676 }
677 }
678}
679
680
681
682U_NAMESPACE_END
683
684#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
685