1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | // |
4 | // rbbisetb.cpp |
5 | // |
6 | /* |
7 | *************************************************************************** |
8 | * Copyright (C) 2002-2008 International Business Machines Corporation * |
9 | * and others. All rights reserved. * |
10 | *************************************************************************** |
11 | */ |
12 | // |
13 | // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules |
14 | // (part of the rule building process.) |
15 | // |
16 | // Starting with the rules parse tree from the scanner, |
17 | // |
18 | // - Enumerate the set of UnicodeSets that are referenced |
19 | // by the RBBI rules. |
20 | // - compute a set of non-overlapping character ranges |
21 | // with all characters within a range belonging to the same |
22 | // set of input uniocde sets. |
23 | // - Derive a set of non-overlapping UnicodeSet (like things) |
24 | // that will correspond to columns in the state table for |
25 | // the RBBI execution engine. All characters within one |
26 | // of these sets belong to the same set of the original |
27 | // UnicodeSets from the user's rules. |
28 | // - construct the trie table that maps input characters |
29 | // to the index of the matching non-overlapping set of set from |
30 | // the previous step. |
31 | // |
32 | |
33 | #include "unicode/utypes.h" |
34 | |
35 | #if !UCONFIG_NO_BREAK_ITERATION |
36 | |
37 | #include "unicode/uniset.h" |
38 | #include "utrie2.h" |
39 | #include "uvector.h" |
40 | #include "uassert.h" |
41 | #include "cmemory.h" |
42 | #include "cstring.h" |
43 | |
44 | #include "rbbisetb.h" |
45 | #include "rbbinode.h" |
46 | |
47 | U_NAMESPACE_BEGIN |
48 | |
49 | //------------------------------------------------------------------------ |
50 | // |
51 | // Constructor |
52 | // |
53 | //------------------------------------------------------------------------ |
54 | RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb) |
55 | { |
56 | fRB = rb; |
57 | fStatus = rb->fStatus; |
58 | fRangeList = 0; |
59 | fTrie = 0; |
60 | fTrieSize = 0; |
61 | fGroupCount = 0; |
62 | fSawBOF = FALSE; |
63 | } |
64 | |
65 | |
66 | //------------------------------------------------------------------------ |
67 | // |
68 | // Destructor |
69 | // |
70 | //------------------------------------------------------------------------ |
71 | RBBISetBuilder::~RBBISetBuilder() |
72 | { |
73 | RangeDescriptor *; |
74 | |
75 | // Walk through & delete the linked list of RangeDescriptors |
76 | for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) { |
77 | RangeDescriptor *r = nextRangeDesc; |
78 | nextRangeDesc = r->fNext; |
79 | delete r; |
80 | } |
81 | |
82 | utrie2_close(fTrie); |
83 | } |
84 | |
85 | |
86 | |
87 | |
88 | //------------------------------------------------------------------------ |
89 | // |
90 | // build Build the list of non-overlapping character ranges |
91 | // from the Unicode Sets. |
92 | // |
93 | //------------------------------------------------------------------------ |
94 | void RBBISetBuilder::buildRanges() { |
95 | RBBINode *usetNode; |
96 | RangeDescriptor *rlRange; |
97 | |
98 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets" )) {printSets();} |
99 | |
100 | // |
101 | // Initialize the process by creating a single range encompassing all characters |
102 | // that is in no sets. |
103 | // |
104 | fRangeList = new RangeDescriptor(*fStatus); // will check for status here |
105 | if (fRangeList == NULL) { |
106 | *fStatus = U_MEMORY_ALLOCATION_ERROR; |
107 | return; |
108 | } |
109 | fRangeList->fStartChar = 0; |
110 | fRangeList->fEndChar = 0x10ffff; |
111 | |
112 | if (U_FAILURE(*fStatus)) { |
113 | return; |
114 | } |
115 | |
116 | // |
117 | // Find the set of non-overlapping ranges of characters |
118 | // |
119 | int ni; |
120 | for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules |
121 | usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); |
122 | if (usetNode==NULL) { |
123 | break; |
124 | } |
125 | |
126 | UnicodeSet *inputSet = usetNode->fInputSet; |
127 | int32_t inputSetRangeCount = inputSet->getRangeCount(); |
128 | int inputSetRangeIndex = 0; |
129 | rlRange = fRangeList; |
130 | |
131 | for (;;) { |
132 | if (inputSetRangeIndex >= inputSetRangeCount) { |
133 | break; |
134 | } |
135 | UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); |
136 | UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); |
137 | |
138 | // skip over ranges from the range list that are completely |
139 | // below the current range from the input unicode set. |
140 | while (rlRange->fEndChar < inputSetRangeBegin) { |
141 | rlRange = rlRange->fNext; |
142 | } |
143 | |
144 | // If the start of the range from the range list is before with |
145 | // the start of the range from the unicode set, split the range list range |
146 | // in two, with one part being before (wholly outside of) the unicode set |
147 | // and the other containing the rest. |
148 | // Then continue the loop; the post-split current range will then be skipped |
149 | // over |
150 | if (rlRange->fStartChar < inputSetRangeBegin) { |
151 | rlRange->split(inputSetRangeBegin, *fStatus); |
152 | if (U_FAILURE(*fStatus)) { |
153 | return; |
154 | } |
155 | continue; |
156 | } |
157 | |
158 | // Same thing at the end of the ranges... |
159 | // If the end of the range from the range list doesn't coincide with |
160 | // the end of the range from the unicode set, split the range list |
161 | // range in two. The first part of the split range will be |
162 | // wholly inside the Unicode set. |
163 | if (rlRange->fEndChar > inputSetRangeEnd) { |
164 | rlRange->split(inputSetRangeEnd+1, *fStatus); |
165 | if (U_FAILURE(*fStatus)) { |
166 | return; |
167 | } |
168 | } |
169 | |
170 | // The current rlRange is now entirely within the UnicodeSet range. |
171 | // Add this unicode set to the list of sets for this rlRange |
172 | if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { |
173 | rlRange->fIncludesSets->addElement(usetNode, *fStatus); |
174 | if (U_FAILURE(*fStatus)) { |
175 | return; |
176 | } |
177 | } |
178 | |
179 | // Advance over ranges that we are finished with. |
180 | if (inputSetRangeEnd == rlRange->fEndChar) { |
181 | inputSetRangeIndex++; |
182 | } |
183 | rlRange = rlRange->fNext; |
184 | } |
185 | } |
186 | |
187 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range" )) { printRanges();} |
188 | |
189 | // |
190 | // Group the above ranges, with each group consisting of one or more |
191 | // ranges that are in exactly the same set of original UnicodeSets. |
192 | // The groups are numbered, and these group numbers are the set of |
193 | // input symbols recognized by the run-time state machine. |
194 | // |
195 | // Numbering: # 0 (state table column 0) is unused. |
196 | // # 1 is reserved - table column 1 is for end-of-input |
197 | // # 2 is reserved - table column 2 is for beginning-in-input |
198 | // # 3 is the first range list. |
199 | // |
200 | RangeDescriptor *rlSearchRange; |
201 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { |
202 | for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { |
203 | if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { |
204 | rlRange->fNum = rlSearchRange->fNum; |
205 | break; |
206 | } |
207 | } |
208 | if (rlRange->fNum == 0) { |
209 | fGroupCount ++; |
210 | rlRange->fNum = fGroupCount+2; |
211 | rlRange->setDictionaryFlag(); |
212 | addValToSets(rlRange->fIncludesSets, fGroupCount+2); |
213 | } |
214 | } |
215 | |
216 | // Handle input sets that contain the special string {eof}. |
217 | // Column 1 of the state table is reserved for EOF on input. |
218 | // Column 2 is reserved for before-the-start-input. |
219 | // (This column can be optimized away later if there are no rule |
220 | // references to {bof}.) |
221 | // Add this column value (1 or 2) to the equivalent expression |
222 | // subtree for each UnicodeSet that contains the string {eof} |
223 | // Because {bof} and {eof} are not a characters in the normal sense, |
224 | // they doesn't affect the computation of ranges or TRIE. |
225 | static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0}; |
226 | static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0}; |
227 | |
228 | UnicodeString eofString(eofUString); |
229 | UnicodeString bofString(bofUString); |
230 | for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules |
231 | usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); |
232 | if (usetNode==NULL) { |
233 | break; |
234 | } |
235 | UnicodeSet *inputSet = usetNode->fInputSet; |
236 | if (inputSet->contains(eofString)) { |
237 | addValToSet(usetNode, 1); |
238 | } |
239 | if (inputSet->contains(bofString)) { |
240 | addValToSet(usetNode, 2); |
241 | fSawBOF = TRUE; |
242 | } |
243 | } |
244 | |
245 | |
246 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup" )) {printRangeGroups();} |
247 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets" )) {printSets();} |
248 | } |
249 | |
250 | |
251 | // |
252 | // Build the Trie table for mapping UChar32 values to the corresponding |
253 | // range group number. |
254 | // |
255 | void RBBISetBuilder::buildTrie() { |
256 | RangeDescriptor *rlRange; |
257 | |
258 | fTrie = utrie2_open(0, // Initial value for all code points. |
259 | 0, // Error value for out-of-range input. |
260 | fStatus); |
261 | |
262 | for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) { |
263 | utrie2_setRange32(fTrie, |
264 | rlRange->fStartChar, // Range start |
265 | rlRange->fEndChar, // Range end (inclusive) |
266 | rlRange->fNum, // value for range |
267 | TRUE, // Overwrite previously written values |
268 | fStatus); |
269 | } |
270 | } |
271 | |
272 | |
273 | void RBBISetBuilder::mergeCategories(IntPair categories) { |
274 | U_ASSERT(categories.first >= 1); |
275 | U_ASSERT(categories.second > categories.first); |
276 | for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) { |
277 | int32_t rangeNum = rd->fNum & ~DICT_BIT; |
278 | int32_t rangeDict = rd->fNum & DICT_BIT; |
279 | if (rangeNum == categories.second) { |
280 | rd->fNum = categories.first | rangeDict; |
281 | } else if (rangeNum > categories.second) { |
282 | rd->fNum--; |
283 | } |
284 | } |
285 | --fGroupCount; |
286 | } |
287 | |
288 | |
289 | //----------------------------------------------------------------------------------- |
290 | // |
291 | // getTrieSize() Return the size that will be required to serialize the Trie. |
292 | // |
293 | //----------------------------------------------------------------------------------- |
294 | int32_t RBBISetBuilder::getTrieSize() { |
295 | if (U_FAILURE(*fStatus)) { |
296 | return 0; |
297 | } |
298 | utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus); |
299 | fTrieSize = utrie2_serialize(fTrie, |
300 | NULL, // Buffer |
301 | 0, // Capacity |
302 | fStatus); |
303 | if (*fStatus == U_BUFFER_OVERFLOW_ERROR) { |
304 | *fStatus = U_ZERO_ERROR; |
305 | } |
306 | // RBBIDebugPrintf("Trie table size is %d\n", trieSize); |
307 | return fTrieSize; |
308 | } |
309 | |
310 | |
311 | //----------------------------------------------------------------------------------- |
312 | // |
313 | // serializeTrie() Put the serialized trie at the specified address. |
314 | // Trust the caller to have given us enough memory. |
315 | // getTrieSize() MUST be called first. |
316 | // |
317 | //----------------------------------------------------------------------------------- |
318 | void RBBISetBuilder::serializeTrie(uint8_t *where) { |
319 | utrie2_serialize(fTrie, |
320 | where, // Buffer |
321 | fTrieSize, // Capacity |
322 | fStatus); |
323 | } |
324 | |
325 | //------------------------------------------------------------------------ |
326 | // |
327 | // addValToSets Add a runtime-mapped input value to each uset from a |
328 | // list of uset nodes. (val corresponds to a state table column.) |
329 | // For each of the original Unicode sets - which correspond |
330 | // directly to uset nodes - a logically equivalent expression |
331 | // is constructed in terms of the remapped runtime input |
332 | // symbol set. This function adds one runtime input symbol to |
333 | // a list of sets. |
334 | // |
335 | // The "logically equivalent expression" is the tree for an |
336 | // or-ing together of all of the symbols that go into the set. |
337 | // |
338 | //------------------------------------------------------------------------ |
339 | void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) { |
340 | int32_t ix; |
341 | |
342 | for (ix=0; ix<sets->size(); ix++) { |
343 | RBBINode *usetNode = (RBBINode *)sets->elementAt(ix); |
344 | addValToSet(usetNode, val); |
345 | } |
346 | } |
347 | |
348 | void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) { |
349 | RBBINode *leafNode = new RBBINode(RBBINode::leafChar); |
350 | if (leafNode == NULL) { |
351 | *fStatus = U_MEMORY_ALLOCATION_ERROR; |
352 | return; |
353 | } |
354 | leafNode->fVal = (unsigned short)val; |
355 | if (usetNode->fLeftChild == NULL) { |
356 | usetNode->fLeftChild = leafNode; |
357 | leafNode->fParent = usetNode; |
358 | } else { |
359 | // There are already input symbols present for this set. |
360 | // Set up an OR node, with the previous stuff as the left child |
361 | // and the new value as the right child. |
362 | RBBINode *orNode = new RBBINode(RBBINode::opOr); |
363 | if (orNode == NULL) { |
364 | *fStatus = U_MEMORY_ALLOCATION_ERROR; |
365 | return; |
366 | } |
367 | orNode->fLeftChild = usetNode->fLeftChild; |
368 | orNode->fRightChild = leafNode; |
369 | orNode->fLeftChild->fParent = orNode; |
370 | orNode->fRightChild->fParent = orNode; |
371 | usetNode->fLeftChild = orNode; |
372 | orNode->fParent = usetNode; |
373 | } |
374 | } |
375 | |
376 | |
377 | //------------------------------------------------------------------------ |
378 | // |
379 | // getNumCharCategories |
380 | // |
381 | //------------------------------------------------------------------------ |
382 | int32_t RBBISetBuilder::getNumCharCategories() const { |
383 | return fGroupCount + 3; |
384 | } |
385 | |
386 | |
387 | //------------------------------------------------------------------------ |
388 | // |
389 | // sawBOF |
390 | // |
391 | //------------------------------------------------------------------------ |
392 | UBool RBBISetBuilder::sawBOF() const { |
393 | return fSawBOF; |
394 | } |
395 | |
396 | |
397 | //------------------------------------------------------------------------ |
398 | // |
399 | // getFirstChar Given a runtime RBBI character category, find |
400 | // the first UChar32 that is in the set of chars |
401 | // in the category. |
402 | //------------------------------------------------------------------------ |
403 | UChar32 RBBISetBuilder::getFirstChar(int32_t category) const { |
404 | RangeDescriptor *rlRange; |
405 | UChar32 retVal = (UChar32)-1; |
406 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { |
407 | if (rlRange->fNum == category) { |
408 | retVal = rlRange->fStartChar; |
409 | break; |
410 | } |
411 | } |
412 | return retVal; |
413 | } |
414 | |
415 | |
416 | |
417 | //------------------------------------------------------------------------ |
418 | // |
419 | // printRanges A debugging function. |
420 | // dump out all of the range definitions. |
421 | // |
422 | //------------------------------------------------------------------------ |
423 | #ifdef RBBI_DEBUG |
424 | void RBBISetBuilder::printRanges() { |
425 | RangeDescriptor *rlRange; |
426 | int i; |
427 | |
428 | RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n" ); |
429 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { |
430 | RBBIDebugPrintf("%2i %4x-%4x " , rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar); |
431 | |
432 | for (i=0; i<rlRange->fIncludesSets->size(); i++) { |
433 | RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); |
434 | UnicodeString setName = UNICODE_STRING("anon" , 4); |
435 | RBBINode *setRef = usetNode->fParent; |
436 | if (setRef != NULL) { |
437 | RBBINode *varRef = setRef->fParent; |
438 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { |
439 | setName = varRef->fText; |
440 | } |
441 | } |
442 | RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" " ); |
443 | } |
444 | RBBIDebugPrintf("\n" ); |
445 | } |
446 | } |
447 | #endif |
448 | |
449 | |
450 | //------------------------------------------------------------------------ |
451 | // |
452 | // printRangeGroups A debugging function. |
453 | // dump out all of the range groups. |
454 | // |
455 | //------------------------------------------------------------------------ |
456 | #ifdef RBBI_DEBUG |
457 | void RBBISetBuilder::printRangeGroups() { |
458 | RangeDescriptor *rlRange; |
459 | RangeDescriptor *tRange; |
460 | int i; |
461 | int lastPrintedGroupNum = 0; |
462 | |
463 | RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n" ); |
464 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { |
465 | int groupNum = rlRange->fNum & 0xbfff; |
466 | if (groupNum > lastPrintedGroupNum) { |
467 | lastPrintedGroupNum = groupNum; |
468 | RBBIDebugPrintf("%2i " , groupNum); |
469 | |
470 | if (rlRange->fNum & DICT_BIT) { RBBIDebugPrintf(" <DICT> " );} |
471 | |
472 | for (i=0; i<rlRange->fIncludesSets->size(); i++) { |
473 | RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); |
474 | UnicodeString setName = UNICODE_STRING("anon" , 4); |
475 | RBBINode *setRef = usetNode->fParent; |
476 | if (setRef != NULL) { |
477 | RBBINode *varRef = setRef->fParent; |
478 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { |
479 | setName = varRef->fText; |
480 | } |
481 | } |
482 | RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" " ); |
483 | } |
484 | |
485 | i = 0; |
486 | for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) { |
487 | if (tRange->fNum == rlRange->fNum) { |
488 | if (i++ % 5 == 0) { |
489 | RBBIDebugPrintf("\n " ); |
490 | } |
491 | RBBIDebugPrintf(" %05x-%05x" , tRange->fStartChar, tRange->fEndChar); |
492 | } |
493 | } |
494 | RBBIDebugPrintf("\n" ); |
495 | } |
496 | } |
497 | RBBIDebugPrintf("\n" ); |
498 | } |
499 | #endif |
500 | |
501 | |
502 | //------------------------------------------------------------------------ |
503 | // |
504 | // printSets A debugging function. |
505 | // dump out all of the set definitions. |
506 | // |
507 | //------------------------------------------------------------------------ |
508 | #ifdef RBBI_DEBUG |
509 | void RBBISetBuilder::printSets() { |
510 | int i; |
511 | |
512 | RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n" ); |
513 | for (i=0; ; i++) { |
514 | RBBINode *usetNode; |
515 | RBBINode *setRef; |
516 | RBBINode *varRef; |
517 | UnicodeString setName; |
518 | |
519 | usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i); |
520 | if (usetNode == NULL) { |
521 | break; |
522 | } |
523 | |
524 | RBBIDebugPrintf("%3d " , i); |
525 | setName = UNICODE_STRING("anonymous" , 9); |
526 | setRef = usetNode->fParent; |
527 | if (setRef != NULL) { |
528 | varRef = setRef->fParent; |
529 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { |
530 | setName = varRef->fText; |
531 | } |
532 | } |
533 | RBBI_DEBUG_printUnicodeString(setName); |
534 | RBBIDebugPrintf(" " ); |
535 | RBBI_DEBUG_printUnicodeString(usetNode->fText); |
536 | RBBIDebugPrintf("\n" ); |
537 | if (usetNode->fLeftChild != NULL) { |
538 | RBBINode::printTree(usetNode->fLeftChild, TRUE); |
539 | } |
540 | } |
541 | RBBIDebugPrintf("\n" ); |
542 | } |
543 | #endif |
544 | |
545 | |
546 | |
547 | //------------------------------------------------------------------------------------- |
548 | // |
549 | // RangeDescriptor copy constructor |
550 | // |
551 | //------------------------------------------------------------------------------------- |
552 | |
553 | RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) { |
554 | int i; |
555 | |
556 | this->fStartChar = other.fStartChar; |
557 | this->fEndChar = other.fEndChar; |
558 | this->fNum = other.fNum; |
559 | this->fNext = NULL; |
560 | UErrorCode oldstatus = status; |
561 | this->fIncludesSets = new UVector(status); |
562 | if (U_FAILURE(oldstatus)) { |
563 | status = oldstatus; |
564 | } |
565 | if (U_FAILURE(status)) { |
566 | return; |
567 | } |
568 | /* test for NULL */ |
569 | if (this->fIncludesSets == 0) { |
570 | status = U_MEMORY_ALLOCATION_ERROR; |
571 | return; |
572 | } |
573 | |
574 | for (i=0; i<other.fIncludesSets->size(); i++) { |
575 | this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status); |
576 | } |
577 | } |
578 | |
579 | |
580 | //------------------------------------------------------------------------------------- |
581 | // |
582 | // RangeDesriptor default constructor |
583 | // |
584 | //------------------------------------------------------------------------------------- |
585 | RangeDescriptor::RangeDescriptor(UErrorCode &status) { |
586 | this->fStartChar = 0; |
587 | this->fEndChar = 0; |
588 | this->fNum = 0; |
589 | this->fNext = NULL; |
590 | UErrorCode oldstatus = status; |
591 | this->fIncludesSets = new UVector(status); |
592 | if (U_FAILURE(oldstatus)) { |
593 | status = oldstatus; |
594 | } |
595 | if (U_FAILURE(status)) { |
596 | return; |
597 | } |
598 | /* test for NULL */ |
599 | if(this->fIncludesSets == 0) { |
600 | status = U_MEMORY_ALLOCATION_ERROR; |
601 | return; |
602 | } |
603 | |
604 | } |
605 | |
606 | |
607 | //------------------------------------------------------------------------------------- |
608 | // |
609 | // RangeDesriptor Destructor |
610 | // |
611 | //------------------------------------------------------------------------------------- |
612 | RangeDescriptor::~RangeDescriptor() { |
613 | delete fIncludesSets; |
614 | fIncludesSets = NULL; |
615 | } |
616 | |
617 | //------------------------------------------------------------------------------------- |
618 | // |
619 | // RangeDesriptor::split() |
620 | // |
621 | //------------------------------------------------------------------------------------- |
622 | void RangeDescriptor::split(UChar32 where, UErrorCode &status) { |
623 | U_ASSERT(where>fStartChar && where<=fEndChar); |
624 | RangeDescriptor *nr = new RangeDescriptor(*this, status); |
625 | if(nr == 0) { |
626 | status = U_MEMORY_ALLOCATION_ERROR; |
627 | return; |
628 | } |
629 | if (U_FAILURE(status)) { |
630 | delete nr; |
631 | return; |
632 | } |
633 | // RangeDescriptor copy constructor copies all fields. |
634 | // Only need to update those that are different after the split. |
635 | nr->fStartChar = where; |
636 | this->fEndChar = where-1; |
637 | nr->fNext = this->fNext; |
638 | this->fNext = nr; |
639 | } |
640 | |
641 | |
642 | //------------------------------------------------------------------------------------- |
643 | // |
644 | // RangeDescriptor::setDictionaryFlag |
645 | // |
646 | // Character Category Numbers that include characters from |
647 | // the original Unicode Set named "dictionary" have bit 14 |
648 | // set to 1. The RBBI runtime engine uses this to trigger |
649 | // use of the word dictionary. |
650 | // |
651 | // This function looks through the Unicode Sets that it |
652 | // (the range) includes, and sets the bit in fNum when |
653 | // "dictionary" is among them. |
654 | // |
655 | // TODO: a faster way would be to find the set node for |
656 | // "dictionary" just once, rather than looking it |
657 | // up by name every time. |
658 | // |
659 | //------------------------------------------------------------------------------------- |
660 | void RangeDescriptor::setDictionaryFlag() { |
661 | int i; |
662 | |
663 | static const char16_t *dictionary = u"dictionary" ; |
664 | for (i=0; i<fIncludesSets->size(); i++) { |
665 | RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i); |
666 | RBBINode *setRef = usetNode->fParent; |
667 | if (setRef != nullptr) { |
668 | RBBINode *varRef = setRef->fParent; |
669 | if (varRef && varRef->fType == RBBINode::varRef) { |
670 | const UnicodeString *setName = &varRef->fText; |
671 | if (setName->compare(dictionary, -1) == 0) { |
672 | fNum |= RBBISetBuilder::DICT_BIT; |
673 | break; |
674 | } |
675 | } |
676 | } |
677 | } |
678 | } |
679 | |
680 | |
681 | |
682 | U_NAMESPACE_END |
683 | |
684 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
685 | |