1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | // |
4 | // rbbisetb.cpp |
5 | // |
6 | /* |
7 | *************************************************************************** |
8 | * Copyright (C) 2002-2008 International Business Machines Corporation * |
9 | * and others. All rights reserved. * |
10 | *************************************************************************** |
11 | */ |
12 | // |
13 | // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules |
14 | // (part of the rule building process.) |
15 | // |
16 | // Starting with the rules parse tree from the scanner, |
17 | // |
18 | // - Enumerate the set of UnicodeSets that are referenced |
19 | // by the RBBI rules. |
20 | // - compute a set of non-overlapping character ranges |
21 | // with all characters within a range belonging to the same |
22 | // set of input unicode sets. |
23 | // - Derive a set of non-overlapping UnicodeSet (like things) |
24 | // that will correspond to columns in the state table for |
25 | // the RBBI execution engine. All characters within one |
26 | // of these sets belong to the same set of the original |
27 | // UnicodeSets from the user's rules. |
28 | // - construct the trie table that maps input characters |
29 | // to the index of the matching non-overlapping set of set from |
30 | // the previous step. |
31 | // |
32 | |
33 | #include "unicode/utypes.h" |
34 | |
35 | #if !UCONFIG_NO_BREAK_ITERATION |
36 | |
37 | #include "unicode/uniset.h" |
38 | #include "uvector.h" |
39 | #include "uassert.h" |
40 | #include "cmemory.h" |
41 | #include "cstring.h" |
42 | |
43 | #include "rbbisetb.h" |
44 | #include "rbbinode.h" |
45 | |
46 | U_NAMESPACE_BEGIN |
47 | |
48 | const int32_t kMaxCharCategoriesFor8BitsTrie = 255; |
49 | //------------------------------------------------------------------------ |
50 | // |
51 | // Constructor |
52 | // |
53 | //------------------------------------------------------------------------ |
54 | RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb) |
55 | { |
56 | fRB = rb; |
57 | fStatus = rb->fStatus; |
58 | fRangeList = nullptr; |
59 | fMutableTrie = nullptr; |
60 | fTrie = nullptr; |
61 | fTrieSize = 0; |
62 | fGroupCount = 0; |
63 | fSawBOF = false; |
64 | } |
65 | |
66 | |
67 | //------------------------------------------------------------------------ |
68 | // |
69 | // Destructor |
70 | // |
71 | //------------------------------------------------------------------------ |
72 | RBBISetBuilder::~RBBISetBuilder() |
73 | { |
74 | RangeDescriptor *; |
75 | |
76 | // Walk through & delete the linked list of RangeDescriptors |
77 | for (nextRangeDesc = fRangeList; nextRangeDesc!=nullptr;) { |
78 | RangeDescriptor *r = nextRangeDesc; |
79 | nextRangeDesc = r->fNext; |
80 | delete r; |
81 | } |
82 | |
83 | ucptrie_close(fTrie); |
84 | umutablecptrie_close(fMutableTrie); |
85 | } |
86 | |
87 | |
88 | |
89 | |
90 | //------------------------------------------------------------------------ |
91 | // |
92 | // build Build the list of non-overlapping character ranges |
93 | // from the Unicode Sets. |
94 | // |
95 | //------------------------------------------------------------------------ |
96 | void RBBISetBuilder::buildRanges() { |
97 | RBBINode *usetNode; |
98 | RangeDescriptor *rlRange; |
99 | |
100 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets" )) {printSets();} |
101 | |
102 | // |
103 | // Initialize the process by creating a single range encompassing all characters |
104 | // that is in no sets. |
105 | // |
106 | fRangeList = new RangeDescriptor(*fStatus); // will check for status here |
107 | if (fRangeList == nullptr) { |
108 | *fStatus = U_MEMORY_ALLOCATION_ERROR; |
109 | return; |
110 | } |
111 | fRangeList->fStartChar = 0; |
112 | fRangeList->fEndChar = 0x10ffff; |
113 | |
114 | if (U_FAILURE(*fStatus)) { |
115 | return; |
116 | } |
117 | |
118 | // |
119 | // Find the set of non-overlapping ranges of characters |
120 | // |
121 | int ni; |
122 | for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules |
123 | usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); |
124 | if (usetNode==nullptr) { |
125 | break; |
126 | } |
127 | |
128 | UnicodeSet *inputSet = usetNode->fInputSet; |
129 | int32_t inputSetRangeCount = inputSet->getRangeCount(); |
130 | int inputSetRangeIndex = 0; |
131 | rlRange = fRangeList; |
132 | |
133 | for (;;) { |
134 | if (inputSetRangeIndex >= inputSetRangeCount) { |
135 | break; |
136 | } |
137 | UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); |
138 | UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); |
139 | |
140 | // skip over ranges from the range list that are completely |
141 | // below the current range from the input unicode set. |
142 | while (rlRange->fEndChar < inputSetRangeBegin) { |
143 | rlRange = rlRange->fNext; |
144 | } |
145 | |
146 | // If the start of the range from the range list is before with |
147 | // the start of the range from the unicode set, split the range list range |
148 | // in two, with one part being before (wholly outside of) the unicode set |
149 | // and the other containing the rest. |
150 | // Then continue the loop; the post-split current range will then be skipped |
151 | // over |
152 | if (rlRange->fStartChar < inputSetRangeBegin) { |
153 | rlRange->split(inputSetRangeBegin, *fStatus); |
154 | if (U_FAILURE(*fStatus)) { |
155 | return; |
156 | } |
157 | continue; |
158 | } |
159 | |
160 | // Same thing at the end of the ranges... |
161 | // If the end of the range from the range list doesn't coincide with |
162 | // the end of the range from the unicode set, split the range list |
163 | // range in two. The first part of the split range will be |
164 | // wholly inside the Unicode set. |
165 | if (rlRange->fEndChar > inputSetRangeEnd) { |
166 | rlRange->split(inputSetRangeEnd+1, *fStatus); |
167 | if (U_FAILURE(*fStatus)) { |
168 | return; |
169 | } |
170 | } |
171 | |
172 | // The current rlRange is now entirely within the UnicodeSet range. |
173 | // Add this unicode set to the list of sets for this rlRange |
174 | if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { |
175 | rlRange->fIncludesSets->addElement(usetNode, *fStatus); |
176 | if (U_FAILURE(*fStatus)) { |
177 | return; |
178 | } |
179 | } |
180 | |
181 | // Advance over ranges that we are finished with. |
182 | if (inputSetRangeEnd == rlRange->fEndChar) { |
183 | inputSetRangeIndex++; |
184 | } |
185 | rlRange = rlRange->fNext; |
186 | } |
187 | } |
188 | |
189 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range" )) { printRanges();} |
190 | |
191 | // |
192 | // Group the above ranges, with each group consisting of one or more |
193 | // ranges that are in exactly the same set of original UnicodeSets. |
194 | // The groups are numbered, and these group numbers are the set of |
195 | // input symbols recognized by the run-time state machine. |
196 | // |
197 | // Numbering: # 0 (state table column 0) is unused. |
198 | // # 1 is reserved - table column 1 is for end-of-input |
199 | // # 2 is reserved - table column 2 is for beginning-of-input |
200 | // # 3 is the first range list. |
201 | // |
202 | RangeDescriptor *rlSearchRange; |
203 | int32_t dictGroupCount = 0; |
204 | |
205 | for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { |
206 | for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { |
207 | if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { |
208 | rlRange->fNum = rlSearchRange->fNum; |
209 | rlRange->fIncludesDict = rlSearchRange->fIncludesDict; |
210 | break; |
211 | } |
212 | } |
213 | if (rlRange->fNum == 0) { |
214 | rlRange->fFirstInGroup = true; |
215 | if (rlRange->isDictionaryRange()) { |
216 | rlRange->fNum = ++dictGroupCount; |
217 | rlRange->fIncludesDict = true; |
218 | } else { |
219 | fGroupCount++; |
220 | rlRange->fNum = fGroupCount+2; |
221 | addValToSets(rlRange->fIncludesSets, rlRange->fNum); |
222 | } |
223 | } |
224 | } |
225 | |
226 | // Move the character category numbers for any dictionary ranges up, so that they |
227 | // immediately follow the non-dictionary ranges. |
228 | |
229 | fDictCategoriesStart = fGroupCount + 3; |
230 | for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { |
231 | if (rlRange->fIncludesDict) { |
232 | rlRange->fNum += fDictCategoriesStart - 1; |
233 | if (rlRange->fFirstInGroup) { |
234 | addValToSets(rlRange->fIncludesSets, rlRange->fNum); |
235 | } |
236 | } |
237 | } |
238 | fGroupCount += dictGroupCount; |
239 | |
240 | |
241 | // Handle input sets that contain the special string {eof}. |
242 | // Column 1 of the state table is reserved for EOF on input. |
243 | // Column 2 is reserved for before-the-start-input. |
244 | // (This column can be optimized away later if there are no rule |
245 | // references to {bof}.) |
246 | // Add this column value (1 or 2) to the equivalent expression |
247 | // subtree for each UnicodeSet that contains the string {eof} |
248 | // Because {bof} and {eof} are not characters in the normal sense, |
249 | // they don't affect the computation of the ranges or TRIE. |
250 | |
251 | UnicodeString eofString(u"eof" ); |
252 | UnicodeString bofString(u"bof" ); |
253 | for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules |
254 | usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); |
255 | if (usetNode==nullptr) { |
256 | break; |
257 | } |
258 | UnicodeSet *inputSet = usetNode->fInputSet; |
259 | if (inputSet->contains(eofString)) { |
260 | addValToSet(usetNode, 1); |
261 | } |
262 | if (inputSet->contains(bofString)) { |
263 | addValToSet(usetNode, 2); |
264 | fSawBOF = true; |
265 | } |
266 | } |
267 | |
268 | |
269 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup" )) {printRangeGroups();} |
270 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets" )) {printSets();} |
271 | } |
272 | |
273 | |
274 | // |
275 | // Build the Trie table for mapping UChar32 values to the corresponding |
276 | // range group number. |
277 | // |
278 | void RBBISetBuilder::buildTrie() { |
279 | fMutableTrie = umutablecptrie_open( |
280 | 0, // Initial value for all code points. |
281 | 0, // Error value for out-of-range input. |
282 | fStatus); |
283 | |
284 | for (RangeDescriptor *range = fRangeList; range!=nullptr && U_SUCCESS(*fStatus); range=range->fNext) { |
285 | umutablecptrie_setRange(fMutableTrie, |
286 | range->fStartChar, // Range start |
287 | range->fEndChar, // Range end (inclusive) |
288 | range->fNum, // value for range |
289 | fStatus); |
290 | } |
291 | } |
292 | |
293 | |
294 | void RBBISetBuilder::mergeCategories(IntPair categories) { |
295 | U_ASSERT(categories.first >= 1); |
296 | U_ASSERT(categories.second > categories.first); |
297 | U_ASSERT((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) || |
298 | (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart)); |
299 | |
300 | for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) { |
301 | int32_t rangeNum = rd->fNum; |
302 | if (rangeNum == categories.second) { |
303 | rd->fNum = categories.first; |
304 | } else if (rangeNum > categories.second) { |
305 | rd->fNum--; |
306 | } |
307 | } |
308 | --fGroupCount; |
309 | if (categories.second <= fDictCategoriesStart) { |
310 | --fDictCategoriesStart; |
311 | } |
312 | } |
313 | |
314 | |
315 | //----------------------------------------------------------------------------------- |
316 | // |
317 | // getTrieSize() Return the size that will be required to serialize the Trie. |
318 | // |
319 | //----------------------------------------------------------------------------------- |
320 | int32_t RBBISetBuilder::getTrieSize() { |
321 | if (U_FAILURE(*fStatus)) { |
322 | return 0; |
323 | } |
324 | if (fTrie == nullptr) { |
325 | bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie; |
326 | fTrie = umutablecptrie_buildImmutable( |
327 | fMutableTrie, |
328 | UCPTRIE_TYPE_FAST, |
329 | use8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16, |
330 | fStatus); |
331 | fTrieSize = ucptrie_toBinary(fTrie, nullptr, 0, fStatus); |
332 | if (*fStatus == U_BUFFER_OVERFLOW_ERROR) { |
333 | *fStatus = U_ZERO_ERROR; |
334 | } |
335 | } |
336 | return fTrieSize; |
337 | } |
338 | |
339 | |
340 | //----------------------------------------------------------------------------------- |
341 | // |
342 | // serializeTrie() Put the serialized trie at the specified address. |
343 | // Trust the caller to have given us enough memory. |
344 | // getTrieSize() MUST be called first. |
345 | // |
346 | //----------------------------------------------------------------------------------- |
347 | void RBBISetBuilder::serializeTrie(uint8_t *where) { |
348 | ucptrie_toBinary(fTrie, |
349 | where, // Buffer |
350 | fTrieSize, // Capacity |
351 | fStatus); |
352 | } |
353 | |
354 | //------------------------------------------------------------------------ |
355 | // |
356 | // addValToSets Add a runtime-mapped input value to each uset from a |
357 | // list of uset nodes. (val corresponds to a state table column.) |
358 | // For each of the original Unicode sets - which correspond |
359 | // directly to uset nodes - a logically equivalent expression |
360 | // is constructed in terms of the remapped runtime input |
361 | // symbol set. This function adds one runtime input symbol to |
362 | // a list of sets. |
363 | // |
364 | // The "logically equivalent expression" is the tree for an |
365 | // or-ing together of all of the symbols that go into the set. |
366 | // |
367 | //------------------------------------------------------------------------ |
368 | void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) { |
369 | int32_t ix; |
370 | |
371 | for (ix=0; ix<sets->size(); ix++) { |
372 | RBBINode *usetNode = (RBBINode *)sets->elementAt(ix); |
373 | addValToSet(usetNode, val); |
374 | } |
375 | } |
376 | |
377 | void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) { |
378 | RBBINode *leafNode = new RBBINode(RBBINode::leafChar); |
379 | if (leafNode == nullptr) { |
380 | *fStatus = U_MEMORY_ALLOCATION_ERROR; |
381 | return; |
382 | } |
383 | leafNode->fVal = (unsigned short)val; |
384 | if (usetNode->fLeftChild == nullptr) { |
385 | usetNode->fLeftChild = leafNode; |
386 | leafNode->fParent = usetNode; |
387 | } else { |
388 | // There are already input symbols present for this set. |
389 | // Set up an OR node, with the previous stuff as the left child |
390 | // and the new value as the right child. |
391 | RBBINode *orNode = new RBBINode(RBBINode::opOr); |
392 | if (orNode == nullptr) { |
393 | *fStatus = U_MEMORY_ALLOCATION_ERROR; |
394 | return; |
395 | } |
396 | orNode->fLeftChild = usetNode->fLeftChild; |
397 | orNode->fRightChild = leafNode; |
398 | orNode->fLeftChild->fParent = orNode; |
399 | orNode->fRightChild->fParent = orNode; |
400 | usetNode->fLeftChild = orNode; |
401 | orNode->fParent = usetNode; |
402 | } |
403 | } |
404 | |
405 | |
406 | //------------------------------------------------------------------------ |
407 | // |
408 | // getNumCharCategories |
409 | // |
410 | //------------------------------------------------------------------------ |
411 | int32_t RBBISetBuilder::getNumCharCategories() const { |
412 | return fGroupCount + 3; |
413 | } |
414 | |
415 | |
416 | //------------------------------------------------------------------------ |
417 | // |
418 | // getDictCategoriesStart |
419 | // |
420 | //------------------------------------------------------------------------ |
421 | int32_t RBBISetBuilder::getDictCategoriesStart() const { |
422 | return fDictCategoriesStart; |
423 | } |
424 | |
425 | |
426 | //------------------------------------------------------------------------ |
427 | // |
428 | // sawBOF |
429 | // |
430 | //------------------------------------------------------------------------ |
431 | UBool RBBISetBuilder::sawBOF() const { |
432 | return fSawBOF; |
433 | } |
434 | |
435 | |
436 | //------------------------------------------------------------------------ |
437 | // |
438 | // getFirstChar Given a runtime RBBI character category, find |
439 | // the first UChar32 that is in the set of chars |
440 | // in the category. |
441 | //------------------------------------------------------------------------ |
442 | UChar32 RBBISetBuilder::getFirstChar(int32_t category) const { |
443 | RangeDescriptor *rlRange; |
444 | UChar32 retVal = (UChar32)-1; |
445 | for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { |
446 | if (rlRange->fNum == category) { |
447 | retVal = rlRange->fStartChar; |
448 | break; |
449 | } |
450 | } |
451 | return retVal; |
452 | } |
453 | |
454 | |
455 | //------------------------------------------------------------------------ |
456 | // |
457 | // printRanges A debugging function. |
458 | // dump out all of the range definitions. |
459 | // |
460 | //------------------------------------------------------------------------ |
461 | #ifdef RBBI_DEBUG |
462 | void RBBISetBuilder::printRanges() { |
463 | RangeDescriptor *rlRange; |
464 | int i; |
465 | |
466 | RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n" ); |
467 | for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { |
468 | RBBIDebugPrintf("%4x-%4x " , rlRange->fStartChar, rlRange->fEndChar); |
469 | |
470 | for (i=0; i<rlRange->fIncludesSets->size(); i++) { |
471 | RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); |
472 | UnicodeString setName {u"anon" }; |
473 | RBBINode *setRef = usetNode->fParent; |
474 | if (setRef != nullptr) { |
475 | RBBINode *varRef = setRef->fParent; |
476 | if (varRef != nullptr && varRef->fType == RBBINode::varRef) { |
477 | setName = varRef->fText; |
478 | } |
479 | } |
480 | RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" " ); |
481 | } |
482 | RBBIDebugPrintf("\n" ); |
483 | } |
484 | } |
485 | #endif |
486 | |
487 | |
488 | //------------------------------------------------------------------------ |
489 | // |
490 | // printRangeGroups A debugging function. |
491 | // dump out all of the range groups. |
492 | // |
493 | //------------------------------------------------------------------------ |
494 | #ifdef RBBI_DEBUG |
495 | void RBBISetBuilder::printRangeGroups() { |
496 | int i; |
497 | |
498 | RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n" ); |
499 | for (RangeDescriptor *rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { |
500 | if (rlRange->fFirstInGroup) { |
501 | int groupNum = rlRange->fNum; |
502 | RBBIDebugPrintf("%2i " , groupNum); |
503 | |
504 | if (groupNum >= fDictCategoriesStart) { RBBIDebugPrintf(" <DICT> " );} |
505 | |
506 | for (i=0; i<rlRange->fIncludesSets->size(); i++) { |
507 | RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); |
508 | UnicodeString setName = UNICODE_STRING("anon" , 4); |
509 | RBBINode *setRef = usetNode->fParent; |
510 | if (setRef != nullptr) { |
511 | RBBINode *varRef = setRef->fParent; |
512 | if (varRef != nullptr && varRef->fType == RBBINode::varRef) { |
513 | setName = varRef->fText; |
514 | } |
515 | } |
516 | RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" " ); |
517 | } |
518 | |
519 | i = 0; |
520 | for (RangeDescriptor *tRange = rlRange; tRange != nullptr; tRange = tRange->fNext) { |
521 | if (tRange->fNum == rlRange->fNum) { |
522 | if (i++ % 5 == 0) { |
523 | RBBIDebugPrintf("\n " ); |
524 | } |
525 | RBBIDebugPrintf(" %05x-%05x" , tRange->fStartChar, tRange->fEndChar); |
526 | } |
527 | } |
528 | RBBIDebugPrintf("\n" ); |
529 | } |
530 | } |
531 | RBBIDebugPrintf("\n" ); |
532 | } |
533 | #endif |
534 | |
535 | |
536 | //------------------------------------------------------------------------ |
537 | // |
538 | // printSets A debugging function. |
539 | // dump out all of the set definitions. |
540 | // |
541 | //------------------------------------------------------------------------ |
542 | #ifdef RBBI_DEBUG |
543 | void RBBISetBuilder::printSets() { |
544 | int i; |
545 | |
546 | RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n" ); |
547 | for (i=0; ; i++) { |
548 | RBBINode *usetNode; |
549 | RBBINode *setRef; |
550 | RBBINode *varRef; |
551 | UnicodeString setName; |
552 | |
553 | usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i); |
554 | if (usetNode == nullptr) { |
555 | break; |
556 | } |
557 | |
558 | RBBIDebugPrintf("%3d " , i); |
559 | setName = UNICODE_STRING("anonymous" , 9); |
560 | setRef = usetNode->fParent; |
561 | if (setRef != nullptr) { |
562 | varRef = setRef->fParent; |
563 | if (varRef != nullptr && varRef->fType == RBBINode::varRef) { |
564 | setName = varRef->fText; |
565 | } |
566 | } |
567 | RBBI_DEBUG_printUnicodeString(setName); |
568 | RBBIDebugPrintf(" " ); |
569 | RBBI_DEBUG_printUnicodeString(usetNode->fText); |
570 | RBBIDebugPrintf("\n" ); |
571 | if (usetNode->fLeftChild != nullptr) { |
572 | RBBINode::printTree(usetNode->fLeftChild, true); |
573 | } |
574 | } |
575 | RBBIDebugPrintf("\n" ); |
576 | } |
577 | #endif |
578 | |
579 | |
580 | |
581 | //------------------------------------------------------------------------------------- |
582 | // |
583 | // RangeDescriptor copy constructor |
584 | // |
585 | //------------------------------------------------------------------------------------- |
586 | |
587 | RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) : |
588 | fStartChar(other.fStartChar), fEndChar {other.fEndChar}, fNum {other.fNum}, |
589 | fIncludesDict{other.fIncludesDict}, fFirstInGroup{other.fFirstInGroup} { |
590 | |
591 | if (U_FAILURE(status)) { |
592 | return; |
593 | } |
594 | fIncludesSets = new UVector(status); |
595 | if (this->fIncludesSets == nullptr) { |
596 | status = U_MEMORY_ALLOCATION_ERROR; |
597 | } |
598 | if (U_FAILURE(status)) { |
599 | return; |
600 | } |
601 | |
602 | for (int32_t i=0; i<other.fIncludesSets->size(); i++) { |
603 | this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status); |
604 | } |
605 | } |
606 | |
607 | |
608 | //------------------------------------------------------------------------------------- |
609 | // |
610 | // RangeDesriptor default constructor |
611 | // |
612 | //------------------------------------------------------------------------------------- |
613 | RangeDescriptor::RangeDescriptor(UErrorCode &status) { |
614 | if (U_FAILURE(status)) { |
615 | return; |
616 | } |
617 | fIncludesSets = new UVector(status); |
618 | if (fIncludesSets == nullptr) { |
619 | status = U_MEMORY_ALLOCATION_ERROR; |
620 | } |
621 | } |
622 | |
623 | |
624 | //------------------------------------------------------------------------------------- |
625 | // |
626 | // RangeDesriptor Destructor |
627 | // |
628 | //------------------------------------------------------------------------------------- |
629 | RangeDescriptor::~RangeDescriptor() { |
630 | delete fIncludesSets; |
631 | fIncludesSets = nullptr; |
632 | } |
633 | |
634 | //------------------------------------------------------------------------------------- |
635 | // |
636 | // RangeDesriptor::split() |
637 | // |
638 | //------------------------------------------------------------------------------------- |
639 | void RangeDescriptor::split(UChar32 where, UErrorCode &status) { |
640 | U_ASSERT(where>fStartChar && where<=fEndChar); |
641 | RangeDescriptor *nr = new RangeDescriptor(*this, status); |
642 | if(nr == nullptr) { |
643 | status = U_MEMORY_ALLOCATION_ERROR; |
644 | return; |
645 | } |
646 | if (U_FAILURE(status)) { |
647 | delete nr; |
648 | return; |
649 | } |
650 | // RangeDescriptor copy constructor copies all fields. |
651 | // Only need to update those that are different after the split. |
652 | nr->fStartChar = where; |
653 | this->fEndChar = where-1; |
654 | nr->fNext = this->fNext; |
655 | this->fNext = nr; |
656 | } |
657 | |
658 | |
659 | //------------------------------------------------------------------------------------- |
660 | // |
661 | // RangeDescriptor::isDictionaryRange |
662 | // |
663 | // Test whether this range includes characters from |
664 | // the original Unicode Set named "dictionary". |
665 | // |
666 | // This function looks through the Unicode Sets that |
667 | // the range includes, checking for one named "dictionary" |
668 | // |
669 | // TODO: a faster way would be to find the set node for |
670 | // "dictionary" just once, rather than looking it |
671 | // up by name every time. |
672 | // |
673 | //------------------------------------------------------------------------------------- |
674 | bool RangeDescriptor::isDictionaryRange() { |
675 | static const char16_t *dictionary = u"dictionary" ; |
676 | for (int32_t i=0; i<fIncludesSets->size(); i++) { |
677 | RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i); |
678 | RBBINode *setRef = usetNode->fParent; |
679 | if (setRef != nullptr) { |
680 | RBBINode *varRef = setRef->fParent; |
681 | if (varRef && varRef->fType == RBBINode::varRef) { |
682 | const UnicodeString *setName = &varRef->fText; |
683 | if (setName->compare(dictionary, -1) == 0) { |
684 | return true; |
685 | } |
686 | } |
687 | } |
688 | } |
689 | return false; |
690 | } |
691 | |
692 | U_NAMESPACE_END |
693 | |
694 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
695 | |