1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2010-2012, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* file name: stringtriebuilder.cpp
9* encoding: UTF-8
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2010dec24
14* created by: Markus W. Scherer
15*/
16
17#include "utypeinfo.h" // for 'typeid' to work
18#include "unicode/utypes.h"
19#include "unicode/stringtriebuilder.h"
20#include "uassert.h"
21#include "uhash.h"
22
23U_CDECL_BEGIN
24
25static int32_t U_CALLCONV
26hashStringTrieNode(const UHashTok key) {
27 return icu::StringTrieBuilder::hashNode(key.pointer);
28}
29
30static UBool U_CALLCONV
31equalStringTrieNodes(const UHashTok key1, const UHashTok key2) {
32 return icu::StringTrieBuilder::equalNodes(key1.pointer, key2.pointer);
33}
34
35U_CDECL_END
36
37U_NAMESPACE_BEGIN
38
39StringTrieBuilder::StringTrieBuilder() : nodes(nullptr) {}
40
41StringTrieBuilder::~StringTrieBuilder() {
42 deleteCompactBuilder();
43}
44
45void
46StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) {
47 if(U_FAILURE(errorCode)) {
48 return;
49 }
50 nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, nullptr,
51 sizeGuess, &errorCode);
52 if(U_SUCCESS(errorCode)) {
53 if(nodes==nullptr) {
54 errorCode=U_MEMORY_ALLOCATION_ERROR;
55 } else {
56 uhash_setKeyDeleter(nodes, uprv_deleteUObject);
57 }
58 }
59}
60
61void
62StringTrieBuilder::deleteCompactBuilder() {
63 uhash_close(nodes);
64 nodes=nullptr;
65}
66
67void
68StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength,
69 UErrorCode &errorCode) {
70 if(buildOption==USTRINGTRIE_BUILD_FAST) {
71 writeNode(0, elementsLength, 0);
72 } else /* USTRINGTRIE_BUILD_SMALL */ {
73 createCompactBuilder(2*elementsLength, errorCode);
74 Node *root=makeNode(0, elementsLength, 0, errorCode);
75 if(U_SUCCESS(errorCode)) {
76 root->markRightEdgesFirst(-1);
77 root->write(*this);
78 }
79 deleteCompactBuilder();
80 }
81}
82
83// Requires start<limit,
84// and all strings of the [start..limit[ elements must be sorted and
85// have a common prefix of length unitIndex.
86int32_t
87StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
88 UBool hasValue=false;
89 int32_t value=0;
90 int32_t type;
91 if(unitIndex==getElementStringLength(start)) {
92 // An intermediate or final value.
93 value=getElementValue(start++);
94 if(start==limit) {
95 return writeValueAndFinal(value, true); // final-value node
96 }
97 hasValue=true;
98 }
99 // Now all [start..limit[ strings are longer than unitIndex.
100 int32_t minUnit=getElementUnit(start, unitIndex);
101 int32_t maxUnit=getElementUnit(limit-1, unitIndex);
102 if(minUnit==maxUnit) {
103 // Linear-match node: All strings have the same character at unitIndex.
104 int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
105 writeNode(start, limit, lastUnitIndex);
106 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
107 int32_t length=lastUnitIndex-unitIndex;
108 int32_t maxLinearMatchLength=getMaxLinearMatchLength();
109 while(length>maxLinearMatchLength) {
110 lastUnitIndex-=maxLinearMatchLength;
111 length-=maxLinearMatchLength;
112 writeElementUnits(start, lastUnitIndex, maxLinearMatchLength);
113 write(getMinLinearMatch()+maxLinearMatchLength-1);
114 }
115 writeElementUnits(start, unitIndex, length);
116 type=getMinLinearMatch()+length-1;
117 } else {
118 // Branch node.
119 int32_t length=countElementUnits(start, limit, unitIndex);
120 // length>=2 because minUnit!=maxUnit.
121 writeBranchSubNode(start, limit, unitIndex, length);
122 if(--length<getMinLinearMatch()) {
123 type=length;
124 } else {
125 write(length);
126 type=0;
127 }
128 }
129 return writeValueAndType(hasValue, value, type);
130}
131
132// start<limit && all strings longer than unitIndex &&
133// length different units at unitIndex
134int32_t
135StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
136 char16_t middleUnits[kMaxSplitBranchLevels];
137 int32_t lessThan[kMaxSplitBranchLevels];
138 int32_t ltLength=0;
139 while(length>getMaxBranchLinearSubNodeLength()) {
140 // Branch on the middle unit.
141 // First, find the middle unit.
142 int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
143 // Encode the less-than branch first.
144 middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit
145 lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2);
146 ++ltLength;
147 // Continue for the greater-or-equal branch.
148 start=i;
149 length=length-length/2;
150 }
151 // For each unit, find its elements array start and whether it has a final value.
152 int32_t starts[kMaxBranchLinearSubNodeLength];
153 UBool isFinal[kMaxBranchLinearSubNodeLength-1];
154 int32_t unitNumber=0;
155 do {
156 int32_t i=starts[unitNumber]=start;
157 char16_t unit=getElementUnit(i++, unitIndex);
158 i=indexOfElementWithNextUnit(i, unitIndex, unit);
159 isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start);
160 start=i;
161 } while(++unitNumber<length-1);
162 // unitNumber==length-1, and the maxUnit elements range is [start..limit[
163 starts[unitNumber]=start;
164
165 // Write the sub-nodes in reverse order: The jump lengths are deltas from
166 // after their own positions, so if we wrote the minUnit sub-node first,
167 // then its jump delta would be larger.
168 // Instead we write the minUnit sub-node last, for a shorter delta.
169 int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1];
170 do {
171 --unitNumber;
172 if(!isFinal[unitNumber]) {
173 jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
174 }
175 } while(unitNumber>0);
176 // The maxUnit sub-node is written as the very last one because we do
177 // not jump for it at all.
178 unitNumber=length-1;
179 writeNode(start, limit, unitIndex+1);
180 int32_t offset=write(getElementUnit(start, unitIndex));
181 // Write the rest of this node's unit-value pairs.
182 while(--unitNumber>=0) {
183 start=starts[unitNumber];
184 int32_t value;
185 if(isFinal[unitNumber]) {
186 // Write the final value for the one string ending with this unit.
187 value=getElementValue(start);
188 } else {
189 // Write the delta to the start position of the sub-node.
190 value=offset-jumpTargets[unitNumber];
191 }
192 writeValueAndFinal(value, isFinal[unitNumber]);
193 offset=write(getElementUnit(start, unitIndex));
194 }
195 // Write the split-branch nodes.
196 while(ltLength>0) {
197 --ltLength;
198 writeDeltaTo(lessThan[ltLength]);
199 offset=write(middleUnits[ltLength]);
200 }
201 return offset;
202}
203
204// Requires start<limit,
205// and all strings of the [start..limit[ elements must be sorted and
206// have a common prefix of length unitIndex.
207StringTrieBuilder::Node *
208StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
209 if(U_FAILURE(errorCode)) {
210 return nullptr;
211 }
212 UBool hasValue=false;
213 int32_t value=0;
214 if(unitIndex==getElementStringLength(start)) {
215 // An intermediate or final value.
216 value=getElementValue(start++);
217 if(start==limit) {
218 return registerFinalValue(value, errorCode);
219 }
220 hasValue=true;
221 }
222 Node *node;
223 // Now all [start..limit[ strings are longer than unitIndex.
224 int32_t minUnit=getElementUnit(start, unitIndex);
225 int32_t maxUnit=getElementUnit(limit-1, unitIndex);
226 if(minUnit==maxUnit) {
227 // Linear-match node: All strings have the same character at unitIndex.
228 int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
229 Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
230 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
231 int32_t length=lastUnitIndex-unitIndex;
232 int32_t maxLinearMatchLength=getMaxLinearMatchLength();
233 while(length>maxLinearMatchLength) {
234 lastUnitIndex-=maxLinearMatchLength;
235 length-=maxLinearMatchLength;
236 node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode);
237 nextNode=registerNode(node, errorCode);
238 }
239 node=createLinearMatchNode(start, unitIndex, length, nextNode);
240 } else {
241 // Branch node.
242 int32_t length=countElementUnits(start, limit, unitIndex);
243 // length>=2 because minUnit!=maxUnit.
244 Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
245 node=new BranchHeadNode(length, subNode);
246 }
247 if(hasValue && node!=nullptr) {
248 if(matchNodesCanHaveValues()) {
249 ((ValueNode *)node)->setValue(value);
250 } else {
251 node=new IntermediateValueNode(value, registerNode(node, errorCode));
252 }
253 }
254 return registerNode(node, errorCode);
255}
256
257// start<limit && all strings longer than unitIndex &&
258// length different units at unitIndex
259StringTrieBuilder::Node *
260StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
261 int32_t length, UErrorCode &errorCode) {
262 if(U_FAILURE(errorCode)) {
263 return nullptr;
264 }
265 char16_t middleUnits[kMaxSplitBranchLevels];
266 Node *lessThan[kMaxSplitBranchLevels];
267 int32_t ltLength=0;
268 while(length>getMaxBranchLinearSubNodeLength()) {
269 // Branch on the middle unit.
270 // First, find the middle unit.
271 int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
272 // Create the less-than branch.
273 middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit
274 lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
275 ++ltLength;
276 // Continue for the greater-or-equal branch.
277 start=i;
278 length=length-length/2;
279 }
280 if(U_FAILURE(errorCode)) {
281 return nullptr;
282 }
283 ListBranchNode *listNode=new ListBranchNode();
284 if(listNode==nullptr) {
285 errorCode=U_MEMORY_ALLOCATION_ERROR;
286 return nullptr;
287 }
288 // For each unit, find its elements array start and whether it has a final value.
289 int32_t unitNumber=0;
290 do {
291 int32_t i=start;
292 char16_t unit=getElementUnit(i++, unitIndex);
293 i=indexOfElementWithNextUnit(i, unitIndex, unit);
294 if(start==i-1 && unitIndex+1==getElementStringLength(start)) {
295 listNode->add(unit, getElementValue(start));
296 } else {
297 listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
298 }
299 start=i;
300 } while(++unitNumber<length-1);
301 // unitNumber==length-1, and the maxUnit elements range is [start..limit[
302 char16_t unit=getElementUnit(start, unitIndex);
303 if(start==limit-1 && unitIndex+1==getElementStringLength(start)) {
304 listNode->add(unit, getElementValue(start));
305 } else {
306 listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
307 }
308 Node *node=registerNode(listNode, errorCode);
309 // Create the split-branch nodes.
310 while(ltLength>0) {
311 --ltLength;
312 node=registerNode(
313 new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
314 }
315 return node;
316}
317
318StringTrieBuilder::Node *
319StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) {
320 if(U_FAILURE(errorCode)) {
321 delete newNode;
322 return nullptr;
323 }
324 if(newNode==nullptr) {
325 errorCode=U_MEMORY_ALLOCATION_ERROR;
326 return nullptr;
327 }
328 const UHashElement *old=uhash_find(nodes, newNode);
329 if(old!=nullptr) {
330 delete newNode;
331 return (Node *)old->key.pointer;
332 }
333 // If uhash_puti() returns a non-zero value from an equivalent, previously
334 // registered node, then uhash_find() failed to find that and we will leak newNode.
335#if U_DEBUG
336 int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue.
337#endif
338 uhash_puti(nodes, newNode, 1, &errorCode);
339 U_ASSERT(oldValue==0);
340 if(U_FAILURE(errorCode)) {
341 delete newNode;
342 return nullptr;
343 }
344 return newNode;
345}
346
347StringTrieBuilder::Node *
348StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
349 if(U_FAILURE(errorCode)) {
350 return nullptr;
351 }
352 FinalValueNode key(value);
353 const UHashElement *old=uhash_find(nodes, &key);
354 if(old!=nullptr) {
355 return (Node *)old->key.pointer;
356 }
357 Node *newNode=new FinalValueNode(value);
358 if(newNode==nullptr) {
359 errorCode=U_MEMORY_ALLOCATION_ERROR;
360 return nullptr;
361 }
362 // If uhash_puti() returns a non-zero value from an equivalent, previously
363 // registered node, then uhash_find() failed to find that and we will leak newNode.
364#if U_DEBUG
365 int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue.
366#endif
367 uhash_puti(nodes, newNode, 1, &errorCode);
368 U_ASSERT(oldValue==0);
369 if(U_FAILURE(errorCode)) {
370 delete newNode;
371 return nullptr;
372 }
373 return newNode;
374}
375
376int32_t
377StringTrieBuilder::hashNode(const void *node) {
378 return ((const Node *)node)->hashCode();
379}
380
381UBool
382StringTrieBuilder::equalNodes(const void *left, const void *right) {
383 return *(const Node *)left==*(const Node *)right;
384}
385
386bool
387StringTrieBuilder::Node::operator==(const Node &other) const {
388 return this==&other || (typeid(*this)==typeid(other) && hash==other.hash);
389}
390
391int32_t
392StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
393 if(offset==0) {
394 offset=edgeNumber;
395 }
396 return edgeNumber;
397}
398
399bool
400StringTrieBuilder::FinalValueNode::operator==(const Node &other) const {
401 if(this==&other) {
402 return true;
403 }
404 if(!Node::operator==(other)) {
405 return false;
406 }
407 const FinalValueNode &o=static_cast<const FinalValueNode &>(other);
408 return value==o.value;
409}
410
411void
412StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) {
413 offset=builder.writeValueAndFinal(value, true);
414}
415
416bool
417StringTrieBuilder::ValueNode::operator==(const Node &other) const {
418 if(this==&other) {
419 return true;
420 }
421 if(!Node::operator==(other)) {
422 return false;
423 }
424 const ValueNode &o=static_cast<const ValueNode &>(other);
425 return hasValue==o.hasValue && (!hasValue || value==o.value);
426}
427
428bool
429StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const {
430 if(this==&other) {
431 return true;
432 }
433 if(!ValueNode::operator==(other)) {
434 return false;
435 }
436 const IntermediateValueNode &o=static_cast<const IntermediateValueNode &>(other);
437 return next==o.next;
438}
439
440int32_t
441StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) {
442 if(offset==0) {
443 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
444 }
445 return edgeNumber;
446}
447
448void
449StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) {
450 next->write(builder);
451 offset=builder.writeValueAndFinal(value, false);
452}
453
454bool
455StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
456 if(this==&other) {
457 return true;
458 }
459 if(!ValueNode::operator==(other)) {
460 return false;
461 }
462 const LinearMatchNode &o=static_cast<const LinearMatchNode &>(other);
463 return length==o.length && next==o.next;
464}
465
466int32_t
467StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
468 if(offset==0) {
469 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
470 }
471 return edgeNumber;
472}
473
474bool
475StringTrieBuilder::ListBranchNode::operator==(const Node &other) const {
476 if(this==&other) {
477 return true;
478 }
479 if(!Node::operator==(other)) {
480 return false;
481 }
482 const ListBranchNode &o=static_cast<const ListBranchNode &>(other);
483 for(int32_t i=0; i<length; ++i) {
484 if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) {
485 return false;
486 }
487 }
488 return true;
489}
490
491int32_t
492StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
493 if(offset==0) {
494 firstEdgeNumber=edgeNumber;
495 int32_t step=0;
496 int32_t i=length;
497 do {
498 Node *edge=equal[--i];
499 if(edge!=nullptr) {
500 edgeNumber=edge->markRightEdgesFirst(edgeNumber-step);
501 }
502 // For all but the rightmost edge, decrement the edge number.
503 step=1;
504 } while(i>0);
505 offset=edgeNumber;
506 }
507 return edgeNumber;
508}
509
510void
511StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) {
512 // Write the sub-nodes in reverse order: The jump lengths are deltas from
513 // after their own positions, so if we wrote the minUnit sub-node first,
514 // then its jump delta would be larger.
515 // Instead we write the minUnit sub-node last, for a shorter delta.
516 int32_t unitNumber=length-1;
517 Node *rightEdge=equal[unitNumber];
518 int32_t rightEdgeNumber= rightEdge==nullptr ? firstEdgeNumber : rightEdge->getOffset();
519 do {
520 --unitNumber;
521 if(equal[unitNumber]!=nullptr) {
522 equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
523 }
524 } while(unitNumber>0);
525 // The maxUnit sub-node is written as the very last one because we do
526 // not jump for it at all.
527 unitNumber=length-1;
528 if(rightEdge==nullptr) {
529 builder.writeValueAndFinal(values[unitNumber], true);
530 } else {
531 rightEdge->write(builder);
532 }
533 offset=builder.write(units[unitNumber]);
534 // Write the rest of this node's unit-value pairs.
535 while(--unitNumber>=0) {
536 int32_t value;
537 UBool isFinal;
538 if(equal[unitNumber]==nullptr) {
539 // Write the final value for the one string ending with this unit.
540 value=values[unitNumber];
541 isFinal=true;
542 } else {
543 // Write the delta to the start position of the sub-node.
544 U_ASSERT(equal[unitNumber]->getOffset()>0);
545 value=offset-equal[unitNumber]->getOffset();
546 isFinal=false;
547 }
548 builder.writeValueAndFinal(value, isFinal);
549 offset=builder.write(units[unitNumber]);
550 }
551}
552
553bool
554StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
555 if(this==&other) {
556 return true;
557 }
558 if(!Node::operator==(other)) {
559 return false;
560 }
561 const SplitBranchNode &o=static_cast<const SplitBranchNode &>(other);
562 return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
563}
564
565int32_t
566StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
567 if(offset==0) {
568 firstEdgeNumber=edgeNumber;
569 edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber);
570 offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1);
571 }
572 return edgeNumber;
573}
574
575void
576StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) {
577 // Encode the less-than branch first.
578 lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
579 // Encode the greater-or-equal branch last because we do not jump for it at all.
580 greaterOrEqual->write(builder);
581 // Write this node.
582 U_ASSERT(lessThan->getOffset()>0);
583 builder.writeDeltaTo(lessThan->getOffset()); // less-than
584 offset=builder.write(unit);
585}
586
587bool
588StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
589 if(this==&other) {
590 return true;
591 }
592 if(!ValueNode::operator==(other)) {
593 return false;
594 }
595 const BranchHeadNode &o=static_cast<const BranchHeadNode &>(other);
596 return length==o.length && next==o.next;
597}
598
599int32_t
600StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
601 if(offset==0) {
602 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
603 }
604 return edgeNumber;
605}
606
607void
608StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) {
609 next->write(builder);
610 if(length<=builder.getMinLinearMatch()) {
611 offset=builder.writeValueAndType(hasValue, value, length-1);
612 } else {
613 builder.write(length-1);
614 offset=builder.writeValueAndType(hasValue, value, 0);
615 }
616}
617
618U_NAMESPACE_END
619