1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2010-2012, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* file name: bytestriebuilder.cpp
9* encoding: UTF-8
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2010sep25
14* created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/bytestrie.h"
19#include "unicode/bytestriebuilder.h"
20#include "unicode/stringpiece.h"
21#include "charstr.h"
22#include "cmemory.h"
23#include "uhash.h"
24#include "uarrsort.h"
25#include "uassert.h"
26#include "ustr_imp.h"
27
28U_NAMESPACE_BEGIN
29
30/*
31 * Note: This builder implementation stores (bytes, value) pairs with full copies
32 * of the byte sequences, until the BytesTrie is built.
33 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
34 */
35
36class BytesTrieElement : public UMemory {
37public:
38 // Use compiler's default constructor, initializes nothing.
39
40 void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);
41
42 StringPiece getString(const CharString &strings) const {
43 int32_t offset=stringOffset;
44 int32_t length;
45 if(offset>=0) {
46 length=(uint8_t)strings[offset++];
47 } else {
48 offset=~offset;
49 length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
50 offset+=2;
51 }
52 return StringPiece(strings.data()+offset, length);
53 }
54 int32_t getStringLength(const CharString &strings) const {
55 int32_t offset=stringOffset;
56 if(offset>=0) {
57 return (uint8_t)strings[offset];
58 } else {
59 offset=~offset;
60 return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
61 }
62 }
63
64 char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
65
66 int32_t getValue() const { return value; }
67
68 int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
69
70private:
71 const char *data(const CharString &strings) const {
72 int32_t offset=stringOffset;
73 if(offset>=0) {
74 ++offset;
75 } else {
76 offset=~offset+2;
77 }
78 return strings.data()+offset;
79 }
80
81 // If the stringOffset is non-negative, then the first strings byte contains
82 // the string length.
83 // If the stringOffset is negative, then the first two strings bytes contain
84 // the string length (big-endian), and the offset needs to be bit-inverted.
85 // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
86 int32_t stringOffset;
87 int32_t value;
88};
89
90void
91BytesTrieElement::setTo(StringPiece s, int32_t val,
92 CharString &strings, UErrorCode &errorCode) {
93 if(U_FAILURE(errorCode)) {
94 return;
95 }
96 int32_t length=s.length();
97 if(length>0xffff) {
98 // Too long: We store the length in 1 or 2 bytes.
99 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
100 return;
101 }
102 int32_t offset=strings.length();
103 if(length>0xff) {
104 offset=~offset;
105 strings.append((char)(length>>8), errorCode);
106 }
107 strings.append((char)length, errorCode);
108 stringOffset=offset;
109 value=val;
110 strings.append(s, errorCode);
111}
112
113int32_t
114BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
115 // TODO: add StringPiece::compare(), see ticket #8187
116 StringPiece thisString=getString(strings);
117 StringPiece otherString=other.getString(strings);
118 int32_t lengthDiff=thisString.length()-otherString.length();
119 int32_t commonLength;
120 if(lengthDiff<=0) {
121 commonLength=thisString.length();
122 } else {
123 commonLength=otherString.length();
124 }
125 int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
126 return diff!=0 ? diff : lengthDiff;
127}
128
129BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
130 : strings(nullptr), elements(nullptr), elementsCapacity(0), elementsLength(0),
131 bytes(nullptr), bytesCapacity(0), bytesLength(0) {
132 if(U_FAILURE(errorCode)) {
133 return;
134 }
135 strings=new CharString();
136 if(strings==nullptr) {
137 errorCode=U_MEMORY_ALLOCATION_ERROR;
138 }
139}
140
141BytesTrieBuilder::~BytesTrieBuilder() {
142 delete strings;
143 delete[] elements;
144 uprv_free(bytes);
145}
146
147BytesTrieBuilder &
148BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {
149 if(U_FAILURE(errorCode)) {
150 return *this;
151 }
152 if(bytesLength>0) {
153 // Cannot add elements after building.
154 errorCode=U_NO_WRITE_PERMISSION;
155 return *this;
156 }
157 if(elementsLength==elementsCapacity) {
158 int32_t newCapacity;
159 if(elementsCapacity==0) {
160 newCapacity=1024;
161 } else {
162 newCapacity=4*elementsCapacity;
163 }
164 BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
165 if(newElements==nullptr) {
166 errorCode=U_MEMORY_ALLOCATION_ERROR;
167 return *this; // error instead of dereferencing null
168 }
169 if(elementsLength>0) {
170 uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));
171 }
172 delete[] elements;
173 elements=newElements;
174 elementsCapacity=newCapacity;
175 }
176 elements[elementsLength++].setTo(s, value, *strings, errorCode);
177 return *this;
178}
179
180U_CDECL_BEGIN
181
182static int32_t U_CALLCONV
183compareElementStrings(const void *context, const void *left, const void *right) {
184 const CharString *strings=static_cast<const CharString *>(context);
185 const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
186 const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
187 return leftElement->compareStringTo(*rightElement, *strings);
188}
189
190U_CDECL_END
191
192BytesTrie *
193BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
194 buildBytes(buildOption, errorCode);
195 BytesTrie *newTrie=nullptr;
196 if(U_SUCCESS(errorCode)) {
197 newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
198 if(newTrie==nullptr) {
199 errorCode=U_MEMORY_ALLOCATION_ERROR;
200 } else {
201 bytes=nullptr; // The new trie now owns the array.
202 bytesCapacity=0;
203 }
204 }
205 return newTrie;
206}
207
208StringPiece
209BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
210 buildBytes(buildOption, errorCode);
211 StringPiece result;
212 if(U_SUCCESS(errorCode)) {
213 result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
214 }
215 return result;
216}
217
218void
219BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
220 if(U_FAILURE(errorCode)) {
221 return;
222 }
223 if(bytes!=nullptr && bytesLength>0) {
224 // Already built.
225 return;
226 }
227 if(bytesLength==0) {
228 if(elementsLength==0) {
229 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
230 return;
231 }
232 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
233 compareElementStrings, strings,
234 false, // need not be a stable sort
235 &errorCode);
236 if(U_FAILURE(errorCode)) {
237 return;
238 }
239 // Duplicate strings are not allowed.
240 StringPiece prev=elements[0].getString(*strings);
241 for(int32_t i=1; i<elementsLength; ++i) {
242 StringPiece current=elements[i].getString(*strings);
243 if(prev==current) {
244 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
245 return;
246 }
247 prev=current;
248 }
249 }
250 // Create and byte-serialize the trie for the elements.
251 bytesLength=0;
252 int32_t capacity=strings->length();
253 if(capacity<1024) {
254 capacity=1024;
255 }
256 if(bytesCapacity<capacity) {
257 uprv_free(bytes);
258 bytes=static_cast<char *>(uprv_malloc(capacity));
259 if(bytes==nullptr) {
260 errorCode=U_MEMORY_ALLOCATION_ERROR;
261 bytesCapacity=0;
262 return;
263 }
264 bytesCapacity=capacity;
265 }
266 StringTrieBuilder::build(buildOption, elementsLength, errorCode);
267 if(bytes==nullptr) {
268 errorCode=U_MEMORY_ALLOCATION_ERROR;
269 }
270}
271
272BytesTrieBuilder &
273BytesTrieBuilder::clear() {
274 strings->clear();
275 elementsLength=0;
276 bytesLength=0;
277 return *this;
278}
279
280int32_t
281BytesTrieBuilder::getElementStringLength(int32_t i) const {
282 return elements[i].getStringLength(*strings);
283}
284
285char16_t
286BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
287 return (uint8_t)elements[i].charAt(byteIndex, *strings);
288}
289
290int32_t
291BytesTrieBuilder::getElementValue(int32_t i) const {
292 return elements[i].getValue();
293}
294
295int32_t
296BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
297 const BytesTrieElement &firstElement=elements[first];
298 const BytesTrieElement &lastElement=elements[last];
299 int32_t minStringLength=firstElement.getStringLength(*strings);
300 while(++byteIndex<minStringLength &&
301 firstElement.charAt(byteIndex, *strings)==
302 lastElement.charAt(byteIndex, *strings)) {}
303 return byteIndex;
304}
305
306int32_t
307BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
308 int32_t length=0; // Number of different bytes at byteIndex.
309 int32_t i=start;
310 do {
311 char byte=elements[i++].charAt(byteIndex, *strings);
312 while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
313 ++i;
314 }
315 ++length;
316 } while(i<limit);
317 return length;
318}
319
320int32_t
321BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
322 do {
323 char byte=elements[i++].charAt(byteIndex, *strings);
324 while(byte==elements[i].charAt(byteIndex, *strings)) {
325 ++i;
326 }
327 } while(--count>0);
328 return i;
329}
330
331int32_t
332BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, char16_t byte) const {
333 char b=(char)byte;
334 while(b==elements[i].charAt(byteIndex, *strings)) {
335 ++i;
336 }
337 return i;
338}
339
340BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
341 : LinearMatchNode(len, nextNode), s(bytes) {
342 hash=static_cast<int32_t>(
343 static_cast<uint32_t>(hash)*37u + static_cast<uint32_t>(ustr_hashCharsN(bytes, len)));
344}
345
346bool
347BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
348 if(this==&other) {
349 return true;
350 }
351 if(!LinearMatchNode::operator==(other)) {
352 return false;
353 }
354 const BTLinearMatchNode &o=static_cast<const BTLinearMatchNode &>(other);
355 return 0==uprv_memcmp(s, o.s, length);
356}
357
358void
359BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
360 BytesTrieBuilder &b=static_cast<BytesTrieBuilder &>(builder);
361 next->write(builder);
362 b.write(s, length);
363 offset=b.write(b.getMinLinearMatch()+length-1);
364}
365
366StringTrieBuilder::Node *
367BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
368 Node *nextNode) const {
369 return new BTLinearMatchNode(
370 elements[i].getString(*strings).data()+byteIndex,
371 length,
372 nextNode);
373}
374
375UBool
376BytesTrieBuilder::ensureCapacity(int32_t length) {
377 if(bytes==nullptr) {
378 return false; // previous memory allocation had failed
379 }
380 if(length>bytesCapacity) {
381 int32_t newCapacity=bytesCapacity;
382 do {
383 newCapacity*=2;
384 } while(newCapacity<=length);
385 char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
386 if(newBytes==nullptr) {
387 // unable to allocate memory
388 uprv_free(bytes);
389 bytes=nullptr;
390 bytesCapacity=0;
391 return false;
392 }
393 uprv_memcpy(newBytes+(newCapacity-bytesLength),
394 bytes+(bytesCapacity-bytesLength), bytesLength);
395 uprv_free(bytes);
396 bytes=newBytes;
397 bytesCapacity=newCapacity;
398 }
399 return true;
400}
401
402int32_t
403BytesTrieBuilder::write(int32_t byte) {
404 int32_t newLength=bytesLength+1;
405 if(ensureCapacity(newLength)) {
406 bytesLength=newLength;
407 bytes[bytesCapacity-bytesLength]=(char)byte;
408 }
409 return bytesLength;
410}
411
412int32_t
413BytesTrieBuilder::write(const char *b, int32_t length) {
414 int32_t newLength=bytesLength+length;
415 if(ensureCapacity(newLength)) {
416 bytesLength=newLength;
417 uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
418 }
419 return bytesLength;
420}
421
422int32_t
423BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
424 return write(elements[i].getString(*strings).data()+byteIndex, length);
425}
426
427int32_t
428BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
429 if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
430 return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
431 }
432 char intBytes[5];
433 int32_t length=1;
434 if(i<0 || i>0xffffff) {
435 intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
436 intBytes[1]=(char)((uint32_t)i>>24);
437 intBytes[2]=(char)((uint32_t)i>>16);
438 intBytes[3]=(char)((uint32_t)i>>8);
439 intBytes[4]=(char)i;
440 length=5;
441 // } else if(i<=BytesTrie::kMaxOneByteValue) {
442 // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
443 } else {
444 if(i<=BytesTrie::kMaxTwoByteValue) {
445 intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
446 } else {
447 if(i<=BytesTrie::kMaxThreeByteValue) {
448 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
449 } else {
450 intBytes[0]=(char)BytesTrie::kFourByteValueLead;
451 intBytes[1]=(char)(i>>16);
452 length=2;
453 }
454 intBytes[length++]=(char)(i>>8);
455 }
456 intBytes[length++]=(char)i;
457 }
458 intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
459 return write(intBytes, length);
460}
461
462int32_t
463BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
464 int32_t offset=write(node);
465 if(hasValue) {
466 offset=writeValueAndFinal(value, false);
467 }
468 return offset;
469}
470
471int32_t
472BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
473 int32_t i=bytesLength-jumpTarget;
474 U_ASSERT(i>=0);
475 if(i<=BytesTrie::kMaxOneByteDelta) {
476 return write(i);
477 } else {
478 char intBytes[5];
479 return write(intBytes, internalEncodeDelta(i, intBytes));
480 }
481}
482
483int32_t
484BytesTrieBuilder::internalEncodeDelta(int32_t i, char intBytes[]) {
485 U_ASSERT(i>=0);
486 if(i<=BytesTrie::kMaxOneByteDelta) {
487 intBytes[0]=(char)i;
488 return 1;
489 }
490 int32_t length=1;
491 if(i<=BytesTrie::kMaxTwoByteDelta) {
492 intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
493 } else {
494 if(i<=BytesTrie::kMaxThreeByteDelta) {
495 intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
496 } else {
497 if(i<=0xffffff) {
498 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
499 } else {
500 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
501 intBytes[1]=(char)(i>>24);
502 length=2;
503 }
504 intBytes[length++]=(char)(i>>16);
505 }
506 intBytes[length++]=(char)(i>>8);
507 }
508 intBytes[length++]=(char)i;
509 return length;
510}
511
512U_NAMESPACE_END
513