1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2010-2012,2014, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* file name: stringtriebuilder.h
9* encoding: UTF-8
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2010dec24
14* created by: Markus W. Scherer
15*/
16
17#ifndef __STRINGTRIEBUILDER_H__
18#define __STRINGTRIEBUILDER_H__
19
20#include "unicode/utypes.h"
21
22#if U_SHOW_CPLUSPLUS_API
23
24#include "unicode/uobject.h"
25
26/**
27 * \file
28 * \brief C++ API: Builder API for trie builders
29 */
30
31// Forward declaration.
32/// \cond
33struct UHashtable;
34typedef struct UHashtable UHashtable;
35/// \endcond
36
37/**
38 * Build options for BytesTrieBuilder and CharsTrieBuilder.
39 * @stable ICU 4.8
40 */
41enum UStringTrieBuildOption {
42 /**
43 * Builds a trie quickly.
44 * @stable ICU 4.8
45 */
46 USTRINGTRIE_BUILD_FAST,
47 /**
48 * Builds a trie more slowly, attempting to generate
49 * a shorter but equivalent serialization.
50 * This build option also uses more memory.
51 *
52 * This option can be effective when many integer values are the same
53 * and string/byte sequence suffixes can be shared.
54 * Runtime speed is not expected to improve.
55 * @stable ICU 4.8
56 */
57 USTRINGTRIE_BUILD_SMALL
58};
59
60U_NAMESPACE_BEGIN
61
62/**
63 * Base class for string trie builder classes.
64 *
65 * This class is not intended for public subclassing.
66 * @stable ICU 4.8
67 */
68class U_COMMON_API StringTrieBuilder : public UObject {
69public:
70#ifndef U_HIDE_INTERNAL_API
71 /** @internal */
72 static int32_t hashNode(const void *node);
73 /** @internal */
74 static UBool equalNodes(const void *left, const void *right);
75#endif /* U_HIDE_INTERNAL_API */
76
77protected:
78 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
79 // or else the compiler will create a public default constructor.
80 /** @internal */
81 StringTrieBuilder();
82 /** @internal */
83 virtual ~StringTrieBuilder();
84
85#ifndef U_HIDE_INTERNAL_API
86 /** @internal */
87 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
88 /** @internal */
89 void deleteCompactBuilder();
90
91 /** @internal */
92 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
93
94 /** @internal */
95 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
96 /** @internal */
97 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
98#endif /* U_HIDE_INTERNAL_API */
99
100 class Node;
101
102#ifndef U_HIDE_INTERNAL_API
103 /** @internal */
104 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
105 /** @internal */
106 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
107 int32_t length, UErrorCode &errorCode);
108#endif /* U_HIDE_INTERNAL_API */
109
110 /** @internal */
111 virtual int32_t getElementStringLength(int32_t i) const = 0;
112 /** @internal */
113 virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0;
114 /** @internal */
115 virtual int32_t getElementValue(int32_t i) const = 0;
116
117 // Finds the first unit index after this one where
118 // the first and last element have different units again.
119 /** @internal */
120 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
121
122 // Number of different units at unitIndex.
123 /** @internal */
124 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
125 /** @internal */
126 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
127 /** @internal */
128 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0;
129
130 /** @internal */
131 virtual UBool matchNodesCanHaveValues() const = 0;
132
133 /** @internal */
134 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
135 /** @internal */
136 virtual int32_t getMinLinearMatch() const = 0;
137 /** @internal */
138 virtual int32_t getMaxLinearMatchLength() const = 0;
139
140#ifndef U_HIDE_INTERNAL_API
141 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
142 /** @internal */
143 static const int32_t kMaxBranchLinearSubNodeLength=5;
144
145 // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units.
146 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
147 /** @internal */
148 static const int32_t kMaxSplitBranchLevels=14;
149
150 /**
151 * Makes sure that there is only one unique node registered that is
152 * equivalent to newNode.
153 * @param newNode Input node. The builder takes ownership.
154 * @param errorCode ICU in/out UErrorCode.
155 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==nullptr.
156 * @return newNode if it is the first of its kind, or
157 * an equivalent node if newNode is a duplicate.
158 * @internal
159 */
160 Node *registerNode(Node *newNode, UErrorCode &errorCode);
161 /**
162 * Makes sure that there is only one unique FinalValueNode registered
163 * with this value.
164 * Avoids creating a node if the value is a duplicate.
165 * @param value A final value.
166 * @param errorCode ICU in/out UErrorCode.
167 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==nullptr.
168 * @return A FinalValueNode with the given value.
169 * @internal
170 */
171 Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
172#endif /* U_HIDE_INTERNAL_API */
173
174 /*
175 * C++ note:
176 * registerNode() and registerFinalValue() take ownership of their input nodes,
177 * and only return owned nodes.
178 * If they see a failure UErrorCode, they will delete the input node.
179 * If they get a nullptr pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
180 * If there is a failure, they return nullptr.
181 *
182 * nullptr Node pointers can be safely passed into other Nodes because
183 * they call the static Node::hashCode() which checks for a nullptr pointer first.
184 *
185 * Therefore, as long as builder functions register a new node,
186 * they need to check for failures only before explicitly dereferencing
187 * a Node pointer, or before setting a new UErrorCode.
188 */
189
190 // Hash set of nodes, maps from nodes to integer 1.
191 /** @internal */
192 UHashtable *nodes;
193
194 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
195 // it is needed for layout of other objects.
196 /**
197 * @internal
198 * \cond
199 */
200 class Node : public UObject {
201 public:
202 Node(int32_t initialHash) : hash(initialHash), offset(0) {}
203 inline int32_t hashCode() const { return hash; }
204 // Handles node==nullptr.
205 static inline int32_t hashCode(const Node *node) { return node==nullptr ? 0 : node->hashCode(); }
206 // Base class operator==() compares the actual class types.
207 virtual bool operator==(const Node &other) const;
208 inline bool operator!=(const Node &other) const { return !operator==(other); }
209 /**
210 * Traverses the Node graph and numbers branch edges, with rightmost edges first.
211 * This is to avoid writing a duplicate node twice.
212 *
213 * Branch nodes in this trie data structure are not symmetric.
214 * Most branch edges "jump" to other nodes but the rightmost branch edges
215 * just continue without a jump.
216 * Therefore, write() must write the rightmost branch edge last
217 * (trie units are written backwards), and must write it at that point even if
218 * it is a duplicate of a node previously written elsewhere.
219 *
220 * This function visits and marks right branch edges first.
221 * Edges are numbered with increasingly negative values because we share the
222 * offset field which gets positive values when nodes are written.
223 * A branch edge also remembers the first number for any of its edges.
224 *
225 * When a further-left branch edge has a number in the range of the rightmost
226 * edge's numbers, then it will be written as part of the required right edge
227 * and we can avoid writing it first.
228 *
229 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
230 * edge numbers.
231 *
232 * @param edgeNumber The first edge number for this node and its sub-nodes.
233 * @return An edge number that is at least the maximum-negative
234 * of the input edge number and the numbers of this node and all of its sub-nodes.
235 */
236 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
237 // write() must set the offset to a positive value.
238 virtual void write(StringTrieBuilder &builder) = 0;
239 // See markRightEdgesFirst.
240 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
241 StringTrieBuilder &builder) {
242 // Note: Edge numbers are negative, lastRight<=firstRight.
243 // If offset>0 then this node and its sub-nodes have been written already
244 // and we need not write them again.
245 // If this node is part of the unwritten right branch edge,
246 // then we wait until that is written.
247 if(offset<0 && (offset<lastRight || firstRight<offset)) {
248 write(builder);
249 }
250 }
251 inline int32_t getOffset() const { return offset; }
252 protected:
253 int32_t hash;
254 int32_t offset;
255 };
256
257#ifndef U_HIDE_INTERNAL_API
258 // This class should not be overridden because
259 // registerFinalValue() compares a stack-allocated FinalValueNode
260 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
261 // with the input node, and the
262 // !Node::operator==(other) used inside FinalValueNode::operator==(other)
263 // will be false if the typeid's are different.
264 /** @internal */
265 class FinalValueNode : public Node {
266 public:
267 FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {}
268 virtual bool operator==(const Node &other) const override;
269 virtual void write(StringTrieBuilder &builder) override;
270 protected:
271 int32_t value;
272 };
273#endif /* U_HIDE_INTERNAL_API */
274
275 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
276 // it is needed for layout of other objects.
277 /**
278 * @internal
279 */
280 class ValueNode : public Node {
281 public:
282 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(false), value(0) {}
283 virtual bool operator==(const Node &other) const override;
284 void setValue(int32_t v) {
285 hasValue=true;
286 value=v;
287 hash=hash*37u+v;
288 }
289 protected:
290 UBool hasValue;
291 int32_t value;
292 };
293
294#ifndef U_HIDE_INTERNAL_API
295 /**
296 * @internal
297 */
298 class IntermediateValueNode : public ValueNode {
299 public:
300 IntermediateValueNode(int32_t v, Node *nextNode)
301 : ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); }
302 virtual bool operator==(const Node &other) const override;
303 virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
304 virtual void write(StringTrieBuilder &builder) override;
305 protected:
306 Node *next;
307 };
308#endif /* U_HIDE_INTERNAL_API */
309
310 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
311 // it is needed for layout of other objects.
312 /**
313 * @internal
314 */
315 class LinearMatchNode : public ValueNode {
316 public:
317 LinearMatchNode(int32_t len, Node *nextNode)
318 : ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),
319 length(len), next(nextNode) {}
320 virtual bool operator==(const Node &other) const override;
321 virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
322 protected:
323 int32_t length;
324 Node *next;
325 };
326
327#ifndef U_HIDE_INTERNAL_API
328 /**
329 * @internal
330 */
331 class BranchNode : public Node {
332 public:
333 BranchNode(int32_t initialHash) : Node(initialHash) {}
334 protected:
335 int32_t firstEdgeNumber;
336 };
337
338 /**
339 * @internal
340 */
341 class ListBranchNode : public BranchNode {
342 public:
343 ListBranchNode() : BranchNode(0x444444), length(0) {}
344 virtual bool operator==(const Node &other) const override;
345 virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
346 virtual void write(StringTrieBuilder &builder) override;
347 // Adds a unit with a final value.
348 void add(int32_t c, int32_t value) {
349 units[length]=(char16_t)c;
350 equal[length]=nullptr;
351 values[length]=value;
352 ++length;
353 hash=(hash*37u+c)*37u+value;
354 }
355 // Adds a unit which leads to another match node.
356 void add(int32_t c, Node *node) {
357 units[length]=(char16_t)c;
358 equal[length]=node;
359 values[length]=0;
360 ++length;
361 hash=(hash*37u+c)*37u+hashCode(node);
362 }
363 protected:
364 Node *equal[kMaxBranchLinearSubNodeLength]; // nullptr means "has final value".
365 int32_t length;
366 int32_t values[kMaxBranchLinearSubNodeLength];
367 char16_t units[kMaxBranchLinearSubNodeLength];
368 };
369
370 /**
371 * @internal
372 */
373 class SplitBranchNode : public BranchNode {
374 public:
375 SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
376 : BranchNode(((0x555555u*37u+middleUnit)*37u+
377 hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),
378 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
379 virtual bool operator==(const Node &other) const override;
380 virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
381 virtual void write(StringTrieBuilder &builder) override;
382 protected:
383 char16_t unit;
384 Node *lessThan;
385 Node *greaterOrEqual;
386 };
387
388 // Branch head node, for writing the actual node lead unit.
389 /** @internal */
390 class BranchHeadNode : public ValueNode {
391 public:
392 BranchHeadNode(int32_t len, Node *subNode)
393 : ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),
394 length(len), next(subNode) {}
395 virtual bool operator==(const Node &other) const override;
396 virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
397 virtual void write(StringTrieBuilder &builder) override;
398 protected:
399 int32_t length;
400 Node *next; // A branch sub-node.
401 };
402
403#endif /* U_HIDE_INTERNAL_API */
404 /// \endcond
405
406 /** @internal */
407 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
408 Node *nextNode) const = 0;
409
410 /** @internal */
411 virtual int32_t write(int32_t unit) = 0;
412 /** @internal */
413 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
414 /** @internal */
415 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
416 /** @internal */
417 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
418 /** @internal */
419 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
420};
421
422U_NAMESPACE_END
423
424#endif /* U_SHOW_CPLUSPLUS_API */
425
426#endif // __STRINGTRIEBUILDER_H__
427