1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2010-2011, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* file name: ucharstrie.h
9* encoding: UTF-8
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2010nov14
14* created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/appendable.h"
19#include "unicode/ucharstrie.h"
20#include "unicode/uobject.h"
21#include "unicode/utf16.h"
22#include "cmemory.h"
23#include "uassert.h"
24
25U_NAMESPACE_BEGIN
26
27UCharsTrie::~UCharsTrie() {
28 uprv_free(ownedArray_);
29}
30
31UStringTrieResult
32UCharsTrie::current() const {
33 const char16_t *pos=pos_;
34 if(pos==nullptr) {
35 return USTRINGTRIE_NO_MATCH;
36 } else {
37 int32_t node;
38 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
39 valueResult(node) : USTRINGTRIE_NO_VALUE;
40 }
41}
42
43UStringTrieResult
44UCharsTrie::firstForCodePoint(UChar32 cp) {
45 return cp<=0xffff ?
46 first(cp) :
47 (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ?
48 next(U16_TRAIL(cp)) :
49 USTRINGTRIE_NO_MATCH);
50}
51
52UStringTrieResult
53UCharsTrie::nextForCodePoint(UChar32 cp) {
54 return cp<=0xffff ?
55 next(cp) :
56 (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ?
57 next(U16_TRAIL(cp)) :
58 USTRINGTRIE_NO_MATCH);
59}
60
61UStringTrieResult
62UCharsTrie::branchNext(const char16_t *pos, int32_t length, int32_t uchar) {
63 // Branch according to the current unit.
64 if(length==0) {
65 length=*pos++;
66 }
67 ++length;
68 // The length of the branch is the number of units to select from.
69 // The data structure encodes a binary search.
70 while(length>kMaxBranchLinearSubNodeLength) {
71 if(uchar<*pos++) {
72 length>>=1;
73 pos=jumpByDelta(pos);
74 } else {
75 length=length-(length>>1);
76 pos=skipDelta(pos);
77 }
78 }
79 // Drop down to linear search for the last few units.
80 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
81 // and divides length by 2.
82 do {
83 if(uchar==*pos++) {
84 UStringTrieResult result;
85 int32_t node=*pos;
86 if(node&kValueIsFinal) {
87 // Leave the final value for getValue() to read.
88 result=USTRINGTRIE_FINAL_VALUE;
89 } else {
90 // Use the non-final value as the jump delta.
91 ++pos;
92 // int32_t delta=readValue(pos, node);
93 int32_t delta;
94 if(node<kMinTwoUnitValueLead) {
95 delta=node;
96 } else if(node<kThreeUnitValueLead) {
97 delta=((node-kMinTwoUnitValueLead)<<16)|*pos++;
98 } else {
99 delta=(pos[0]<<16)|pos[1];
100 pos+=2;
101 }
102 // end readValue()
103 pos+=delta;
104 node=*pos;
105 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
106 }
107 pos_=pos;
108 return result;
109 }
110 --length;
111 pos=skipValue(pos);
112 } while(length>1);
113 if(uchar==*pos++) {
114 pos_=pos;
115 int32_t node=*pos;
116 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
117 } else {
118 stop();
119 return USTRINGTRIE_NO_MATCH;
120 }
121}
122
123UStringTrieResult
124UCharsTrie::nextImpl(const char16_t *pos, int32_t uchar) {
125 int32_t node=*pos++;
126 for(;;) {
127 if(node<kMinLinearMatch) {
128 return branchNext(pos, node, uchar);
129 } else if(node<kMinValueLead) {
130 // Match the first of length+1 units.
131 int32_t length=node-kMinLinearMatch; // Actual match length minus 1.
132 if(uchar==*pos++) {
133 remainingMatchLength_=--length;
134 pos_=pos;
135 return (length<0 && (node=*pos)>=kMinValueLead) ?
136 valueResult(node) : USTRINGTRIE_NO_VALUE;
137 } else {
138 // No match.
139 break;
140 }
141 } else if(node&kValueIsFinal) {
142 // No further matching units.
143 break;
144 } else {
145 // Skip intermediate value.
146 pos=skipNodeValue(pos, node);
147 node&=kNodeTypeMask;
148 }
149 }
150 stop();
151 return USTRINGTRIE_NO_MATCH;
152}
153
154UStringTrieResult
155UCharsTrie::next(int32_t uchar) {
156 const char16_t *pos=pos_;
157 if(pos==nullptr) {
158 return USTRINGTRIE_NO_MATCH;
159 }
160 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
161 if(length>=0) {
162 // Remaining part of a linear-match node.
163 if(uchar==*pos++) {
164 remainingMatchLength_=--length;
165 pos_=pos;
166 int32_t node;
167 return (length<0 && (node=*pos)>=kMinValueLead) ?
168 valueResult(node) : USTRINGTRIE_NO_VALUE;
169 } else {
170 stop();
171 return USTRINGTRIE_NO_MATCH;
172 }
173 }
174 return nextImpl(pos, uchar);
175}
176
177UStringTrieResult
178UCharsTrie::next(ConstChar16Ptr ptr, int32_t sLength) {
179 const char16_t *s=ptr;
180 if(sLength<0 ? *s==0 : sLength==0) {
181 // Empty input.
182 return current();
183 }
184 const char16_t *pos=pos_;
185 if(pos==nullptr) {
186 return USTRINGTRIE_NO_MATCH;
187 }
188 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
189 for(;;) {
190 // Fetch the next input unit, if there is one.
191 // Continue a linear-match node without rechecking sLength<0.
192 int32_t uchar;
193 if(sLength<0) {
194 for(;;) {
195 if((uchar=*s++)==0) {
196 remainingMatchLength_=length;
197 pos_=pos;
198 int32_t node;
199 return (length<0 && (node=*pos)>=kMinValueLead) ?
200 valueResult(node) : USTRINGTRIE_NO_VALUE;
201 }
202 if(length<0) {
203 remainingMatchLength_=length;
204 break;
205 }
206 if(uchar!=*pos) {
207 stop();
208 return USTRINGTRIE_NO_MATCH;
209 }
210 ++pos;
211 --length;
212 }
213 } else {
214 for(;;) {
215 if(sLength==0) {
216 remainingMatchLength_=length;
217 pos_=pos;
218 int32_t node;
219 return (length<0 && (node=*pos)>=kMinValueLead) ?
220 valueResult(node) : USTRINGTRIE_NO_VALUE;
221 }
222 uchar=*s++;
223 --sLength;
224 if(length<0) {
225 remainingMatchLength_=length;
226 break;
227 }
228 if(uchar!=*pos) {
229 stop();
230 return USTRINGTRIE_NO_MATCH;
231 }
232 ++pos;
233 --length;
234 }
235 }
236 int32_t node=*pos++;
237 for(;;) {
238 if(node<kMinLinearMatch) {
239 UStringTrieResult result=branchNext(pos, node, uchar);
240 if(result==USTRINGTRIE_NO_MATCH) {
241 return USTRINGTRIE_NO_MATCH;
242 }
243 // Fetch the next input unit, if there is one.
244 if(sLength<0) {
245 if((uchar=*s++)==0) {
246 return result;
247 }
248 } else {
249 if(sLength==0) {
250 return result;
251 }
252 uchar=*s++;
253 --sLength;
254 }
255 if(result==USTRINGTRIE_FINAL_VALUE) {
256 // No further matching units.
257 stop();
258 return USTRINGTRIE_NO_MATCH;
259 }
260 pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
261 node=*pos++;
262 } else if(node<kMinValueLead) {
263 // Match length+1 units.
264 length=node-kMinLinearMatch; // Actual match length minus 1.
265 if(uchar!=*pos) {
266 stop();
267 return USTRINGTRIE_NO_MATCH;
268 }
269 ++pos;
270 --length;
271 break;
272 } else if(node&kValueIsFinal) {
273 // No further matching units.
274 stop();
275 return USTRINGTRIE_NO_MATCH;
276 } else {
277 // Skip intermediate value.
278 pos=skipNodeValue(pos, node);
279 node&=kNodeTypeMask;
280 }
281 }
282 }
283}
284
285const char16_t *
286UCharsTrie::findUniqueValueFromBranch(const char16_t *pos, int32_t length,
287 UBool haveUniqueValue, int32_t &uniqueValue) {
288 while(length>kMaxBranchLinearSubNodeLength) {
289 ++pos; // ignore the comparison unit
290 if(nullptr==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
291 return nullptr;
292 }
293 length=length-(length>>1);
294 pos=skipDelta(pos);
295 }
296 do {
297 ++pos; // ignore a comparison unit
298 // handle its value
299 int32_t node=*pos++;
300 UBool isFinal=(UBool)(node>>15);
301 node&=0x7fff;
302 int32_t value=readValue(pos, node);
303 pos=skipValue(pos, node);
304 if(isFinal) {
305 if(haveUniqueValue) {
306 if(value!=uniqueValue) {
307 return nullptr;
308 }
309 } else {
310 uniqueValue=value;
311 haveUniqueValue=true;
312 }
313 } else {
314 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
315 return nullptr;
316 }
317 haveUniqueValue=true;
318 }
319 } while(--length>1);
320 return pos+1; // ignore the last comparison unit
321}
322
323UBool
324UCharsTrie::findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
325 int32_t node=*pos++;
326 for(;;) {
327 if(node<kMinLinearMatch) {
328 if(node==0) {
329 node=*pos++;
330 }
331 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
332 if(pos==nullptr) {
333 return false;
334 }
335 haveUniqueValue=true;
336 node=*pos++;
337 } else if(node<kMinValueLead) {
338 // linear-match node
339 pos+=node-kMinLinearMatch+1; // Ignore the match units.
340 node=*pos++;
341 } else {
342 UBool isFinal=(UBool)(node>>15);
343 int32_t value;
344 if(isFinal) {
345 value=readValue(pos, node&0x7fff);
346 } else {
347 value=readNodeValue(pos, node);
348 }
349 if(haveUniqueValue) {
350 if(value!=uniqueValue) {
351 return false;
352 }
353 } else {
354 uniqueValue=value;
355 haveUniqueValue=true;
356 }
357 if(isFinal) {
358 return true;
359 }
360 pos=skipNodeValue(pos, node);
361 node&=kNodeTypeMask;
362 }
363 }
364}
365
366int32_t
367UCharsTrie::getNextUChars(Appendable &out) const {
368 const char16_t *pos=pos_;
369 if(pos==nullptr) {
370 return 0;
371 }
372 if(remainingMatchLength_>=0) {
373 out.appendCodeUnit(*pos); // Next unit of a pending linear-match node.
374 return 1;
375 }
376 int32_t node=*pos++;
377 if(node>=kMinValueLead) {
378 if(node&kValueIsFinal) {
379 return 0;
380 } else {
381 pos=skipNodeValue(pos, node);
382 node&=kNodeTypeMask;
383 }
384 }
385 if(node<kMinLinearMatch) {
386 if(node==0) {
387 node=*pos++;
388 }
389 out.reserveAppendCapacity(++node);
390 getNextBranchUChars(pos, node, out);
391 return node;
392 } else {
393 // First unit of the linear-match node.
394 out.appendCodeUnit(*pos);
395 return 1;
396 }
397}
398
399void
400UCharsTrie::getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out) {
401 while(length>kMaxBranchLinearSubNodeLength) {
402 ++pos; // ignore the comparison unit
403 getNextBranchUChars(jumpByDelta(pos), length>>1, out);
404 length=length-(length>>1);
405 pos=skipDelta(pos);
406 }
407 do {
408 out.appendCodeUnit(*pos++);
409 pos=skipValue(pos);
410 } while(--length>1);
411 out.appendCodeUnit(*pos);
412}
413
414U_NAMESPACE_END
415