1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * Copyright (C) 2013-2014, International Business Machines |
6 | * Corporation and others. All Rights Reserved. |
7 | ******************************************************************************* |
8 | * collationrootelements.cpp |
9 | * |
10 | * created on: 2013mar05 |
11 | * created by: Markus W. Scherer |
12 | */ |
13 | |
14 | #include "unicode/utypes.h" |
15 | |
16 | #if !UCONFIG_NO_COLLATION |
17 | |
18 | #include "collation.h" |
19 | #include "collationrootelements.h" |
20 | #include "uassert.h" |
21 | |
22 | U_NAMESPACE_BEGIN |
23 | |
24 | int64_t |
25 | CollationRootElements::lastCEWithPrimaryBefore(uint32_t p) const { |
26 | if(p == 0) { return 0; } |
27 | U_ASSERT(p > elements[elements[IX_FIRST_PRIMARY_INDEX]]); |
28 | int32_t index = findP(p); |
29 | uint32_t q = elements[index]; |
30 | uint32_t secTer; |
31 | if(p == (q & 0xffffff00)) { |
32 | // p == elements[index] is a root primary. Find the CE before it. |
33 | // We must not be in a primary range. |
34 | U_ASSERT((q & PRIMARY_STEP_MASK) == 0); |
35 | secTer = elements[index - 1]; |
36 | if((secTer & SEC_TER_DELTA_FLAG) == 0) { |
37 | // Primary CE just before p. |
38 | p = secTer & 0xffffff00; |
39 | secTer = Collation::COMMON_SEC_AND_TER_CE; |
40 | } else { |
41 | // secTer = last secondary & tertiary for the previous primary |
42 | index -= 2; |
43 | for(;;) { |
44 | p = elements[index]; |
45 | if((p & SEC_TER_DELTA_FLAG) == 0) { |
46 | p &= 0xffffff00; |
47 | break; |
48 | } |
49 | --index; |
50 | } |
51 | } |
52 | } else { |
53 | // p > elements[index] which is the previous primary. |
54 | // Find the last secondary & tertiary weights for it. |
55 | p = q & 0xffffff00; |
56 | secTer = Collation::COMMON_SEC_AND_TER_CE; |
57 | for(;;) { |
58 | q = elements[++index]; |
59 | if((q & SEC_TER_DELTA_FLAG) == 0) { |
60 | // We must not be in a primary range. |
61 | U_ASSERT((q & PRIMARY_STEP_MASK) == 0); |
62 | break; |
63 | } |
64 | secTer = q; |
65 | } |
66 | } |
67 | return ((int64_t)p << 32) | (secTer & ~SEC_TER_DELTA_FLAG); |
68 | } |
69 | |
70 | int64_t |
71 | CollationRootElements::firstCEWithPrimaryAtLeast(uint32_t p) const { |
72 | if(p == 0) { return 0; } |
73 | int32_t index = findP(p); |
74 | if(p != (elements[index] & 0xffffff00)) { |
75 | for(;;) { |
76 | p = elements[++index]; |
77 | if((p & SEC_TER_DELTA_FLAG) == 0) { |
78 | // First primary after p. We must not be in a primary range. |
79 | U_ASSERT((p & PRIMARY_STEP_MASK) == 0); |
80 | break; |
81 | } |
82 | } |
83 | } |
84 | // The code above guarantees that p has at most 3 bytes: (p & 0xff) == 0. |
85 | return ((int64_t)p << 32) | Collation::COMMON_SEC_AND_TER_CE; |
86 | } |
87 | |
88 | uint32_t |
89 | CollationRootElements::getPrimaryBefore(uint32_t p, UBool isCompressible) const { |
90 | int32_t index = findPrimary(p); |
91 | int32_t step; |
92 | uint32_t q = elements[index]; |
93 | if(p == (q & 0xffffff00)) { |
94 | // Found p itself. Return the previous primary. |
95 | // See if p is at the end of a previous range. |
96 | step = (int32_t)q & PRIMARY_STEP_MASK; |
97 | if(step == 0) { |
98 | // p is not at the end of a range. Look for the previous primary. |
99 | do { |
100 | p = elements[--index]; |
101 | } while((p & SEC_TER_DELTA_FLAG) != 0); |
102 | return p & 0xffffff00; |
103 | } |
104 | } else { |
105 | // p is in a range, and not at the start. |
106 | uint32_t nextElement = elements[index + 1]; |
107 | U_ASSERT(isEndOfPrimaryRange(nextElement)); |
108 | step = (int32_t)nextElement & PRIMARY_STEP_MASK; |
109 | } |
110 | // Return the previous range primary. |
111 | if((p & 0xffff) == 0) { |
112 | return Collation::decTwoBytePrimaryByOneStep(p, isCompressible, step); |
113 | } else { |
114 | return Collation::decThreeBytePrimaryByOneStep(p, isCompressible, step); |
115 | } |
116 | } |
117 | |
118 | uint32_t |
119 | CollationRootElements::getSecondaryBefore(uint32_t p, uint32_t s) const { |
120 | int32_t index; |
121 | uint32_t previousSec, sec; |
122 | if(p == 0) { |
123 | index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; |
124 | // Gap at the beginning of the secondary CE range. |
125 | previousSec = 0; |
126 | sec = elements[index] >> 16; |
127 | } else { |
128 | index = findPrimary(p) + 1; |
129 | previousSec = Collation::BEFORE_WEIGHT16; |
130 | sec = getFirstSecTerForPrimary(index) >> 16; |
131 | } |
132 | U_ASSERT(s >= sec); |
133 | while(s > sec) { |
134 | previousSec = sec; |
135 | U_ASSERT((elements[index] & SEC_TER_DELTA_FLAG) != 0); |
136 | sec = elements[index++] >> 16; |
137 | } |
138 | U_ASSERT(sec == s); |
139 | return previousSec; |
140 | } |
141 | |
142 | uint32_t |
143 | CollationRootElements::getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) const { |
144 | U_ASSERT((t & ~Collation::ONLY_TERTIARY_MASK) == 0); |
145 | int32_t index; |
146 | uint32_t previousTer, secTer; |
147 | if(p == 0) { |
148 | if(s == 0) { |
149 | index = (int32_t)elements[IX_FIRST_TERTIARY_INDEX]; |
150 | // Gap at the beginning of the tertiary CE range. |
151 | previousTer = 0; |
152 | } else { |
153 | index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; |
154 | previousTer = Collation::BEFORE_WEIGHT16; |
155 | } |
156 | secTer = elements[index] & ~SEC_TER_DELTA_FLAG; |
157 | } else { |
158 | index = findPrimary(p) + 1; |
159 | previousTer = Collation::BEFORE_WEIGHT16; |
160 | secTer = getFirstSecTerForPrimary(index); |
161 | } |
162 | uint32_t st = (s << 16) | t; |
163 | while(st > secTer) { |
164 | if((secTer >> 16) == s) { previousTer = secTer; } |
165 | U_ASSERT((elements[index] & SEC_TER_DELTA_FLAG) != 0); |
166 | secTer = elements[index++] & ~SEC_TER_DELTA_FLAG; |
167 | } |
168 | U_ASSERT(secTer == st); |
169 | return previousTer & 0xffff; |
170 | } |
171 | |
172 | uint32_t |
173 | CollationRootElements::getPrimaryAfter(uint32_t p, int32_t index, UBool isCompressible) const { |
174 | U_ASSERT(p == (elements[index] & 0xffffff00) || isEndOfPrimaryRange(elements[index + 1])); |
175 | uint32_t q = elements[++index]; |
176 | int32_t step; |
177 | if((q & SEC_TER_DELTA_FLAG) == 0 && (step = (int32_t)q & PRIMARY_STEP_MASK) != 0) { |
178 | // Return the next primary in this range. |
179 | if((p & 0xffff) == 0) { |
180 | return Collation::incTwoBytePrimaryByOffset(p, isCompressible, step); |
181 | } else { |
182 | return Collation::incThreeBytePrimaryByOffset(p, isCompressible, step); |
183 | } |
184 | } else { |
185 | // Return the next primary in the list. |
186 | while((q & SEC_TER_DELTA_FLAG) != 0) { |
187 | q = elements[++index]; |
188 | } |
189 | U_ASSERT((q & PRIMARY_STEP_MASK) == 0); |
190 | return q; |
191 | } |
192 | } |
193 | |
194 | uint32_t |
195 | CollationRootElements::getSecondaryAfter(int32_t index, uint32_t s) const { |
196 | uint32_t secTer; |
197 | uint32_t secLimit; |
198 | if(index == 0) { |
199 | // primary = 0 |
200 | U_ASSERT(s != 0); |
201 | index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; |
202 | secTer = elements[index]; |
203 | // Gap at the end of the secondary CE range. |
204 | secLimit = 0x10000; |
205 | } else { |
206 | U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]); |
207 | secTer = getFirstSecTerForPrimary(index + 1); |
208 | // If this is an explicit sec/ter unit, then it will be read once more. |
209 | // Gap for secondaries of primary CEs. |
210 | secLimit = getSecondaryBoundary(); |
211 | } |
212 | for(;;) { |
213 | uint32_t sec = secTer >> 16; |
214 | if(sec > s) { return sec; } |
215 | secTer = elements[++index]; |
216 | if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; } |
217 | } |
218 | } |
219 | |
220 | uint32_t |
221 | CollationRootElements::getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) const { |
222 | uint32_t secTer; |
223 | uint32_t terLimit; |
224 | if(index == 0) { |
225 | // primary = 0 |
226 | if(s == 0) { |
227 | U_ASSERT(t != 0); |
228 | index = (int32_t)elements[IX_FIRST_TERTIARY_INDEX]; |
229 | // Gap at the end of the tertiary CE range. |
230 | terLimit = 0x4000; |
231 | } else { |
232 | index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; |
233 | // Gap for tertiaries of primary/secondary CEs. |
234 | terLimit = getTertiaryBoundary(); |
235 | } |
236 | secTer = elements[index] & ~SEC_TER_DELTA_FLAG; |
237 | } else { |
238 | U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]); |
239 | secTer = getFirstSecTerForPrimary(index + 1); |
240 | // If this is an explicit sec/ter unit, then it will be read once more. |
241 | terLimit = getTertiaryBoundary(); |
242 | } |
243 | uint32_t st = (s << 16) | t; |
244 | for(;;) { |
245 | if(secTer > st) { |
246 | U_ASSERT((secTer >> 16) == s); |
247 | return secTer & 0xffff; |
248 | } |
249 | secTer = elements[++index]; |
250 | // No tertiary greater than t for this primary+secondary. |
251 | if((secTer & SEC_TER_DELTA_FLAG) == 0 || (secTer >> 16) > s) { return terLimit; } |
252 | secTer &= ~SEC_TER_DELTA_FLAG; |
253 | } |
254 | } |
255 | |
256 | uint32_t |
257 | CollationRootElements::getFirstSecTerForPrimary(int32_t index) const { |
258 | uint32_t secTer = elements[index]; |
259 | if((secTer & SEC_TER_DELTA_FLAG) == 0) { |
260 | // No sec/ter delta. |
261 | return Collation::COMMON_SEC_AND_TER_CE; |
262 | } |
263 | secTer &= ~SEC_TER_DELTA_FLAG; |
264 | if(secTer > Collation::COMMON_SEC_AND_TER_CE) { |
265 | // Implied sec/ter. |
266 | return Collation::COMMON_SEC_AND_TER_CE; |
267 | } |
268 | // Explicit sec/ter below common/common. |
269 | return secTer; |
270 | } |
271 | |
272 | int32_t |
273 | CollationRootElements::findPrimary(uint32_t p) const { |
274 | // Requirement: p must occur as a root primary. |
275 | U_ASSERT((p & 0xff) == 0); // at most a 3-byte primary |
276 | int32_t index = findP(p); |
277 | // If p is in a range, then we just assume that p is an actual primary in this range. |
278 | // (Too cumbersome/expensive to check.) |
279 | // Otherwise, it must be an exact match. |
280 | U_ASSERT(isEndOfPrimaryRange(elements[index + 1]) || p == (elements[index] & 0xffffff00)); |
281 | return index; |
282 | } |
283 | |
284 | int32_t |
285 | CollationRootElements::findP(uint32_t p) const { |
286 | // p need not occur as a root primary. |
287 | // For example, it might be a reordering group boundary. |
288 | U_ASSERT((p >> 24) != Collation::UNASSIGNED_IMPLICIT_BYTE); |
289 | // modified binary search |
290 | int32_t start = (int32_t)elements[IX_FIRST_PRIMARY_INDEX]; |
291 | U_ASSERT(p >= elements[start]); |
292 | int32_t limit = length - 1; |
293 | U_ASSERT(elements[limit] >= PRIMARY_SENTINEL); |
294 | U_ASSERT(p < elements[limit]); |
295 | while((start + 1) < limit) { |
296 | // Invariant: elements[start] and elements[limit] are primaries, |
297 | // and elements[start]<=p<=elements[limit]. |
298 | int32_t i = (start + limit) / 2; |
299 | uint32_t q = elements[i]; |
300 | if((q & SEC_TER_DELTA_FLAG) != 0) { |
301 | // Find the next primary. |
302 | int32_t j = i + 1; |
303 | for(;;) { |
304 | if(j == limit) { break; } |
305 | q = elements[j]; |
306 | if((q & SEC_TER_DELTA_FLAG) == 0) { |
307 | i = j; |
308 | break; |
309 | } |
310 | ++j; |
311 | } |
312 | if((q & SEC_TER_DELTA_FLAG) != 0) { |
313 | // Find the preceding primary. |
314 | j = i - 1; |
315 | for(;;) { |
316 | if(j == start) { break; } |
317 | q = elements[j]; |
318 | if((q & SEC_TER_DELTA_FLAG) == 0) { |
319 | i = j; |
320 | break; |
321 | } |
322 | --j; |
323 | } |
324 | if((q & SEC_TER_DELTA_FLAG) != 0) { |
325 | // No primary between start and limit. |
326 | break; |
327 | } |
328 | } |
329 | } |
330 | if(p < (q & 0xffffff00)) { // Reset the "step" bits of a range end primary. |
331 | limit = i; |
332 | } else { |
333 | start = i; |
334 | } |
335 | } |
336 | return start; |
337 | } |
338 | |
339 | U_NAMESPACE_END |
340 | |
341 | #endif // !UCONFIG_NO_COLLATION |
342 | |