1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * |
6 | * Copyright (C) 2002-2011, International Business Machines |
7 | * Corporation and others. All Rights Reserved. |
8 | * |
9 | ******************************************************************************* |
10 | * file name: propsvec.c |
11 | * encoding: UTF-8 |
12 | * tab size: 8 (not used) |
13 | * indentation:4 |
14 | * |
15 | * created on: 2002feb22 |
16 | * created by: Markus W. Scherer |
17 | * |
18 | * Store bits (Unicode character properties) in bit set vectors. |
19 | */ |
20 | |
21 | #include <stdlib.h> |
22 | #include "unicode/utypes.h" |
23 | #include "cmemory.h" |
24 | #include "utrie.h" |
25 | #include "utrie2.h" |
26 | #include "uarrsort.h" |
27 | #include "propsvec.h" |
28 | #include "uassert.h" |
29 | |
30 | struct UPropsVectors { |
31 | uint32_t *v; |
32 | int32_t columns; /* number of columns, plus two for start & limit values */ |
33 | int32_t maxRows; |
34 | int32_t rows; |
35 | int32_t prevRow; /* search optimization: remember last row seen */ |
36 | UBool isCompacted; |
37 | }; |
38 | |
39 | #define UPVEC_INITIAL_ROWS (1<<12) |
40 | #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16) |
41 | #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1) |
42 | |
43 | U_CAPI UPropsVectors * U_EXPORT2 |
44 | upvec_open(int32_t columns, UErrorCode *pErrorCode) { |
45 | UPropsVectors *pv; |
46 | uint32_t *v, *row; |
47 | uint32_t cp; |
48 | |
49 | if(U_FAILURE(*pErrorCode)) { |
50 | return NULL; |
51 | } |
52 | if(columns<1) { |
53 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
54 | return NULL; |
55 | } |
56 | columns+=2; /* count range start and limit columns */ |
57 | |
58 | pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors)); |
59 | v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4); |
60 | if(pv==NULL || v==NULL) { |
61 | uprv_free(pv); |
62 | uprv_free(v); |
63 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
64 | return NULL; |
65 | } |
66 | uprv_memset(pv, 0, sizeof(UPropsVectors)); |
67 | pv->v=v; |
68 | pv->columns=columns; |
69 | pv->maxRows=UPVEC_INITIAL_ROWS; |
70 | pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP); |
71 | |
72 | /* set the all-Unicode row and the special-value rows */ |
73 | row=pv->v; |
74 | uprv_memset(row, 0, pv->rows*columns*4); |
75 | row[0]=0; |
76 | row[1]=0x110000; |
77 | row+=columns; |
78 | for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) { |
79 | row[0]=cp; |
80 | row[1]=cp+1; |
81 | row+=columns; |
82 | } |
83 | return pv; |
84 | } |
85 | |
86 | U_CAPI void U_EXPORT2 |
87 | upvec_close(UPropsVectors *pv) { |
88 | if(pv!=NULL) { |
89 | uprv_free(pv->v); |
90 | uprv_free(pv); |
91 | } |
92 | } |
93 | |
94 | static uint32_t * |
95 | _findRow(UPropsVectors *pv, UChar32 rangeStart) { |
96 | uint32_t *row; |
97 | int32_t columns, i, start, limit, prevRow; |
98 | |
99 | columns=pv->columns; |
100 | limit=pv->rows; |
101 | prevRow=pv->prevRow; |
102 | |
103 | /* check the vicinity of the last-seen row (start searching with an unrolled loop) */ |
104 | row=pv->v+prevRow*columns; |
105 | if(rangeStart>=(UChar32)row[0]) { |
106 | if(rangeStart<(UChar32)row[1]) { |
107 | /* same row as last seen */ |
108 | return row; |
109 | } else if(rangeStart<(UChar32)(row+=columns)[1]) { |
110 | /* next row after the last one */ |
111 | pv->prevRow=prevRow+1; |
112 | return row; |
113 | } else if(rangeStart<(UChar32)(row+=columns)[1]) { |
114 | /* second row after the last one */ |
115 | pv->prevRow=prevRow+2; |
116 | return row; |
117 | } else if((rangeStart-(UChar32)row[1])<10) { |
118 | /* we are close, continue looping */ |
119 | prevRow+=2; |
120 | do { |
121 | ++prevRow; |
122 | row+=columns; |
123 | } while(rangeStart>=(UChar32)row[1]); |
124 | pv->prevRow=prevRow; |
125 | return row; |
126 | } |
127 | } else if(rangeStart<(UChar32)pv->v[1]) { |
128 | /* the very first row */ |
129 | pv->prevRow=0; |
130 | return pv->v; |
131 | } |
132 | |
133 | /* do a binary search for the start of the range */ |
134 | start=0; |
135 | while(start<limit-1) { |
136 | i=(start+limit)/2; |
137 | row=pv->v+i*columns; |
138 | if(rangeStart<(UChar32)row[0]) { |
139 | limit=i; |
140 | } else if(rangeStart<(UChar32)row[1]) { |
141 | pv->prevRow=i; |
142 | return row; |
143 | } else { |
144 | start=i; |
145 | } |
146 | } |
147 | |
148 | /* must be found because all ranges together always cover all of Unicode */ |
149 | pv->prevRow=start; |
150 | return pv->v+start*columns; |
151 | } |
152 | |
153 | U_CAPI void U_EXPORT2 |
154 | upvec_setValue(UPropsVectors *pv, |
155 | UChar32 start, UChar32 end, |
156 | int32_t column, |
157 | uint32_t value, uint32_t mask, |
158 | UErrorCode *pErrorCode) { |
159 | uint32_t *firstRow, *lastRow; |
160 | int32_t columns; |
161 | UChar32 limit; |
162 | UBool splitFirstRow, splitLastRow; |
163 | |
164 | /* argument checking */ |
165 | if(U_FAILURE(*pErrorCode)) { |
166 | return; |
167 | } |
168 | if( pv==NULL || |
169 | start<0 || start>end || end>UPVEC_MAX_CP || |
170 | column<0 || column>=(pv->columns-2) |
171 | ) { |
172 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
173 | return; |
174 | } |
175 | if(pv->isCompacted) { |
176 | *pErrorCode=U_NO_WRITE_PERMISSION; |
177 | return; |
178 | } |
179 | limit=end+1; |
180 | |
181 | /* initialize */ |
182 | columns=pv->columns; |
183 | column+=2; /* skip range start and limit columns */ |
184 | value&=mask; |
185 | |
186 | /* find the rows whose ranges overlap with the input range */ |
187 | |
188 | /* find the first and last rows, always successful */ |
189 | firstRow=_findRow(pv, start); |
190 | lastRow=_findRow(pv, end); |
191 | |
192 | /* |
193 | * Rows need to be split if they partially overlap with the |
194 | * input range (only possible for the first and last rows) |
195 | * and if their value differs from the input value. |
196 | */ |
197 | splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask)); |
198 | splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask)); |
199 | |
200 | /* split first/last rows if necessary */ |
201 | if(splitFirstRow || splitLastRow) { |
202 | int32_t count, rows; |
203 | |
204 | rows=pv->rows; |
205 | if((rows+splitFirstRow+splitLastRow)>pv->maxRows) { |
206 | uint32_t *newVectors; |
207 | int32_t newMaxRows; |
208 | |
209 | if(pv->maxRows<UPVEC_MEDIUM_ROWS) { |
210 | newMaxRows=UPVEC_MEDIUM_ROWS; |
211 | } else if(pv->maxRows<UPVEC_MAX_ROWS) { |
212 | newMaxRows=UPVEC_MAX_ROWS; |
213 | } else { |
214 | /* Implementation bug, or UPVEC_MAX_ROWS too low. */ |
215 | *pErrorCode=U_INTERNAL_PROGRAM_ERROR; |
216 | return; |
217 | } |
218 | newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4); |
219 | if(newVectors==NULL) { |
220 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
221 | return; |
222 | } |
223 | uprv_memcpy(newVectors, pv->v, (size_t)rows*columns*4); |
224 | firstRow=newVectors+(firstRow-pv->v); |
225 | lastRow=newVectors+(lastRow-pv->v); |
226 | uprv_free(pv->v); |
227 | pv->v=newVectors; |
228 | pv->maxRows=newMaxRows; |
229 | } |
230 | |
231 | /* count the number of row cells to move after the last row, and move them */ |
232 | count = (int32_t)((pv->v+rows*columns)-(lastRow+columns)); |
233 | if(count>0) { |
234 | uprv_memmove( |
235 | lastRow+(1+splitFirstRow+splitLastRow)*columns, |
236 | lastRow+columns, |
237 | count*4); |
238 | } |
239 | pv->rows=rows+splitFirstRow+splitLastRow; |
240 | |
241 | /* split the first row, and move the firstRow pointer to the second part */ |
242 | if(splitFirstRow) { |
243 | /* copy all affected rows up one and move the lastRow pointer */ |
244 | count = (int32_t)((lastRow-firstRow)+columns); |
245 | uprv_memmove(firstRow+columns, firstRow, (size_t)count*4); |
246 | lastRow+=columns; |
247 | |
248 | /* split the range and move the firstRow pointer */ |
249 | firstRow[1]=firstRow[columns]=(uint32_t)start; |
250 | firstRow+=columns; |
251 | } |
252 | |
253 | /* split the last row */ |
254 | if(splitLastRow) { |
255 | /* copy the last row data */ |
256 | uprv_memcpy(lastRow+columns, lastRow, (size_t)columns*4); |
257 | |
258 | /* split the range and move the firstRow pointer */ |
259 | lastRow[1]=lastRow[columns]=(uint32_t)limit; |
260 | } |
261 | } |
262 | |
263 | /* set the "row last seen" to the last row for the range */ |
264 | pv->prevRow=(int32_t)((lastRow-(pv->v))/columns); |
265 | |
266 | /* set the input value in all remaining rows */ |
267 | firstRow+=column; |
268 | lastRow+=column; |
269 | mask=~mask; |
270 | for(;;) { |
271 | *firstRow=(*firstRow&mask)|value; |
272 | if(firstRow==lastRow) { |
273 | break; |
274 | } |
275 | firstRow+=columns; |
276 | } |
277 | } |
278 | |
279 | U_CAPI uint32_t U_EXPORT2 |
280 | upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) { |
281 | uint32_t *row; |
282 | UPropsVectors *ncpv; |
283 | |
284 | if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) { |
285 | return 0; |
286 | } |
287 | ncpv=(UPropsVectors *)pv; |
288 | row=_findRow(ncpv, c); |
289 | return row[2+column]; |
290 | } |
291 | |
292 | U_CAPI uint32_t * U_EXPORT2 |
293 | upvec_getRow(const UPropsVectors *pv, int32_t rowIndex, |
294 | UChar32 *pRangeStart, UChar32 *pRangeEnd) { |
295 | uint32_t *row; |
296 | int32_t columns; |
297 | |
298 | if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) { |
299 | return NULL; |
300 | } |
301 | |
302 | columns=pv->columns; |
303 | row=pv->v+rowIndex*columns; |
304 | if(pRangeStart!=NULL) { |
305 | *pRangeStart=(UChar32)row[0]; |
306 | } |
307 | if(pRangeEnd!=NULL) { |
308 | *pRangeEnd=(UChar32)row[1]-1; |
309 | } |
310 | return row+2; |
311 | } |
312 | |
313 | static int32_t U_CALLCONV |
314 | upvec_compareRows(const void *context, const void *l, const void *r) { |
315 | const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r; |
316 | const UPropsVectors *pv=(const UPropsVectors *)context; |
317 | int32_t i, count, columns; |
318 | |
319 | count=columns=pv->columns; /* includes start/limit columns */ |
320 | |
321 | /* start comparing after start/limit but wrap around to them */ |
322 | i=2; |
323 | do { |
324 | if(left[i]!=right[i]) { |
325 | return left[i]<right[i] ? -1 : 1; |
326 | } |
327 | if(++i==columns) { |
328 | i=0; |
329 | } |
330 | } while(--count>0); |
331 | |
332 | return 0; |
333 | } |
334 | |
335 | U_CAPI void U_EXPORT2 |
336 | upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) { |
337 | uint32_t *row; |
338 | int32_t i, columns, valueColumns, rows, count; |
339 | UChar32 start, limit; |
340 | |
341 | /* argument checking */ |
342 | if(U_FAILURE(*pErrorCode)) { |
343 | return; |
344 | } |
345 | if(handler==NULL) { |
346 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
347 | return; |
348 | } |
349 | if(pv->isCompacted) { |
350 | return; |
351 | } |
352 | |
353 | /* Set the flag now: Sorting and compacting destroys the builder data structure. */ |
354 | pv->isCompacted=TRUE; |
355 | |
356 | rows=pv->rows; |
357 | columns=pv->columns; |
358 | U_ASSERT(columns>=3); /* upvec_open asserts this */ |
359 | valueColumns=columns-2; /* not counting start & limit */ |
360 | |
361 | /* sort the properties vectors to find unique vector values */ |
362 | uprv_sortArray(pv->v, rows, columns*4, |
363 | upvec_compareRows, pv, FALSE, pErrorCode); |
364 | if(U_FAILURE(*pErrorCode)) { |
365 | return; |
366 | } |
367 | |
368 | /* |
369 | * Find and set the special values. |
370 | * This has to do almost the same work as the compaction below, |
371 | * to find the indexes where the special-value rows will move. |
372 | */ |
373 | row=pv->v; |
374 | count=-valueColumns; |
375 | for(i=0; i<rows; ++i) { |
376 | start=(UChar32)row[0]; |
377 | |
378 | /* count a new values vector if it is different from the current one */ |
379 | if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) { |
380 | count+=valueColumns; |
381 | } |
382 | |
383 | if(start>=UPVEC_FIRST_SPECIAL_CP) { |
384 | handler(context, start, start, count, row+2, valueColumns, pErrorCode); |
385 | if(U_FAILURE(*pErrorCode)) { |
386 | return; |
387 | } |
388 | } |
389 | |
390 | row+=columns; |
391 | } |
392 | |
393 | /* count is at the beginning of the last vector, add valueColumns to include that last vector */ |
394 | count+=valueColumns; |
395 | |
396 | /* Call the handler once more to signal the start of delivering real values. */ |
397 | handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP, |
398 | count, row-valueColumns, valueColumns, pErrorCode); |
399 | if(U_FAILURE(*pErrorCode)) { |
400 | return; |
401 | } |
402 | |
403 | /* |
404 | * Move vector contents up to a contiguous array with only unique |
405 | * vector values, and call the handler function for each vector. |
406 | * |
407 | * This destroys the Properties Vector structure and replaces it |
408 | * with an array of just vector values. |
409 | */ |
410 | row=pv->v; |
411 | count=-valueColumns; |
412 | for(i=0; i<rows; ++i) { |
413 | /* fetch these first before memmove() may overwrite them */ |
414 | start=(UChar32)row[0]; |
415 | limit=(UChar32)row[1]; |
416 | |
417 | /* add a new values vector if it is different from the current one */ |
418 | if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) { |
419 | count+=valueColumns; |
420 | uprv_memmove(pv->v+count, row+2, (size_t)valueColumns*4); |
421 | } |
422 | |
423 | if(start<UPVEC_FIRST_SPECIAL_CP) { |
424 | handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode); |
425 | if(U_FAILURE(*pErrorCode)) { |
426 | return; |
427 | } |
428 | } |
429 | |
430 | row+=columns; |
431 | } |
432 | |
433 | /* count is at the beginning of the last vector, add one to include that last vector */ |
434 | pv->rows=count/valueColumns+1; |
435 | } |
436 | |
437 | U_CAPI const uint32_t * U_EXPORT2 |
438 | upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) { |
439 | if(!pv->isCompacted) { |
440 | return NULL; |
441 | } |
442 | if(pRows!=NULL) { |
443 | *pRows=pv->rows; |
444 | } |
445 | if(pColumns!=NULL) { |
446 | *pColumns=pv->columns-2; |
447 | } |
448 | return pv->v; |
449 | } |
450 | |
451 | U_CAPI uint32_t * U_EXPORT2 |
452 | upvec_cloneArray(const UPropsVectors *pv, |
453 | int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) { |
454 | uint32_t *clonedArray; |
455 | int32_t byteLength; |
456 | |
457 | if(U_FAILURE(*pErrorCode)) { |
458 | return NULL; |
459 | } |
460 | if(!pv->isCompacted) { |
461 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
462 | return NULL; |
463 | } |
464 | byteLength=pv->rows*(pv->columns-2)*4; |
465 | clonedArray=(uint32_t *)uprv_malloc(byteLength); |
466 | if(clonedArray==NULL) { |
467 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
468 | return NULL; |
469 | } |
470 | uprv_memcpy(clonedArray, pv->v, byteLength); |
471 | if(pRows!=NULL) { |
472 | *pRows=pv->rows; |
473 | } |
474 | if(pColumns!=NULL) { |
475 | *pColumns=pv->columns-2; |
476 | } |
477 | return clonedArray; |
478 | } |
479 | |
480 | U_CAPI UTrie2 * U_EXPORT2 |
481 | upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) { |
482 | UPVecToUTrie2Context toUTrie2={ NULL, 0, 0, 0 }; |
483 | upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode); |
484 | utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode); |
485 | if(U_FAILURE(*pErrorCode)) { |
486 | utrie2_close(toUTrie2.trie); |
487 | toUTrie2.trie=NULL; |
488 | } |
489 | return toUTrie2.trie; |
490 | } |
491 | |
492 | /* |
493 | * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts |
494 | * some 16-bit field and builds and returns a UTrie2. |
495 | */ |
496 | |
497 | U_CAPI void U_CALLCONV |
498 | upvec_compactToUTrie2Handler(void *context, |
499 | UChar32 start, UChar32 end, |
500 | int32_t rowIndex, uint32_t *row, int32_t columns, |
501 | UErrorCode *pErrorCode) { |
502 | (void)row; |
503 | (void)columns; |
504 | UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context; |
505 | if(start<UPVEC_FIRST_SPECIAL_CP) { |
506 | utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode); |
507 | } else { |
508 | switch(start) { |
509 | case UPVEC_INITIAL_VALUE_CP: |
510 | toUTrie2->initialValue=rowIndex; |
511 | break; |
512 | case UPVEC_ERROR_VALUE_CP: |
513 | toUTrie2->errorValue=rowIndex; |
514 | break; |
515 | case UPVEC_START_REAL_VALUES_CP: |
516 | toUTrie2->maxValue=rowIndex; |
517 | if(rowIndex>0xffff) { |
518 | /* too many rows for a 16-bit trie */ |
519 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
520 | } else { |
521 | toUTrie2->trie=utrie2_open(toUTrie2->initialValue, |
522 | toUTrie2->errorValue, pErrorCode); |
523 | } |
524 | break; |
525 | default: |
526 | break; |
527 | } |
528 | } |
529 | } |
530 | |