1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2002-2011, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: propsvec.c
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2002feb22
16* created by: Markus W. Scherer
17*
18* Store bits (Unicode character properties) in bit set vectors.
19*/
20
21#include <stdlib.h>
22#include "unicode/utypes.h"
23#include "cmemory.h"
24#include "utrie.h"
25#include "utrie2.h"
26#include "uarrsort.h"
27#include "propsvec.h"
28#include "uassert.h"
29
30struct UPropsVectors {
31 uint32_t *v;
32 int32_t columns; /* number of columns, plus two for start & limit values */
33 int32_t maxRows;
34 int32_t rows;
35 int32_t prevRow; /* search optimization: remember last row seen */
36 UBool isCompacted;
37};
38
39#define UPVEC_INITIAL_ROWS (1<<12)
40#define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
41#define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
42
43U_CAPI UPropsVectors * U_EXPORT2
44upvec_open(int32_t columns, UErrorCode *pErrorCode) {
45 UPropsVectors *pv;
46 uint32_t *v, *row;
47 uint32_t cp;
48
49 if(U_FAILURE(*pErrorCode)) {
50 return nullptr;
51 }
52 if(columns<1) {
53 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
54 return nullptr;
55 }
56 columns+=2; /* count range start and limit columns */
57
58 pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
59 v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
60 if(pv==nullptr || v==nullptr) {
61 uprv_free(pv);
62 uprv_free(v);
63 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
64 return nullptr;
65 }
66 uprv_memset(pv, 0, sizeof(UPropsVectors));
67 pv->v=v;
68 pv->columns=columns;
69 pv->maxRows=UPVEC_INITIAL_ROWS;
70 pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
71
72 /* set the all-Unicode row and the special-value rows */
73 row=pv->v;
74 uprv_memset(row, 0, pv->rows*columns*4);
75 row[0]=0;
76 row[1]=0x110000;
77 row+=columns;
78 for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
79 row[0]=cp;
80 row[1]=cp+1;
81 row+=columns;
82 }
83 return pv;
84}
85
86U_CAPI void U_EXPORT2
87upvec_close(UPropsVectors *pv) {
88 if(pv!=nullptr) {
89 uprv_free(pv->v);
90 uprv_free(pv);
91 }
92}
93
94static uint32_t *
95_findRow(UPropsVectors *pv, UChar32 rangeStart) {
96 uint32_t *row;
97 int32_t columns, i, start, limit, prevRow;
98
99 columns=pv->columns;
100 limit=pv->rows;
101 prevRow=pv->prevRow;
102
103 /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
104 row=pv->v+prevRow*columns;
105 if(rangeStart>=(UChar32)row[0]) {
106 if(rangeStart<(UChar32)row[1]) {
107 /* same row as last seen */
108 return row;
109 } else if(rangeStart<(UChar32)(row+=columns)[1]) {
110 /* next row after the last one */
111 pv->prevRow=prevRow+1;
112 return row;
113 } else if(rangeStart<(UChar32)(row+=columns)[1]) {
114 /* second row after the last one */
115 pv->prevRow=prevRow+2;
116 return row;
117 } else if((rangeStart-(UChar32)row[1])<10) {
118 /* we are close, continue looping */
119 prevRow+=2;
120 do {
121 ++prevRow;
122 row+=columns;
123 } while(rangeStart>=(UChar32)row[1]);
124 pv->prevRow=prevRow;
125 return row;
126 }
127 } else if(rangeStart<(UChar32)pv->v[1]) {
128 /* the very first row */
129 pv->prevRow=0;
130 return pv->v;
131 }
132
133 /* do a binary search for the start of the range */
134 start=0;
135 while(start<limit-1) {
136 i=(start+limit)/2;
137 row=pv->v+i*columns;
138 if(rangeStart<(UChar32)row[0]) {
139 limit=i;
140 } else if(rangeStart<(UChar32)row[1]) {
141 pv->prevRow=i;
142 return row;
143 } else {
144 start=i;
145 }
146 }
147
148 /* must be found because all ranges together always cover all of Unicode */
149 pv->prevRow=start;
150 return pv->v+start*columns;
151}
152
153U_CAPI void U_EXPORT2
154upvec_setValue(UPropsVectors *pv,
155 UChar32 start, UChar32 end,
156 int32_t column,
157 uint32_t value, uint32_t mask,
158 UErrorCode *pErrorCode) {
159 uint32_t *firstRow, *lastRow;
160 int32_t columns;
161 UChar32 limit;
162 UBool splitFirstRow, splitLastRow;
163
164 /* argument checking */
165 if(U_FAILURE(*pErrorCode)) {
166 return;
167 }
168 if( pv==nullptr ||
169 start<0 || start>end || end>UPVEC_MAX_CP ||
170 column<0 || column>=(pv->columns-2)
171 ) {
172 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
173 return;
174 }
175 if(pv->isCompacted) {
176 *pErrorCode=U_NO_WRITE_PERMISSION;
177 return;
178 }
179 limit=end+1;
180
181 /* initialize */
182 columns=pv->columns;
183 column+=2; /* skip range start and limit columns */
184 value&=mask;
185
186 /* find the rows whose ranges overlap with the input range */
187
188 /* find the first and last rows, always successful */
189 firstRow=_findRow(pv, start);
190 lastRow=_findRow(pv, end);
191
192 /*
193 * Rows need to be split if they partially overlap with the
194 * input range (only possible for the first and last rows)
195 * and if their value differs from the input value.
196 */
197 splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
198 splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
199
200 /* split first/last rows if necessary */
201 if(splitFirstRow || splitLastRow) {
202 int32_t count, rows;
203
204 rows=pv->rows;
205 if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
206 uint32_t *newVectors;
207 int32_t newMaxRows;
208
209 if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
210 newMaxRows=UPVEC_MEDIUM_ROWS;
211 } else if(pv->maxRows<UPVEC_MAX_ROWS) {
212 newMaxRows=UPVEC_MAX_ROWS;
213 } else {
214 /* Implementation bug, or UPVEC_MAX_ROWS too low. */
215 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
216 return;
217 }
218 newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
219 if(newVectors==nullptr) {
220 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
221 return;
222 }
223 uprv_memcpy(newVectors, pv->v, (size_t)rows*columns*4);
224 firstRow=newVectors+(firstRow-pv->v);
225 lastRow=newVectors+(lastRow-pv->v);
226 uprv_free(pv->v);
227 pv->v=newVectors;
228 pv->maxRows=newMaxRows;
229 }
230
231 /* count the number of row cells to move after the last row, and move them */
232 count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
233 if(count>0) {
234 uprv_memmove(
235 lastRow+(1+splitFirstRow+splitLastRow)*columns,
236 lastRow+columns,
237 count*4);
238 }
239 pv->rows=rows+splitFirstRow+splitLastRow;
240
241 /* split the first row, and move the firstRow pointer to the second part */
242 if(splitFirstRow) {
243 /* copy all affected rows up one and move the lastRow pointer */
244 count = (int32_t)((lastRow-firstRow)+columns);
245 uprv_memmove(firstRow+columns, firstRow, (size_t)count*4);
246 lastRow+=columns;
247
248 /* split the range and move the firstRow pointer */
249 firstRow[1]=firstRow[columns]=(uint32_t)start;
250 firstRow+=columns;
251 }
252
253 /* split the last row */
254 if(splitLastRow) {
255 /* copy the last row data */
256 uprv_memcpy(lastRow+columns, lastRow, (size_t)columns*4);
257
258 /* split the range and move the firstRow pointer */
259 lastRow[1]=lastRow[columns]=(uint32_t)limit;
260 }
261 }
262
263 /* set the "row last seen" to the last row for the range */
264 pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
265
266 /* set the input value in all remaining rows */
267 firstRow+=column;
268 lastRow+=column;
269 mask=~mask;
270 for(;;) {
271 *firstRow=(*firstRow&mask)|value;
272 if(firstRow==lastRow) {
273 break;
274 }
275 firstRow+=columns;
276 }
277}
278
279U_CAPI uint32_t U_EXPORT2
280upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
281 uint32_t *row;
282 UPropsVectors *ncpv;
283
284 if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
285 return 0;
286 }
287 ncpv=(UPropsVectors *)pv;
288 row=_findRow(ncpv, c);
289 return row[2+column];
290}
291
292U_CAPI uint32_t * U_EXPORT2
293upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
294 UChar32 *pRangeStart, UChar32 *pRangeEnd) {
295 uint32_t *row;
296 int32_t columns;
297
298 if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
299 return nullptr;
300 }
301
302 columns=pv->columns;
303 row=pv->v+rowIndex*columns;
304 if(pRangeStart!=nullptr) {
305 *pRangeStart=(UChar32)row[0];
306 }
307 if(pRangeEnd!=nullptr) {
308 *pRangeEnd=(UChar32)row[1]-1;
309 }
310 return row+2;
311}
312
313static int32_t U_CALLCONV
314upvec_compareRows(const void *context, const void *l, const void *r) {
315 const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
316 const UPropsVectors *pv=(const UPropsVectors *)context;
317 int32_t i, count, columns;
318
319 count=columns=pv->columns; /* includes start/limit columns */
320
321 /* start comparing after start/limit but wrap around to them */
322 i=2;
323 do {
324 if(left[i]!=right[i]) {
325 return left[i]<right[i] ? -1 : 1;
326 }
327 if(++i==columns) {
328 i=0;
329 }
330 } while(--count>0);
331
332 return 0;
333}
334
335U_CAPI void U_EXPORT2
336upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
337 uint32_t *row;
338 int32_t i, columns, valueColumns, rows, count;
339 UChar32 start, limit;
340
341 /* argument checking */
342 if(U_FAILURE(*pErrorCode)) {
343 return;
344 }
345 if(handler==nullptr) {
346 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
347 return;
348 }
349 if(pv->isCompacted) {
350 return;
351 }
352
353 /* Set the flag now: Sorting and compacting destroys the builder data structure. */
354 pv->isCompacted=true;
355
356 rows=pv->rows;
357 columns=pv->columns;
358 U_ASSERT(columns>=3); /* upvec_open asserts this */
359 valueColumns=columns-2; /* not counting start & limit */
360
361 /* sort the properties vectors to find unique vector values */
362 uprv_sortArray(pv->v, rows, columns*4,
363 upvec_compareRows, pv, false, pErrorCode);
364 if(U_FAILURE(*pErrorCode)) {
365 return;
366 }
367
368 /*
369 * Find and set the special values.
370 * This has to do almost the same work as the compaction below,
371 * to find the indexes where the special-value rows will move.
372 */
373 row=pv->v;
374 count=-valueColumns;
375 for(i=0; i<rows; ++i) {
376 start=(UChar32)row[0];
377
378 /* count a new values vector if it is different from the current one */
379 if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
380 count+=valueColumns;
381 }
382
383 if(start>=UPVEC_FIRST_SPECIAL_CP) {
384 handler(context, start, start, count, row+2, valueColumns, pErrorCode);
385 if(U_FAILURE(*pErrorCode)) {
386 return;
387 }
388 }
389
390 row+=columns;
391 }
392
393 /* count is at the beginning of the last vector, add valueColumns to include that last vector */
394 count+=valueColumns;
395
396 /* Call the handler once more to signal the start of delivering real values. */
397 handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
398 count, row-valueColumns, valueColumns, pErrorCode);
399 if(U_FAILURE(*pErrorCode)) {
400 return;
401 }
402
403 /*
404 * Move vector contents up to a contiguous array with only unique
405 * vector values, and call the handler function for each vector.
406 *
407 * This destroys the Properties Vector structure and replaces it
408 * with an array of just vector values.
409 */
410 row=pv->v;
411 count=-valueColumns;
412 for(i=0; i<rows; ++i) {
413 /* fetch these first before memmove() may overwrite them */
414 start=(UChar32)row[0];
415 limit=(UChar32)row[1];
416
417 /* add a new values vector if it is different from the current one */
418 if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
419 count+=valueColumns;
420 uprv_memmove(pv->v+count, row+2, (size_t)valueColumns*4);
421 }
422
423 if(start<UPVEC_FIRST_SPECIAL_CP) {
424 handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
425 if(U_FAILURE(*pErrorCode)) {
426 return;
427 }
428 }
429
430 row+=columns;
431 }
432
433 /* count is at the beginning of the last vector, add one to include that last vector */
434 pv->rows=count/valueColumns+1;
435}
436
437U_CAPI const uint32_t * U_EXPORT2
438upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
439 if(!pv->isCompacted) {
440 return nullptr;
441 }
442 if(pRows!=nullptr) {
443 *pRows=pv->rows;
444 }
445 if(pColumns!=nullptr) {
446 *pColumns=pv->columns-2;
447 }
448 return pv->v;
449}
450
451U_CAPI uint32_t * U_EXPORT2
452upvec_cloneArray(const UPropsVectors *pv,
453 int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
454 uint32_t *clonedArray;
455 int32_t byteLength;
456
457 if(U_FAILURE(*pErrorCode)) {
458 return nullptr;
459 }
460 if(!pv->isCompacted) {
461 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
462 return nullptr;
463 }
464 byteLength=pv->rows*(pv->columns-2)*4;
465 clonedArray=(uint32_t *)uprv_malloc(byteLength);
466 if(clonedArray==nullptr) {
467 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
468 return nullptr;
469 }
470 uprv_memcpy(clonedArray, pv->v, byteLength);
471 if(pRows!=nullptr) {
472 *pRows=pv->rows;
473 }
474 if(pColumns!=nullptr) {
475 *pColumns=pv->columns-2;
476 }
477 return clonedArray;
478}
479
480U_CAPI UTrie2 * U_EXPORT2
481upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
482 UPVecToUTrie2Context toUTrie2={ nullptr, 0, 0, 0 };
483 upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
484 utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
485 if(U_FAILURE(*pErrorCode)) {
486 utrie2_close(toUTrie2.trie);
487 toUTrie2.trie=nullptr;
488 }
489 return toUTrie2.trie;
490}
491
492/*
493 * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
494 * some 16-bit field and builds and returns a UTrie2.
495 */
496
497U_CAPI void U_CALLCONV
498upvec_compactToUTrie2Handler(void *context,
499 UChar32 start, UChar32 end,
500 int32_t rowIndex, uint32_t *row, int32_t columns,
501 UErrorCode *pErrorCode) {
502 (void)row;
503 (void)columns;
504 UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
505 if(start<UPVEC_FIRST_SPECIAL_CP) {
506 utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, true, pErrorCode);
507 } else {
508 switch(start) {
509 case UPVEC_INITIAL_VALUE_CP:
510 toUTrie2->initialValue=rowIndex;
511 break;
512 case UPVEC_ERROR_VALUE_CP:
513 toUTrie2->errorValue=rowIndex;
514 break;
515 case UPVEC_START_REAL_VALUES_CP:
516 toUTrie2->maxValue=rowIndex;
517 if(rowIndex>0xffff) {
518 /* too many rows for a 16-bit trie */
519 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
520 } else {
521 toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
522 toUTrie2->errorValue, pErrorCode);
523 }
524 break;
525 default:
526 break;
527 }
528 }
529}
530