1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2002-2011, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: uset.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2002mar07
16* created by: Markus W. Scherer
17*
18* There are functions to efficiently serialize a USet into an array of uint16_t
19* and functions to use such a serialized form efficiently without
20* instantiating a new USet.
21*/
22
23#include "unicode/utypes.h"
24#include "unicode/uobject.h"
25#include "unicode/uset.h"
26#include "unicode/uniset.h"
27#include "cmemory.h"
28#include "unicode/ustring.h"
29#include "unicode/parsepos.h"
30
31U_NAMESPACE_USE
32
33U_CAPI USet* U_EXPORT2
34uset_openEmpty() {
35 return (USet*) new UnicodeSet();
36}
37
38U_CAPI USet* U_EXPORT2
39uset_open(UChar32 start, UChar32 end) {
40 return (USet*) new UnicodeSet(start, end);
41}
42
43U_CAPI void U_EXPORT2
44uset_close(USet* set) {
45 delete (UnicodeSet*) set;
46}
47
48U_CAPI USet * U_EXPORT2
49uset_clone(const USet *set) {
50 return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone());
51}
52
53U_CAPI UBool U_EXPORT2
54uset_isFrozen(const USet *set) {
55 return ((UnicodeSet*) set)->UnicodeSet::isFrozen();
56}
57
58U_CAPI void U_EXPORT2
59uset_freeze(USet *set) {
60 ((UnicodeSet*) set)->UnicodeSet::freeze();
61}
62
63U_CAPI USet * U_EXPORT2
64uset_cloneAsThawed(const USet *set) {
65 return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed());
66}
67
68U_CAPI void U_EXPORT2
69uset_set(USet* set,
70 UChar32 start, UChar32 end) {
71 ((UnicodeSet*) set)->UnicodeSet::set(start, end);
72}
73
74U_CAPI void U_EXPORT2
75uset_addAll(USet* set, const USet *additionalSet) {
76 ((UnicodeSet*) set)->UnicodeSet::addAll(*((const UnicodeSet*)additionalSet));
77}
78
79U_CAPI void U_EXPORT2
80uset_add(USet* set, UChar32 c) {
81 ((UnicodeSet*) set)->UnicodeSet::add(c);
82}
83
84U_CAPI void U_EXPORT2
85uset_addRange(USet* set, UChar32 start, UChar32 end) {
86 ((UnicodeSet*) set)->UnicodeSet::add(start, end);
87}
88
89U_CAPI void U_EXPORT2
90uset_addString(USet* set, const char16_t* str, int32_t strLen) {
91 // UnicodeString handles -1 for strLen
92 UnicodeString s(strLen<0, str, strLen);
93 ((UnicodeSet*) set)->UnicodeSet::add(s);
94}
95
96U_CAPI void U_EXPORT2
97uset_addAllCodePoints(USet* set, const char16_t *str, int32_t strLen) {
98 // UnicodeString handles -1 for strLen
99 UnicodeString s(str, strLen);
100 ((UnicodeSet*) set)->UnicodeSet::addAll(s);
101}
102
103U_CAPI void U_EXPORT2
104uset_remove(USet* set, UChar32 c) {
105 ((UnicodeSet*) set)->UnicodeSet::remove(c);
106}
107
108U_CAPI void U_EXPORT2
109uset_removeRange(USet* set, UChar32 start, UChar32 end) {
110 ((UnicodeSet*) set)->UnicodeSet::remove(start, end);
111}
112
113U_CAPI void U_EXPORT2
114uset_removeString(USet* set, const char16_t* str, int32_t strLen) {
115 UnicodeString s(strLen==-1, str, strLen);
116 ((UnicodeSet*) set)->UnicodeSet::remove(s);
117}
118
119U_CAPI void U_EXPORT2
120uset_removeAllCodePoints(USet *set, const char16_t *str, int32_t length) {
121 UnicodeString s(length==-1, str, length);
122 ((UnicodeSet*) set)->UnicodeSet::removeAll(s);
123}
124
125U_CAPI void U_EXPORT2
126uset_removeAll(USet* set, const USet* remove) {
127 ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove);
128}
129
130U_CAPI void U_EXPORT2
131uset_retain(USet* set, UChar32 start, UChar32 end) {
132 ((UnicodeSet*) set)->UnicodeSet::retain(start, end);
133}
134
135U_CAPI void U_EXPORT2
136uset_retainString(USet *set, const char16_t *str, int32_t length) {
137 UnicodeString s(length==-1, str, length);
138 ((UnicodeSet*) set)->UnicodeSet::retain(s);
139}
140
141U_CAPI void U_EXPORT2
142uset_retainAllCodePoints(USet *set, const char16_t *str, int32_t length) {
143 UnicodeString s(length==-1, str, length);
144 ((UnicodeSet*) set)->UnicodeSet::retainAll(s);
145}
146
147U_CAPI void U_EXPORT2
148uset_retainAll(USet* set, const USet* retain) {
149 ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain);
150}
151
152U_CAPI void U_EXPORT2
153uset_compact(USet* set) {
154 ((UnicodeSet*) set)->UnicodeSet::compact();
155}
156
157U_CAPI void U_EXPORT2
158uset_complement(USet* set) {
159 ((UnicodeSet*) set)->UnicodeSet::complement();
160}
161
162U_CAPI void U_EXPORT2
163uset_complementRange(USet *set, UChar32 start, UChar32 end) {
164 ((UnicodeSet*) set)->UnicodeSet::complement(start, end);
165}
166
167U_CAPI void U_EXPORT2
168uset_complementString(USet *set, const char16_t *str, int32_t length) {
169 UnicodeString s(length==-1, str, length);
170 ((UnicodeSet*) set)->UnicodeSet::complement(s);
171}
172
173U_CAPI void U_EXPORT2
174uset_complementAllCodePoints(USet *set, const char16_t *str, int32_t length) {
175 UnicodeString s(length==-1, str, length);
176 ((UnicodeSet*) set)->UnicodeSet::complementAll(s);
177}
178
179U_CAPI void U_EXPORT2
180uset_complementAll(USet* set, const USet* complement) {
181 ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement);
182}
183
184U_CAPI void U_EXPORT2
185uset_clear(USet* set) {
186 ((UnicodeSet*) set)->UnicodeSet::clear();
187}
188
189U_CAPI void U_EXPORT2
190uset_removeAllStrings(USet* set) {
191 ((UnicodeSet*) set)->UnicodeSet::removeAllStrings();
192}
193
194U_CAPI UBool U_EXPORT2
195uset_isEmpty(const USet* set) {
196 return ((const UnicodeSet*) set)->UnicodeSet::isEmpty();
197}
198
199U_CAPI UBool U_EXPORT2
200uset_hasStrings(const USet* set) {
201 return ((const UnicodeSet*) set)->UnicodeSet::hasStrings();
202}
203
204U_CAPI UBool U_EXPORT2
205uset_contains(const USet* set, UChar32 c) {
206 return ((const UnicodeSet*) set)->UnicodeSet::contains(c);
207}
208
209U_CAPI UBool U_EXPORT2
210uset_containsRange(const USet* set, UChar32 start, UChar32 end) {
211 return ((const UnicodeSet*) set)->UnicodeSet::contains(start, end);
212}
213
214U_CAPI UBool U_EXPORT2
215uset_containsString(const USet* set, const char16_t* str, int32_t strLen) {
216 UnicodeString s(strLen==-1, str, strLen);
217 return ((const UnicodeSet*) set)->UnicodeSet::contains(s);
218}
219
220U_CAPI UBool U_EXPORT2
221uset_containsAll(const USet* set1, const USet* set2) {
222 return ((const UnicodeSet*) set1)->UnicodeSet::containsAll(* (const UnicodeSet*) set2);
223}
224
225U_CAPI UBool U_EXPORT2
226uset_containsAllCodePoints(const USet* set, const char16_t *str, int32_t strLen) {
227 // Create a string alias, since nothing is being added to the set.
228 UnicodeString s(strLen==-1, str, strLen);
229 return ((const UnicodeSet*) set)->UnicodeSet::containsAll(s);
230}
231
232U_CAPI UBool U_EXPORT2
233uset_containsNone(const USet* set1, const USet* set2) {
234 return ((const UnicodeSet*) set1)->UnicodeSet::containsNone(* (const UnicodeSet*) set2);
235}
236
237U_CAPI UBool U_EXPORT2
238uset_containsSome(const USet* set1, const USet* set2) {
239 return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2);
240}
241
242U_CAPI int32_t U_EXPORT2
243uset_span(const USet *set, const char16_t *s, int32_t length, USetSpanCondition spanCondition) {
244 return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition);
245}
246
247U_CAPI int32_t U_EXPORT2
248uset_spanBack(const USet *set, const char16_t *s, int32_t length, USetSpanCondition spanCondition) {
249 return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition);
250}
251
252U_CAPI int32_t U_EXPORT2
253uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) {
254 return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition);
255}
256
257U_CAPI int32_t U_EXPORT2
258uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) {
259 return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition);
260}
261
262U_CAPI UBool U_EXPORT2
263uset_equals(const USet* set1, const USet* set2) {
264 return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2;
265}
266
267U_CAPI int32_t U_EXPORT2
268uset_indexOf(const USet* set, UChar32 c) {
269 return ((UnicodeSet*) set)->UnicodeSet::indexOf(c);
270}
271
272U_CAPI UChar32 U_EXPORT2
273uset_charAt(const USet* set, int32_t index) {
274 return ((UnicodeSet*) set)->UnicodeSet::charAt(index);
275}
276
277U_CAPI int32_t U_EXPORT2
278uset_size(const USet* set) {
279 return ((const UnicodeSet*) set)->UnicodeSet::size();
280}
281
282U_NAMESPACE_BEGIN
283/**
284 * This class only exists to provide access to the UnicodeSet private
285 * USet support API. Declaring a class a friend is more portable than
286 * trying to declare extern "C" functions as friends.
287 */
288class USetAccess /* not : public UObject because all methods are static */ {
289public:
290 /* Try to have the compiler inline these*/
291 inline static int32_t getStringCount(const UnicodeSet& set) {
292 return set.stringsSize();
293 }
294 inline static const UnicodeString* getString(const UnicodeSet& set,
295 int32_t i) {
296 return set.getString(i);
297 }
298private:
299 /* do not instantiate*/
300 USetAccess();
301};
302U_NAMESPACE_END
303
304U_CAPI int32_t U_EXPORT2
305uset_getRangeCount(const USet *set) {
306 return ((const UnicodeSet *)set)->UnicodeSet::getRangeCount();
307}
308
309U_CAPI int32_t U_EXPORT2
310uset_getItemCount(const USet* uset) {
311 const UnicodeSet& set = *(const UnicodeSet*)uset;
312 return set.getRangeCount() + USetAccess::getStringCount(set);
313}
314
315U_CAPI int32_t U_EXPORT2
316uset_getItem(const USet* uset, int32_t itemIndex,
317 UChar32* start, UChar32* end,
318 char16_t* str, int32_t strCapacity,
319 UErrorCode* ec) {
320 if (U_FAILURE(*ec)) return 0;
321 const UnicodeSet& set = *(const UnicodeSet*)uset;
322 int32_t rangeCount;
323
324 if (itemIndex < 0) {
325 *ec = U_ILLEGAL_ARGUMENT_ERROR;
326 return -1;
327 } else if (itemIndex < (rangeCount = set.getRangeCount())) {
328 *start = set.getRangeStart(itemIndex);
329 *end = set.getRangeEnd(itemIndex);
330 return 0;
331 } else {
332 itemIndex -= rangeCount;
333 if (itemIndex < USetAccess::getStringCount(set)) {
334 const UnicodeString* s = USetAccess::getString(set, itemIndex);
335 return s->extract(str, strCapacity, *ec);
336 } else {
337 *ec = U_INDEX_OUTOFBOUNDS_ERROR;
338 return -1;
339 }
340 }
341}
342
343//U_CAPI UBool U_EXPORT2
344//uset_getRange(const USet* set, int32_t rangeIndex,
345// UChar32* pStart, UChar32* pEnd) {
346// if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) {
347// return false;
348// }
349// const UnicodeSet* us = (const UnicodeSet*) set;
350// *pStart = us->getRangeStart(rangeIndex);
351// *pEnd = us->getRangeEnd(rangeIndex);
352// return true;
353//}
354
355/*
356 * Serialize a USet into 16-bit units.
357 * Store BMP code points as themselves with one 16-bit unit each.
358 *
359 * Important: the code points in the array are in ascending order,
360 * therefore all BMP code points precede all supplementary code points.
361 *
362 * Store each supplementary code point in 2 16-bit units,
363 * simply with higher-then-lower 16-bit halves.
364 *
365 * Precede the entire list with the length.
366 * If there are supplementary code points, then set bit 15 in the length
367 * and add the bmpLength between it and the array.
368 *
369 * In other words:
370 * - all BMP: (length=bmpLength) BMP, .., BMP
371 * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, ..
372 */
373U_CAPI int32_t U_EXPORT2
374uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) {
375 if (ec==nullptr || U_FAILURE(*ec)) {
376 return 0;
377 }
378
379 return ((const UnicodeSet*) set)->UnicodeSet::serialize(dest, destCapacity,* ec);
380}
381
382U_CAPI UBool U_EXPORT2
383uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) {
384 int32_t length;
385
386 if(fillSet==nullptr) {
387 return false;
388 }
389 if(src==nullptr || srcLength<=0) {
390 fillSet->length=fillSet->bmpLength=0;
391 return false;
392 }
393
394 length=*src++;
395 if(length&0x8000) {
396 /* there are supplementary values */
397 length&=0x7fff;
398 if(srcLength<(2+length)) {
399 fillSet->length=fillSet->bmpLength=0;
400 return false;
401 }
402 fillSet->bmpLength=*src++;
403 } else {
404 /* only BMP values */
405 if(srcLength<(1+length)) {
406 fillSet->length=fillSet->bmpLength=0;
407 return false;
408 }
409 fillSet->bmpLength=length;
410 }
411 fillSet->array=src;
412 fillSet->length=length;
413 return true;
414}
415
416U_CAPI void U_EXPORT2
417uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) {
418 if(fillSet==nullptr || (uint32_t)c>0x10ffff) {
419 return;
420 }
421
422 fillSet->array=fillSet->staticArray;
423 if(c<0xffff) {
424 fillSet->bmpLength=fillSet->length=2;
425 fillSet->staticArray[0]=(uint16_t)c;
426 fillSet->staticArray[1]=(uint16_t)c+1;
427 } else if(c==0xffff) {
428 fillSet->bmpLength=1;
429 fillSet->length=3;
430 fillSet->staticArray[0]=0xffff;
431 fillSet->staticArray[1]=1;
432 fillSet->staticArray[2]=0;
433 } else if(c<0x10ffff) {
434 fillSet->bmpLength=0;
435 fillSet->length=4;
436 fillSet->staticArray[0]=(uint16_t)(c>>16);
437 fillSet->staticArray[1]=(uint16_t)c;
438 ++c;
439 fillSet->staticArray[2]=(uint16_t)(c>>16);
440 fillSet->staticArray[3]=(uint16_t)c;
441 } else /* c==0x10ffff */ {
442 fillSet->bmpLength=0;
443 fillSet->length=2;
444 fillSet->staticArray[0]=0x10;
445 fillSet->staticArray[1]=0xffff;
446 }
447}
448
449U_CAPI UBool U_EXPORT2
450uset_serializedContains(const USerializedSet* set, UChar32 c) {
451 const uint16_t* array;
452
453 if(set==nullptr || (uint32_t)c>0x10ffff) {
454 return false;
455 }
456
457 array=set->array;
458 if(c<=0xffff) {
459 /* find c in the BMP part */
460 int32_t lo = 0;
461 int32_t hi = set->bmpLength-1;
462 if (c < array[0]) {
463 hi = 0;
464 } else if (c < array[hi]) {
465 for(;;) {
466 int32_t i = (lo + hi) >> 1;
467 if (i == lo) {
468 break; // Done!
469 } else if (c < array[i]) {
470 hi = i;
471 } else {
472 lo = i;
473 }
474 }
475 } else {
476 hi += 1;
477 }
478 return (UBool)(hi&1);
479 } else {
480 /* find c in the supplementary part */
481 uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c;
482 int32_t base = set->bmpLength;
483 int32_t lo = 0;
484 int32_t hi = set->length - 2 - base;
485 if (high < array[base] || (high==array[base] && low<array[base+1])) {
486 hi = 0;
487 } else if (high < array[base+hi] || (high==array[base+hi] && low<array[base+hi+1])) {
488 for (;;) {
489 int32_t i = ((lo + hi) >> 1) & ~1; // Guarantee even result
490 int32_t iabs = i + base;
491 if (i == lo) {
492 break; // Done!
493 } else if (high < array[iabs] || (high==array[iabs] && low<array[iabs+1])) {
494 hi = i;
495 } else {
496 lo = i;
497 }
498 }
499 } else {
500 hi += 2;
501 }
502 /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */
503 return (UBool)(((hi+(base<<1))&2)!=0);
504 }
505}
506
507U_CAPI int32_t U_EXPORT2
508uset_getSerializedRangeCount(const USerializedSet* set) {
509 if(set==nullptr) {
510 return 0;
511 }
512
513 return (set->bmpLength+(set->length-set->bmpLength)/2+1)/2;
514}
515
516U_CAPI UBool U_EXPORT2
517uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
518 UChar32* pStart, UChar32* pEnd) {
519 const uint16_t* array;
520 int32_t bmpLength, length;
521
522 if(set==nullptr || rangeIndex<0 || pStart==nullptr || pEnd==nullptr) {
523 return false;
524 }
525
526 array=set->array;
527 length=set->length;
528 bmpLength=set->bmpLength;
529
530 rangeIndex*=2; /* address start/limit pairs */
531 if(rangeIndex<bmpLength) {
532 *pStart=array[rangeIndex++];
533 if(rangeIndex<bmpLength) {
534 *pEnd=array[rangeIndex]-1;
535 } else if(rangeIndex<length) {
536 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1;
537 } else {
538 *pEnd=0x10ffff;
539 }
540 return true;
541 } else {
542 rangeIndex-=bmpLength;
543 rangeIndex*=2; /* address pairs of pairs of units */
544 length-=bmpLength;
545 if(rangeIndex<length) {
546 array+=bmpLength;
547 *pStart=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1];
548 rangeIndex+=2;
549 if(rangeIndex<length) {
550 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1;
551 } else {
552 *pEnd=0x10ffff;
553 }
554 return true;
555 } else {
556 return false;
557 }
558 }
559}
560
561// TODO The old, internal uset.c had an efficient uset_containsOne function.
562// Returned the one and only code point, or else -1 or something.
563// Consider adding such a function to both C and C++ UnicodeSet/uset.
564// See tools/gennorm/store.c for usage, now usetContainsOne there.
565
566// TODO Investigate incorporating this code into UnicodeSet to improve
567// efficiency.
568// ---
569// #define USET_GROW_DELTA 20
570//
571// static int32_t
572// findChar(const UChar32* array, int32_t length, UChar32 c) {
573// int32_t i;
574//
575// /* check the last range limit first for more efficient appending */
576// if(length>0) {
577// if(c>=array[length-1]) {
578// return length;
579// }
580//
581// /* do not check the last range limit again in the loop below */
582// --length;
583// }
584//
585// for(i=0; i<length && c>=array[i]; ++i) {}
586// return i;
587// }
588//
589// static UBool
590// addRemove(USet* set, UChar32 c, int32_t doRemove) {
591// int32_t i, length, more;
592//
593// if(set==nullptr || (uint32_t)c>0x10ffff) {
594// return false;
595// }
596//
597// length=set->length;
598// i=findChar(set->array, length, c);
599// if((i&1)^doRemove) {
600// /* c is already in the set */
601// return true;
602// }
603//
604// /* how many more array items do we need? */
605// if(i<length && (c+1)==set->array[i]) {
606// /* c is just before the following range, extend that in-place by one */
607// set->array[i]=c;
608// if(i>0) {
609// --i;
610// if(c==set->array[i]) {
611// /* the previous range collapsed, remove it */
612// set->length=length-=2;
613// if(i<length) {
614// uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
615// }
616// }
617// }
618// return true;
619// } else if(i>0 && c==set->array[i-1]) {
620// /* c is just after the previous range, extend that in-place by one */
621// if(++c<=0x10ffff) {
622// set->array[i-1]=c;
623// if(i<length && c==set->array[i]) {
624// /* the following range collapsed, remove it */
625// --i;
626// set->length=length-=2;
627// if(i<length) {
628// uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
629// }
630// }
631// } else {
632// /* extend the previous range (had limit 0x10ffff) to the end of Unicode */
633// set->length=i-1;
634// }
635// return true;
636// } else if(i==length && c==0x10ffff) {
637// /* insert one range limit c */
638// more=1;
639// } else {
640// /* insert two range limits c, c+1 */
641// more=2;
642// }
643//
644// /* insert <more> range limits */
645// if(length+more>set->capacity) {
646// /* reallocate */
647// int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA;
648// UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4);
649// if(newArray==nullptr) {
650// return false;
651// }
652// set->capacity=newCapacity;
653// uprv_memcpy(newArray, set->array, length*4);
654//
655// if(set->array!=set->staticBuffer) {
656// uprv_free(set->array);
657// }
658// set->array=newArray;
659// }
660//
661// if(i<length) {
662// uprv_memmove(set->array+i+more, set->array+i, (length-i)*4);
663// }
664// set->array[i]=c;
665// if(more==2) {
666// set->array[i+1]=c+1;
667// }
668// set->length+=more;
669//
670// return true;
671// }
672//
673// U_CAPI UBool U_EXPORT2
674// uset_add(USet* set, UChar32 c) {
675// return addRemove(set, c, 0);
676// }
677//
678// U_CAPI void U_EXPORT2
679// uset_remove(USet* set, UChar32 c) {
680// addRemove(set, c, 1);
681// }
682