1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2002-2014, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: uset.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2002mar07
16* created by: Markus W. Scherer
17*
18* C version of UnicodeSet.
19*/
20
21
22/**
23 * \file
24 * \brief C API: Unicode Set
25 *
26 * <p>This is a C wrapper around the C++ UnicodeSet class.</p>
27 */
28
29#ifndef __USET_H__
30#define __USET_H__
31
32#include "unicode/utypes.h"
33#include "unicode/uchar.h"
34
35#if U_SHOW_CPLUSPLUS_API
36#include "unicode/localpointer.h"
37#endif // U_SHOW_CPLUSPLUS_API
38
39#ifndef USET_DEFINED
40
41#ifndef U_IN_DOXYGEN
42#define USET_DEFINED
43#endif
44/**
45 * USet is the C API type corresponding to C++ class UnicodeSet.
46 * Use the uset_* API to manipulate. Create with
47 * uset_open*, and destroy with uset_close.
48 * @stable ICU 2.4
49 */
50typedef struct USet USet;
51#endif
52
53/**
54 * Bitmask values to be passed to uset_openPatternOptions() or
55 * uset_applyPattern() taking an option parameter.
56 *
57 * Use at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
58 * These case options are mutually exclusive.
59 *
60 * Undefined options bits are ignored, and reserved for future use.
61 *
62 * @stable ICU 2.4
63 */
64enum {
65 /**
66 * Ignore white space within patterns unless quoted or escaped.
67 * @stable ICU 2.4
68 */
69 USET_IGNORE_SPACE = 1,
70
71 /**
72 * Enable case insensitive matching. E.g., "[ab]" with this flag
73 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
74 * match all except 'a', 'A', 'b', and 'B'. This performs a full
75 * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
76 *
77 * The resulting set is a superset of the input for the code points but
78 * not for the strings.
79 * It performs a case mapping closure of the code points and adds
80 * full case folding strings for the code points, and reduces strings of
81 * the original set to their full case folding equivalents.
82 *
83 * This is designed for case-insensitive matches, for example
84 * in regular expressions. The full code point case closure allows checking of
85 * an input character directly against the closure set.
86 * Strings are matched by comparing the case-folded form from the closure
87 * set with an incremental case folding of the string in question.
88 *
89 * The closure set will also contain single code points if the original
90 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
91 * This is not necessary (that is, redundant) for the above matching method
92 * but results in the same closure sets regardless of whether the original
93 * set contained the code point or a string.
94 *
95 * @stable ICU 2.4
96 */
97 USET_CASE_INSENSITIVE = 2,
98
99 /**
100 * Adds all case mappings for each element in the set.
101 * This adds the full lower-, title-, and uppercase mappings as well as the full case folding
102 * of each existing element in the set.
103 *
104 * Unlike the “case insensitive” options, this does not perform a closure.
105 * For example, it does not add 'ſ' (U+017F long s) for 's',
106 * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
107 *
108 * @stable ICU 3.2
109 */
110 USET_ADD_CASE_MAPPINGS = 4,
111
112#ifndef U_HIDE_DRAFT_API
113 /**
114 * Enable case insensitive matching.
115 * Same as USET_CASE_INSENSITIVE but using only Simple_Case_Folding (scf) mappings,
116 * which map each code point to one code point,
117 * not full Case_Folding (cf) mappings, which map some code points to multiple code points.
118 *
119 * This is designed for case-insensitive matches, for example in certain
120 * regular expression implementations where only Simple_Case_Folding mappings are used,
121 * such as in ECMAScript (JavaScript) regular expressions.
122 *
123 * @draft ICU 73
124 */
125 USET_SIMPLE_CASE_INSENSITIVE = 6
126#endif // U_HIDE_DRAFT_API
127};
128
129/**
130 * Argument values for whether span() and similar functions continue while
131 * the current character is contained vs. not contained in the set.
132 *
133 * The functionality is straightforward for sets with only single code points,
134 * without strings (which is the common case):
135 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same.
136 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED.
137 * - span() and spanBack() partition any string the same way when
138 * alternating between span(USET_SPAN_NOT_CONTAINED) and
139 * span(either "contained" condition).
140 * - Using a complemented (inverted) set and the opposite span conditions
141 * yields the same results.
142 *
143 * When a set contains multi-code point strings, then these statements may not
144 * be true, depending on the strings in the set (for example, whether they
145 * overlap with each other) and the string that is processed.
146 * For a set with strings:
147 * - The complement of the set contains the opposite set of code points,
148 * but the same set of strings.
149 * Therefore, complementing both the set and the span conditions
150 * may yield different results.
151 * - When starting spans at different positions in a string
152 * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
153 * because a set string may start before the later position.
154 * - span(USET_SPAN_SIMPLE) may be shorter than
155 * span(USET_SPAN_CONTAINED) because it will not recursively try
156 * all possible paths.
157 * For example, with a set which contains the three strings "xy", "xya" and "ax",
158 * span("xyax", USET_SPAN_CONTAINED) will return 4 but
159 * span("xyax", USET_SPAN_SIMPLE) will return 3.
160 * span(USET_SPAN_SIMPLE) will never be longer than
161 * span(USET_SPAN_CONTAINED).
162 * - With either "contained" condition, span() and spanBack() may partition
163 * a string in different ways.
164 * For example, with a set which contains the two strings "ab" and "ba",
165 * and when processing the string "aba",
166 * span() will yield contained/not-contained boundaries of { 0, 2, 3 }
167 * while spanBack() will yield boundaries of { 0, 1, 3 }.
168 *
169 * Note: If it is important to get the same boundaries whether iterating forward
170 * or backward through a string, then either only span() should be used and
171 * the boundaries cached for backward operation, or an ICU BreakIterator
172 * could be used.
173 *
174 * Note: Unpaired surrogates are treated like surrogate code points.
175 * Similarly, set strings match only on code point boundaries,
176 * never in the middle of a surrogate pair.
177 * Illegal UTF-8 sequences are treated like U+FFFD.
178 * When processing UTF-8 strings, malformed set strings
179 * (strings with unpaired surrogates which cannot be converted to UTF-8)
180 * are ignored.
181 *
182 * @stable ICU 3.8
183 */
184typedef enum USetSpanCondition {
185 /**
186 * Continues a span() while there is no set element at the current position.
187 * Increments by one code point at a time.
188 * Stops before the first set element (character or string).
189 * (For code points only, this is like while contains(current)==false).
190 *
191 * When span() returns, the substring between where it started and the position
192 * it returned consists only of characters that are not in the set,
193 * and none of its strings overlap with the span.
194 *
195 * @stable ICU 3.8
196 */
197 USET_SPAN_NOT_CONTAINED = 0,
198 /**
199 * Spans the longest substring that is a concatenation of set elements (characters or strings).
200 * (For characters only, this is like while contains(current)==true).
201 *
202 * When span() returns, the substring between where it started and the position
203 * it returned consists only of set elements (characters or strings) that are in the set.
204 *
205 * If a set contains strings, then the span will be the longest substring for which there
206 * exists at least one non-overlapping concatenation of set elements (characters or strings).
207 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
208 * (Java/ICU/Perl regex stops at the first match of an OR.)
209 *
210 * @stable ICU 3.8
211 */
212 USET_SPAN_CONTAINED = 1,
213 /**
214 * Continues a span() while there is a set element at the current position.
215 * Increments by the longest matching element at each position.
216 * (For characters only, this is like while contains(current)==true).
217 *
218 * When span() returns, the substring between where it started and the position
219 * it returned consists only of set elements (characters or strings) that are in the set.
220 *
221 * If a set only contains single characters, then this is the same
222 * as USET_SPAN_CONTAINED.
223 *
224 * If a set contains strings, then the span will be the longest substring
225 * with a match at each position with the longest single set element (character or string).
226 *
227 * Use this span condition together with other longest-match algorithms,
228 * such as ICU converters (ucnv_getUnicodeSet()).
229 *
230 * @stable ICU 3.8
231 */
232 USET_SPAN_SIMPLE = 2,
233#ifndef U_HIDE_DEPRECATED_API
234 /**
235 * One more than the last span condition.
236 * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
237 */
238 USET_SPAN_CONDITION_COUNT
239#endif // U_HIDE_DEPRECATED_API
240} USetSpanCondition;
241
242enum {
243 /**
244 * Capacity of USerializedSet::staticArray.
245 * Enough for any single-code point set.
246 * Also provides padding for nice sizeof(USerializedSet).
247 * @stable ICU 2.4
248 */
249 USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8
250};
251
252/**
253 * A serialized form of a Unicode set. Limited manipulations are
254 * possible directly on a serialized set. See below.
255 * @stable ICU 2.4
256 */
257typedef struct USerializedSet {
258 /**
259 * The serialized Unicode Set.
260 * @stable ICU 2.4
261 */
262 const uint16_t *array;
263 /**
264 * The length of the array that contains BMP characters.
265 * @stable ICU 2.4
266 */
267 int32_t bmpLength;
268 /**
269 * The total length of the array.
270 * @stable ICU 2.4
271 */
272 int32_t length;
273 /**
274 * A small buffer for the array to reduce memory allocations.
275 * @stable ICU 2.4
276 */
277 uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY];
278} USerializedSet;
279
280/*********************************************************************
281 * USet API
282 *********************************************************************/
283
284/**
285 * Create an empty USet object.
286 * Equivalent to uset_open(1, 0).
287 * @return a newly created USet. The caller must call uset_close() on
288 * it when done.
289 * @stable ICU 4.2
290 */
291U_CAPI USet* U_EXPORT2
292uset_openEmpty(void);
293
294/**
295 * Creates a USet object that contains the range of characters
296 * start..end, inclusive. If <code>start > end</code>
297 * then an empty set is created (same as using uset_openEmpty()).
298 * @param start first character of the range, inclusive
299 * @param end last character of the range, inclusive
300 * @return a newly created USet. The caller must call uset_close() on
301 * it when done.
302 * @stable ICU 2.4
303 */
304U_CAPI USet* U_EXPORT2
305uset_open(UChar32 start, UChar32 end);
306
307/**
308 * Creates a set from the given pattern. See the UnicodeSet class
309 * description for the syntax of the pattern language.
310 * @param pattern a string specifying what characters are in the set
311 * @param patternLength the length of the pattern, or -1 if null
312 * terminated
313 * @param ec the error code
314 * @stable ICU 2.4
315 */
316U_CAPI USet* U_EXPORT2
317uset_openPattern(const UChar* pattern, int32_t patternLength,
318 UErrorCode* ec);
319
320/**
321 * Creates a set from the given pattern. See the UnicodeSet class
322 * description for the syntax of the pattern language.
323 * @param pattern a string specifying what characters are in the set
324 * @param patternLength the length of the pattern, or -1 if null
325 * terminated
326 * @param options bitmask for options to apply to the pattern.
327 * Valid options are USET_IGNORE_SPACE and
328 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
329 * These case options are mutually exclusive.
330 * @param ec the error code
331 * @stable ICU 2.4
332 */
333U_CAPI USet* U_EXPORT2
334uset_openPatternOptions(const UChar* pattern, int32_t patternLength,
335 uint32_t options,
336 UErrorCode* ec);
337
338/**
339 * Disposes of the storage used by a USet object. This function should
340 * be called exactly once for objects returned by uset_open().
341 * @param set the object to dispose of
342 * @stable ICU 2.4
343 */
344U_CAPI void U_EXPORT2
345uset_close(USet* set);
346
347#if U_SHOW_CPLUSPLUS_API
348
349U_NAMESPACE_BEGIN
350
351/**
352 * \class LocalUSetPointer
353 * "Smart pointer" class, closes a USet via uset_close().
354 * For most methods see the LocalPointerBase base class.
355 *
356 * @see LocalPointerBase
357 * @see LocalPointer
358 * @stable ICU 4.4
359 */
360U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close);
361
362U_NAMESPACE_END
363
364#endif
365
366/**
367 * Returns a copy of this object.
368 * If this set is frozen, then the clone will be frozen as well.
369 * Use uset_cloneAsThawed() for a mutable clone of a frozen set.
370 * @param set the original set
371 * @return the newly allocated copy of the set
372 * @see uset_cloneAsThawed
373 * @stable ICU 3.8
374 */
375U_CAPI USet * U_EXPORT2
376uset_clone(const USet *set);
377
378/**
379 * Determines whether the set has been frozen (made immutable) or not.
380 * See the ICU4J Freezable interface for details.
381 * @param set the set
382 * @return true/false for whether the set has been frozen
383 * @see uset_freeze
384 * @see uset_cloneAsThawed
385 * @stable ICU 3.8
386 */
387U_CAPI UBool U_EXPORT2
388uset_isFrozen(const USet *set);
389
390/**
391 * Freeze the set (make it immutable).
392 * Once frozen, it cannot be unfrozen and is therefore thread-safe
393 * until it is deleted.
394 * See the ICU4J Freezable interface for details.
395 * Freezing the set may also make some operations faster, for example
396 * uset_contains() and uset_span().
397 * A frozen set will not be modified. (It remains frozen.)
398 * @param set the set
399 * @return the same set, now frozen
400 * @see uset_isFrozen
401 * @see uset_cloneAsThawed
402 * @stable ICU 3.8
403 */
404U_CAPI void U_EXPORT2
405uset_freeze(USet *set);
406
407/**
408 * Clone the set and make the clone mutable.
409 * See the ICU4J Freezable interface for details.
410 * @param set the set
411 * @return the mutable clone
412 * @see uset_freeze
413 * @see uset_isFrozen
414 * @see uset_clone
415 * @stable ICU 3.8
416 */
417U_CAPI USet * U_EXPORT2
418uset_cloneAsThawed(const USet *set);
419
420/**
421 * Causes the USet object to represent the range <code>start - end</code>.
422 * If <code>start > end</code> then this USet is set to an empty range.
423 * A frozen set will not be modified.
424 * @param set the object to set to the given range
425 * @param start first character in the set, inclusive
426 * @param end last character in the set, inclusive
427 * @stable ICU 3.2
428 */
429U_CAPI void U_EXPORT2
430uset_set(USet* set,
431 UChar32 start, UChar32 end);
432
433/**
434 * Modifies the set to represent the set specified by the given
435 * pattern. See the UnicodeSet class description for the syntax of
436 * the pattern language. See also the User Guide chapter about UnicodeSet.
437 * <em>Empties the set passed before applying the pattern.</em>
438 * A frozen set will not be modified.
439 * @param set The set to which the pattern is to be applied.
440 * @param pattern A pointer to UChar string specifying what characters are in the set.
441 * The character at pattern[0] must be a '['.
442 * @param patternLength The length of the UChar string. -1 if NUL terminated.
443 * @param options A bitmask for options to apply to the pattern.
444 * Valid options are USET_IGNORE_SPACE and
445 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS,
446 * USET_SIMPLE_CASE_INSENSITIVE.
447 * These case options are mutually exclusive.
448 * @param status Returns an error if the pattern cannot be parsed.
449 * @return Upon successful parse, the value is either
450 * the index of the character after the closing ']'
451 * of the parsed pattern.
452 * If the status code indicates failure, then the return value
453 * is the index of the error in the source.
454 *
455 * @stable ICU 2.8
456 */
457U_CAPI int32_t U_EXPORT2
458uset_applyPattern(USet *set,
459 const UChar *pattern, int32_t patternLength,
460 uint32_t options,
461 UErrorCode *status);
462
463/**
464 * Modifies the set to contain those code points which have the given value
465 * for the given binary or enumerated property, as returned by
466 * u_getIntPropertyValue. Prior contents of this set are lost.
467 * A frozen set will not be modified.
468 *
469 * @param set the object to contain the code points defined by the property
470 *
471 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
472 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
473 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
474 *
475 * @param value a value in the range u_getIntPropertyMinValue(prop)..
476 * u_getIntPropertyMaxValue(prop), with one exception. If prop is
477 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
478 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped
479 * categories such as [:L:] to be represented.
480 *
481 * @param ec error code input/output parameter
482 *
483 * @stable ICU 3.2
484 */
485U_CAPI void U_EXPORT2
486uset_applyIntPropertyValue(USet* set,
487 UProperty prop, int32_t value, UErrorCode* ec);
488
489/**
490 * Modifies the set to contain those code points which have the
491 * given value for the given property. Prior contents of this
492 * set are lost.
493 * A frozen set will not be modified.
494 *
495 * @param set the object to contain the code points defined by the given
496 * property and value alias
497 *
498 * @param prop a string specifying a property alias, either short or long.
499 * The name is matched loosely. See PropertyAliases.txt for names and a
500 * description of loose matching. If the value string is empty, then this
501 * string is interpreted as either a General_Category value alias, a Script
502 * value alias, a binary property alias, or a special ID. Special IDs are
503 * matched loosely and correspond to the following sets:
504 *
505 * "ANY" = [\\u0000-\\U0010FFFF],
506 * "ASCII" = [\\u0000-\\u007F],
507 * "Assigned" = [:^Cn:].
508 *
509 * @param propLength the length of the prop, or -1 if NULL
510 *
511 * @param value a string specifying a value alias, either short or long.
512 * The name is matched loosely. See PropertyValueAliases.txt for names
513 * and a description of loose matching. In addition to aliases listed,
514 * numeric values and canonical combining classes may be expressed
515 * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string
516 * may also be empty.
517 *
518 * @param valueLength the length of the value, or -1 if NULL
519 *
520 * @param ec error code input/output parameter
521 *
522 * @stable ICU 3.2
523 */
524U_CAPI void U_EXPORT2
525uset_applyPropertyAlias(USet* set,
526 const UChar *prop, int32_t propLength,
527 const UChar *value, int32_t valueLength,
528 UErrorCode* ec);
529
530/**
531 * Return true if the given position, in the given pattern, appears
532 * to be the start of a UnicodeSet pattern.
533 *
534 * @param pattern a string specifying the pattern
535 * @param patternLength the length of the pattern, or -1 if NULL
536 * @param pos the given position
537 * @stable ICU 3.2
538 */
539U_CAPI UBool U_EXPORT2
540uset_resemblesPattern(const UChar *pattern, int32_t patternLength,
541 int32_t pos);
542
543/**
544 * Returns a string representation of this set. If the result of
545 * calling this function is passed to a uset_openPattern(), it
546 * will produce another set that is equal to this one.
547 * @param set the set
548 * @param result the string to receive the rules, may be NULL
549 * @param resultCapacity the capacity of result, may be 0 if result is NULL
550 * @param escapeUnprintable if true then convert unprintable
551 * character to their hex escape representations, \\uxxxx or
552 * \\Uxxxxxxxx. Unprintable characters are those other than
553 * U+000A, U+0020..U+007E.
554 * @param ec error code.
555 * @return length of string, possibly larger than resultCapacity
556 * @stable ICU 2.4
557 */
558U_CAPI int32_t U_EXPORT2
559uset_toPattern(const USet* set,
560 UChar* result, int32_t resultCapacity,
561 UBool escapeUnprintable,
562 UErrorCode* ec);
563
564/**
565 * Adds the given character to the given USet. After this call,
566 * uset_contains(set, c) will return true.
567 * A frozen set will not be modified.
568 * @param set the object to which to add the character
569 * @param c the character to add
570 * @stable ICU 2.4
571 */
572U_CAPI void U_EXPORT2
573uset_add(USet* set, UChar32 c);
574
575/**
576 * Adds all of the elements in the specified set to this set if
577 * they're not already present. This operation effectively
578 * modifies this set so that its value is the <i>union</i> of the two
579 * sets. The behavior of this operation is unspecified if the specified
580 * collection is modified while the operation is in progress.
581 * A frozen set will not be modified.
582 *
583 * @param set the object to which to add the set
584 * @param additionalSet the source set whose elements are to be added to this set.
585 * @stable ICU 2.6
586 */
587U_CAPI void U_EXPORT2
588uset_addAll(USet* set, const USet *additionalSet);
589
590/**
591 * Adds the given range of characters to the given USet. After this call,
592 * uset_contains(set, start, end) will return true.
593 * A frozen set will not be modified.
594 * @param set the object to which to add the character
595 * @param start the first character of the range to add, inclusive
596 * @param end the last character of the range to add, inclusive
597 * @stable ICU 2.2
598 */
599U_CAPI void U_EXPORT2
600uset_addRange(USet* set, UChar32 start, UChar32 end);
601
602/**
603 * Adds the given string to the given USet. After this call,
604 * uset_containsString(set, str, strLen) will return true.
605 * A frozen set will not be modified.
606 * @param set the object to which to add the character
607 * @param str the string to add
608 * @param strLen the length of the string or -1 if null terminated.
609 * @stable ICU 2.4
610 */
611U_CAPI void U_EXPORT2
612uset_addString(USet* set, const UChar* str, int32_t strLen);
613
614/**
615 * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
616 * If this set already contains any particular character, it has no effect on that character.
617 * A frozen set will not be modified.
618 * @param set the object to which to add the character
619 * @param str the source string
620 * @param strLen the length of the string or -1 if null terminated.
621 * @stable ICU 3.4
622 */
623U_CAPI void U_EXPORT2
624uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen);
625
626/**
627 * Removes the given character from the given USet. After this call,
628 * uset_contains(set, c) will return false.
629 * A frozen set will not be modified.
630 * @param set the object from which to remove the character
631 * @param c the character to remove
632 * @stable ICU 2.4
633 */
634U_CAPI void U_EXPORT2
635uset_remove(USet* set, UChar32 c);
636
637/**
638 * Removes the given range of characters from the given USet. After this call,
639 * uset_contains(set, start, end) will return false.
640 * A frozen set will not be modified.
641 * @param set the object to which to add the character
642 * @param start the first character of the range to remove, inclusive
643 * @param end the last character of the range to remove, inclusive
644 * @stable ICU 2.2
645 */
646U_CAPI void U_EXPORT2
647uset_removeRange(USet* set, UChar32 start, UChar32 end);
648
649/**
650 * Removes the given string to the given USet. After this call,
651 * uset_containsString(set, str, strLen) will return false.
652 * A frozen set will not be modified.
653 * @param set the object to which to add the character
654 * @param str the string to remove
655 * @param strLen the length of the string or -1 if null terminated.
656 * @stable ICU 2.4
657 */
658U_CAPI void U_EXPORT2
659uset_removeString(USet* set, const UChar* str, int32_t strLen);
660
661/**
662 * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"}
663 * A frozen set will not be modified.
664 *
665 * @param set the object to be modified
666 * @param str the string
667 * @param length the length of the string, or -1 if NUL-terminated
668 * @stable ICU 69
669 */
670U_CAPI void U_EXPORT2
671uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length);
672
673/**
674 * Removes from this set all of its elements that are contained in the
675 * specified set. This operation effectively modifies this
676 * set so that its value is the <i>asymmetric set difference</i> of
677 * the two sets.
678 * A frozen set will not be modified.
679 * @param set the object from which the elements are to be removed
680 * @param removeSet the object that defines which elements will be
681 * removed from this set
682 * @stable ICU 3.2
683 */
684U_CAPI void U_EXPORT2
685uset_removeAll(USet* set, const USet* removeSet);
686
687/**
688 * Retain only the elements in this set that are contained in the
689 * specified range. If <code>start > end</code> then an empty range is
690 * retained, leaving the set empty. This is equivalent to
691 * a boolean logic AND, or a set INTERSECTION.
692 * A frozen set will not be modified.
693 *
694 * @param set the object for which to retain only the specified range
695 * @param start first character, inclusive, of range
696 * @param end last character, inclusive, of range
697 * @stable ICU 3.2
698 */
699U_CAPI void U_EXPORT2
700uset_retain(USet* set, UChar32 start, UChar32 end);
701
702/**
703 * Retains only the specified string from this set if it is present.
704 * Upon return this set will be empty if it did not contain s, or
705 * will only contain s if it did contain s.
706 * A frozen set will not be modified.
707 *
708 * @param set the object to be modified
709 * @param str the string
710 * @param length the length of the string, or -1 if NUL-terminated
711 * @stable ICU 69
712 */
713U_CAPI void U_EXPORT2
714uset_retainString(USet *set, const UChar *str, int32_t length);
715
716/**
717 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
718 * A frozen set will not be modified.
719 *
720 * @param set the object to be modified
721 * @param str the string
722 * @param length the length of the string, or -1 if NUL-terminated
723 * @stable ICU 69
724 */
725U_CAPI void U_EXPORT2
726uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length);
727
728/**
729 * Retains only the elements in this set that are contained in the
730 * specified set. In other words, removes from this set all of
731 * its elements that are not contained in the specified set. This
732 * operation effectively modifies this set so that its value is
733 * the <i>intersection</i> of the two sets.
734 * A frozen set will not be modified.
735 *
736 * @param set the object on which to perform the retain
737 * @param retain set that defines which elements this set will retain
738 * @stable ICU 3.2
739 */
740U_CAPI void U_EXPORT2
741uset_retainAll(USet* set, const USet* retain);
742
743/**
744 * Reallocate this objects internal structures to take up the least
745 * possible space, without changing this object's value.
746 * A frozen set will not be modified.
747 *
748 * @param set the object on which to perform the compact
749 * @stable ICU 3.2
750 */
751U_CAPI void U_EXPORT2
752uset_compact(USet* set);
753
754/**
755 * This is equivalent to
756 * <code>uset_complementRange(set, 0, 0x10FFFF)</code>.
757 *
758 * <strong>Note:</strong> This performs a symmetric difference with all code points
759 * <em>and thus retains all multicharacter strings</em>.
760 * In order to achieve a “code point complement” (all code points minus this set),
761 * the easiest is to <code>uset_complement(set); uset_removeAllStrings(set);</code>.
762 *
763 * A frozen set will not be modified.
764 * @param set the set
765 * @stable ICU 2.4
766 */
767U_CAPI void U_EXPORT2
768uset_complement(USet* set);
769
770/**
771 * Complements the specified range in this set. Any character in
772 * the range will be removed if it is in this set, or will be
773 * added if it is not in this set. If <code>start > end</code>
774 * then an empty range is complemented, leaving the set unchanged.
775 * This is equivalent to a boolean logic XOR.
776 * A frozen set will not be modified.
777 *
778 * @param set the object to be modified
779 * @param start first character, inclusive, of range
780 * @param end last character, inclusive, of range
781 * @stable ICU 69
782 */
783U_CAPI void U_EXPORT2
784uset_complementRange(USet *set, UChar32 start, UChar32 end);
785
786/**
787 * Complements the specified string in this set.
788 * The string will be removed if it is in this set, or will be added if it is not in this set.
789 * A frozen set will not be modified.
790 *
791 * @param set the object to be modified
792 * @param str the string
793 * @param length the length of the string, or -1 if NUL-terminated
794 * @stable ICU 69
795 */
796U_CAPI void U_EXPORT2
797uset_complementString(USet *set, const UChar *str, int32_t length);
798
799/**
800 * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"}
801 * A frozen set will not be modified.
802 *
803 * @param set the object to be modified
804 * @param str the string
805 * @param length the length of the string, or -1 if NUL-terminated
806 * @stable ICU 69
807 */
808U_CAPI void U_EXPORT2
809uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length);
810
811/**
812 * Complements in this set all elements contained in the specified
813 * set. Any character in the other set will be removed if it is
814 * in this set, or will be added if it is not in this set.
815 * A frozen set will not be modified.
816 *
817 * @param set the set with which to complement
818 * @param complement set that defines which elements will be xor'ed
819 * from this set.
820 * @stable ICU 3.2
821 */
822U_CAPI void U_EXPORT2
823uset_complementAll(USet* set, const USet* complement);
824
825/**
826 * Removes all of the elements from this set. This set will be
827 * empty after this call returns.
828 * A frozen set will not be modified.
829 * @param set the set
830 * @stable ICU 2.4
831 */
832U_CAPI void U_EXPORT2
833uset_clear(USet* set);
834
835/**
836 * Close this set over the given attribute. For the attribute
837 * USET_CASE_INSENSITIVE, the result is to modify this set so that:
838 *
839 * 1. For each character or string 'a' in this set, all strings or
840 * characters 'b' such that foldCase(a) == foldCase(b) are added
841 * to this set.
842 *
843 * 2. For each string 'e' in the resulting set, if e !=
844 * foldCase(e), 'e' will be removed.
845 *
846 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
847 *
848 * (Here foldCase(x) refers to the operation u_strFoldCase, and a
849 * == b denotes that the contents are the same, not pointer
850 * comparison.)
851 *
852 * A frozen set will not be modified.
853 *
854 * @param set the set
855 *
856 * @param attributes bitmask for attributes to close over.
857 * Valid options:
858 * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
859 * These case options are mutually exclusive.
860 * Unrelated options bits are ignored.
861 * @stable ICU 4.2
862 */
863U_CAPI void U_EXPORT2
864uset_closeOver(USet* set, int32_t attributes);
865
866/**
867 * Remove all strings from this set.
868 *
869 * @param set the set
870 * @stable ICU 4.2
871 */
872U_CAPI void U_EXPORT2
873uset_removeAllStrings(USet* set);
874
875/**
876 * Returns true if the given USet contains no characters and no
877 * strings.
878 * @param set the set
879 * @return true if set is empty
880 * @stable ICU 2.4
881 */
882U_CAPI UBool U_EXPORT2
883uset_isEmpty(const USet* set);
884
885/**
886 * @param set the set
887 * @return true if this set contains multi-character strings or the empty string.
888 * @stable ICU 70
889 */
890U_CAPI UBool U_EXPORT2
891uset_hasStrings(const USet *set);
892
893/**
894 * Returns true if the given USet contains the given character.
895 * This function works faster with a frozen set.
896 * @param set the set
897 * @param c The codepoint to check for within the set
898 * @return true if set contains c
899 * @stable ICU 2.4
900 */
901U_CAPI UBool U_EXPORT2
902uset_contains(const USet* set, UChar32 c);
903
904/**
905 * Returns true if the given USet contains all characters c
906 * where start <= c && c <= end.
907 * @param set the set
908 * @param start the first character of the range to test, inclusive
909 * @param end the last character of the range to test, inclusive
910 * @return true if set contains the range
911 * @stable ICU 2.2
912 */
913U_CAPI UBool U_EXPORT2
914uset_containsRange(const USet* set, UChar32 start, UChar32 end);
915
916/**
917 * Returns true if the given USet contains the given string.
918 * @param set the set
919 * @param str the string
920 * @param strLen the length of the string or -1 if null terminated.
921 * @return true if set contains str
922 * @stable ICU 2.4
923 */
924U_CAPI UBool U_EXPORT2
925uset_containsString(const USet* set, const UChar* str, int32_t strLen);
926
927/**
928 * Returns the index of the given character within this set, where
929 * the set is ordered by ascending code point. If the character
930 * is not in this set, return -1. The inverse of this method is
931 * <code>charAt()</code>.
932 * @param set the set
933 * @param c the character to obtain the index for
934 * @return an index from 0..size()-1, or -1
935 * @stable ICU 3.2
936 */
937U_CAPI int32_t U_EXPORT2
938uset_indexOf(const USet* set, UChar32 c);
939
940/**
941 * Returns the character at the given index within this set, where
942 * the set is ordered by ascending code point. If the index is
943 * out of range for characters, returns (UChar32)-1.
944 * The inverse of this method is <code>indexOf()</code>.
945 *
946 * For iteration, this is slower than uset_getRangeCount()/uset_getItemCount()
947 * with uset_getItem(), because for each call it skips linearly over <code>index</code>
948 * characters in the ranges.
949 *
950 * @param set the set
951 * @param charIndex an index from 0..size()-1 to obtain the char for
952 * @return the character at the given index, or (UChar32)-1.
953 * @stable ICU 3.2
954 */
955U_CAPI UChar32 U_EXPORT2
956uset_charAt(const USet* set, int32_t charIndex);
957
958/**
959 * Returns the number of characters and strings contained in this set.
960 * The last (uset_getItemCount() - uset_getRangeCount()) items are strings.
961 *
962 * This is slower than uset_getRangeCount() and uset_getItemCount() because
963 * it counts the code points of all ranges.
964 *
965 * @param set the set
966 * @return a non-negative integer counting the characters and strings
967 * contained in set
968 * @stable ICU 2.4
969 * @see uset_getRangeCount
970 */
971U_CAPI int32_t U_EXPORT2
972uset_size(const USet* set);
973
974/**
975 * @param set the set
976 * @return the number of ranges in this set.
977 * @stable ICU 70
978 * @see uset_getItemCount
979 * @see uset_getItem
980 * @see uset_size
981 */
982U_CAPI int32_t U_EXPORT2
983uset_getRangeCount(const USet *set);
984
985/**
986 * Returns the number of items in this set. An item is either a range
987 * of characters or a single multicharacter string.
988 * @param set the set
989 * @return a non-negative integer counting the character ranges
990 * and/or strings contained in set
991 * @stable ICU 2.4
992 */
993U_CAPI int32_t U_EXPORT2
994uset_getItemCount(const USet* set);
995
996/**
997 * Returns an item of this set. An item is either a range of
998 * characters or a single multicharacter string (which can be the empty string).
999 *
1000 * If <code>itemIndex</code> is less than uset_getRangeCount(), then this function returns 0,
1001 * and the range is <code>*start</code>..<code>*end</code>.
1002 *
1003 * If <code>itemIndex</code> is at least uset_getRangeCount() and less than uset_getItemCount(), then
1004 * this function copies the string into <code>str[strCapacity]</code> and
1005 * returns the length of the string (0 for the empty string).
1006 *
1007 * If <code>itemIndex</code> is out of range, then this function returns -1.
1008 *
1009 * Note that 0 is returned for each range as well as for the empty string.
1010 *
1011 * @param set the set
1012 * @param itemIndex a non-negative integer in the range 0..uset_getItemCount(set)-1
1013 * @param start pointer to variable to receive first character in range, inclusive;
1014 * can be NULL for a string item
1015 * @param end pointer to variable to receive last character in range, inclusive;
1016 * can be NULL for a string item
1017 * @param str buffer to receive the string, may be NULL
1018 * @param strCapacity capacity of str, or 0 if str is NULL
1019 * @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range
1020 * @return the length of the string (0 or >= 2), or 0 if the item is a range,
1021 * or -1 if the itemIndex is out of range
1022 * @stable ICU 2.4
1023 */
1024U_CAPI int32_t U_EXPORT2
1025uset_getItem(const USet* set, int32_t itemIndex,
1026 UChar32* start, UChar32* end,
1027 UChar* str, int32_t strCapacity,
1028 UErrorCode* ec);
1029
1030/**
1031 * Returns true if set1 contains all the characters and strings
1032 * of set2. It answers the question, 'Is set1 a superset of set2?'
1033 * @param set1 set to be checked for containment
1034 * @param set2 set to be checked for containment
1035 * @return true if the test condition is met
1036 * @stable ICU 3.2
1037 */
1038U_CAPI UBool U_EXPORT2
1039uset_containsAll(const USet* set1, const USet* set2);
1040
1041/**
1042 * Returns true if this set contains all the characters
1043 * of the given string. This is does not check containment of grapheme
1044 * clusters, like uset_containsString.
1045 * @param set set of characters to be checked for containment
1046 * @param str string containing codepoints to be checked for containment
1047 * @param strLen the length of the string or -1 if null terminated.
1048 * @return true if the test condition is met
1049 * @stable ICU 3.4
1050 */
1051U_CAPI UBool U_EXPORT2
1052uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen);
1053
1054/**
1055 * Returns true if set1 contains none of the characters and strings
1056 * of set2. It answers the question, 'Is set1 a disjoint set of set2?'
1057 * @param set1 set to be checked for containment
1058 * @param set2 set to be checked for containment
1059 * @return true if the test condition is met
1060 * @stable ICU 3.2
1061 */
1062U_CAPI UBool U_EXPORT2
1063uset_containsNone(const USet* set1, const USet* set2);
1064
1065/**
1066 * Returns true if set1 contains some of the characters and strings
1067 * of set2. It answers the question, 'Does set1 and set2 have an intersection?'
1068 * @param set1 set to be checked for containment
1069 * @param set2 set to be checked for containment
1070 * @return true if the test condition is met
1071 * @stable ICU 3.2
1072 */
1073U_CAPI UBool U_EXPORT2
1074uset_containsSome(const USet* set1, const USet* set2);
1075
1076/**
1077 * Returns the length of the initial substring of the input string which
1078 * consists only of characters and strings that are contained in this set
1079 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1080 * or only of characters and strings that are not contained
1081 * in this set (USET_SPAN_NOT_CONTAINED).
1082 * See USetSpanCondition for details.
1083 * Similar to the strspn() C library function.
1084 * Unpaired surrogates are treated according to contains() of their surrogate code points.
1085 * This function works faster with a frozen set and with a non-negative string length argument.
1086 * @param set the set
1087 * @param s start of the string
1088 * @param length of the string; can be -1 for NUL-terminated
1089 * @param spanCondition specifies the containment condition
1090 * @return the length of the initial substring according to the spanCondition;
1091 * 0 if the start of the string does not fit the spanCondition
1092 * @stable ICU 3.8
1093 * @see USetSpanCondition
1094 */
1095U_CAPI int32_t U_EXPORT2
1096uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
1097
1098/**
1099 * Returns the start of the trailing substring of the input string which
1100 * consists only of characters and strings that are contained in this set
1101 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1102 * or only of characters and strings that are not contained
1103 * in this set (USET_SPAN_NOT_CONTAINED).
1104 * See USetSpanCondition for details.
1105 * Unpaired surrogates are treated according to contains() of their surrogate code points.
1106 * This function works faster with a frozen set and with a non-negative string length argument.
1107 * @param set the set
1108 * @param s start of the string
1109 * @param length of the string; can be -1 for NUL-terminated
1110 * @param spanCondition specifies the containment condition
1111 * @return the start of the trailing substring according to the spanCondition;
1112 * the string length if the end of the string does not fit the spanCondition
1113 * @stable ICU 3.8
1114 * @see USetSpanCondition
1115 */
1116U_CAPI int32_t U_EXPORT2
1117uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
1118
1119/**
1120 * Returns the length of the initial substring of the input string which
1121 * consists only of characters and strings that are contained in this set
1122 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1123 * or only of characters and strings that are not contained
1124 * in this set (USET_SPAN_NOT_CONTAINED).
1125 * See USetSpanCondition for details.
1126 * Similar to the strspn() C library function.
1127 * Malformed byte sequences are treated according to contains(0xfffd).
1128 * This function works faster with a frozen set and with a non-negative string length argument.
1129 * @param set the set
1130 * @param s start of the string (UTF-8)
1131 * @param length of the string; can be -1 for NUL-terminated
1132 * @param spanCondition specifies the containment condition
1133 * @return the length of the initial substring according to the spanCondition;
1134 * 0 if the start of the string does not fit the spanCondition
1135 * @stable ICU 3.8
1136 * @see USetSpanCondition
1137 */
1138U_CAPI int32_t U_EXPORT2
1139uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
1140
1141/**
1142 * Returns the start of the trailing substring of the input string which
1143 * consists only of characters and strings that are contained in this set
1144 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1145 * or only of characters and strings that are not contained
1146 * in this set (USET_SPAN_NOT_CONTAINED).
1147 * See USetSpanCondition for details.
1148 * Malformed byte sequences are treated according to contains(0xfffd).
1149 * This function works faster with a frozen set and with a non-negative string length argument.
1150 * @param set the set
1151 * @param s start of the string (UTF-8)
1152 * @param length of the string; can be -1 for NUL-terminated
1153 * @param spanCondition specifies the containment condition
1154 * @return the start of the trailing substring according to the spanCondition;
1155 * the string length if the end of the string does not fit the spanCondition
1156 * @stable ICU 3.8
1157 * @see USetSpanCondition
1158 */
1159U_CAPI int32_t U_EXPORT2
1160uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
1161
1162/**
1163 * Returns true if set1 contains all of the characters and strings
1164 * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?'
1165 * @param set1 set to be checked for containment
1166 * @param set2 set to be checked for containment
1167 * @return true if the test condition is met
1168 * @stable ICU 3.2
1169 */
1170U_CAPI UBool U_EXPORT2
1171uset_equals(const USet* set1, const USet* set2);
1172
1173/*********************************************************************
1174 * Serialized set API
1175 *********************************************************************/
1176
1177/**
1178 * Serializes this set into an array of 16-bit integers. Serialization
1179 * (currently) only records the characters in the set; multicharacter
1180 * strings are ignored.
1181 *
1182 * The array
1183 * has following format (each line is one 16-bit integer):
1184 *
1185 * length = (n+2*m) | (m!=0?0x8000:0)
1186 * bmpLength = n; present if m!=0
1187 * bmp[0]
1188 * bmp[1]
1189 * ...
1190 * bmp[n-1]
1191 * supp-high[0]
1192 * supp-low[0]
1193 * supp-high[1]
1194 * supp-low[1]
1195 * ...
1196 * supp-high[m-1]
1197 * supp-low[m-1]
1198 *
1199 * The array starts with a header. After the header are n bmp
1200 * code points, then m supplementary code points. Either n or m
1201 * or both may be zero. n+2*m is always <= 0x7FFF.
1202 *
1203 * If there are no supplementary characters (if m==0) then the
1204 * header is one 16-bit integer, 'length', with value n.
1205 *
1206 * If there are supplementary characters (if m!=0) then the header
1207 * is two 16-bit integers. The first, 'length', has value
1208 * (n+2*m)|0x8000. The second, 'bmpLength', has value n.
1209 *
1210 * After the header the code points are stored in ascending order.
1211 * Supplementary code points are stored as most significant 16
1212 * bits followed by least significant 16 bits.
1213 *
1214 * @param set the set
1215 * @param dest pointer to buffer of destCapacity 16-bit integers.
1216 * May be NULL only if destCapacity is zero.
1217 * @param destCapacity size of dest, or zero. Must not be negative.
1218 * @param pErrorCode pointer to the error code. Will be set to
1219 * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to
1220 * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity.
1221 * @return the total length of the serialized format, including
1222 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
1223 * than U_BUFFER_OVERFLOW_ERROR.
1224 * @stable ICU 2.4
1225 */
1226U_CAPI int32_t U_EXPORT2
1227uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode);
1228
1229/**
1230 * Given a serialized array, fill in the given serialized set object.
1231 * @param fillSet pointer to result
1232 * @param src pointer to start of array
1233 * @param srcLength length of array
1234 * @return true if the given array is valid, otherwise false
1235 * @stable ICU 2.4
1236 */
1237U_CAPI UBool U_EXPORT2
1238uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength);
1239
1240/**
1241 * Set the USerializedSet to contain the given character (and nothing
1242 * else).
1243 * @param fillSet pointer to result
1244 * @param c The codepoint to set
1245 * @stable ICU 2.4
1246 */
1247U_CAPI void U_EXPORT2
1248uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c);
1249
1250/**
1251 * Returns true if the given USerializedSet contains the given
1252 * character.
1253 * @param set the serialized set
1254 * @param c The codepoint to check for within the set
1255 * @return true if set contains c
1256 * @stable ICU 2.4
1257 */
1258U_CAPI UBool U_EXPORT2
1259uset_serializedContains(const USerializedSet* set, UChar32 c);
1260
1261/**
1262 * Returns the number of disjoint ranges of characters contained in
1263 * the given serialized set. Ignores any strings contained in the
1264 * set.
1265 * @param set the serialized set
1266 * @return a non-negative integer counting the character ranges
1267 * contained in set
1268 * @stable ICU 2.4
1269 */
1270U_CAPI int32_t U_EXPORT2
1271uset_getSerializedRangeCount(const USerializedSet* set);
1272
1273/**
1274 * Returns a range of characters contained in the given serialized
1275 * set.
1276 * @param set the serialized set
1277 * @param rangeIndex a non-negative integer in the range 0..
1278 * uset_getSerializedRangeCount(set)-1
1279 * @param pStart pointer to variable to receive first character
1280 * in range, inclusive
1281 * @param pEnd pointer to variable to receive last character in range,
1282 * inclusive
1283 * @return true if rangeIndex is valid, otherwise false
1284 * @stable ICU 2.4
1285 */
1286U_CAPI UBool U_EXPORT2
1287uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
1288 UChar32* pStart, UChar32* pEnd);
1289
1290#endif
1291