1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * |
6 | * Copyright (C) 2002-2014, International Business Machines |
7 | * Corporation and others. All Rights Reserved. |
8 | * |
9 | ******************************************************************************* |
10 | * file name: uset.h |
11 | * encoding: UTF-8 |
12 | * tab size: 8 (not used) |
13 | * indentation:4 |
14 | * |
15 | * created on: 2002mar07 |
16 | * created by: Markus W. Scherer |
17 | * |
18 | * C version of UnicodeSet. |
19 | */ |
20 | |
21 | |
22 | /** |
23 | * \file |
24 | * \brief C API: Unicode Set |
25 | * |
26 | * <p>This is a C wrapper around the C++ UnicodeSet class.</p> |
27 | */ |
28 | |
29 | #ifndef __USET_H__ |
30 | #define __USET_H__ |
31 | |
32 | #include "unicode/utypes.h" |
33 | #include "unicode/uchar.h" |
34 | #include "unicode/localpointer.h" |
35 | |
36 | #ifndef USET_DEFINED |
37 | |
38 | #ifndef U_IN_DOXYGEN |
39 | #define USET_DEFINED |
40 | #endif |
41 | /** |
42 | * USet is the C API type corresponding to C++ class UnicodeSet. |
43 | * Use the uset_* API to manipulate. Create with |
44 | * uset_open*, and destroy with uset_close. |
45 | * @stable ICU 2.4 |
46 | */ |
47 | typedef struct USet USet; |
48 | #endif |
49 | |
50 | /** |
51 | * Bitmask values to be passed to uset_openPatternOptions() or |
52 | * uset_applyPattern() taking an option parameter. |
53 | * @stable ICU 2.4 |
54 | */ |
55 | enum { |
56 | /** |
57 | * Ignore white space within patterns unless quoted or escaped. |
58 | * @stable ICU 2.4 |
59 | */ |
60 | USET_IGNORE_SPACE = 1, |
61 | |
62 | /** |
63 | * Enable case insensitive matching. E.g., "[ab]" with this flag |
64 | * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will |
65 | * match all except 'a', 'A', 'b', and 'B'. This performs a full |
66 | * closure over case mappings, e.g. U+017F for s. |
67 | * |
68 | * The resulting set is a superset of the input for the code points but |
69 | * not for the strings. |
70 | * It performs a case mapping closure of the code points and adds |
71 | * full case folding strings for the code points, and reduces strings of |
72 | * the original set to their full case folding equivalents. |
73 | * |
74 | * This is designed for case-insensitive matches, for example |
75 | * in regular expressions. The full code point case closure allows checking of |
76 | * an input character directly against the closure set. |
77 | * Strings are matched by comparing the case-folded form from the closure |
78 | * set with an incremental case folding of the string in question. |
79 | * |
80 | * The closure set will also contain single code points if the original |
81 | * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). |
82 | * This is not necessary (that is, redundant) for the above matching method |
83 | * but results in the same closure sets regardless of whether the original |
84 | * set contained the code point or a string. |
85 | * |
86 | * @stable ICU 2.4 |
87 | */ |
88 | USET_CASE_INSENSITIVE = 2, |
89 | |
90 | /** |
91 | * Enable case insensitive matching. E.g., "[ab]" with this flag |
92 | * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will |
93 | * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, |
94 | * title-, and uppercase mappings as well as the case folding |
95 | * of each existing element in the set. |
96 | * @stable ICU 3.2 |
97 | */ |
98 | USET_ADD_CASE_MAPPINGS = 4 |
99 | }; |
100 | |
101 | /** |
102 | * Argument values for whether span() and similar functions continue while |
103 | * the current character is contained vs. not contained in the set. |
104 | * |
105 | * The functionality is straightforward for sets with only single code points, |
106 | * without strings (which is the common case): |
107 | * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same. |
108 | * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED. |
109 | * - span() and spanBack() partition any string the same way when |
110 | * alternating between span(USET_SPAN_NOT_CONTAINED) and |
111 | * span(either "contained" condition). |
112 | * - Using a complemented (inverted) set and the opposite span conditions |
113 | * yields the same results. |
114 | * |
115 | * When a set contains multi-code point strings, then these statements may not |
116 | * be true, depending on the strings in the set (for example, whether they |
117 | * overlap with each other) and the string that is processed. |
118 | * For a set with strings: |
119 | * - The complement of the set contains the opposite set of code points, |
120 | * but the same set of strings. |
121 | * Therefore, complementing both the set and the span conditions |
122 | * may yield different results. |
123 | * - When starting spans at different positions in a string |
124 | * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different |
125 | * because a set string may start before the later position. |
126 | * - span(USET_SPAN_SIMPLE) may be shorter than |
127 | * span(USET_SPAN_CONTAINED) because it will not recursively try |
128 | * all possible paths. |
129 | * For example, with a set which contains the three strings "xy", "xya" and "ax", |
130 | * span("xyax", USET_SPAN_CONTAINED) will return 4 but |
131 | * span("xyax", USET_SPAN_SIMPLE) will return 3. |
132 | * span(USET_SPAN_SIMPLE) will never be longer than |
133 | * span(USET_SPAN_CONTAINED). |
134 | * - With either "contained" condition, span() and spanBack() may partition |
135 | * a string in different ways. |
136 | * For example, with a set which contains the two strings "ab" and "ba", |
137 | * and when processing the string "aba", |
138 | * span() will yield contained/not-contained boundaries of { 0, 2, 3 } |
139 | * while spanBack() will yield boundaries of { 0, 1, 3 }. |
140 | * |
141 | * Note: If it is important to get the same boundaries whether iterating forward |
142 | * or backward through a string, then either only span() should be used and |
143 | * the boundaries cached for backward operation, or an ICU BreakIterator |
144 | * could be used. |
145 | * |
146 | * Note: Unpaired surrogates are treated like surrogate code points. |
147 | * Similarly, set strings match only on code point boundaries, |
148 | * never in the middle of a surrogate pair. |
149 | * Illegal UTF-8 sequences are treated like U+FFFD. |
150 | * When processing UTF-8 strings, malformed set strings |
151 | * (strings with unpaired surrogates which cannot be converted to UTF-8) |
152 | * are ignored. |
153 | * |
154 | * @stable ICU 3.8 |
155 | */ |
156 | typedef enum USetSpanCondition { |
157 | /** |
158 | * Continues a span() while there is no set element at the current position. |
159 | * Increments by one code point at a time. |
160 | * Stops before the first set element (character or string). |
161 | * (For code points only, this is like while contains(current)==FALSE). |
162 | * |
163 | * When span() returns, the substring between where it started and the position |
164 | * it returned consists only of characters that are not in the set, |
165 | * and none of its strings overlap with the span. |
166 | * |
167 | * @stable ICU 3.8 |
168 | */ |
169 | USET_SPAN_NOT_CONTAINED = 0, |
170 | /** |
171 | * Spans the longest substring that is a concatenation of set elements (characters or strings). |
172 | * (For characters only, this is like while contains(current)==TRUE). |
173 | * |
174 | * When span() returns, the substring between where it started and the position |
175 | * it returned consists only of set elements (characters or strings) that are in the set. |
176 | * |
177 | * If a set contains strings, then the span will be the longest substring for which there |
178 | * exists at least one non-overlapping concatenation of set elements (characters or strings). |
179 | * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. |
180 | * (Java/ICU/Perl regex stops at the first match of an OR.) |
181 | * |
182 | * @stable ICU 3.8 |
183 | */ |
184 | USET_SPAN_CONTAINED = 1, |
185 | /** |
186 | * Continues a span() while there is a set element at the current position. |
187 | * Increments by the longest matching element at each position. |
188 | * (For characters only, this is like while contains(current)==TRUE). |
189 | * |
190 | * When span() returns, the substring between where it started and the position |
191 | * it returned consists only of set elements (characters or strings) that are in the set. |
192 | * |
193 | * If a set only contains single characters, then this is the same |
194 | * as USET_SPAN_CONTAINED. |
195 | * |
196 | * If a set contains strings, then the span will be the longest substring |
197 | * with a match at each position with the longest single set element (character or string). |
198 | * |
199 | * Use this span condition together with other longest-match algorithms, |
200 | * such as ICU converters (ucnv_getUnicodeSet()). |
201 | * |
202 | * @stable ICU 3.8 |
203 | */ |
204 | USET_SPAN_SIMPLE = 2, |
205 | #ifndef U_HIDE_DEPRECATED_API |
206 | /** |
207 | * One more than the last span condition. |
208 | * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. |
209 | */ |
210 | USET_SPAN_CONDITION_COUNT |
211 | #endif // U_HIDE_DEPRECATED_API |
212 | } USetSpanCondition; |
213 | |
214 | enum { |
215 | /** |
216 | * Capacity of USerializedSet::staticArray. |
217 | * Enough for any single-code point set. |
218 | * Also provides padding for nice sizeof(USerializedSet). |
219 | * @stable ICU 2.4 |
220 | */ |
221 | USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8 |
222 | }; |
223 | |
224 | /** |
225 | * A serialized form of a Unicode set. Limited manipulations are |
226 | * possible directly on a serialized set. See below. |
227 | * @stable ICU 2.4 |
228 | */ |
229 | typedef struct USerializedSet { |
230 | /** |
231 | * The serialized Unicode Set. |
232 | * @stable ICU 2.4 |
233 | */ |
234 | const uint16_t *array; |
235 | /** |
236 | * The length of the array that contains BMP characters. |
237 | * @stable ICU 2.4 |
238 | */ |
239 | int32_t bmpLength; |
240 | /** |
241 | * The total length of the array. |
242 | * @stable ICU 2.4 |
243 | */ |
244 | int32_t length; |
245 | /** |
246 | * A small buffer for the array to reduce memory allocations. |
247 | * @stable ICU 2.4 |
248 | */ |
249 | uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY]; |
250 | } USerializedSet; |
251 | |
252 | /********************************************************************* |
253 | * USet API |
254 | *********************************************************************/ |
255 | |
256 | /** |
257 | * Create an empty USet object. |
258 | * Equivalent to uset_open(1, 0). |
259 | * @return a newly created USet. The caller must call uset_close() on |
260 | * it when done. |
261 | * @stable ICU 4.2 |
262 | */ |
263 | U_STABLE USet* U_EXPORT2 |
264 | uset_openEmpty(void); |
265 | |
266 | /** |
267 | * Creates a USet object that contains the range of characters |
268 | * start..end, inclusive. If <code>start > end</code> |
269 | * then an empty set is created (same as using uset_openEmpty()). |
270 | * @param start first character of the range, inclusive |
271 | * @param end last character of the range, inclusive |
272 | * @return a newly created USet. The caller must call uset_close() on |
273 | * it when done. |
274 | * @stable ICU 2.4 |
275 | */ |
276 | U_STABLE USet* U_EXPORT2 |
277 | uset_open(UChar32 start, UChar32 end); |
278 | |
279 | /** |
280 | * Creates a set from the given pattern. See the UnicodeSet class |
281 | * description for the syntax of the pattern language. |
282 | * @param pattern a string specifying what characters are in the set |
283 | * @param patternLength the length of the pattern, or -1 if null |
284 | * terminated |
285 | * @param ec the error code |
286 | * @stable ICU 2.4 |
287 | */ |
288 | U_STABLE USet* U_EXPORT2 |
289 | uset_openPattern(const UChar* pattern, int32_t patternLength, |
290 | UErrorCode* ec); |
291 | |
292 | /** |
293 | * Creates a set from the given pattern. See the UnicodeSet class |
294 | * description for the syntax of the pattern language. |
295 | * @param pattern a string specifying what characters are in the set |
296 | * @param patternLength the length of the pattern, or -1 if null |
297 | * terminated |
298 | * @param options bitmask for options to apply to the pattern. |
299 | * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. |
300 | * @param ec the error code |
301 | * @stable ICU 2.4 |
302 | */ |
303 | U_STABLE USet* U_EXPORT2 |
304 | uset_openPatternOptions(const UChar* pattern, int32_t patternLength, |
305 | uint32_t options, |
306 | UErrorCode* ec); |
307 | |
308 | /** |
309 | * Disposes of the storage used by a USet object. This function should |
310 | * be called exactly once for objects returned by uset_open(). |
311 | * @param set the object to dispose of |
312 | * @stable ICU 2.4 |
313 | */ |
314 | U_STABLE void U_EXPORT2 |
315 | uset_close(USet* set); |
316 | |
317 | #if U_SHOW_CPLUSPLUS_API |
318 | |
319 | U_NAMESPACE_BEGIN |
320 | |
321 | /** |
322 | * \class LocalUSetPointer |
323 | * "Smart pointer" class, closes a USet via uset_close(). |
324 | * For most methods see the LocalPointerBase base class. |
325 | * |
326 | * @see LocalPointerBase |
327 | * @see LocalPointer |
328 | * @stable ICU 4.4 |
329 | */ |
330 | U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close); |
331 | |
332 | U_NAMESPACE_END |
333 | |
334 | #endif |
335 | |
336 | /** |
337 | * Returns a copy of this object. |
338 | * If this set is frozen, then the clone will be frozen as well. |
339 | * Use uset_cloneAsThawed() for a mutable clone of a frozen set. |
340 | * @param set the original set |
341 | * @return the newly allocated copy of the set |
342 | * @see uset_cloneAsThawed |
343 | * @stable ICU 3.8 |
344 | */ |
345 | U_STABLE USet * U_EXPORT2 |
346 | uset_clone(const USet *set); |
347 | |
348 | /** |
349 | * Determines whether the set has been frozen (made immutable) or not. |
350 | * See the ICU4J Freezable interface for details. |
351 | * @param set the set |
352 | * @return TRUE/FALSE for whether the set has been frozen |
353 | * @see uset_freeze |
354 | * @see uset_cloneAsThawed |
355 | * @stable ICU 3.8 |
356 | */ |
357 | U_STABLE UBool U_EXPORT2 |
358 | uset_isFrozen(const USet *set); |
359 | |
360 | /** |
361 | * Freeze the set (make it immutable). |
362 | * Once frozen, it cannot be unfrozen and is therefore thread-safe |
363 | * until it is deleted. |
364 | * See the ICU4J Freezable interface for details. |
365 | * Freezing the set may also make some operations faster, for example |
366 | * uset_contains() and uset_span(). |
367 | * A frozen set will not be modified. (It remains frozen.) |
368 | * @param set the set |
369 | * @return the same set, now frozen |
370 | * @see uset_isFrozen |
371 | * @see uset_cloneAsThawed |
372 | * @stable ICU 3.8 |
373 | */ |
374 | U_STABLE void U_EXPORT2 |
375 | uset_freeze(USet *set); |
376 | |
377 | /** |
378 | * Clone the set and make the clone mutable. |
379 | * See the ICU4J Freezable interface for details. |
380 | * @param set the set |
381 | * @return the mutable clone |
382 | * @see uset_freeze |
383 | * @see uset_isFrozen |
384 | * @see uset_clone |
385 | * @stable ICU 3.8 |
386 | */ |
387 | U_STABLE USet * U_EXPORT2 |
388 | uset_cloneAsThawed(const USet *set); |
389 | |
390 | /** |
391 | * Causes the USet object to represent the range <code>start - end</code>. |
392 | * If <code>start > end</code> then this USet is set to an empty range. |
393 | * A frozen set will not be modified. |
394 | * @param set the object to set to the given range |
395 | * @param start first character in the set, inclusive |
396 | * @param end last character in the set, inclusive |
397 | * @stable ICU 3.2 |
398 | */ |
399 | U_STABLE void U_EXPORT2 |
400 | uset_set(USet* set, |
401 | UChar32 start, UChar32 end); |
402 | |
403 | /** |
404 | * Modifies the set to represent the set specified by the given |
405 | * pattern. See the UnicodeSet class description for the syntax of |
406 | * the pattern language. See also the User Guide chapter about UnicodeSet. |
407 | * <em>Empties the set passed before applying the pattern.</em> |
408 | * A frozen set will not be modified. |
409 | * @param set The set to which the pattern is to be applied. |
410 | * @param pattern A pointer to UChar string specifying what characters are in the set. |
411 | * The character at pattern[0] must be a '['. |
412 | * @param patternLength The length of the UChar string. -1 if NUL terminated. |
413 | * @param options A bitmask for options to apply to the pattern. |
414 | * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. |
415 | * @param status Returns an error if the pattern cannot be parsed. |
416 | * @return Upon successful parse, the value is either |
417 | * the index of the character after the closing ']' |
418 | * of the parsed pattern. |
419 | * If the status code indicates failure, then the return value |
420 | * is the index of the error in the source. |
421 | * |
422 | * @stable ICU 2.8 |
423 | */ |
424 | U_STABLE int32_t U_EXPORT2 |
425 | uset_applyPattern(USet *set, |
426 | const UChar *pattern, int32_t patternLength, |
427 | uint32_t options, |
428 | UErrorCode *status); |
429 | |
430 | /** |
431 | * Modifies the set to contain those code points which have the given value |
432 | * for the given binary or enumerated property, as returned by |
433 | * u_getIntPropertyValue. Prior contents of this set are lost. |
434 | * A frozen set will not be modified. |
435 | * |
436 | * @param set the object to contain the code points defined by the property |
437 | * |
438 | * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 |
439 | * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 |
440 | * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. |
441 | * |
442 | * @param value a value in the range u_getIntPropertyMinValue(prop).. |
443 | * u_getIntPropertyMaxValue(prop), with one exception. If prop is |
444 | * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but |
445 | * rather a mask value produced by U_GET_GC_MASK(). This allows grouped |
446 | * categories such as [:L:] to be represented. |
447 | * |
448 | * @param ec error code input/output parameter |
449 | * |
450 | * @stable ICU 3.2 |
451 | */ |
452 | U_STABLE void U_EXPORT2 |
453 | uset_applyIntPropertyValue(USet* set, |
454 | UProperty prop, int32_t value, UErrorCode* ec); |
455 | |
456 | /** |
457 | * Modifies the set to contain those code points which have the |
458 | * given value for the given property. Prior contents of this |
459 | * set are lost. |
460 | * A frozen set will not be modified. |
461 | * |
462 | * @param set the object to contain the code points defined by the given |
463 | * property and value alias |
464 | * |
465 | * @param prop a string specifying a property alias, either short or long. |
466 | * The name is matched loosely. See PropertyAliases.txt for names and a |
467 | * description of loose matching. If the value string is empty, then this |
468 | * string is interpreted as either a General_Category value alias, a Script |
469 | * value alias, a binary property alias, or a special ID. Special IDs are |
470 | * matched loosely and correspond to the following sets: |
471 | * |
472 | * "ANY" = [\\u0000-\\U0010FFFF], |
473 | * "ASCII" = [\\u0000-\\u007F], |
474 | * "Assigned" = [:^Cn:]. |
475 | * |
476 | * @param propLength the length of the prop, or -1 if NULL |
477 | * |
478 | * @param value a string specifying a value alias, either short or long. |
479 | * The name is matched loosely. See PropertyValueAliases.txt for names |
480 | * and a description of loose matching. In addition to aliases listed, |
481 | * numeric values and canonical combining classes may be expressed |
482 | * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string |
483 | * may also be empty. |
484 | * |
485 | * @param valueLength the length of the value, or -1 if NULL |
486 | * |
487 | * @param ec error code input/output parameter |
488 | * |
489 | * @stable ICU 3.2 |
490 | */ |
491 | U_STABLE void U_EXPORT2 |
492 | uset_applyPropertyAlias(USet* set, |
493 | const UChar *prop, int32_t propLength, |
494 | const UChar *value, int32_t valueLength, |
495 | UErrorCode* ec); |
496 | |
497 | /** |
498 | * Return true if the given position, in the given pattern, appears |
499 | * to be the start of a UnicodeSet pattern. |
500 | * |
501 | * @param pattern a string specifying the pattern |
502 | * @param patternLength the length of the pattern, or -1 if NULL |
503 | * @param pos the given position |
504 | * @stable ICU 3.2 |
505 | */ |
506 | U_STABLE UBool U_EXPORT2 |
507 | uset_resemblesPattern(const UChar *pattern, int32_t patternLength, |
508 | int32_t pos); |
509 | |
510 | /** |
511 | * Returns a string representation of this set. If the result of |
512 | * calling this function is passed to a uset_openPattern(), it |
513 | * will produce another set that is equal to this one. |
514 | * @param set the set |
515 | * @param result the string to receive the rules, may be NULL |
516 | * @param resultCapacity the capacity of result, may be 0 if result is NULL |
517 | * @param escapeUnprintable if TRUE then convert unprintable |
518 | * character to their hex escape representations, \\uxxxx or |
519 | * \\Uxxxxxxxx. Unprintable characters are those other than |
520 | * U+000A, U+0020..U+007E. |
521 | * @param ec error code. |
522 | * @return length of string, possibly larger than resultCapacity |
523 | * @stable ICU 2.4 |
524 | */ |
525 | U_STABLE int32_t U_EXPORT2 |
526 | uset_toPattern(const USet* set, |
527 | UChar* result, int32_t resultCapacity, |
528 | UBool escapeUnprintable, |
529 | UErrorCode* ec); |
530 | |
531 | /** |
532 | * Adds the given character to the given USet. After this call, |
533 | * uset_contains(set, c) will return TRUE. |
534 | * A frozen set will not be modified. |
535 | * @param set the object to which to add the character |
536 | * @param c the character to add |
537 | * @stable ICU 2.4 |
538 | */ |
539 | U_STABLE void U_EXPORT2 |
540 | uset_add(USet* set, UChar32 c); |
541 | |
542 | /** |
543 | * Adds all of the elements in the specified set to this set if |
544 | * they're not already present. This operation effectively |
545 | * modifies this set so that its value is the <i>union</i> of the two |
546 | * sets. The behavior of this operation is unspecified if the specified |
547 | * collection is modified while the operation is in progress. |
548 | * A frozen set will not be modified. |
549 | * |
550 | * @param set the object to which to add the set |
551 | * @param additionalSet the source set whose elements are to be added to this set. |
552 | * @stable ICU 2.6 |
553 | */ |
554 | U_STABLE void U_EXPORT2 |
555 | uset_addAll(USet* set, const USet *additionalSet); |
556 | |
557 | /** |
558 | * Adds the given range of characters to the given USet. After this call, |
559 | * uset_contains(set, start, end) will return TRUE. |
560 | * A frozen set will not be modified. |
561 | * @param set the object to which to add the character |
562 | * @param start the first character of the range to add, inclusive |
563 | * @param end the last character of the range to add, inclusive |
564 | * @stable ICU 2.2 |
565 | */ |
566 | U_STABLE void U_EXPORT2 |
567 | uset_addRange(USet* set, UChar32 start, UChar32 end); |
568 | |
569 | /** |
570 | * Adds the given string to the given USet. After this call, |
571 | * uset_containsString(set, str, strLen) will return TRUE. |
572 | * A frozen set will not be modified. |
573 | * @param set the object to which to add the character |
574 | * @param str the string to add |
575 | * @param strLen the length of the string or -1 if null terminated. |
576 | * @stable ICU 2.4 |
577 | */ |
578 | U_STABLE void U_EXPORT2 |
579 | uset_addString(USet* set, const UChar* str, int32_t strLen); |
580 | |
581 | /** |
582 | * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} |
583 | * If this set already any particular character, it has no effect on that character. |
584 | * A frozen set will not be modified. |
585 | * @param set the object to which to add the character |
586 | * @param str the source string |
587 | * @param strLen the length of the string or -1 if null terminated. |
588 | * @stable ICU 3.4 |
589 | */ |
590 | U_STABLE void U_EXPORT2 |
591 | uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen); |
592 | |
593 | /** |
594 | * Removes the given character from the given USet. After this call, |
595 | * uset_contains(set, c) will return FALSE. |
596 | * A frozen set will not be modified. |
597 | * @param set the object from which to remove the character |
598 | * @param c the character to remove |
599 | * @stable ICU 2.4 |
600 | */ |
601 | U_STABLE void U_EXPORT2 |
602 | uset_remove(USet* set, UChar32 c); |
603 | |
604 | /** |
605 | * Removes the given range of characters from the given USet. After this call, |
606 | * uset_contains(set, start, end) will return FALSE. |
607 | * A frozen set will not be modified. |
608 | * @param set the object to which to add the character |
609 | * @param start the first character of the range to remove, inclusive |
610 | * @param end the last character of the range to remove, inclusive |
611 | * @stable ICU 2.2 |
612 | */ |
613 | U_STABLE void U_EXPORT2 |
614 | uset_removeRange(USet* set, UChar32 start, UChar32 end); |
615 | |
616 | /** |
617 | * Removes the given string to the given USet. After this call, |
618 | * uset_containsString(set, str, strLen) will return FALSE. |
619 | * A frozen set will not be modified. |
620 | * @param set the object to which to add the character |
621 | * @param str the string to remove |
622 | * @param strLen the length of the string or -1 if null terminated. |
623 | * @stable ICU 2.4 |
624 | */ |
625 | U_STABLE void U_EXPORT2 |
626 | uset_removeString(USet* set, const UChar* str, int32_t strLen); |
627 | |
628 | /** |
629 | * Removes from this set all of its elements that are contained in the |
630 | * specified set. This operation effectively modifies this |
631 | * set so that its value is the <i>asymmetric set difference</i> of |
632 | * the two sets. |
633 | * A frozen set will not be modified. |
634 | * @param set the object from which the elements are to be removed |
635 | * @param removeSet the object that defines which elements will be |
636 | * removed from this set |
637 | * @stable ICU 3.2 |
638 | */ |
639 | U_STABLE void U_EXPORT2 |
640 | uset_removeAll(USet* set, const USet* removeSet); |
641 | |
642 | /** |
643 | * Retain only the elements in this set that are contained in the |
644 | * specified range. If <code>start > end</code> then an empty range is |
645 | * retained, leaving the set empty. This is equivalent to |
646 | * a boolean logic AND, or a set INTERSECTION. |
647 | * A frozen set will not be modified. |
648 | * |
649 | * @param set the object for which to retain only the specified range |
650 | * @param start first character, inclusive, of range to be retained |
651 | * to this set. |
652 | * @param end last character, inclusive, of range to be retained |
653 | * to this set. |
654 | * @stable ICU 3.2 |
655 | */ |
656 | U_STABLE void U_EXPORT2 |
657 | uset_retain(USet* set, UChar32 start, UChar32 end); |
658 | |
659 | /** |
660 | * Retains only the elements in this set that are contained in the |
661 | * specified set. In other words, removes from this set all of |
662 | * its elements that are not contained in the specified set. This |
663 | * operation effectively modifies this set so that its value is |
664 | * the <i>intersection</i> of the two sets. |
665 | * A frozen set will not be modified. |
666 | * |
667 | * @param set the object on which to perform the retain |
668 | * @param retain set that defines which elements this set will retain |
669 | * @stable ICU 3.2 |
670 | */ |
671 | U_STABLE void U_EXPORT2 |
672 | uset_retainAll(USet* set, const USet* retain); |
673 | |
674 | /** |
675 | * Reallocate this objects internal structures to take up the least |
676 | * possible space, without changing this object's value. |
677 | * A frozen set will not be modified. |
678 | * |
679 | * @param set the object on which to perfrom the compact |
680 | * @stable ICU 3.2 |
681 | */ |
682 | U_STABLE void U_EXPORT2 |
683 | uset_compact(USet* set); |
684 | |
685 | /** |
686 | * Inverts this set. This operation modifies this set so that |
687 | * its value is its complement. This operation does not affect |
688 | * the multicharacter strings, if any. |
689 | * A frozen set will not be modified. |
690 | * @param set the set |
691 | * @stable ICU 2.4 |
692 | */ |
693 | U_STABLE void U_EXPORT2 |
694 | uset_complement(USet* set); |
695 | |
696 | /** |
697 | * Complements in this set all elements contained in the specified |
698 | * set. Any character in the other set will be removed if it is |
699 | * in this set, or will be added if it is not in this set. |
700 | * A frozen set will not be modified. |
701 | * |
702 | * @param set the set with which to complement |
703 | * @param complement set that defines which elements will be xor'ed |
704 | * from this set. |
705 | * @stable ICU 3.2 |
706 | */ |
707 | U_STABLE void U_EXPORT2 |
708 | uset_complementAll(USet* set, const USet* complement); |
709 | |
710 | /** |
711 | * Removes all of the elements from this set. This set will be |
712 | * empty after this call returns. |
713 | * A frozen set will not be modified. |
714 | * @param set the set |
715 | * @stable ICU 2.4 |
716 | */ |
717 | U_STABLE void U_EXPORT2 |
718 | uset_clear(USet* set); |
719 | |
720 | /** |
721 | * Close this set over the given attribute. For the attribute |
722 | * USET_CASE, the result is to modify this set so that: |
723 | * |
724 | * 1. For each character or string 'a' in this set, all strings or |
725 | * characters 'b' such that foldCase(a) == foldCase(b) are added |
726 | * to this set. |
727 | * |
728 | * 2. For each string 'e' in the resulting set, if e != |
729 | * foldCase(e), 'e' will be removed. |
730 | * |
731 | * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] |
732 | * |
733 | * (Here foldCase(x) refers to the operation u_strFoldCase, and a |
734 | * == b denotes that the contents are the same, not pointer |
735 | * comparison.) |
736 | * |
737 | * A frozen set will not be modified. |
738 | * |
739 | * @param set the set |
740 | * |
741 | * @param attributes bitmask for attributes to close over. |
742 | * Currently only the USET_CASE bit is supported. Any undefined bits |
743 | * are ignored. |
744 | * @stable ICU 4.2 |
745 | */ |
746 | U_STABLE void U_EXPORT2 |
747 | uset_closeOver(USet* set, int32_t attributes); |
748 | |
749 | /** |
750 | * Remove all strings from this set. |
751 | * |
752 | * @param set the set |
753 | * @stable ICU 4.2 |
754 | */ |
755 | U_STABLE void U_EXPORT2 |
756 | uset_removeAllStrings(USet* set); |
757 | |
758 | /** |
759 | * Returns TRUE if the given USet contains no characters and no |
760 | * strings. |
761 | * @param set the set |
762 | * @return true if set is empty |
763 | * @stable ICU 2.4 |
764 | */ |
765 | U_STABLE UBool U_EXPORT2 |
766 | uset_isEmpty(const USet* set); |
767 | |
768 | /** |
769 | * Returns TRUE if the given USet contains the given character. |
770 | * This function works faster with a frozen set. |
771 | * @param set the set |
772 | * @param c The codepoint to check for within the set |
773 | * @return true if set contains c |
774 | * @stable ICU 2.4 |
775 | */ |
776 | U_STABLE UBool U_EXPORT2 |
777 | uset_contains(const USet* set, UChar32 c); |
778 | |
779 | /** |
780 | * Returns TRUE if the given USet contains all characters c |
781 | * where start <= c && c <= end. |
782 | * @param set the set |
783 | * @param start the first character of the range to test, inclusive |
784 | * @param end the last character of the range to test, inclusive |
785 | * @return TRUE if set contains the range |
786 | * @stable ICU 2.2 |
787 | */ |
788 | U_STABLE UBool U_EXPORT2 |
789 | uset_containsRange(const USet* set, UChar32 start, UChar32 end); |
790 | |
791 | /** |
792 | * Returns TRUE if the given USet contains the given string. |
793 | * @param set the set |
794 | * @param str the string |
795 | * @param strLen the length of the string or -1 if null terminated. |
796 | * @return true if set contains str |
797 | * @stable ICU 2.4 |
798 | */ |
799 | U_STABLE UBool U_EXPORT2 |
800 | uset_containsString(const USet* set, const UChar* str, int32_t strLen); |
801 | |
802 | /** |
803 | * Returns the index of the given character within this set, where |
804 | * the set is ordered by ascending code point. If the character |
805 | * is not in this set, return -1. The inverse of this method is |
806 | * <code>charAt()</code>. |
807 | * @param set the set |
808 | * @param c the character to obtain the index for |
809 | * @return an index from 0..size()-1, or -1 |
810 | * @stable ICU 3.2 |
811 | */ |
812 | U_STABLE int32_t U_EXPORT2 |
813 | uset_indexOf(const USet* set, UChar32 c); |
814 | |
815 | /** |
816 | * Returns the character at the given index within this set, where |
817 | * the set is ordered by ascending code point. If the index is |
818 | * out of range, return (UChar32)-1. The inverse of this method is |
819 | * <code>indexOf()</code>. |
820 | * @param set the set |
821 | * @param charIndex an index from 0..size()-1 to obtain the char for |
822 | * @return the character at the given index, or (UChar32)-1. |
823 | * @stable ICU 3.2 |
824 | */ |
825 | U_STABLE UChar32 U_EXPORT2 |
826 | uset_charAt(const USet* set, int32_t charIndex); |
827 | |
828 | /** |
829 | * Returns the number of characters and strings contained in the given |
830 | * USet. |
831 | * @param set the set |
832 | * @return a non-negative integer counting the characters and strings |
833 | * contained in set |
834 | * @stable ICU 2.4 |
835 | */ |
836 | U_STABLE int32_t U_EXPORT2 |
837 | uset_size(const USet* set); |
838 | |
839 | /** |
840 | * Returns the number of items in this set. An item is either a range |
841 | * of characters or a single multicharacter string. |
842 | * @param set the set |
843 | * @return a non-negative integer counting the character ranges |
844 | * and/or strings contained in set |
845 | * @stable ICU 2.4 |
846 | */ |
847 | U_STABLE int32_t U_EXPORT2 |
848 | uset_getItemCount(const USet* set); |
849 | |
850 | /** |
851 | * Returns an item of this set. An item is either a range of |
852 | * characters or a single multicharacter string. |
853 | * @param set the set |
854 | * @param itemIndex a non-negative integer in the range 0.. |
855 | * uset_getItemCount(set)-1 |
856 | * @param start pointer to variable to receive first character |
857 | * in range, inclusive |
858 | * @param end pointer to variable to receive last character in range, |
859 | * inclusive |
860 | * @param str buffer to receive the string, may be NULL |
861 | * @param strCapacity capacity of str, or 0 if str is NULL |
862 | * @param ec error code |
863 | * @return the length of the string (>= 2), or 0 if the item is a |
864 | * range, in which case it is the range *start..*end, or -1 if |
865 | * itemIndex is out of range |
866 | * @stable ICU 2.4 |
867 | */ |
868 | U_STABLE int32_t U_EXPORT2 |
869 | uset_getItem(const USet* set, int32_t itemIndex, |
870 | UChar32* start, UChar32* end, |
871 | UChar* str, int32_t strCapacity, |
872 | UErrorCode* ec); |
873 | |
874 | /** |
875 | * Returns true if set1 contains all the characters and strings |
876 | * of set2. It answers the question, 'Is set1 a superset of set2?' |
877 | * @param set1 set to be checked for containment |
878 | * @param set2 set to be checked for containment |
879 | * @return true if the test condition is met |
880 | * @stable ICU 3.2 |
881 | */ |
882 | U_STABLE UBool U_EXPORT2 |
883 | uset_containsAll(const USet* set1, const USet* set2); |
884 | |
885 | /** |
886 | * Returns true if this set contains all the characters |
887 | * of the given string. This is does not check containment of grapheme |
888 | * clusters, like uset_containsString. |
889 | * @param set set of characters to be checked for containment |
890 | * @param str string containing codepoints to be checked for containment |
891 | * @param strLen the length of the string or -1 if null terminated. |
892 | * @return true if the test condition is met |
893 | * @stable ICU 3.4 |
894 | */ |
895 | U_STABLE UBool U_EXPORT2 |
896 | uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen); |
897 | |
898 | /** |
899 | * Returns true if set1 contains none of the characters and strings |
900 | * of set2. It answers the question, 'Is set1 a disjoint set of set2?' |
901 | * @param set1 set to be checked for containment |
902 | * @param set2 set to be checked for containment |
903 | * @return true if the test condition is met |
904 | * @stable ICU 3.2 |
905 | */ |
906 | U_STABLE UBool U_EXPORT2 |
907 | uset_containsNone(const USet* set1, const USet* set2); |
908 | |
909 | /** |
910 | * Returns true if set1 contains some of the characters and strings |
911 | * of set2. It answers the question, 'Does set1 and set2 have an intersection?' |
912 | * @param set1 set to be checked for containment |
913 | * @param set2 set to be checked for containment |
914 | * @return true if the test condition is met |
915 | * @stable ICU 3.2 |
916 | */ |
917 | U_STABLE UBool U_EXPORT2 |
918 | uset_containsSome(const USet* set1, const USet* set2); |
919 | |
920 | /** |
921 | * Returns the length of the initial substring of the input string which |
922 | * consists only of characters and strings that are contained in this set |
923 | * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), |
924 | * or only of characters and strings that are not contained |
925 | * in this set (USET_SPAN_NOT_CONTAINED). |
926 | * See USetSpanCondition for details. |
927 | * Similar to the strspn() C library function. |
928 | * Unpaired surrogates are treated according to contains() of their surrogate code points. |
929 | * This function works faster with a frozen set and with a non-negative string length argument. |
930 | * @param set the set |
931 | * @param s start of the string |
932 | * @param length of the string; can be -1 for NUL-terminated |
933 | * @param spanCondition specifies the containment condition |
934 | * @return the length of the initial substring according to the spanCondition; |
935 | * 0 if the start of the string does not fit the spanCondition |
936 | * @stable ICU 3.8 |
937 | * @see USetSpanCondition |
938 | */ |
939 | U_STABLE int32_t U_EXPORT2 |
940 | uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); |
941 | |
942 | /** |
943 | * Returns the start of the trailing substring of the input string which |
944 | * consists only of characters and strings that are contained in this set |
945 | * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), |
946 | * or only of characters and strings that are not contained |
947 | * in this set (USET_SPAN_NOT_CONTAINED). |
948 | * See USetSpanCondition for details. |
949 | * Unpaired surrogates are treated according to contains() of their surrogate code points. |
950 | * This function works faster with a frozen set and with a non-negative string length argument. |
951 | * @param set the set |
952 | * @param s start of the string |
953 | * @param length of the string; can be -1 for NUL-terminated |
954 | * @param spanCondition specifies the containment condition |
955 | * @return the start of the trailing substring according to the spanCondition; |
956 | * the string length if the end of the string does not fit the spanCondition |
957 | * @stable ICU 3.8 |
958 | * @see USetSpanCondition |
959 | */ |
960 | U_STABLE int32_t U_EXPORT2 |
961 | uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); |
962 | |
963 | /** |
964 | * Returns the length of the initial substring of the input string which |
965 | * consists only of characters and strings that are contained in this set |
966 | * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), |
967 | * or only of characters and strings that are not contained |
968 | * in this set (USET_SPAN_NOT_CONTAINED). |
969 | * See USetSpanCondition for details. |
970 | * Similar to the strspn() C library function. |
971 | * Malformed byte sequences are treated according to contains(0xfffd). |
972 | * This function works faster with a frozen set and with a non-negative string length argument. |
973 | * @param set the set |
974 | * @param s start of the string (UTF-8) |
975 | * @param length of the string; can be -1 for NUL-terminated |
976 | * @param spanCondition specifies the containment condition |
977 | * @return the length of the initial substring according to the spanCondition; |
978 | * 0 if the start of the string does not fit the spanCondition |
979 | * @stable ICU 3.8 |
980 | * @see USetSpanCondition |
981 | */ |
982 | U_STABLE int32_t U_EXPORT2 |
983 | uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); |
984 | |
985 | /** |
986 | * Returns the start of the trailing substring of the input string which |
987 | * consists only of characters and strings that are contained in this set |
988 | * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), |
989 | * or only of characters and strings that are not contained |
990 | * in this set (USET_SPAN_NOT_CONTAINED). |
991 | * See USetSpanCondition for details. |
992 | * Malformed byte sequences are treated according to contains(0xfffd). |
993 | * This function works faster with a frozen set and with a non-negative string length argument. |
994 | * @param set the set |
995 | * @param s start of the string (UTF-8) |
996 | * @param length of the string; can be -1 for NUL-terminated |
997 | * @param spanCondition specifies the containment condition |
998 | * @return the start of the trailing substring according to the spanCondition; |
999 | * the string length if the end of the string does not fit the spanCondition |
1000 | * @stable ICU 3.8 |
1001 | * @see USetSpanCondition |
1002 | */ |
1003 | U_STABLE int32_t U_EXPORT2 |
1004 | uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); |
1005 | |
1006 | /** |
1007 | * Returns true if set1 contains all of the characters and strings |
1008 | * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' |
1009 | * @param set1 set to be checked for containment |
1010 | * @param set2 set to be checked for containment |
1011 | * @return true if the test condition is met |
1012 | * @stable ICU 3.2 |
1013 | */ |
1014 | U_STABLE UBool U_EXPORT2 |
1015 | uset_equals(const USet* set1, const USet* set2); |
1016 | |
1017 | /********************************************************************* |
1018 | * Serialized set API |
1019 | *********************************************************************/ |
1020 | |
1021 | /** |
1022 | * Serializes this set into an array of 16-bit integers. Serialization |
1023 | * (currently) only records the characters in the set; multicharacter |
1024 | * strings are ignored. |
1025 | * |
1026 | * The array |
1027 | * has following format (each line is one 16-bit integer): |
1028 | * |
1029 | * length = (n+2*m) | (m!=0?0x8000:0) |
1030 | * bmpLength = n; present if m!=0 |
1031 | * bmp[0] |
1032 | * bmp[1] |
1033 | * ... |
1034 | * bmp[n-1] |
1035 | * supp-high[0] |
1036 | * supp-low[0] |
1037 | * supp-high[1] |
1038 | * supp-low[1] |
1039 | * ... |
1040 | * supp-high[m-1] |
1041 | * supp-low[m-1] |
1042 | * |
1043 | * The array starts with a header. After the header are n bmp |
1044 | * code points, then m supplementary code points. Either n or m |
1045 | * or both may be zero. n+2*m is always <= 0x7FFF. |
1046 | * |
1047 | * If there are no supplementary characters (if m==0) then the |
1048 | * header is one 16-bit integer, 'length', with value n. |
1049 | * |
1050 | * If there are supplementary characters (if m!=0) then the header |
1051 | * is two 16-bit integers. The first, 'length', has value |
1052 | * (n+2*m)|0x8000. The second, 'bmpLength', has value n. |
1053 | * |
1054 | * After the header the code points are stored in ascending order. |
1055 | * Supplementary code points are stored as most significant 16 |
1056 | * bits followed by least significant 16 bits. |
1057 | * |
1058 | * @param set the set |
1059 | * @param dest pointer to buffer of destCapacity 16-bit integers. |
1060 | * May be NULL only if destCapacity is zero. |
1061 | * @param destCapacity size of dest, or zero. Must not be negative. |
1062 | * @param pErrorCode pointer to the error code. Will be set to |
1063 | * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to |
1064 | * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. |
1065 | * @return the total length of the serialized format, including |
1066 | * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other |
1067 | * than U_BUFFER_OVERFLOW_ERROR. |
1068 | * @stable ICU 2.4 |
1069 | */ |
1070 | U_STABLE int32_t U_EXPORT2 |
1071 | uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode); |
1072 | |
1073 | /** |
1074 | * Given a serialized array, fill in the given serialized set object. |
1075 | * @param fillSet pointer to result |
1076 | * @param src pointer to start of array |
1077 | * @param srcLength length of array |
1078 | * @return true if the given array is valid, otherwise false |
1079 | * @stable ICU 2.4 |
1080 | */ |
1081 | U_STABLE UBool U_EXPORT2 |
1082 | uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength); |
1083 | |
1084 | /** |
1085 | * Set the USerializedSet to contain the given character (and nothing |
1086 | * else). |
1087 | * @param fillSet pointer to result |
1088 | * @param c The codepoint to set |
1089 | * @stable ICU 2.4 |
1090 | */ |
1091 | U_STABLE void U_EXPORT2 |
1092 | uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c); |
1093 | |
1094 | /** |
1095 | * Returns TRUE if the given USerializedSet contains the given |
1096 | * character. |
1097 | * @param set the serialized set |
1098 | * @param c The codepoint to check for within the set |
1099 | * @return true if set contains c |
1100 | * @stable ICU 2.4 |
1101 | */ |
1102 | U_STABLE UBool U_EXPORT2 |
1103 | uset_serializedContains(const USerializedSet* set, UChar32 c); |
1104 | |
1105 | /** |
1106 | * Returns the number of disjoint ranges of characters contained in |
1107 | * the given serialized set. Ignores any strings contained in the |
1108 | * set. |
1109 | * @param set the serialized set |
1110 | * @return a non-negative integer counting the character ranges |
1111 | * contained in set |
1112 | * @stable ICU 2.4 |
1113 | */ |
1114 | U_STABLE int32_t U_EXPORT2 |
1115 | uset_getSerializedRangeCount(const USerializedSet* set); |
1116 | |
1117 | /** |
1118 | * Returns a range of characters contained in the given serialized |
1119 | * set. |
1120 | * @param set the serialized set |
1121 | * @param rangeIndex a non-negative integer in the range 0.. |
1122 | * uset_getSerializedRangeCount(set)-1 |
1123 | * @param pStart pointer to variable to receive first character |
1124 | * in range, inclusive |
1125 | * @param pEnd pointer to variable to receive last character in range, |
1126 | * inclusive |
1127 | * @return true if rangeIndex is valid, otherwise false |
1128 | * @stable ICU 2.4 |
1129 | */ |
1130 | U_STABLE UBool U_EXPORT2 |
1131 | uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, |
1132 | UChar32* pStart, UChar32* pEnd); |
1133 | |
1134 | #endif |
1135 | |