1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4***************************************************************************
5* Copyright (C) 1999-2016, International Business Machines Corporation
6* and others. All Rights Reserved.
7***************************************************************************
8* Date Name Description
9* 10/20/99 alan Creation.
10***************************************************************************
11*/
12
13#ifndef UNICODESET_H
14#define UNICODESET_H
15
16#include "unicode/utypes.h"
17
18#if U_SHOW_CPLUSPLUS_API
19
20#include "unicode/ucpmap.h"
21#include "unicode/unifilt.h"
22#include "unicode/unistr.h"
23#include "unicode/uset.h"
24
25/**
26 * \file
27 * \brief C++ API: Unicode Set
28 */
29
30U_NAMESPACE_BEGIN
31
32// Forward Declarations.
33class BMPSet;
34class ParsePosition;
35class RBBIRuleScanner;
36class SymbolTable;
37class UnicodeSetStringSpan;
38class UVector;
39class RuleCharacterIterator;
40
41/**
42 * A mutable set of Unicode characters and multicharacter strings. Objects of this class
43 * represent <em>character classes</em> used in regular expressions.
44 * A character specifies a subset of Unicode code points. Legal
45 * code points are U+0000 to U+10FFFF, inclusive.
46 *
47 * <p>The UnicodeSet class is not designed to be subclassed.
48 *
49 * <p><code>UnicodeSet</code> supports two APIs. The first is the
50 * <em>operand</em> API that allows the caller to modify the value of
51 * a <code>UnicodeSet</code> object. It conforms to Java 2's
52 * <code>java.util.Set</code> interface, although
53 * <code>UnicodeSet</code> does not actually implement that
54 * interface. All methods of <code>Set</code> are supported, with the
55 * modification that they take a character range or single character
56 * instead of an <code>Object</code>, and they take a
57 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The
58 * operand API may be thought of in terms of boolean logic: a boolean
59 * OR is implemented by <code>add</code>, a boolean AND is implemented
60 * by <code>retain</code>, a boolean XOR is implemented by
61 * <code>complement</code> taking an argument, and a boolean NOT is
62 * implemented by <code>complement</code> with no argument. In terms
63 * of traditional set theory function names, <code>add</code> is a
64 * union, <code>retain</code> is an intersection, <code>remove</code>
65 * is an asymmetric difference, and <code>complement</code> with no
66 * argument is a set complement with respect to the superset range
67 * <code>MIN_VALUE-MAX_VALUE</code>
68 *
69 * <p>The second API is the
70 * <code>applyPattern()</code>/<code>toPattern()</code> API from the
71 * <code>java.text.Format</code>-derived classes. Unlike the
72 * methods that add characters, add categories, and control the logic
73 * of the set, the method <code>applyPattern()</code> sets all
74 * attributes of a <code>UnicodeSet</code> at once, based on a
75 * string pattern.
76 *
77 * <p><b>Pattern syntax</b></p>
78 *
79 * Patterns are accepted by the constructors and the
80 * <code>applyPattern()</code> methods and returned by the
81 * <code>toPattern()</code> method. These patterns follow a syntax
82 * similar to that employed by version 8 regular expression character
83 * classes. Here are some simple examples:
84 *
85 * \htmlonly<blockquote>\endhtmlonly
86 * <table>
87 * <tr align="top">
88 * <td nowrap valign="top" align="left"><code>[]</code></td>
89 * <td valign="top">No characters</td>
90 * </tr><tr align="top">
91 * <td nowrap valign="top" align="left"><code>[a]</code></td>
92 * <td valign="top">The character 'a'</td>
93 * </tr><tr align="top">
94 * <td nowrap valign="top" align="left"><code>[ae]</code></td>
95 * <td valign="top">The characters 'a' and 'e'</td>
96 * </tr>
97 * <tr>
98 * <td nowrap valign="top" align="left"><code>[a-e]</code></td>
99 * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
100 * point order</td>
101 * </tr>
102 * <tr>
103 * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
104 * <td valign="top">The character U+4E01</td>
105 * </tr>
106 * <tr>
107 * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
108 * <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and
109 * &quot;ac&quot;</td>
110 * </tr>
111 * <tr>
112 * <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
113 * <td valign="top">All characters in the general category Uppercase Letter</td>
114 * </tr>
115 * </table>
116 * \htmlonly</blockquote>\endhtmlonly
117 *
118 * Any character may be preceded by a backslash in order to remove any special
119 * meaning. White space characters, as defined by UCharacter.isWhitespace(), are
120 * ignored, unless they are escaped.
121 *
122 * <p>Property patterns specify a set of characters having a certain
123 * property as defined by the Unicode standard. Both the POSIX-like
124 * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a
125 * complete list of supported property patterns, see the User's Guide
126 * for UnicodeSet at
127 * <a href="https://unicode-org.github.io/icu/userguide/strings/unicodeset">
128 * https://unicode-org.github.io/icu/userguide/strings/unicodeset</a>.
129 * Actual determination of property data is defined by the underlying
130 * Unicode database as implemented by UCharacter.
131 *
132 * <p>Patterns specify individual characters, ranges of characters, and
133 * Unicode property sets. When elements are concatenated, they
134 * specify their union. To complement a set, place a '^' immediately
135 * after the opening '['. Property patterns are inverted by modifying
136 * their delimiters; "[:^foo]" and "\\P{foo}". In any other location,
137 * '^' has no special meaning.
138 *
139 * <p>Since ICU 70, "[^...]", "[:^foo]", "\\P{foo}", and "[:binaryProperty=No:]"
140 * perform a “code point complement” (all code points minus the original set),
141 * removing all multicharacter strings,
142 * equivalent to <code>.complement().removeAllStrings()</code>.
143 * The complement() API function continues to perform a
144 * symmetric difference with all code points and thus retains all multicharacter strings.
145 *
146 * <p>Ranges are indicated by placing two a '-' between two
147 * characters, as in "a-z". This specifies the range of all
148 * characters from the left to the right, in Unicode order. If the
149 * left character is greater than or equal to the
150 * right character it is a syntax error. If a '-' occurs as the first
151 * character after the opening '[' or '[^', or if it occurs as the
152 * last character before the closing ']', then it is taken as a
153 * literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
154 * set of three characters, 'a', 'b', and '-'.
155 *
156 * <p>Sets may be intersected using the '&' operator or the asymmetric
157 * set difference may be taken using the '-' operator, for example,
158 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
159 * with values less than 4096. Operators ('&' and '|') have equal
160 * precedence and bind left-to-right. Thus
161 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
162 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
163 * difference; intersection is commutative.
164 *
165 * <table>
166 * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
167 * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
168 * through 'z' and all letters in between, in Unicode order
169 * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
170 * all characters but 'a' through 'z',
171 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
172 * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
173 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
174 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
175 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
176 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
177 * <td>The asymmetric difference of sets specified by <em>pat1</em> and
178 * <em>pat2</em>
179 * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
180 * <td>The set of characters having the specified
181 * Unicode property; in
182 * this case, Unicode uppercase letters
183 * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
184 * <td>The set of characters <em>not</em> having the given
185 * Unicode property
186 * </table>
187 *
188 * <p><b>Formal syntax</b></p>
189 *
190 * \htmlonly<blockquote>\endhtmlonly
191 * <table>
192 * <tr align="top">
193 * <td nowrap valign="top" align="right"><code>pattern :=&nbsp; </code></td>
194 * <td valign="top"><code>('[' '^'? item* ']') |
195 * property</code></td>
196 * </tr>
197 * <tr align="top">
198 * <td nowrap valign="top" align="right"><code>item :=&nbsp; </code></td>
199 * <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
200 * </code></td>
201 * </tr>
202 * <tr align="top">
203 * <td nowrap valign="top" align="right"><code>pattern-expr :=&nbsp; </code></td>
204 * <td valign="top"><code>pattern | pattern-expr pattern |
205 * pattern-expr op pattern<br>
206 * </code></td>
207 * </tr>
208 * <tr align="top">
209 * <td nowrap valign="top" align="right"><code>op :=&nbsp; </code></td>
210 * <td valign="top"><code>'&amp;' | '-'<br>
211 * </code></td>
212 * </tr>
213 * <tr align="top">
214 * <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>
215 * <td valign="top"><code>'[' | ']' | '-'<br>
216 * </code></td>
217 * </tr>
218 * <tr align="top">
219 * <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
220 * <td valign="top"><em>any character that is not</em><code> special<br>
221 * | ('\' </code><em>any character</em><code>)<br>
222 * | ('\\u' hex hex hex hex)<br>
223 * </code></td>
224 * </tr>
225 * <tr align="top">
226 * <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
227 * <td valign="top"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br>
228 * &nbsp;&nbsp;&nbsp;&nbsp;'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td>
229 * </tr>
230 * <tr>
231 * <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
232 * <td valign="top"><em>a Unicode property set pattern</em></td>
233 * </tr>
234 * </table>
235 * <br>
236 * <table border="1">
237 * <tr>
238 * <td>Legend: <table>
239 * <tr>
240 * <td nowrap valign="top"><code>a := b</code></td>
241 * <td width="20" valign="top">&nbsp; </td>
242 * <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
243 * </tr>
244 * <tr>
245 * <td nowrap valign="top"><code>a?</code></td>
246 * <td valign="top"></td>
247 * <td valign="top">zero or one instance of <code>a</code><br>
248 * </td>
249 * </tr>
250 * <tr>
251 * <td nowrap valign="top"><code>a*</code></td>
252 * <td valign="top"></td>
253 * <td valign="top">one or more instances of <code>a</code><br>
254 * </td>
255 * </tr>
256 * <tr>
257 * <td nowrap valign="top"><code>a | b</code></td>
258 * <td valign="top"></td>
259 * <td valign="top">either <code>a</code> or <code>b</code><br>
260 * </td>
261 * </tr>
262 * <tr>
263 * <td nowrap valign="top"><code>'a'</code></td>
264 * <td valign="top"></td>
265 * <td valign="top">the literal string between the quotes </td>
266 * </tr>
267 * </table>
268 * </td>
269 * </tr>
270 * </table>
271 * \htmlonly</blockquote>\endhtmlonly
272 *
273 * <p>Note:
274 * - Most UnicodeSet methods do not take a UErrorCode parameter because
275 * there are usually very few opportunities for failure other than a shortage
276 * of memory, error codes in low-level C++ string methods would be inconvenient,
277 * and the error code as the last parameter (ICU convention) would prevent
278 * the use of default parameter values.
279 * Instead, such methods set the UnicodeSet into a "bogus" state
280 * (see isBogus()) if an error occurs.
281 *
282 * @author Alan Liu
283 * @stable ICU 2.0
284 */
285class U_COMMON_API UnicodeSet final : public UnicodeFilter {
286private:
287 /**
288 * Enough for sets with few ranges.
289 * For example, White_Space has 10 ranges, list length 21.
290 */
291 static constexpr int32_t INITIAL_CAPACITY = 25;
292 // fFlags constant
293 static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid)
294
295 UChar32* list = stackList; // MUST be terminated with HIGH
296 int32_t capacity = INITIAL_CAPACITY; // capacity of list
297 int32_t len = 1; // length of list used; 1 <= len <= capacity
298 uint8_t fFlags = 0; // Bit flag (see constants above)
299
300 BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not nullptr.
301 UChar32* buffer = nullptr; // internal buffer, may be nullptr
302 int32_t bufferCapacity = 0; // capacity of buffer
303
304 /**
305 * The pattern representation of this set. This may not be the
306 * most economical pattern. It is the pattern supplied to
307 * applyPattern(), with variables substituted and whitespace
308 * removed. For sets constructed without applyPattern(), or
309 * modified using the non-pattern API, this string will be empty,
310 * indicating that toPattern() must generate a pattern
311 * representation from the inversion list.
312 */
313 char16_t *pat = nullptr;
314 int32_t patLen = 0;
315
316 UVector* strings = nullptr; // maintained in sorted order
317 UnicodeSetStringSpan *stringSpan = nullptr;
318
319 /**
320 * Initial list array.
321 * Avoids some heap allocations, and list is never nullptr.
322 * Increases the object size a bit.
323 */
324 UChar32 stackList[INITIAL_CAPACITY];
325
326public:
327 /**
328 * Determine if this object contains a valid set.
329 * A bogus set has no value. It is different from an empty set.
330 * It can be used to indicate that no set value is available.
331 *
332 * @return true if the set is bogus/invalid, false otherwise
333 * @see setToBogus()
334 * @stable ICU 4.0
335 */
336 inline UBool isBogus(void) const;
337
338 /**
339 * Make this UnicodeSet object invalid.
340 * The string will test true with isBogus().
341 *
342 * A bogus set has no value. It is different from an empty set.
343 * It can be used to indicate that no set value is available.
344 *
345 * This utility function is used throughout the UnicodeSet
346 * implementation to indicate that a UnicodeSet operation failed,
347 * and may be used in other functions,
348 * especially but not exclusively when such functions do not
349 * take a UErrorCode for simplicity.
350 *
351 * @see isBogus()
352 * @stable ICU 4.0
353 */
354 void setToBogus();
355
356public:
357
358 enum {
359 /**
360 * Minimum value that can be stored in a UnicodeSet.
361 * @stable ICU 2.4
362 */
363 MIN_VALUE = 0,
364
365 /**
366 * Maximum value that can be stored in a UnicodeSet.
367 * @stable ICU 2.4
368 */
369 MAX_VALUE = 0x10ffff
370 };
371
372 //----------------------------------------------------------------
373 // Constructors &c
374 //----------------------------------------------------------------
375
376public:
377
378 /**
379 * Constructs an empty set.
380 * @stable ICU 2.0
381 */
382 UnicodeSet();
383
384 /**
385 * Constructs a set containing the given range. If <code>end <
386 * start</code> then an empty set is created.
387 *
388 * @param start first character, inclusive, of range
389 * @param end last character, inclusive, of range
390 * @stable ICU 2.4
391 */
392 UnicodeSet(UChar32 start, UChar32 end);
393
394#ifndef U_HIDE_INTERNAL_API
395 /**
396 * @internal
397 */
398 enum ESerialization {
399 kSerialized /* result of serialize() */
400 };
401
402 /**
403 * Constructs a set from the output of serialize().
404 *
405 * @param buffer the 16 bit array
406 * @param bufferLen the original length returned from serialize()
407 * @param serialization the value 'kSerialized'
408 * @param status error code
409 *
410 * @internal
411 */
412 UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
413 ESerialization serialization, UErrorCode &status);
414#endif /* U_HIDE_INTERNAL_API */
415
416 /**
417 * Constructs a set from the given pattern. See the class
418 * description for the syntax of the pattern language.
419 * @param pattern a string specifying what characters are in the set
420 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
421 * contains a syntax error.
422 * @stable ICU 2.0
423 */
424 UnicodeSet(const UnicodeString& pattern,
425 UErrorCode& status);
426
427#ifndef U_HIDE_INTERNAL_API
428 /**
429 * Constructs a set from the given pattern. See the class
430 * description for the syntax of the pattern language.
431 * @param pattern a string specifying what characters are in the set
432 * @param options bitmask for options to apply to the pattern.
433 * Valid options are USET_IGNORE_SPACE and
434 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
435 * These case options are mutually exclusive.
436 * @param symbols a symbol table mapping variable names to values
437 * and stand-in characters to UnicodeSets; may be nullptr
438 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
439 * contains a syntax error.
440 * @internal
441 */
442 UnicodeSet(const UnicodeString& pattern,
443 uint32_t options,
444 const SymbolTable* symbols,
445 UErrorCode& status);
446#endif /* U_HIDE_INTERNAL_API */
447
448 /**
449 * Constructs a set from the given pattern. See the class description
450 * for the syntax of the pattern language.
451 * @param pattern a string specifying what characters are in the set
452 * @param pos on input, the position in pattern at which to start parsing.
453 * On output, the position after the last character parsed.
454 * @param options bitmask for options to apply to the pattern.
455 * Valid options are USET_IGNORE_SPACE and
456 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
457 * These case options are mutually exclusive.
458 * @param symbols a symbol table mapping variable names to values
459 * and stand-in characters to UnicodeSets; may be nullptr
460 * @param status input-output error code
461 * @stable ICU 2.8
462 */
463 UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
464 uint32_t options,
465 const SymbolTable* symbols,
466 UErrorCode& status);
467
468 /**
469 * Constructs a set that is identical to the given UnicodeSet.
470 * @stable ICU 2.0
471 */
472 UnicodeSet(const UnicodeSet& o);
473
474 /**
475 * Destructs the set.
476 * @stable ICU 2.0
477 */
478 virtual ~UnicodeSet();
479
480 /**
481 * Assigns this object to be a copy of another.
482 * A frozen set will not be modified.
483 * @stable ICU 2.0
484 */
485 UnicodeSet& operator=(const UnicodeSet& o);
486
487 /**
488 * Compares the specified object with this set for equality. Returns
489 * <tt>true</tt> if the two sets
490 * have the same size, and every member of the specified set is
491 * contained in this set (or equivalently, every member of this set is
492 * contained in the specified set).
493 *
494 * @param o set to be compared for equality with this set.
495 * @return <tt>true</tt> if the specified set is equal to this set.
496 * @stable ICU 2.0
497 */
498 virtual bool operator==(const UnicodeSet& o) const;
499
500 /**
501 * Compares the specified object with this set for equality. Returns
502 * <tt>true</tt> if the specified set is not equal to this set.
503 * @stable ICU 2.0
504 */
505 inline bool operator!=(const UnicodeSet& o) const;
506
507 /**
508 * Returns a copy of this object. All UnicodeFunctor objects have
509 * to support cloning in order to allow classes using
510 * UnicodeFunctors, such as Transliterator, to implement cloning.
511 * If this set is frozen, then the clone will be frozen as well.
512 * Use cloneAsThawed() for a mutable clone of a frozen set.
513 * @see cloneAsThawed
514 * @stable ICU 2.0
515 */
516 virtual UnicodeSet* clone() const override;
517
518 /**
519 * Returns the hash code value for this set.
520 *
521 * @return the hash code value for this set.
522 * @see Object#hashCode()
523 * @stable ICU 2.0
524 */
525 virtual int32_t hashCode(void) const;
526
527 /**
528 * Get a UnicodeSet pointer from a USet
529 *
530 * @param uset a USet (the ICU plain C type for UnicodeSet)
531 * @return the corresponding UnicodeSet pointer.
532 *
533 * @stable ICU 4.2
534 */
535 inline static UnicodeSet *fromUSet(USet *uset);
536
537 /**
538 * Get a UnicodeSet pointer from a const USet
539 *
540 * @param uset a const USet (the ICU plain C type for UnicodeSet)
541 * @return the corresponding UnicodeSet pointer.
542 *
543 * @stable ICU 4.2
544 */
545 inline static const UnicodeSet *fromUSet(const USet *uset);
546
547 /**
548 * Produce a USet * pointer for this UnicodeSet.
549 * USet is the plain C type for UnicodeSet
550 *
551 * @return a USet pointer for this UnicodeSet
552 * @stable ICU 4.2
553 */
554 inline USet *toUSet();
555
556
557 /**
558 * Produce a const USet * pointer for this UnicodeSet.
559 * USet is the plain C type for UnicodeSet
560 *
561 * @return a const USet pointer for this UnicodeSet
562 * @stable ICU 4.2
563 */
564 inline const USet * toUSet() const;
565
566
567 //----------------------------------------------------------------
568 // Freezable API
569 //----------------------------------------------------------------
570
571 /**
572 * Determines whether the set has been frozen (made immutable) or not.
573 * See the ICU4J Freezable interface for details.
574 * @return true/false for whether the set has been frozen
575 * @see freeze
576 * @see cloneAsThawed
577 * @stable ICU 3.8
578 */
579 inline UBool isFrozen() const;
580
581 /**
582 * Freeze the set (make it immutable).
583 * Once frozen, it cannot be unfrozen and is therefore thread-safe
584 * until it is deleted.
585 * See the ICU4J Freezable interface for details.
586 * Freezing the set may also make some operations faster, for example
587 * contains() and span().
588 * A frozen set will not be modified. (It remains frozen.)
589 * @return this set.
590 * @see isFrozen
591 * @see cloneAsThawed
592 * @stable ICU 3.8
593 */
594 UnicodeSet *freeze();
595
596 /**
597 * Clone the set and make the clone mutable.
598 * See the ICU4J Freezable interface for details.
599 * @return the mutable clone
600 * @see freeze
601 * @see isFrozen
602 * @stable ICU 3.8
603 */
604 UnicodeSet *cloneAsThawed() const;
605
606 //----------------------------------------------------------------
607 // Public API
608 //----------------------------------------------------------------
609
610 /**
611 * Make this object represent the range `start - end`.
612 * If `start > end` then this object is set to an empty range.
613 * A frozen set will not be modified.
614 *
615 * @param start first character in the set, inclusive
616 * @param end last character in the set, inclusive
617 * @stable ICU 2.4
618 */
619 UnicodeSet& set(UChar32 start, UChar32 end);
620
621 /**
622 * Return true if the given position, in the given pattern, appears
623 * to be the start of a UnicodeSet pattern.
624 * @stable ICU 2.4
625 */
626 static UBool resemblesPattern(const UnicodeString& pattern,
627 int32_t pos);
628
629 /**
630 * Modifies this set to represent the set specified by the given
631 * pattern, ignoring Unicode Pattern_White_Space characters.
632 * See the class description for the syntax of the pattern language.
633 * A frozen set will not be modified.
634 * @param pattern a string specifying what characters are in the set
635 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
636 * contains a syntax error.
637 * <em> Empties the set passed before applying the pattern.</em>
638 * @return a reference to this
639 * @stable ICU 2.0
640 */
641 UnicodeSet& applyPattern(const UnicodeString& pattern,
642 UErrorCode& status);
643
644#ifndef U_HIDE_INTERNAL_API
645 /**
646 * Modifies this set to represent the set specified by the given
647 * pattern, optionally ignoring Unicode Pattern_White_Space characters.
648 * See the class description for the syntax of the pattern language.
649 * A frozen set will not be modified.
650 * @param pattern a string specifying what characters are in the set
651 * @param options bitmask for options to apply to the pattern.
652 * Valid options are USET_IGNORE_SPACE and
653 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
654 * These case options are mutually exclusive.
655 * @param symbols a symbol table mapping variable names to
656 * values and stand-ins to UnicodeSets; may be nullptr
657 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
658 * contains a syntax error.
659 *<em> Empties the set passed before applying the pattern.</em>
660 * @return a reference to this
661 * @internal
662 */
663 UnicodeSet& applyPattern(const UnicodeString& pattern,
664 uint32_t options,
665 const SymbolTable* symbols,
666 UErrorCode& status);
667#endif /* U_HIDE_INTERNAL_API */
668
669 /**
670 * Parses the given pattern, starting at the given position. The
671 * character at pattern.charAt(pos.getIndex()) must be '[', or the
672 * parse fails. Parsing continues until the corresponding closing
673 * ']'. If a syntax error is encountered between the opening and
674 * closing brace, the parse fails. Upon return from a successful
675 * parse, the ParsePosition is updated to point to the character
676 * following the closing ']', and a StringBuffer containing a
677 * pairs list for the parsed pattern is returned. This method calls
678 * itself recursively to parse embedded subpatterns.
679 *<em> Empties the set passed before applying the pattern.</em>
680 * A frozen set will not be modified.
681 *
682 * @param pattern the string containing the pattern to be parsed.
683 * The portion of the string from pos.getIndex(), which must be a
684 * '[', to the corresponding closing ']', is parsed.
685 * @param pos upon entry, the position at which to being parsing.
686 * The character at pattern.charAt(pos.getIndex()) must be a '['.
687 * Upon return from a successful parse, pos.getIndex() is either
688 * the character after the closing ']' of the parsed pattern, or
689 * pattern.length() if the closing ']' is the last character of
690 * the pattern string.
691 * @param options bitmask for options to apply to the pattern.
692 * Valid options are USET_IGNORE_SPACE and
693 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
694 * These case options are mutually exclusive.
695 * @param symbols a symbol table mapping variable names to
696 * values and stand-ins to UnicodeSets; may be nullptr
697 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
698 * contains a syntax error.
699 * @return a reference to this
700 * @stable ICU 2.8
701 */
702 UnicodeSet& applyPattern(const UnicodeString& pattern,
703 ParsePosition& pos,
704 uint32_t options,
705 const SymbolTable* symbols,
706 UErrorCode& status);
707
708 /**
709 * Returns a string representation of this set. If the result of
710 * calling this function is passed to a UnicodeSet constructor, it
711 * will produce another set that is equal to this one.
712 * A frozen set will not be modified.
713 * @param result the string to receive the rules. Previous
714 * contents will be deleted.
715 * @param escapeUnprintable if true then convert unprintable
716 * character to their hex escape representations, \\uxxxx or
717 * \\Uxxxxxxxx. Unprintable characters are those other than
718 * U+000A, U+0020..U+007E.
719 * @stable ICU 2.0
720 */
721 virtual UnicodeString& toPattern(UnicodeString& result,
722 UBool escapeUnprintable = false) const override;
723
724 /**
725 * Modifies this set to contain those code points which have the given value
726 * for the given binary or enumerated property, as returned by
727 * u_getIntPropertyValue. Prior contents of this set are lost.
728 * A frozen set will not be modified.
729 *
730 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
731 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
732 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
733 *
734 * @param value a value in the range u_getIntPropertyMinValue(prop)..
735 * u_getIntPropertyMaxValue(prop), with one exception. If prop is
736 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
737 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped
738 * categories such as [:L:] to be represented.
739 *
740 * @param ec error code input/output parameter
741 *
742 * @return a reference to this set
743 *
744 * @stable ICU 2.4
745 */
746 UnicodeSet& applyIntPropertyValue(UProperty prop,
747 int32_t value,
748 UErrorCode& ec);
749
750 /**
751 * Modifies this set to contain those code points which have the
752 * given value for the given property. Prior contents of this
753 * set are lost.
754 * A frozen set will not be modified.
755 *
756 * @param prop a property alias, either short or long. The name is matched
757 * loosely. See PropertyAliases.txt for names and a description of loose
758 * matching. If the value string is empty, then this string is interpreted
759 * as either a General_Category value alias, a Script value alias, a binary
760 * property alias, or a special ID. Special IDs are matched loosely and
761 * correspond to the following sets:
762 *
763 * "ANY" = [\\u0000-\\U0010FFFF],
764 * "ASCII" = [\\u0000-\\u007F],
765 * "Assigned" = [:^Cn:].
766 *
767 * @param value a value alias, either short or long. The name is matched
768 * loosely. See PropertyValueAliases.txt for names and a description of
769 * loose matching. In addition to aliases listed, numeric values and
770 * canonical combining classes may be expressed numerically, e.g., ("nv",
771 * "0.5") or ("ccc", "220"). The value string may also be empty.
772 *
773 * @param ec error code input/output parameter
774 *
775 * @return a reference to this set
776 *
777 * @stable ICU 2.4
778 */
779 UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
780 const UnicodeString& value,
781 UErrorCode& ec);
782
783 /**
784 * Returns the number of elements in this set (its cardinality).
785 * Note than the elements of a set may include both individual
786 * codepoints and strings.
787 *
788 * This is slower than getRangeCount() because
789 * it counts the code points of all ranges.
790 *
791 * @return the number of elements in this set (its cardinality).
792 * @stable ICU 2.0
793 * @see getRangeCount
794 */
795 virtual int32_t size(void) const;
796
797 /**
798 * Returns <tt>true</tt> if this set contains no elements.
799 *
800 * @return <tt>true</tt> if this set contains no elements.
801 * @stable ICU 2.0
802 */
803 virtual UBool isEmpty(void) const;
804
805 /**
806 * @return true if this set contains multi-character strings or the empty string.
807 * @stable ICU 70
808 */
809 UBool hasStrings() const;
810
811 /**
812 * Returns true if this set contains the given character.
813 * This function works faster with a frozen set.
814 * @param c character to be checked for containment
815 * @return true if the test condition is met
816 * @stable ICU 2.0
817 */
818 virtual UBool contains(UChar32 c) const override;
819
820 /**
821 * Returns true if this set contains every character
822 * of the given range.
823 * @param start first character, inclusive, of the range
824 * @param end last character, inclusive, of the range
825 * @return true if the test condition is met
826 * @stable ICU 2.0
827 */
828 virtual UBool contains(UChar32 start, UChar32 end) const;
829
830 /**
831 * Returns <tt>true</tt> if this set contains the given
832 * multicharacter string.
833 * @param s string to be checked for containment
834 * @return <tt>true</tt> if this set contains the specified string
835 * @stable ICU 2.4
836 */
837 UBool contains(const UnicodeString& s) const;
838
839 /**
840 * Returns true if this set contains all the characters and strings
841 * of the given set.
842 * @param c set to be checked for containment
843 * @return true if the test condition is met
844 * @stable ICU 2.4
845 */
846 virtual UBool containsAll(const UnicodeSet& c) const;
847
848 /**
849 * Returns true if this set contains all the characters
850 * of the given string.
851 * @param s string containing characters to be checked for containment
852 * @return true if the test condition is met
853 * @stable ICU 2.4
854 */
855 UBool containsAll(const UnicodeString& s) const;
856
857 /**
858 * Returns true if this set contains none of the characters
859 * of the given range.
860 * @param start first character, inclusive, of the range
861 * @param end last character, inclusive, of the range
862 * @return true if the test condition is met
863 * @stable ICU 2.4
864 */
865 UBool containsNone(UChar32 start, UChar32 end) const;
866
867 /**
868 * Returns true if this set contains none of the characters and strings
869 * of the given set.
870 * @param c set to be checked for containment
871 * @return true if the test condition is met
872 * @stable ICU 2.4
873 */
874 UBool containsNone(const UnicodeSet& c) const;
875
876 /**
877 * Returns true if this set contains none of the characters
878 * of the given string.
879 * @param s string containing characters to be checked for containment
880 * @return true if the test condition is met
881 * @stable ICU 2.4
882 */
883 UBool containsNone(const UnicodeString& s) const;
884
885 /**
886 * Returns true if this set contains one or more of the characters
887 * in the given range.
888 * @param start first character, inclusive, of the range
889 * @param end last character, inclusive, of the range
890 * @return true if the condition is met
891 * @stable ICU 2.4
892 */
893 inline UBool containsSome(UChar32 start, UChar32 end) const;
894
895 /**
896 * Returns true if this set contains one or more of the characters
897 * and strings of the given set.
898 * @param s The set to be checked for containment
899 * @return true if the condition is met
900 * @stable ICU 2.4
901 */
902 inline UBool containsSome(const UnicodeSet& s) const;
903
904 /**
905 * Returns true if this set contains one or more of the characters
906 * of the given string.
907 * @param s string containing characters to be checked for containment
908 * @return true if the condition is met
909 * @stable ICU 2.4
910 */
911 inline UBool containsSome(const UnicodeString& s) const;
912
913 /**
914 * Returns the length of the initial substring of the input string which
915 * consists only of characters and strings that are contained in this set
916 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
917 * or only of characters and strings that are not contained
918 * in this set (USET_SPAN_NOT_CONTAINED).
919 * See USetSpanCondition for details.
920 * Similar to the strspn() C library function.
921 * Unpaired surrogates are treated according to contains() of their surrogate code points.
922 * This function works faster with a frozen set and with a non-negative string length argument.
923 * @param s start of the string
924 * @param length of the string; can be -1 for NUL-terminated
925 * @param spanCondition specifies the containment condition
926 * @return the length of the initial substring according to the spanCondition;
927 * 0 if the start of the string does not fit the spanCondition
928 * @stable ICU 3.8
929 * @see USetSpanCondition
930 */
931 int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
932
933 /**
934 * Returns the end of the substring of the input string according to the USetSpanCondition.
935 * Same as <code>start+span(s.getBuffer()+start, s.length()-start, spanCondition)</code>
936 * after pinning start to 0<=start<=s.length().
937 * @param s the string
938 * @param start the start index in the string for the span operation
939 * @param spanCondition specifies the containment condition
940 * @return the exclusive end of the substring according to the spanCondition;
941 * the substring s.tempSubStringBetween(start, end) fulfills the spanCondition
942 * @stable ICU 4.4
943 * @see USetSpanCondition
944 */
945 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
946
947 /**
948 * Returns the start of the trailing substring of the input string which
949 * consists only of characters and strings that are contained in this set
950 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
951 * or only of characters and strings that are not contained
952 * in this set (USET_SPAN_NOT_CONTAINED).
953 * See USetSpanCondition for details.
954 * Unpaired surrogates are treated according to contains() of their surrogate code points.
955 * This function works faster with a frozen set and with a non-negative string length argument.
956 * @param s start of the string
957 * @param length of the string; can be -1 for NUL-terminated
958 * @param spanCondition specifies the containment condition
959 * @return the start of the trailing substring according to the spanCondition;
960 * the string length if the end of the string does not fit the spanCondition
961 * @stable ICU 3.8
962 * @see USetSpanCondition
963 */
964 int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
965
966 /**
967 * Returns the start of the substring of the input string according to the USetSpanCondition.
968 * Same as <code>spanBack(s.getBuffer(), limit, spanCondition)</code>
969 * after pinning limit to 0<=end<=s.length().
970 * @param s the string
971 * @param limit the exclusive-end index in the string for the span operation
972 * (use s.length() or INT32_MAX for spanning back from the end of the string)
973 * @param spanCondition specifies the containment condition
974 * @return the start of the substring according to the spanCondition;
975 * the substring s.tempSubStringBetween(start, limit) fulfills the spanCondition
976 * @stable ICU 4.4
977 * @see USetSpanCondition
978 */
979 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
980
981 /**
982 * Returns the length of the initial substring of the input string which
983 * consists only of characters and strings that are contained in this set
984 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
985 * or only of characters and strings that are not contained
986 * in this set (USET_SPAN_NOT_CONTAINED).
987 * See USetSpanCondition for details.
988 * Similar to the strspn() C library function.
989 * Malformed byte sequences are treated according to contains(0xfffd).
990 * This function works faster with a frozen set and with a non-negative string length argument.
991 * @param s start of the string (UTF-8)
992 * @param length of the string; can be -1 for NUL-terminated
993 * @param spanCondition specifies the containment condition
994 * @return the length of the initial substring according to the spanCondition;
995 * 0 if the start of the string does not fit the spanCondition
996 * @stable ICU 3.8
997 * @see USetSpanCondition
998 */
999 int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
1000
1001 /**
1002 * Returns the start of the trailing substring of the input string which
1003 * consists only of characters and strings that are contained in this set
1004 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1005 * or only of characters and strings that are not contained
1006 * in this set (USET_SPAN_NOT_CONTAINED).
1007 * See USetSpanCondition for details.
1008 * Malformed byte sequences are treated according to contains(0xfffd).
1009 * This function works faster with a frozen set and with a non-negative string length argument.
1010 * @param s start of the string (UTF-8)
1011 * @param length of the string; can be -1 for NUL-terminated
1012 * @param spanCondition specifies the containment condition
1013 * @return the start of the trailing substring according to the spanCondition;
1014 * the string length if the end of the string does not fit the spanCondition
1015 * @stable ICU 3.8
1016 * @see USetSpanCondition
1017 */
1018 int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
1019
1020 /**
1021 * Implement UnicodeMatcher::matches()
1022 * @stable ICU 2.4
1023 */
1024 virtual UMatchDegree matches(const Replaceable& text,
1025 int32_t& offset,
1026 int32_t limit,
1027 UBool incremental) override;
1028
1029private:
1030 /**
1031 * Returns the longest match for s in text at the given position.
1032 * If limit > start then match forward from start+1 to limit
1033 * matching all characters except s.charAt(0). If limit < start,
1034 * go backward starting from start-1 matching all characters
1035 * except s.charAt(s.length()-1). This method assumes that the
1036 * first character, text.charAt(start), matches s, so it does not
1037 * check it.
1038 * @param text the text to match
1039 * @param start the first character to match. In the forward
1040 * direction, text.charAt(start) is matched against s.charAt(0).
1041 * In the reverse direction, it is matched against
1042 * s.charAt(s.length()-1).
1043 * @param limit the limit offset for matching, either last+1 in
1044 * the forward direction, or last-1 in the reverse direction,
1045 * where last is the index of the last character to match.
1046 * @param s
1047 * @return If part of s matches up to the limit, return |limit -
1048 * start|. If all of s matches before reaching the limit, return
1049 * s.length(). If there is a mismatch between s and text, return
1050 * 0
1051 */
1052 static int32_t matchRest(const Replaceable& text,
1053 int32_t start, int32_t limit,
1054 const UnicodeString& s);
1055
1056 /**
1057 * Returns the smallest value i such that c < list[i]. Caller
1058 * must ensure that c is a legal value or this method will enter
1059 * an infinite loop. This method performs a binary search.
1060 * @param c a character in the range MIN_VALUE..MAX_VALUE
1061 * inclusive
1062 * @return the smallest integer i in the range 0..len-1,
1063 * inclusive, such that c < list[i]
1064 */
1065 int32_t findCodePoint(UChar32 c) const;
1066
1067public:
1068
1069 /**
1070 * Implementation of UnicodeMatcher API. Union the set of all
1071 * characters that may be matched by this object into the given
1072 * set.
1073 * @param toUnionTo the set into which to union the source characters
1074 * @stable ICU 2.4
1075 */
1076 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override;
1077
1078 /**
1079 * Returns the index of the given character within this set, where
1080 * the set is ordered by ascending code point. If the character
1081 * is not in this set, return -1. The inverse of this method is
1082 * <code>charAt()</code>.
1083 * @return an index from 0..size()-1, or -1
1084 * @stable ICU 2.4
1085 */
1086 int32_t indexOf(UChar32 c) const;
1087
1088 /**
1089 * Returns the character at the given index within this set, where
1090 * the set is ordered by ascending code point. If the index is
1091 * out of range for characters, returns (UChar32)-1.
1092 * The inverse of this method is <code>indexOf()</code>.
1093 *
1094 * For iteration, this is slower than UnicodeSetIterator or
1095 * getRangeCount()/getRangeStart()/getRangeEnd(),
1096 * because for each call it skips linearly over <code>index</code>
1097 * characters in the ranges.
1098 *
1099 * @param index an index from 0..size()-1
1100 * @return the character at the given index, or (UChar32)-1.
1101 * @stable ICU 2.4
1102 */
1103 UChar32 charAt(int32_t index) const;
1104
1105 /**
1106 * Adds the specified range to this set if it is not already
1107 * present. If this set already contains the specified range,
1108 * the call leaves this set unchanged. If <code>start > end</code>
1109 * then an empty range is added, leaving the set unchanged.
1110 * This is equivalent to a boolean logic OR, or a set UNION.
1111 * A frozen set will not be modified.
1112 *
1113 * @param start first character, inclusive, of range to be added
1114 * to this set.
1115 * @param end last character, inclusive, of range to be added
1116 * to this set.
1117 * @stable ICU 2.0
1118 */
1119 virtual UnicodeSet& add(UChar32 start, UChar32 end);
1120
1121 /**
1122 * Adds the specified character to this set if it is not already
1123 * present. If this set already contains the specified character,
1124 * the call leaves this set unchanged.
1125 * A frozen set will not be modified.
1126 *
1127 * @param c the character (code point)
1128 * @return this object, for chaining
1129 * @stable ICU 2.0
1130 */
1131 UnicodeSet& add(UChar32 c);
1132
1133 /**
1134 * Adds the specified multicharacter to this set if it is not already
1135 * present. If this set already contains the multicharacter,
1136 * the call leaves this set unchanged.
1137 * Thus "ch" => {"ch"}
1138 * A frozen set will not be modified.
1139 *
1140 * @param s the source string
1141 * @return this object, for chaining
1142 * @stable ICU 2.4
1143 */
1144 UnicodeSet& add(const UnicodeString& s);
1145
1146 private:
1147 /**
1148 * @return a code point IF the string consists of a single one.
1149 * otherwise returns -1.
1150 * @param s string to test
1151 */
1152 static int32_t getSingleCP(const UnicodeString& s);
1153
1154 void _add(const UnicodeString& s);
1155
1156 public:
1157 /**
1158 * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
1159 * If this set already contains any particular character, it has no effect on that character.
1160 * A frozen set will not be modified.
1161 * @param s the source string
1162 * @return this object, for chaining
1163 * @stable ICU 2.4
1164 */
1165 UnicodeSet& addAll(const UnicodeString& s);
1166
1167 /**
1168 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1169 * A frozen set will not be modified.
1170 * @param s the source string
1171 * @return this object, for chaining
1172 * @stable ICU 2.4
1173 */
1174 UnicodeSet& retainAll(const UnicodeString& s);
1175
1176 /**
1177 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1178 * A frozen set will not be modified.
1179 * @param s the source string
1180 * @return this object, for chaining
1181 * @stable ICU 2.4
1182 */
1183 UnicodeSet& complementAll(const UnicodeString& s);
1184
1185 /**
1186 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1187 * A frozen set will not be modified.
1188 * @param s the source string
1189 * @return this object, for chaining
1190 * @stable ICU 2.4
1191 */
1192 UnicodeSet& removeAll(const UnicodeString& s);
1193
1194 /**
1195 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1196 *
1197 * @param s the source string
1198 * @return a newly created set containing the given string.
1199 * The caller owns the return object and is responsible for deleting it.
1200 * @stable ICU 2.4
1201 */
1202 static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1203
1204
1205 /**
1206 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1207 * @param s the source string
1208 * @return a newly created set containing the given characters
1209 * The caller owns the return object and is responsible for deleting it.
1210 * @stable ICU 2.4
1211 */
1212 static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1213
1214 /**
1215 * Retain only the elements in this set that are contained in the
1216 * specified range. If <code>start > end</code> then an empty range is
1217 * retained, leaving the set empty. This is equivalent to
1218 * a boolean logic AND, or a set INTERSECTION.
1219 * A frozen set will not be modified.
1220 *
1221 * @param start first character, inclusive, of range
1222 * @param end last character, inclusive, of range
1223 * @stable ICU 2.0
1224 */
1225 virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1226
1227
1228 /**
1229 * Retain the specified character from this set if it is present.
1230 * A frozen set will not be modified.
1231 *
1232 * @param c the character (code point)
1233 * @return this object, for chaining
1234 * @stable ICU 2.0
1235 */
1236 UnicodeSet& retain(UChar32 c);
1237
1238 /**
1239 * Retains only the specified string from this set if it is present.
1240 * Upon return this set will be empty if it did not contain s, or
1241 * will only contain s if it did contain s.
1242 * A frozen set will not be modified.
1243 *
1244 * @param s the source string
1245 * @return this object, for chaining
1246 * @stable ICU 69
1247 */
1248 UnicodeSet& retain(const UnicodeString &s);
1249
1250 /**
1251 * Removes the specified range from this set if it is present.
1252 * The set will not contain the specified range once the call
1253 * returns. If <code>start > end</code> then an empty range is
1254 * removed, leaving the set unchanged.
1255 * A frozen set will not be modified.
1256 *
1257 * @param start first character, inclusive, of range to be removed
1258 * from this set.
1259 * @param end last character, inclusive, of range to be removed
1260 * from this set.
1261 * @stable ICU 2.0
1262 */
1263 virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1264
1265 /**
1266 * Removes the specified character from this set if it is present.
1267 * The set will not contain the specified range once the call
1268 * returns.
1269 * A frozen set will not be modified.
1270 *
1271 * @param c the character (code point)
1272 * @return this object, for chaining
1273 * @stable ICU 2.0
1274 */
1275 UnicodeSet& remove(UChar32 c);
1276
1277 /**
1278 * Removes the specified string from this set if it is present.
1279 * The set will not contain the specified character once the call
1280 * returns.
1281 * A frozen set will not be modified.
1282 * @param s the source string
1283 * @return this object, for chaining
1284 * @stable ICU 2.4
1285 */
1286 UnicodeSet& remove(const UnicodeString& s);
1287
1288 /**
1289 * This is equivalent to
1290 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1291 *
1292 * <strong>Note:</strong> This performs a symmetric difference with all code points
1293 * <em>and thus retains all multicharacter strings</em>.
1294 * In order to achieve a “code point complement” (all code points minus this set),
1295 * the easiest is to <code>.complement().removeAllStrings()</code>.
1296 *
1297 * A frozen set will not be modified.
1298 * @stable ICU 2.0
1299 */
1300 virtual UnicodeSet& complement();
1301
1302 /**
1303 * Complements the specified range in this set. Any character in
1304 * the range will be removed if it is in this set, or will be
1305 * added if it is not in this set. If <code>start > end</code>
1306 * then an empty range is complemented, leaving the set unchanged.
1307 * This is equivalent to a boolean logic XOR.
1308 * A frozen set will not be modified.
1309 *
1310 * @param start first character, inclusive, of range
1311 * @param end last character, inclusive, of range
1312 * @stable ICU 2.0
1313 */
1314 virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1315
1316 /**
1317 * Complements the specified character in this set. The character
1318 * will be removed if it is in this set, or will be added if it is
1319 * not in this set.
1320 * A frozen set will not be modified.
1321 *
1322 * @param c the character (code point)
1323 * @return this object, for chaining
1324 * @stable ICU 2.0
1325 */
1326 UnicodeSet& complement(UChar32 c);
1327
1328 /**
1329 * Complement the specified string in this set.
1330 * The string will be removed if it is in this set, or will be added if it is not in this set.
1331 * A frozen set will not be modified.
1332 *
1333 * @param s the string to complement
1334 * @return this object, for chaining
1335 * @stable ICU 2.4
1336 */
1337 UnicodeSet& complement(const UnicodeString& s);
1338
1339 /**
1340 * Adds all of the elements in the specified set to this set if
1341 * they're not already present. This operation effectively
1342 * modifies this set so that its value is the <i>union</i> of the two
1343 * sets. The behavior of this operation is unspecified if the specified
1344 * collection is modified while the operation is in progress.
1345 * A frozen set will not be modified.
1346 *
1347 * @param c set whose elements are to be added to this set.
1348 * @see #add(UChar32, UChar32)
1349 * @stable ICU 2.0
1350 */
1351 virtual UnicodeSet& addAll(const UnicodeSet& c);
1352
1353 /**
1354 * Retains only the elements in this set that are contained in the
1355 * specified set. In other words, removes from this set all of
1356 * its elements that are not contained in the specified set. This
1357 * operation effectively modifies this set so that its value is
1358 * the <i>intersection</i> of the two sets.
1359 * A frozen set will not be modified.
1360 *
1361 * @param c set that defines which elements this set will retain.
1362 * @stable ICU 2.0
1363 */
1364 virtual UnicodeSet& retainAll(const UnicodeSet& c);
1365
1366 /**
1367 * Removes from this set all of its elements that are contained in the
1368 * specified set. This operation effectively modifies this
1369 * set so that its value is the <i>asymmetric set difference</i> of
1370 * the two sets.
1371 * A frozen set will not be modified.
1372 *
1373 * @param c set that defines which elements will be removed from
1374 * this set.
1375 * @stable ICU 2.0
1376 */
1377 virtual UnicodeSet& removeAll(const UnicodeSet& c);
1378
1379 /**
1380 * Complements in this set all elements contained in the specified
1381 * set. Any character in the other set will be removed if it is
1382 * in this set, or will be added if it is not in this set.
1383 * A frozen set will not be modified.
1384 *
1385 * @param c set that defines which elements will be xor'ed from
1386 * this set.
1387 * @stable ICU 2.4
1388 */
1389 virtual UnicodeSet& complementAll(const UnicodeSet& c);
1390
1391 /**
1392 * Removes all of the elements from this set. This set will be
1393 * empty after this call returns.
1394 * A frozen set will not be modified.
1395 * @stable ICU 2.0
1396 */
1397 virtual UnicodeSet& clear(void);
1398
1399 /**
1400 * Close this set over the given attribute. For the attribute
1401 * USET_CASE_INSENSITIVE, the result is to modify this set so that:
1402 *
1403 * 1. For each character or string 'a' in this set, all strings or
1404 * characters 'b' such that foldCase(a) == foldCase(b) are added
1405 * to this set.
1406 *
1407 * 2. For each string 'e' in the resulting set, if e !=
1408 * foldCase(e), 'e' will be removed.
1409 *
1410 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
1411 *
1412 * (Here foldCase(x) refers to the operation u_strFoldCase, and a
1413 * == b denotes that the contents are the same, not pointer
1414 * comparison.)
1415 *
1416 * A frozen set will not be modified.
1417 *
1418 * @param attribute bitmask for attributes to close over.
1419 * Valid options:
1420 * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
1421 * These case options are mutually exclusive.
1422 * Unrelated options bits are ignored.
1423 * @return a reference to this set.
1424 * @stable ICU 4.2
1425 */
1426 UnicodeSet& closeOver(int32_t attribute);
1427
1428 /**
1429 * Remove all strings from this set.
1430 *
1431 * @return a reference to this set.
1432 * @stable ICU 4.2
1433 */
1434 virtual UnicodeSet &removeAllStrings();
1435
1436 /**
1437 * Iteration method that returns the number of ranges contained in
1438 * this set.
1439 * @see #getRangeStart
1440 * @see #getRangeEnd
1441 * @stable ICU 2.4
1442 */
1443 virtual int32_t getRangeCount(void) const;
1444
1445 /**
1446 * Iteration method that returns the first character in the
1447 * specified range of this set.
1448 * @see #getRangeCount
1449 * @see #getRangeEnd
1450 * @stable ICU 2.4
1451 */
1452 virtual UChar32 getRangeStart(int32_t index) const;
1453
1454 /**
1455 * Iteration method that returns the last character in the
1456 * specified range of this set.
1457 * @see #getRangeStart
1458 * @see #getRangeEnd
1459 * @stable ICU 2.4
1460 */
1461 virtual UChar32 getRangeEnd(int32_t index) const;
1462
1463 /**
1464 * Serializes this set into an array of 16-bit integers. Serialization
1465 * (currently) only records the characters in the set; multicharacter
1466 * strings are ignored.
1467 *
1468 * The array has following format (each line is one 16-bit
1469 * integer):
1470 *
1471 * length = (n+2*m) | (m!=0?0x8000:0)
1472 * bmpLength = n; present if m!=0
1473 * bmp[0]
1474 * bmp[1]
1475 * ...
1476 * bmp[n-1]
1477 * supp-high[0]
1478 * supp-low[0]
1479 * supp-high[1]
1480 * supp-low[1]
1481 * ...
1482 * supp-high[m-1]
1483 * supp-low[m-1]
1484 *
1485 * The array starts with a header. After the header are n bmp
1486 * code points, then m supplementary code points. Either n or m
1487 * or both may be zero. n+2*m is always <= 0x7FFF.
1488 *
1489 * If there are no supplementary characters (if m==0) then the
1490 * header is one 16-bit integer, 'length', with value n.
1491 *
1492 * If there are supplementary characters (if m!=0) then the header
1493 * is two 16-bit integers. The first, 'length', has value
1494 * (n+2*m)|0x8000. The second, 'bmpLength', has value n.
1495 *
1496 * After the header the code points are stored in ascending order.
1497 * Supplementary code points are stored as most significant 16
1498 * bits followed by least significant 16 bits.
1499 *
1500 * @param dest pointer to buffer of destCapacity 16-bit integers.
1501 * May be nullptr only if destCapacity is zero.
1502 * @param destCapacity size of dest, or zero. Must not be negative.
1503 * @param ec error code. Will be set to U_INDEX_OUTOFBOUNDS_ERROR
1504 * if n+2*m > 0x7FFF. Will be set to U_BUFFER_OVERFLOW_ERROR if
1505 * n+2*m+(m!=0?2:1) > destCapacity.
1506 * @return the total length of the serialized format, including
1507 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
1508 * than U_BUFFER_OVERFLOW_ERROR.
1509 * @stable ICU 2.4
1510 */
1511 int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1512
1513 /**
1514 * Reallocate this objects internal structures to take up the least
1515 * possible space, without changing this object's value.
1516 * A frozen set will not be modified.
1517 * @stable ICU 2.4
1518 */
1519 virtual UnicodeSet& compact();
1520
1521 /**
1522 * Return the class ID for this class. This is useful only for
1523 * comparing to a return value from getDynamicClassID(). For example:
1524 * <pre>
1525 * . Base* polymorphic_pointer = createPolymorphicObject();
1526 * . if (polymorphic_pointer->getDynamicClassID() ==
1527 * . Derived::getStaticClassID()) ...
1528 * </pre>
1529 * @return The class ID for all objects of this class.
1530 * @stable ICU 2.0
1531 */
1532 static UClassID U_EXPORT2 getStaticClassID(void);
1533
1534 /**
1535 * Implement UnicodeFunctor API.
1536 *
1537 * @return The class ID for this object. All objects of a given
1538 * class have the same class ID. Objects of other classes have
1539 * different class IDs.
1540 * @stable ICU 2.4
1541 */
1542 virtual UClassID getDynamicClassID(void) const override;
1543
1544private:
1545
1546 // Private API for the USet API
1547
1548 friend class USetAccess;
1549
1550 const UnicodeString* getString(int32_t index) const;
1551
1552 //----------------------------------------------------------------
1553 // RuleBasedTransliterator support
1554 //----------------------------------------------------------------
1555
1556private:
1557
1558 /**
1559 * Returns <tt>true</tt> if this set contains any character whose low byte
1560 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
1561 * indexing.
1562 */
1563 virtual UBool matchesIndexValue(uint8_t v) const override;
1564
1565private:
1566 friend class RBBIRuleScanner;
1567
1568 //----------------------------------------------------------------
1569 // Implementation: Clone as thawed (see ICU4J Freezable)
1570 //----------------------------------------------------------------
1571
1572 UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1573 UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
1574
1575 //----------------------------------------------------------------
1576 // Implementation: Pattern parsing
1577 //----------------------------------------------------------------
1578
1579 void applyPatternIgnoreSpace(const UnicodeString& pattern,
1580 ParsePosition& pos,
1581 const SymbolTable* symbols,
1582 UErrorCode& status);
1583
1584 void applyPattern(RuleCharacterIterator& chars,
1585 const SymbolTable* symbols,
1586 UnicodeString& rebuiltPat,
1587 uint32_t options,
1588 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1589 int32_t depth,
1590 UErrorCode& ec);
1591
1592 void closeOverCaseInsensitive(bool simple);
1593 void closeOverAddCaseMappings();
1594
1595 //----------------------------------------------------------------
1596 // Implementation: Utility methods
1597 //----------------------------------------------------------------
1598
1599 static int32_t nextCapacity(int32_t minCapacity);
1600
1601 bool ensureCapacity(int32_t newLen);
1602
1603 bool ensureBufferCapacity(int32_t newLen);
1604
1605 void swapBuffers(void);
1606
1607 UBool allocateStrings(UErrorCode &status);
1608 int32_t stringsSize() const;
1609 UBool stringsContains(const UnicodeString &s) const;
1610
1611 UnicodeString& _toPattern(UnicodeString& result,
1612 UBool escapeUnprintable) const;
1613
1614 UnicodeString& _generatePattern(UnicodeString& result,
1615 UBool escapeUnprintable) const;
1616
1617 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1618
1619 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1620
1621 static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
1622 UBool escapeUnprintable);
1623
1624 //----------------------------------------------------------------
1625 // Implementation: Fundamental operators
1626 //----------------------------------------------------------------
1627
1628 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1629
1630 void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1631
1632 void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1633
1634 /**
1635 * Return true if the given position, in the given pattern, appears
1636 * to be the start of a property set pattern [:foo:], \\p{foo}, or
1637 * \\P{foo}, or \\N{name}.
1638 */
1639 static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1640 int32_t pos);
1641
1642 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1643 int32_t iterOpts);
1644
1645 /**
1646 * Parse the given property pattern at the given parse position
1647 * and set this UnicodeSet to the result.
1648 *
1649 * The original design document is out of date, but still useful.
1650 * Ignore the property and value names:
1651 * https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/unicodeset_properties.html
1652 *
1653 * Recognized syntax:
1654 *
1655 * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
1656 * \\p{foo} \\P{foo} - white space not allowed within "\\p" or "\\P"
1657 * \\N{name} - white space not allowed within "\\N"
1658 *
1659 * Other than the above restrictions, Unicode Pattern_White_Space characters are ignored.
1660 * Case is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading
1661 * and trailing space is deleted, and internal runs of whitespace
1662 * are collapsed to a single space.
1663 *
1664 * We support binary properties, enumerated properties, and the
1665 * following non-enumerated properties:
1666 *
1667 * Numeric_Value
1668 * Name
1669 * Unicode_1_Name
1670 *
1671 * @param pattern the pattern string
1672 * @param ppos on entry, the position at which to begin parsing.
1673 * This should be one of the locations marked '^':
1674 *
1675 * [:blah:] \\p{blah} \\P{blah} \\N{name}
1676 * ^ % ^ % ^ % ^ %
1677 *
1678 * On return, the position after the last character parsed, that is,
1679 * the locations marked '%'. If the parse fails, ppos is returned
1680 * unchanged.
1681 * @param ec status
1682 * @return a reference to this.
1683 */
1684 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1685 ParsePosition& ppos,
1686 UErrorCode &ec);
1687
1688 void applyPropertyPattern(RuleCharacterIterator& chars,
1689 UnicodeString& rebuiltPat,
1690 UErrorCode& ec);
1691
1692 /**
1693 * A filter that returns true if the given code point should be
1694 * included in the UnicodeSet being constructed.
1695 */
1696 typedef UBool (*Filter)(UChar32 codePoint, void* context);
1697
1698 /**
1699 * Given a filter, set this UnicodeSet to the code points
1700 * contained by that filter. The filter MUST be
1701 * property-conformant. That is, if it returns value v for one
1702 * code point, then it must return v for all affiliated code
1703 * points, as defined by the inclusions list. See
1704 * getInclusions().
1705 * src is a UPropertySource value.
1706 */
1707 void applyFilter(Filter filter,
1708 void* context,
1709 const UnicodeSet* inclusions,
1710 UErrorCode &status);
1711
1712 /**
1713 * Set the new pattern to cache.
1714 */
1715 void setPattern(const UnicodeString& newPat) {
1716 setPattern(newPat.getBuffer(), newPat.length());
1717 }
1718 void setPattern(const char16_t *newPat, int32_t newPatLen);
1719 /**
1720 * Release existing cached pattern.
1721 */
1722 void releasePattern();
1723
1724 friend class UnicodeSetIterator;
1725};
1726
1727
1728
1729inline bool UnicodeSet::operator!=(const UnicodeSet& o) const {
1730 return !operator==(o);
1731}
1732
1733inline UBool UnicodeSet::isFrozen() const {
1734 return (UBool)(bmpSet!=nullptr || stringSpan!=nullptr);
1735}
1736
1737inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1738 return !containsNone(start, end);
1739}
1740
1741inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1742 return !containsNone(s);
1743}
1744
1745inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1746 return !containsNone(s);
1747}
1748
1749inline UBool UnicodeSet::isBogus() const {
1750 return (UBool)(fFlags & kIsBogus);
1751}
1752
1753inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
1754 return reinterpret_cast<UnicodeSet *>(uset);
1755}
1756
1757inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1758 return reinterpret_cast<const UnicodeSet *>(uset);
1759}
1760
1761inline USet *UnicodeSet::toUSet() {
1762 return reinterpret_cast<USet *>(this);
1763}
1764
1765inline const USet *UnicodeSet::toUSet() const {
1766 return reinterpret_cast<const USet *>(this);
1767}
1768
1769inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1770 int32_t sLength=s.length();
1771 if(start<0) {
1772 start=0;
1773 } else if(start>sLength) {
1774 start=sLength;
1775 }
1776 return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1777}
1778
1779inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1780 int32_t sLength=s.length();
1781 if(limit<0) {
1782 limit=0;
1783 } else if(limit>sLength) {
1784 limit=sLength;
1785 }
1786 return spanBack(s.getBuffer(), limit, spanCondition);
1787}
1788
1789U_NAMESPACE_END
1790
1791#endif /* U_SHOW_CPLUSPLUS_API */
1792
1793#endif
1794