uniset.h source code [Godot/thirdparty/icu4c/common/unicode/uniset.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	***************************************************************************
5	* Copyright (C) 1999-2016, International Business Machines Corporation
6	* and others. All Rights Reserved.
7	***************************************************************************
8	* Date Name Description
9	* 10/20/99 alan Creation.
10	***************************************************************************
11	*/
12
13	#ifndef UNICODESET_H
14	#define UNICODESET_H
15
16	#include "unicode/utypes.h"
17
18	#if U_SHOW_CPLUSPLUS_API
19
20	#include "unicode/ucpmap.h"
21	#include "unicode/unifilt.h"
22	#include "unicode/unistr.h"
23	#include "unicode/uset.h"
24
25	/**
26	* \file
27	* \brief C++ API: Unicode Set
28	*/
29
30	U_NAMESPACE_BEGIN
31
32	// Forward Declarations.
33	class BMPSet;
34	class ParsePosition;
35	class RBBIRuleScanner;
36	class SymbolTable;
37	class UnicodeSetStringSpan;
38	class UVector;
39	class RuleCharacterIterator;
40
41	/**
42	* A mutable set of Unicode characters and multicharacter strings. Objects of this class
43	* represent <em>character classes</em> used in regular expressions.
44	* A character specifies a subset of Unicode code points. Legal
45	* code points are U+0000 to U+10FFFF, inclusive.
46	*
47	* <p>The UnicodeSet class is not designed to be subclassed.
48	*
49	* <p><code>UnicodeSet</code> supports two APIs. The first is the
50	* <em>operand</em> API that allows the caller to modify the value of
51	* a <code>UnicodeSet</code> object. It conforms to Java 2's
52	* <code>java.util.Set</code> interface, although
53	* <code>UnicodeSet</code> does not actually implement that
54	* interface. All methods of <code>Set</code> are supported, with the
55	* modification that they take a character range or single character
56	* instead of an <code>Object</code>, and they take a
57	* <code>UnicodeSet</code> instead of a <code>Collection</code>. The
58	* operand API may be thought of in terms of boolean logic: a boolean
59	* OR is implemented by <code>add</code>, a boolean AND is implemented
60	* by <code>retain</code>, a boolean XOR is implemented by
61	* <code>complement</code> taking an argument, and a boolean NOT is
62	* implemented by <code>complement</code> with no argument. In terms
63	* of traditional set theory function names, <code>add</code> is a
64	* union, <code>retain</code> is an intersection, <code>remove</code>
65	* is an asymmetric difference, and <code>complement</code> with no
66	* argument is a set complement with respect to the superset range
67	* <code>MIN_VALUE-MAX_VALUE</code>
68	*
69	* <p>The second API is the
70	* <code>applyPattern()</code>/<code>toPattern()</code> API from the
71	* <code>java.text.Format</code>-derived classes. Unlike the
72	* methods that add characters, add categories, and control the logic
73	* of the set, the method <code>applyPattern()</code> sets all
74	* attributes of a <code>UnicodeSet</code> at once, based on a
75	* string pattern.
76	*
77	* <p><b>Pattern syntax</b></p>
78	*
79	* Patterns are accepted by the constructors and the
80	* <code>applyPattern()</code> methods and returned by the
81	* <code>toPattern()</code> method. These patterns follow a syntax
82	* similar to that employed by version 8 regular expression character
83	* classes. Here are some simple examples:
84	*
85	* \htmlonly<blockquote>\endhtmlonly
86	* <table>
87	* <tr align="top">
88	* <td nowrap valign="top" align="left"><code>[]</code></td>
89	* <td valign="top">No characters</td>
90	* </tr><tr align="top">
91	* <td nowrap valign="top" align="left"><code>[a]</code></td>
92	* <td valign="top">The character 'a'</td>
93	* </tr><tr align="top">
94	* <td nowrap valign="top" align="left"><code>[ae]</code></td>
95	* <td valign="top">The characters 'a' and 'e'</td>
96	* </tr>
97	* <tr>
98	* <td nowrap valign="top" align="left"><code>[a-e]</code></td>
99	* <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
100	* point order</td>
101	* </tr>
102	* <tr>
103	* <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
104	* <td valign="top">The character U+4E01</td>
105	* </tr>
106	* <tr>
107	* <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
108	* <td valign="top">The character 'a' and the multicharacter strings "ab" and
109	* "ac"</td>
110	* </tr>
111	* <tr>
112	* <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
113	* <td valign="top">All characters in the general category Uppercase Letter</td>
114	* </tr>
115	* </table>
116	* \htmlonly</blockquote>\endhtmlonly
117	*
118	* Any character may be preceded by a backslash in order to remove any special
119	* meaning. White space characters, as defined by UCharacter.isWhitespace(), are
120	* ignored, unless they are escaped.
121	*
122	* <p>Property patterns specify a set of characters having a certain
123	* property as defined by the Unicode standard. Both the POSIX-like
124	* "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a
125	* complete list of supported property patterns, see the User's Guide
126	* for UnicodeSet at
127	* <a href="https://unicode-org.github.io/icu/userguide/strings/unicodeset">
128	* https://unicode-org.github.io/icu/userguide/strings/unicodeset</a>.
129	* Actual determination of property data is defined by the underlying
130	* Unicode database as implemented by UCharacter.
131	*
132	* <p>Patterns specify individual characters, ranges of characters, and
133	* Unicode property sets. When elements are concatenated, they
134	* specify their union. To complement a set, place a '^' immediately
135	* after the opening '['. Property patterns are inverted by modifying
136	* their delimiters; "[:^foo]" and "\\P{foo}". In any other location,
137	* '^' has no special meaning.
138	*
139	* <p>Since ICU 70, "[^...]", "[:^foo]", "\\P{foo}", and "[:binaryProperty=No:]"
140	* perform a “code point complement” (all code points minus the original set),
141	* removing all multicharacter strings,
142	* equivalent to <code>.complement().removeAllStrings()</code>.
143	* The complement() API function continues to perform a
144	* symmetric difference with all code points and thus retains all multicharacter strings.
145	*
146	* <p>Ranges are indicated by placing two a '-' between two
147	* characters, as in "a-z". This specifies the range of all
148	* characters from the left to the right, in Unicode order. If the
149	* left character is greater than or equal to the
150	* right character it is a syntax error. If a '-' occurs as the first
151	* character after the opening '[' or '[^', or if it occurs as the
152	* last character before the closing ']', then it is taken as a
153	* literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
154	* set of three characters, 'a', 'b', and '-'.
155	*
156	* <p>Sets may be intersected using the '&' operator or the asymmetric
157	* set difference may be taken using the '-' operator, for example,
158	* "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
159	* with values less than 4096. Operators ('&' and '\|') have equal
160	* precedence and bind left-to-right. Thus
161	* "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
162	* "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
163	* difference; intersection is commutative.
164	*
165	* <table>
166	* <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
167	* <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
168	* through 'z' and all letters in between, in Unicode order
169	* <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
170	* all characters but 'a' through 'z',
171	* that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
172	* <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
173	* <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
174	* <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
175	* <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
176	* <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
177	* <td>The asymmetric difference of sets specified by <em>pat1</em> and
178	* <em>pat2</em>
179	* <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
180	* <td>The set of characters having the specified
181	* Unicode property; in
182	* this case, Unicode uppercase letters
183	* <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
184	* <td>The set of characters <em>not</em> having the given
185	* Unicode property
186	* </table>
187	*
188	* <p><b>Formal syntax</b></p>
189	*
190	* \htmlonly<blockquote>\endhtmlonly
191	* <table>
192	* <tr align="top">
193	* <td nowrap valign="top" align="right"><code>pattern :=  </code></td>
194	* <td valign="top"><code>('[' '^'? item* ']') \|
195	* property</code></td>
196	* </tr>
197	* <tr align="top">
198	* <td nowrap valign="top" align="right"><code>item :=  </code></td>
199	* <td valign="top"><code>char \| (char '-' char) \| pattern-expr<br>
200	* </code></td>
201	* </tr>
202	* <tr align="top">
203	* <td nowrap valign="top" align="right"><code>pattern-expr :=  </code></td>
204	* <td valign="top"><code>pattern \| pattern-expr pattern \|
205	* pattern-expr op pattern<br>
206	* </code></td>
207	* </tr>
208	* <tr align="top">
209	* <td nowrap valign="top" align="right"><code>op :=  </code></td>
210	* <td valign="top"><code>'&' \| '-'<br>
211	* </code></td>
212	* </tr>
213	* <tr align="top">
214	* <td nowrap valign="top" align="right"><code>special :=  </code></td>
215	* <td valign="top"><code>'[' \| ']' \| '-'<br>
216	* </code></td>
217	* </tr>
218	* <tr align="top">
219	* <td nowrap valign="top" align="right"><code>char :=  </code></td>
220	* <td valign="top"><em>any character that is not</em><code> special<br>
221	* \| ('\' </code><em>any character</em><code>)<br>
222	* \| ('\\u' hex hex hex hex)<br>
223	* </code></td>
224	* </tr>
225	* <tr align="top">
226	* <td nowrap valign="top" align="right"><code>hex :=  </code></td>
227	* <td valign="top"><code>'0' \| '1' \| '2' \| '3' \| '4' \| '5' \| '6' \| '7' \| '8' \| '9' \|<br>
228	*     'A' \| 'B' \| 'C' \| 'D' \| 'E' \| 'F' \| 'a' \| 'b' \| 'c' \| 'd' \| 'e' \| 'f'</code></td>
229	* </tr>
230	* <tr>
231	* <td nowrap valign="top" align="right"><code>property :=  </code></td>
232	* <td valign="top"><em>a Unicode property set pattern</em></td>
233	* </tr>
234	* </table>
235	* <br>
236	* <table border="1">
237	* <tr>
238	* <td>Legend: <table>
239	* <tr>
240	* <td nowrap valign="top"><code>a := b</code></td>
241	* <td width="20" valign="top">  </td>
242	* <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
243	* </tr>
244	* <tr>
245	* <td nowrap valign="top"><code>a?</code></td>
246	* <td valign="top"></td>
247	* <td valign="top">zero or one instance of <code>a</code><br>
248	* </td>
249	* </tr>
250	* <tr>
251	* <td nowrap valign="top"><code>a*</code></td>
252	* <td valign="top"></td>
253	* <td valign="top">one or more instances of <code>a</code><br>
254	* </td>
255	* </tr>
256	* <tr>
257	* <td nowrap valign="top"><code>a \| b</code></td>
258	* <td valign="top"></td>
259	* <td valign="top">either <code>a</code> or <code>b</code><br>
260	* </td>
261	* </tr>
262	* <tr>
263	* <td nowrap valign="top"><code>'a'</code></td>
264	* <td valign="top"></td>
265	* <td valign="top">the literal string between the quotes </td>
266	* </tr>
267	* </table>
268	* </td>
269	* </tr>
270	* </table>
271	* \htmlonly</blockquote>\endhtmlonly
272	*
273	* <p>Note:
274	* - Most UnicodeSet methods do not take a UErrorCode parameter because
275	* there are usually very few opportunities for failure other than a shortage
276	* of memory, error codes in low-level C++ string methods would be inconvenient,
277	* and the error code as the last parameter (ICU convention) would prevent
278	* the use of default parameter values.
279	* Instead, such methods set the UnicodeSet into a "bogus" state
280	* (see isBogus()) if an error occurs.
281	*
282	* @author Alan Liu
283	* @stable ICU 2.0
284	*/
285	class U_COMMON_API UnicodeSet final : public UnicodeFilter {
286	private:
287	/**
288	* Enough for sets with few ranges.
289	* For example, White_Space has 10 ranges, list length 21.
290	*/
291	static constexpr int32_t INITIAL_CAPACITY = `25`;
292	// fFlags constant
293	static constexpr uint8_t kIsBogus = `1`; // This set is bogus (i.e. not valid)
294
295	UChar32* list = stackList; // MUST be terminated with HIGH
296	int32_t capacity = INITIAL_CAPACITY; // capacity of list
297	int32_t len = `1`; // length of list used; 1 <= len <= capacity
298	uint8_t fFlags = `0`; // Bit flag (see constants above)
299
300	BMPSet bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not nullptr.*
301	UChar32* buffer = nullptr; // internal buffer, may be nullptr
302	int32_t bufferCapacity = `0`; // capacity of buffer
303
304	/**
305	* The pattern representation of this set. This may not be the
306	* most economical pattern. It is the pattern supplied to
307	* applyPattern(), with variables substituted and whitespace
308	* removed. For sets constructed without applyPattern(), or
309	* modified using the non-pattern API, this string will be empty,
310	* indicating that toPattern() must generate a pattern
311	* representation from the inversion list.
312	*/
313	char16_t pat = nullptr*;
314	int32_t patLen = `0`;
315
316	UVector* strings = nullptr; // maintained in sorted order
317	UnicodeSetStringSpan stringSpan = nullptr*;
318
319	/**
320	* Initial list array.
321	* Avoids some heap allocations, and list is never nullptr.
322	* Increases the object size a bit.
323	*/
324	UChar32 stackList[INITIAL_CAPACITY];
325
326	public:
327	/**
328	* Determine if this object contains a valid set.
329	* A bogus set has no value. It is different from an empty set.
330	* It can be used to indicate that no set value is available.
331	*
332	* @return true if the set is bogus/invalid, false otherwise
333	* @see setToBogus()
334	* @stable ICU 4.0
335	*/
336	inline UBool isBogus(void) const;
337
338	/**
339	* Make this UnicodeSet object invalid.
340	* The string will test true with isBogus().
341	*
342	* A bogus set has no value. It is different from an empty set.
343	* It can be used to indicate that no set value is available.
344	*
345	* This utility function is used throughout the UnicodeSet
346	* implementation to indicate that a UnicodeSet operation failed,
347	* and may be used in other functions,
348	* especially but not exclusively when such functions do not
349	* take a UErrorCode for simplicity.
350	*
351	* @see isBogus()
352	* @stable ICU 4.0
353	*/
354	void setToBogus();
355
356	public:
357
358	enum {
359	/**
360	* Minimum value that can be stored in a UnicodeSet.
361	* @stable ICU 2.4
362	*/
363	MIN_VALUE = `0`,
364
365	/**
366	* Maximum value that can be stored in a UnicodeSet.
367	* @stable ICU 2.4
368	*/
369	MAX_VALUE = `0x10ffff`
370	};
371
372	//----------------------------------------------------------------
373	// Constructors &c
374	//----------------------------------------------------------------
375
376	public:
377
378	/**
379	* Constructs an empty set.
380	* @stable ICU 2.0
381	*/
382	UnicodeSet();
383
384	/**
385	* Constructs a set containing the given range. If <code>end <
386	* start</code> then an empty set is created.
387	*
388	* @param start first character, inclusive, of range
389	* @param end last character, inclusive, of range
390	* @stable ICU 2.4
391	*/
392	UnicodeSet(UChar32 start, UChar32 end);
393
394	#ifndef U_HIDE_INTERNAL_API
395	/**
396	* @internal
397	*/
398	enum ESerialization {
399	kSerialized / result of serialize() /
400	};
401
402	/**
403	* Constructs a set from the output of serialize().
404	*
405	* @param buffer the 16 bit array
406	* @param bufferLen the original length returned from serialize()
407	* @param serialization the value 'kSerialized'
408	* @param status error code
409	*
410	* @internal
411	*/
412	UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
413	ESerialization serialization, UErrorCode &status);
414	#endif /* U_HIDE_INTERNAL_API */
415
416	/**
417	* Constructs a set from the given pattern. See the class
418	* description for the syntax of the pattern language.
419	* @param pattern a string specifying what characters are in the set
420	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
421	* contains a syntax error.
422	* @stable ICU 2.0
423	*/
424	UnicodeSet(const UnicodeString& pattern,
425	UErrorCode& status);
426
427	#ifndef U_HIDE_INTERNAL_API
428	/**
429	* Constructs a set from the given pattern. See the class
430	* description for the syntax of the pattern language.
431	* @param pattern a string specifying what characters are in the set
432	* @param options bitmask for options to apply to the pattern.
433	* Valid options are USET_IGNORE_SPACE and
434	* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
435	* These case options are mutually exclusive.
436	* @param symbols a symbol table mapping variable names to values
437	* and stand-in characters to UnicodeSets; may be nullptr
438	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
439	* contains a syntax error.
440	* @internal
441	*/
442	UnicodeSet(const UnicodeString& pattern,
443	uint32_t options,
444	const SymbolTable* symbols,
445	UErrorCode& status);
446	#endif /* U_HIDE_INTERNAL_API */
447
448	/**
449	* Constructs a set from the given pattern. See the class description
450	* for the syntax of the pattern language.
451	* @param pattern a string specifying what characters are in the set
452	* @param pos on input, the position in pattern at which to start parsing.
453	* On output, the position after the last character parsed.
454	* @param options bitmask for options to apply to the pattern.
455	* Valid options are USET_IGNORE_SPACE and
456	* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
457	* These case options are mutually exclusive.
458	* @param symbols a symbol table mapping variable names to values
459	* and stand-in characters to UnicodeSets; may be nullptr
460	* @param status input-output error code
461	* @stable ICU 2.8
462	*/
463	UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
464	uint32_t options,
465	const SymbolTable* symbols,
466	UErrorCode& status);
467
468	/**
469	* Constructs a set that is identical to the given UnicodeSet.
470	* @stable ICU 2.0
471	*/
472	UnicodeSet(const UnicodeSet& o);
473
474	/**
475	* Destructs the set.
476	* @stable ICU 2.0
477	*/
478	virtual ~UnicodeSet();
479
480	/**
481	* Assigns this object to be a copy of another.
482	* A frozen set will not be modified.
483	* @stable ICU 2.0
484	*/
485	UnicodeSet& operator=(const UnicodeSet& o);
486
487	/**
488	* Compares the specified object with this set for equality. Returns
489	* <tt>true</tt> if the two sets
490	* have the same size, and every member of the specified set is
491	* contained in this set (or equivalently, every member of this set is
492	* contained in the specified set).
493	*
494	* @param o set to be compared for equality with this set.
495	* @return <tt>true</tt> if the specified set is equal to this set.
496	* @stable ICU 2.0
497	*/
498	virtual bool operator==(const UnicodeSet& o) const;
499
500	/**
501	* Compares the specified object with this set for equality. Returns
502	* <tt>true</tt> if the specified set is not equal to this set.
503	* @stable ICU 2.0
504	*/
505	inline bool operator!=(const UnicodeSet& o) const;
506
507	/**
508	* Returns a copy of this object. All UnicodeFunctor objects have
509	* to support cloning in order to allow classes using
510	* UnicodeFunctors, such as Transliterator, to implement cloning.
511	* If this set is frozen, then the clone will be frozen as well.
512	* Use cloneAsThawed() for a mutable clone of a frozen set.
513	* @see cloneAsThawed
514	* @stable ICU 2.0
515	*/
516	virtual UnicodeSet* clone() const override;
517
518	/**
519	* Returns the hash code value for this set.
520	*
521	* @return the hash code value for this set.
522	* @see Object#hashCode()
523	* @stable ICU 2.0
524	*/
525	virtual int32_t hashCode(void) const;
526
527	/**
528	* Get a UnicodeSet pointer from a USet
529	*
530	* @param uset a USet (the ICU plain C type for UnicodeSet)
531	* @return the corresponding UnicodeSet pointer.
532	*
533	* @stable ICU 4.2
534	*/
535	inline static UnicodeSet fromUSet(USet uset);
536
537	/**
538	* Get a UnicodeSet pointer from a const USet
539	*
540	* @param uset a const USet (the ICU plain C type for UnicodeSet)
541	* @return the corresponding UnicodeSet pointer.
542	*
543	* @stable ICU 4.2
544	*/
545	inline static const UnicodeSet fromUSet(const* USet *uset);
546
547	/**
548	* Produce a USet * pointer for this UnicodeSet.
549	* USet is the plain C type for UnicodeSet
550	*
551	* @return a USet pointer for this UnicodeSet
552	* @stable ICU 4.2
553	*/
554	inline USet *toUSet();
555
556
557	/**
558	* Produce a const USet * pointer for this UnicodeSet.
559	* USet is the plain C type for UnicodeSet
560	*
561	* @return a const USet pointer for this UnicodeSet
562	* @stable ICU 4.2
563	*/
564	inline const USet * toUSet() const;
565
566
567	//----------------------------------------------------------------
568	// Freezable API
569	//----------------------------------------------------------------
570
571	/**
572	* Determines whether the set has been frozen (made immutable) or not.
573	* See the ICU4J Freezable interface for details.
574	* @return true/false for whether the set has been frozen
575	* @see freeze
576	* @see cloneAsThawed
577	* @stable ICU 3.8
578	*/
579	inline UBool isFrozen() const;
580
581	/**
582	* Freeze the set (make it immutable).
583	* Once frozen, it cannot be unfrozen and is therefore thread-safe
584	* until it is deleted.
585	* See the ICU4J Freezable interface for details.
586	* Freezing the set may also make some operations faster, for example
587	* contains() and span().
588	* A frozen set will not be modified. (It remains frozen.)
589	* @return this set.
590	* @see isFrozen
591	* @see cloneAsThawed
592	* @stable ICU 3.8
593	*/
594	UnicodeSet *freeze();
595
596	/**
597	* Clone the set and make the clone mutable.
598	* See the ICU4J Freezable interface for details.
599	* @return the mutable clone
600	* @see freeze
601	* @see isFrozen
602	* @stable ICU 3.8
603	*/
604	UnicodeSet cloneAsThawed() const*;
605
606	//----------------------------------------------------------------
607	// Public API
608	//----------------------------------------------------------------
609
610	/**
611	* Make this object represent the range `start - end`.
612	* If `start > end` then this object is set to an empty range.
613	* A frozen set will not be modified.
614	*
615	* @param start first character in the set, inclusive
616	* @param end last character in the set, inclusive
617	* @stable ICU 2.4
618	*/
619	UnicodeSet& set(UChar32 start, UChar32 end);
620
621	/**
622	* Return true if the given position, in the given pattern, appears
623	* to be the start of a UnicodeSet pattern.
624	* @stable ICU 2.4
625	*/
626	static UBool resemblesPattern(const UnicodeString& pattern,
627	int32_t pos);
628
629	/**
630	* Modifies this set to represent the set specified by the given
631	* pattern, ignoring Unicode Pattern_White_Space characters.
632	* See the class description for the syntax of the pattern language.
633	* A frozen set will not be modified.
634	* @param pattern a string specifying what characters are in the set
635	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
636	* contains a syntax error.
637	* <em> Empties the set passed before applying the pattern.</em>
638	* @return a reference to this
639	* @stable ICU 2.0
640	*/
641	UnicodeSet& applyPattern(const UnicodeString& pattern,
642	UErrorCode& status);
643
644	#ifndef U_HIDE_INTERNAL_API
645	/**
646	* Modifies this set to represent the set specified by the given
647	* pattern, optionally ignoring Unicode Pattern_White_Space characters.
648	* See the class description for the syntax of the pattern language.
649	* A frozen set will not be modified.
650	* @param pattern a string specifying what characters are in the set
651	* @param options bitmask for options to apply to the pattern.
652	* Valid options are USET_IGNORE_SPACE and
653	* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
654	* These case options are mutually exclusive.
655	* @param symbols a symbol table mapping variable names to
656	* values and stand-ins to UnicodeSets; may be nullptr
657	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
658	* contains a syntax error.
659	*<em> Empties the set passed before applying the pattern.</em>
660	* @return a reference to this
661	* @internal
662	*/
663	UnicodeSet& applyPattern(const UnicodeString& pattern,
664	uint32_t options,
665	const SymbolTable* symbols,
666	UErrorCode& status);
667	#endif /* U_HIDE_INTERNAL_API */
668
669	/**
670	* Parses the given pattern, starting at the given position. The
671	* character at pattern.charAt(pos.getIndex()) must be '[', or the
672	* parse fails. Parsing continues until the corresponding closing
673	* ']'. If a syntax error is encountered between the opening and
674	* closing brace, the parse fails. Upon return from a successful
675	* parse, the ParsePosition is updated to point to the character
676	* following the closing ']', and a StringBuffer containing a
677	* pairs list for the parsed pattern is returned. This method calls
678	* itself recursively to parse embedded subpatterns.
679	*<em> Empties the set passed before applying the pattern.</em>
680	* A frozen set will not be modified.
681	*
682	* @param pattern the string containing the pattern to be parsed.
683	* The portion of the string from pos.getIndex(), which must be a
684	* '[', to the corresponding closing ']', is parsed.
685	* @param pos upon entry, the position at which to being parsing.
686	* The character at pattern.charAt(pos.getIndex()) must be a '['.
687	* Upon return from a successful parse, pos.getIndex() is either
688	* the character after the closing ']' of the parsed pattern, or
689	* pattern.length() if the closing ']' is the last character of
690	* the pattern string.
691	* @param options bitmask for options to apply to the pattern.
692	* Valid options are USET_IGNORE_SPACE and
693	* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
694	* These case options are mutually exclusive.
695	* @param symbols a symbol table mapping variable names to
696	* values and stand-ins to UnicodeSets; may be nullptr
697	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
698	* contains a syntax error.
699	* @return a reference to this
700	* @stable ICU 2.8
701	*/
702	UnicodeSet& applyPattern(const UnicodeString& pattern,
703	ParsePosition& pos,
704	uint32_t options,
705	const SymbolTable* symbols,
706	UErrorCode& status);
707
708	/**
709	* Returns a string representation of this set. If the result of
710	* calling this function is passed to a UnicodeSet constructor, it
711	* will produce another set that is equal to this one.
712	* A frozen set will not be modified.
713	* @param result the string to receive the rules. Previous
714	* contents will be deleted.
715	* @param escapeUnprintable if true then convert unprintable
716	* character to their hex escape representations, \\uxxxx or
717	* \\Uxxxxxxxx. Unprintable characters are those other than
718	* U+000A, U+0020..U+007E.
719	* @stable ICU 2.0
720	*/
721	virtual UnicodeString& toPattern(UnicodeString& result,
722	UBool escapeUnprintable = false) const override;
723
724	/**
725	* Modifies this set to contain those code points which have the given value
726	* for the given binary or enumerated property, as returned by
727	* u_getIntPropertyValue. Prior contents of this set are lost.
728	* A frozen set will not be modified.
729	*
730	* @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
731	* or UCHAR_INT_START..UCHAR_INT_LIMIT-1
732	* or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
733	*
734	* @param value a value in the range u_getIntPropertyMinValue(prop)..
735	* u_getIntPropertyMaxValue(prop), with one exception. If prop is
736	* UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
737	* rather a mask value produced by U_GET_GC_MASK(). This allows grouped
738	* categories such as [:L:] to be represented.
739	*
740	* @param ec error code input/output parameter
741	*
742	* @return a reference to this set
743	*
744	* @stable ICU 2.4
745	*/
746	UnicodeSet& applyIntPropertyValue(UProperty prop,
747	int32_t value,
748	UErrorCode& ec);
749
750	/**
751	* Modifies this set to contain those code points which have the
752	* given value for the given property. Prior contents of this
753	* set are lost.
754	* A frozen set will not be modified.
755	*
756	* @param prop a property alias, either short or long. The name is matched
757	* loosely. See PropertyAliases.txt for names and a description of loose
758	* matching. If the value string is empty, then this string is interpreted
759	* as either a General_Category value alias, a Script value alias, a binary
760	* property alias, or a special ID. Special IDs are matched loosely and
761	* correspond to the following sets:
762	*
763	* "ANY" = [\\u0000-\\U0010FFFF],
764	* "ASCII" = [\\u0000-\\u007F],
765	* "Assigned" = [:^Cn:].
766	*
767	* @param value a value alias, either short or long. The name is matched
768	* loosely. See PropertyValueAliases.txt for names and a description of
769	* loose matching. In addition to aliases listed, numeric values and
770	* canonical combining classes may be expressed numerically, e.g., ("nv",
771	* "0.5") or ("ccc", "220"). The value string may also be empty.
772	*
773	* @param ec error code input/output parameter
774	*
775	* @return a reference to this set
776	*
777	* @stable ICU 2.4
778	*/
779	UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
780	const UnicodeString& value,
781	UErrorCode& ec);
782
783	/**
784	* Returns the number of elements in this set (its cardinality).
785	* Note than the elements of a set may include both individual
786	* codepoints and strings.
787	*
788	* This is slower than getRangeCount() because
789	* it counts the code points of all ranges.
790	*
791	* @return the number of elements in this set (its cardinality).
792	* @stable ICU 2.0
793	* @see getRangeCount
794	*/
795	virtual int32_t size(void) const;
796
797	/**
798	* Returns <tt>true</tt> if this set contains no elements.
799	*
800	* @return <tt>true</tt> if this set contains no elements.
801	* @stable ICU 2.0
802	*/
803	virtual UBool isEmpty(void) const;
804
805	/**
806	* @return true if this set contains multi-character strings or the empty string.
807	* @stable ICU 70
808	*/
809	UBool hasStrings() const;
810
811	/**
812	* Returns true if this set contains the given character.
813	* This function works faster with a frozen set.
814	* @param c character to be checked for containment
815	* @return true if the test condition is met
816	* @stable ICU 2.0
817	*/
818	virtual UBool contains(UChar32 c) const override;
819
820	/**
821	* Returns true if this set contains every character
822	* of the given range.
823	* @param start first character, inclusive, of the range
824	* @param end last character, inclusive, of the range
825	* @return true if the test condition is met
826	* @stable ICU 2.0
827	*/
828	virtual UBool contains(UChar32 start, UChar32 end) const;
829
830	/**
831	* Returns <tt>true</tt> if this set contains the given
832	* multicharacter string.
833	* @param s string to be checked for containment
834	* @return <tt>true</tt> if this set contains the specified string
835	* @stable ICU 2.4
836	*/
837	UBool contains(const UnicodeString& s) const;
838
839	/**
840	* Returns true if this set contains all the characters and strings
841	* of the given set.
842	* @param c set to be checked for containment
843	* @return true if the test condition is met
844	* @stable ICU 2.4
845	*/
846	virtual UBool containsAll(const UnicodeSet& c) const;
847
848	/**
849	* Returns true if this set contains all the characters
850	* of the given string.
851	* @param s string containing characters to be checked for containment
852	* @return true if the test condition is met
853	* @stable ICU 2.4
854	*/
855	UBool containsAll(const UnicodeString& s) const;
856
857	/**
858	* Returns true if this set contains none of the characters
859	* of the given range.
860	* @param start first character, inclusive, of the range
861	* @param end last character, inclusive, of the range
862	* @return true if the test condition is met
863	* @stable ICU 2.4
864	*/
865	UBool containsNone(UChar32 start, UChar32 end) const;
866
867	/**
868	* Returns true if this set contains none of the characters and strings
869	* of the given set.
870	* @param c set to be checked for containment
871	* @return true if the test condition is met
872	* @stable ICU 2.4
873	*/
874	UBool containsNone(const UnicodeSet& c) const;
875
876	/**
877	* Returns true if this set contains none of the characters
878	* of the given string.
879	* @param s string containing characters to be checked for containment
880	* @return true if the test condition is met
881	* @stable ICU 2.4
882	*/
883	UBool containsNone(const UnicodeString& s) const;
884
885	/**
886	* Returns true if this set contains one or more of the characters
887	* in the given range.
888	* @param start first character, inclusive, of the range
889	* @param end last character, inclusive, of the range
890	* @return true if the condition is met
891	* @stable ICU 2.4
892	*/
893	inline UBool containsSome(UChar32 start, UChar32 end) const;
894
895	/**
896	* Returns true if this set contains one or more of the characters
897	* and strings of the given set.
898	* @param s The set to be checked for containment
899	* @return true if the condition is met
900	* @stable ICU 2.4
901	*/
902	inline UBool containsSome(const UnicodeSet& s) const;
903
904	/**
905	* Returns true if this set contains one or more of the characters
906	* of the given string.
907	* @param s string containing characters to be checked for containment
908	* @return true if the condition is met
909	* @stable ICU 2.4
910	*/
911	inline UBool containsSome(const UnicodeString& s) const;
912
913	/**
914	* Returns the length of the initial substring of the input string which
915	* consists only of characters and strings that are contained in this set
916	* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
917	* or only of characters and strings that are not contained
918	* in this set (USET_SPAN_NOT_CONTAINED).
919	* See USetSpanCondition for details.
920	* Similar to the strspn() C library function.
921	* Unpaired surrogates are treated according to contains() of their surrogate code points.
922	* This function works faster with a frozen set and with a non-negative string length argument.
923	* @param s start of the string
924	* @param length of the string; can be -1 for NUL-terminated
925	* @param spanCondition specifies the containment condition
926	* @return the length of the initial substring according to the spanCondition;
927	* 0 if the start of the string does not fit the spanCondition
928	* @stable ICU 3.8
929	* @see USetSpanCondition
930	*/
931	int32_t span(const char16_t s, int32_t length, USetSpanCondition spanCondition) const*;
932
933	/**
934	* Returns the end of the substring of the input string according to the USetSpanCondition.
935	* Same as <code>start+span(s.getBuffer()+start, s.length()-start, spanCondition)</code>
936	* after pinning start to 0<=start<=s.length().
937	* @param s the string
938	* @param start the start index in the string for the span operation
939	* @param spanCondition specifies the containment condition
940	* @return the exclusive end of the substring according to the spanCondition;
941	* the substring s.tempSubStringBetween(start, end) fulfills the spanCondition
942	* @stable ICU 4.4
943	* @see USetSpanCondition
944	*/
945	inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
946
947	/**
948	* Returns the start of the trailing substring of the input string which
949	* consists only of characters and strings that are contained in this set
950	* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
951	* or only of characters and strings that are not contained
952	* in this set (USET_SPAN_NOT_CONTAINED).
953	* See USetSpanCondition for details.
954	* Unpaired surrogates are treated according to contains() of their surrogate code points.
955	* This function works faster with a frozen set and with a non-negative string length argument.
956	* @param s start of the string
957	* @param length of the string; can be -1 for NUL-terminated
958	* @param spanCondition specifies the containment condition
959	* @return the start of the trailing substring according to the spanCondition;
960	* the string length if the end of the string does not fit the spanCondition
961	* @stable ICU 3.8
962	* @see USetSpanCondition
963	*/
964	int32_t spanBack(const char16_t s, int32_t length, USetSpanCondition spanCondition) const*;
965
966	/**
967	* Returns the start of the substring of the input string according to the USetSpanCondition.
968	* Same as <code>spanBack(s.getBuffer(), limit, spanCondition)</code>
969	* after pinning limit to 0<=end<=s.length().
970	* @param s the string
971	* @param limit the exclusive-end index in the string for the span operation
972	* (use s.length() or INT32_MAX for spanning back from the end of the string)
973	* @param spanCondition specifies the containment condition
974	* @return the start of the substring according to the spanCondition;
975	* the substring s.tempSubStringBetween(start, limit) fulfills the spanCondition
976	* @stable ICU 4.4
977	* @see USetSpanCondition
978	*/
979	inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
980
981	/**
982	* Returns the length of the initial substring of the input string which
983	* consists only of characters and strings that are contained in this set
984	* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
985	* or only of characters and strings that are not contained
986	* in this set (USET_SPAN_NOT_CONTAINED).
987	* See USetSpanCondition for details.
988	* Similar to the strspn() C library function.
989	* Malformed byte sequences are treated according to contains(0xfffd).
990	* This function works faster with a frozen set and with a non-negative string length argument.
991	* @param s start of the string (UTF-8)
992	* @param length of the string; can be -1 for NUL-terminated
993	* @param spanCondition specifies the containment condition
994	* @return the length of the initial substring according to the spanCondition;
995	* 0 if the start of the string does not fit the spanCondition
996	* @stable ICU 3.8
997	* @see USetSpanCondition
998	*/
999	int32_t spanUTF8(const char s, int32_t length, USetSpanCondition spanCondition) const*;
1000
1001	/**
1002	* Returns the start of the trailing substring of the input string which
1003	* consists only of characters and strings that are contained in this set
1004	* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1005	* or only of characters and strings that are not contained
1006	* in this set (USET_SPAN_NOT_CONTAINED).
1007	* See USetSpanCondition for details.
1008	* Malformed byte sequences are treated according to contains(0xfffd).
1009	* This function works faster with a frozen set and with a non-negative string length argument.
1010	* @param s start of the string (UTF-8)
1011	* @param length of the string; can be -1 for NUL-terminated
1012	* @param spanCondition specifies the containment condition
1013	* @return the start of the trailing substring according to the spanCondition;
1014	* the string length if the end of the string does not fit the spanCondition
1015	* @stable ICU 3.8
1016	* @see USetSpanCondition
1017	*/
1018	int32_t spanBackUTF8(const char s, int32_t length, USetSpanCondition spanCondition) const*;
1019
1020	/**
1021	* Implement UnicodeMatcher::matches()
1022	* @stable ICU 2.4
1023	*/
1024	virtual UMatchDegree matches(const Replaceable& text,
1025	int32_t& offset,
1026	int32_t limit,
1027	UBool incremental) override;
1028
1029	private:
1030	/**
1031	* Returns the longest match for s in text at the given position.
1032	* If limit > start then match forward from start+1 to limit
1033	* matching all characters except s.charAt(0). If limit < start,
1034	* go backward starting from start-1 matching all characters
1035	* except s.charAt(s.length()-1). This method assumes that the
1036	* first character, text.charAt(start), matches s, so it does not
1037	* check it.
1038	* @param text the text to match
1039	* @param start the first character to match. In the forward
1040	* direction, text.charAt(start) is matched against s.charAt(0).
1041	* In the reverse direction, it is matched against
1042	* s.charAt(s.length()-1).
1043	* @param limit the limit offset for matching, either last+1 in
1044	* the forward direction, or last-1 in the reverse direction,
1045	* where last is the index of the last character to match.
1046	* @param s
1047	* @return If part of s matches up to the limit, return \|limit -
1048	* start\|. If all of s matches before reaching the limit, return
1049	* s.length(). If there is a mismatch between s and text, return
1050	* 0
1051	*/
1052	static int32_t matchRest(const Replaceable& text,
1053	int32_t start, int32_t limit,
1054	const UnicodeString& s);
1055
1056	/**
1057	* Returns the smallest value i such that c < list[i]. Caller
1058	* must ensure that c is a legal value or this method will enter
1059	* an infinite loop. This method performs a binary search.
1060	* @param c a character in the range MIN_VALUE..MAX_VALUE
1061	* inclusive
1062	* @return the smallest integer i in the range 0..len-1,
1063	* inclusive, such that c < list[i]
1064	*/
1065	int32_t findCodePoint(UChar32 c) const;
1066
1067	public:
1068
1069	/**
1070	* Implementation of UnicodeMatcher API. Union the set of all
1071	* characters that may be matched by this object into the given
1072	* set.
1073	* @param toUnionTo the set into which to union the source characters
1074	* @stable ICU 2.4
1075	*/
1076	virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override;
1077
1078	/**
1079	* Returns the index of the given character within this set, where
1080	* the set is ordered by ascending code point. If the character
1081	* is not in this set, return -1. The inverse of this method is
1082	* <code>charAt()</code>.
1083	* @return an index from 0..size()-1, or -1
1084	* @stable ICU 2.4
1085	*/
1086	int32_t indexOf(UChar32 c) const;
1087
1088	/**
1089	* Returns the character at the given index within this set, where
1090	* the set is ordered by ascending code point. If the index is
1091	* out of range for characters, returns (UChar32)-1.
1092	* The inverse of this method is <code>indexOf()</code>.
1093	*
1094	* For iteration, this is slower than UnicodeSetIterator or
1095	* getRangeCount()/getRangeStart()/getRangeEnd(),
1096	* because for each call it skips linearly over <code>index</code>
1097	* characters in the ranges.
1098	*
1099	* @param index an index from 0..size()-1
1100	* @return the character at the given index, or (UChar32)-1.
1101	* @stable ICU 2.4
1102	*/
1103	UChar32 charAt(int32_t index) const;
1104
1105	/**
1106	* Adds the specified range to this set if it is not already
1107	* present. If this set already contains the specified range,
1108	* the call leaves this set unchanged. If <code>start > end</code>
1109	* then an empty range is added, leaving the set unchanged.
1110	* This is equivalent to a boolean logic OR, or a set UNION.
1111	* A frozen set will not be modified.
1112	*
1113	* @param start first character, inclusive, of range to be added
1114	* to this set.
1115	* @param end last character, inclusive, of range to be added
1116	* to this set.
1117	* @stable ICU 2.0
1118	*/
1119	virtual UnicodeSet& add(UChar32 start, UChar32 end);
1120
1121	/**
1122	* Adds the specified character to this set if it is not already
1123	* present. If this set already contains the specified character,
1124	* the call leaves this set unchanged.
1125	* A frozen set will not be modified.
1126	*
1127	* @param c the character (code point)
1128	* @return this object, for chaining
1129	* @stable ICU 2.0
1130	*/
1131	UnicodeSet& add(UChar32 c);
1132
1133	/**
1134	* Adds the specified multicharacter to this set if it is not already
1135	* present. If this set already contains the multicharacter,
1136	* the call leaves this set unchanged.
1137	* Thus "ch" => {"ch"}
1138	* A frozen set will not be modified.
1139	*
1140	* @param s the source string
1141	* @return this object, for chaining
1142	* @stable ICU 2.4
1143	*/
1144	UnicodeSet& add(const UnicodeString& s);
1145
1146	private:
1147	/**
1148	* @return a code point IF the string consists of a single one.
1149	* otherwise returns -1.
1150	* @param s string to test
1151	*/
1152	static int32_t getSingleCP(const UnicodeString& s);
1153
1154	void _add(const UnicodeString& s);
1155
1156	public:
1157	/**
1158	* Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
1159	* If this set already contains any particular character, it has no effect on that character.
1160	* A frozen set will not be modified.
1161	* @param s the source string
1162	* @return this object, for chaining
1163	* @stable ICU 2.4
1164	*/
1165	UnicodeSet& addAll(const UnicodeString& s);
1166
1167	/**
1168	* Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1169	* A frozen set will not be modified.
1170	* @param s the source string
1171	* @return this object, for chaining
1172	* @stable ICU 2.4
1173	*/
1174	UnicodeSet& retainAll(const UnicodeString& s);
1175
1176	/**
1177	* Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1178	* A frozen set will not be modified.
1179	* @param s the source string
1180	* @return this object, for chaining
1181	* @stable ICU 2.4
1182	*/
1183	UnicodeSet& complementAll(const UnicodeString& s);
1184
1185	/**
1186	* Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1187	* A frozen set will not be modified.
1188	* @param s the source string
1189	* @return this object, for chaining
1190	* @stable ICU 2.4
1191	*/
1192	UnicodeSet& removeAll(const UnicodeString& s);
1193
1194	/**
1195	* Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1196	*
1197	* @param s the source string
1198	* @return a newly created set containing the given string.
1199	* The caller owns the return object and is responsible for deleting it.
1200	* @stable ICU 2.4
1201	*/
1202	static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1203
1204
1205	/**
1206	* Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1207	* @param s the source string
1208	* @return a newly created set containing the given characters
1209	* The caller owns the return object and is responsible for deleting it.
1210	* @stable ICU 2.4
1211	*/
1212	static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1213
1214	/**
1215	* Retain only the elements in this set that are contained in the
1216	* specified range. If <code>start > end</code> then an empty range is
1217	* retained, leaving the set empty. This is equivalent to
1218	* a boolean logic AND, or a set INTERSECTION.
1219	* A frozen set will not be modified.
1220	*
1221	* @param start first character, inclusive, of range
1222	* @param end last character, inclusive, of range
1223	* @stable ICU 2.0
1224	*/
1225	virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1226
1227
1228	/**
1229	* Retain the specified character from this set if it is present.
1230	* A frozen set will not be modified.
1231	*
1232	* @param c the character (code point)
1233	* @return this object, for chaining
1234	* @stable ICU 2.0
1235	*/
1236	UnicodeSet& retain(UChar32 c);
1237
1238	/**
1239	* Retains only the specified string from this set if it is present.
1240	* Upon return this set will be empty if it did not contain s, or
1241	* will only contain s if it did contain s.
1242	* A frozen set will not be modified.
1243	*
1244	* @param s the source string
1245	* @return this object, for chaining
1246	* @stable ICU 69
1247	*/
1248	UnicodeSet& retain(const UnicodeString &s);
1249
1250	/**
1251	* Removes the specified range from this set if it is present.
1252	* The set will not contain the specified range once the call
1253	* returns. If <code>start > end</code> then an empty range is
1254	* removed, leaving the set unchanged.
1255	* A frozen set will not be modified.
1256	*
1257	* @param start first character, inclusive, of range to be removed
1258	* from this set.
1259	* @param end last character, inclusive, of range to be removed
1260	* from this set.
1261	* @stable ICU 2.0
1262	*/
1263	virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1264
1265	/**
1266	* Removes the specified character from this set if it is present.
1267	* The set will not contain the specified range once the call
1268	* returns.
1269	* A frozen set will not be modified.
1270	*
1271	* @param c the character (code point)
1272	* @return this object, for chaining
1273	* @stable ICU 2.0
1274	*/
1275	UnicodeSet& remove(UChar32 c);
1276
1277	/**
1278	* Removes the specified string from this set if it is present.
1279	* The set will not contain the specified character once the call
1280	* returns.
1281	* A frozen set will not be modified.
1282	* @param s the source string
1283	* @return this object, for chaining
1284	* @stable ICU 2.4
1285	*/
1286	UnicodeSet& remove(const UnicodeString& s);
1287
1288	/**
1289	* This is equivalent to
1290	* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1291	*
1292	* <strong>Note:</strong> This performs a symmetric difference with all code points
1293	* <em>and thus retains all multicharacter strings</em>.
1294	* In order to achieve a “code point complement” (all code points minus this set),
1295	* the easiest is to <code>.complement().removeAllStrings()</code>.
1296	*
1297	* A frozen set will not be modified.
1298	* @stable ICU 2.0
1299	*/
1300	virtual UnicodeSet& complement();
1301
1302	/**
1303	* Complements the specified range in this set. Any character in
1304	* the range will be removed if it is in this set, or will be
1305	* added if it is not in this set. If <code>start > end</code>
1306	* then an empty range is complemented, leaving the set unchanged.
1307	* This is equivalent to a boolean logic XOR.
1308	* A frozen set will not be modified.
1309	*
1310	* @param start first character, inclusive, of range
1311	* @param end last character, inclusive, of range
1312	* @stable ICU 2.0
1313	*/
1314	virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1315
1316	/**
1317	* Complements the specified character in this set. The character
1318	* will be removed if it is in this set, or will be added if it is
1319	* not in this set.
1320	* A frozen set will not be modified.
1321	*
1322	* @param c the character (code point)
1323	* @return this object, for chaining
1324	* @stable ICU 2.0
1325	*/
1326	UnicodeSet& complement(UChar32 c);
1327
1328	/**
1329	* Complement the specified string in this set.
1330	* The string will be removed if it is in this set, or will be added if it is not in this set.
1331	* A frozen set will not be modified.
1332	*
1333	* @param s the string to complement
1334	* @return this object, for chaining
1335	* @stable ICU 2.4
1336	*/
1337	UnicodeSet& complement(const UnicodeString& s);
1338
1339	/**
1340	* Adds all of the elements in the specified set to this set if
1341	* they're not already present. This operation effectively
1342	* modifies this set so that its value is the <i>union</i> of the two
1343	* sets. The behavior of this operation is unspecified if the specified
1344	* collection is modified while the operation is in progress.
1345	* A frozen set will not be modified.
1346	*
1347	* @param c set whose elements are to be added to this set.
1348	* @see #add(UChar32, UChar32)
1349	* @stable ICU 2.0
1350	*/
1351	virtual UnicodeSet& addAll(const UnicodeSet& c);
1352
1353	/**
1354	* Retains only the elements in this set that are contained in the
1355	* specified set. In other words, removes from this set all of
1356	* its elements that are not contained in the specified set. This
1357	* operation effectively modifies this set so that its value is
1358	* the <i>intersection</i> of the two sets.
1359	* A frozen set will not be modified.
1360	*
1361	* @param c set that defines which elements this set will retain.
1362	* @stable ICU 2.0
1363	*/
1364	virtual UnicodeSet& retainAll(const UnicodeSet& c);
1365
1366	/**
1367	* Removes from this set all of its elements that are contained in the
1368	* specified set. This operation effectively modifies this
1369	* set so that its value is the <i>asymmetric set difference</i> of
1370	* the two sets.
1371	* A frozen set will not be modified.
1372	*
1373	* @param c set that defines which elements will be removed from
1374	* this set.
1375	* @stable ICU 2.0
1376	*/
1377	virtual UnicodeSet& removeAll(const UnicodeSet& c);
1378
1379	/**
1380	* Complements in this set all elements contained in the specified
1381	* set. Any character in the other set will be removed if it is
1382	* in this set, or will be added if it is not in this set.
1383	* A frozen set will not be modified.
1384	*
1385	* @param c set that defines which elements will be xor'ed from
1386	* this set.
1387	* @stable ICU 2.4
1388	*/
1389	virtual UnicodeSet& complementAll(const UnicodeSet& c);
1390
1391	/**
1392	* Removes all of the elements from this set. This set will be
1393	* empty after this call returns.
1394	* A frozen set will not be modified.
1395	* @stable ICU 2.0
1396	*/
1397	virtual UnicodeSet& clear(void);
1398
1399	/**
1400	* Close this set over the given attribute. For the attribute
1401	* USET_CASE_INSENSITIVE, the result is to modify this set so that:
1402	*
1403	* 1. For each character or string 'a' in this set, all strings or
1404	* characters 'b' such that foldCase(a) == foldCase(b) are added
1405	* to this set.
1406	*
1407	* 2. For each string 'e' in the resulting set, if e !=
1408	* foldCase(e), 'e' will be removed.
1409	*
1410	* Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
1411	*
1412	* (Here foldCase(x) refers to the operation u_strFoldCase, and a
1413	* == b denotes that the contents are the same, not pointer
1414	* comparison.)
1415	*
1416	* A frozen set will not be modified.
1417	*
1418	* @param attribute bitmask for attributes to close over.
1419	* Valid options:
1420	* At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
1421	* These case options are mutually exclusive.
1422	* Unrelated options bits are ignored.
1423	* @return a reference to this set.
1424	* @stable ICU 4.2
1425	*/
1426	UnicodeSet& closeOver(int32_t attribute);
1427
1428	/**
1429	* Remove all strings from this set.
1430	*
1431	* @return a reference to this set.
1432	* @stable ICU 4.2
1433	*/
1434	virtual UnicodeSet &removeAllStrings();
1435
1436	/**
1437	* Iteration method that returns the number of ranges contained in
1438	* this set.
1439	* @see #getRangeStart
1440	* @see #getRangeEnd
1441	* @stable ICU 2.4
1442	*/
1443	virtual int32_t getRangeCount(void) const;
1444
1445	/**
1446	* Iteration method that returns the first character in the
1447	* specified range of this set.
1448	* @see #getRangeCount
1449	* @see #getRangeEnd
1450	* @stable ICU 2.4
1451	*/
1452	virtual UChar32 getRangeStart(int32_t index) const;
1453
1454	/**
1455	* Iteration method that returns the last character in the
1456	* specified range of this set.
1457	* @see #getRangeStart
1458	* @see #getRangeEnd
1459	* @stable ICU 2.4
1460	*/
1461	virtual UChar32 getRangeEnd(int32_t index) const;
1462
1463	/**
1464	* Serializes this set into an array of 16-bit integers. Serialization
1465	* (currently) only records the characters in the set; multicharacter
1466	* strings are ignored.
1467	*
1468	* The array has following format (each line is one 16-bit
1469	* integer):
1470	*
1471	* length = (n+2*m) \| (m!=0?0x8000:0)
1472	* bmpLength = n; present if m!=0
1473	* bmp[0]
1474	* bmp[1]
1475	* ...
1476	* bmp[n-1]
1477	* supp-high[0]
1478	* supp-low[0]
1479	* supp-high[1]
1480	* supp-low[1]
1481	* ...
1482	* supp-high[m-1]
1483	* supp-low[m-1]
1484	*
1485	* The array starts with a header. After the header are n bmp
1486	* code points, then m supplementary code points. Either n or m
1487	* or both may be zero. n+2*m is always <= 0x7FFF.
1488	*
1489	* If there are no supplementary characters (if m==0) then the
1490	* header is one 16-bit integer, 'length', with value n.
1491	*
1492	* If there are supplementary characters (if m!=0) then the header
1493	* is two 16-bit integers. The first, 'length', has value
1494	* (n+2*m)\|0x8000. The second, 'bmpLength', has value n.
1495	*
1496	* After the header the code points are stored in ascending order.
1497	* Supplementary code points are stored as most significant 16
1498	* bits followed by least significant 16 bits.
1499	*
1500	* @param dest pointer to buffer of destCapacity 16-bit integers.
1501	* May be nullptr only if destCapacity is zero.
1502	* @param destCapacity size of dest, or zero. Must not be negative.
1503	* @param ec error code. Will be set to U_INDEX_OUTOFBOUNDS_ERROR
1504	* if n+2*m > 0x7FFF. Will be set to U_BUFFER_OVERFLOW_ERROR if
1505	* n+2*m+(m!=0?2:1) > destCapacity.
1506	* @return the total length of the serialized format, including
1507	* the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
1508	* than U_BUFFER_OVERFLOW_ERROR.
1509	* @stable ICU 2.4
1510	*/
1511	int32_t serialize(uint16_t dest, int32_t destCapacity, UErrorCode& ec) const*;
1512
1513	/**
1514	* Reallocate this objects internal structures to take up the least
1515	* possible space, without changing this object's value.
1516	* A frozen set will not be modified.
1517	* @stable ICU 2.4
1518	*/
1519	virtual UnicodeSet& compact();
1520
1521	/**
1522	* Return the class ID for this class. This is useful only for
1523	* comparing to a return value from getDynamicClassID(). For example:
1524	* <pre>
1525	* . Base* polymorphic_pointer = createPolymorphicObject();
1526	* . if (polymorphic_pointer->getDynamicClassID() ==
1527	* . Derived::getStaticClassID()) ...
1528	* </pre>
1529	* @return The class ID for all objects of this class.
1530	* @stable ICU 2.0
1531	*/
1532	static UClassID U_EXPORT2 getStaticClassID(void);
1533
1534	/**
1535	* Implement UnicodeFunctor API.
1536	*
1537	* @return The class ID for this object. All objects of a given
1538	* class have the same class ID. Objects of other classes have
1539	* different class IDs.
1540	* @stable ICU 2.4
1541	*/
1542	virtual UClassID getDynamicClassID(void) const override;
1543
1544	private:
1545
1546	// Private API for the USet API
1547
1548	friend class USetAccess;
1549
1550	const UnicodeString* getString(int32_t index) const;
1551
1552	//----------------------------------------------------------------
1553	// RuleBasedTransliterator support
1554	//----------------------------------------------------------------
1555
1556	private:
1557
1558	/**
1559	* Returns <tt>true</tt> if this set contains any character whose low byte
1560	* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
1561	* indexing.
1562	*/
1563	virtual UBool matchesIndexValue(uint8_t v) const override;
1564
1565	private:
1566	friend class RBBIRuleScanner;
1567
1568	//----------------------------------------------------------------
1569	// Implementation: Clone as thawed (see ICU4J Freezable)
1570	//----------------------------------------------------------------
1571
1572	UnicodeSet(const UnicodeSet& o, UBool / asThawed /);
1573	UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
1574
1575	//----------------------------------------------------------------
1576	// Implementation: Pattern parsing
1577	//----------------------------------------------------------------
1578
1579	void applyPatternIgnoreSpace(const UnicodeString& pattern,
1580	ParsePosition& pos,
1581	const SymbolTable* symbols,
1582	UErrorCode& status);
1583
1584	void applyPattern(RuleCharacterIterator& chars,
1585	const SymbolTable* symbols,
1586	UnicodeString& rebuiltPat,
1587	uint32_t options,
1588	UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1589	int32_t depth,
1590	UErrorCode& ec);
1591
1592	void closeOverCaseInsensitive(bool simple);
1593	void closeOverAddCaseMappings();
1594
1595	//----------------------------------------------------------------
1596	// Implementation: Utility methods
1597	//----------------------------------------------------------------
1598
1599	static int32_t nextCapacity(int32_t minCapacity);
1600
1601	bool ensureCapacity(int32_t newLen);
1602
1603	bool ensureBufferCapacity(int32_t newLen);
1604
1605	void swapBuffers(void);
1606
1607	UBool allocateStrings(UErrorCode &status);
1608	int32_t stringsSize() const;
1609	UBool stringsContains(const UnicodeString &s) const;
1610
1611	UnicodeString& _toPattern(UnicodeString& result,
1612	UBool escapeUnprintable) const;
1613
1614	UnicodeString& _generatePattern(UnicodeString& result,
1615	UBool escapeUnprintable) const;
1616
1617	static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1618
1619	static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1620
1621	static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
1622	UBool escapeUnprintable);
1623
1624	//----------------------------------------------------------------
1625	// Implementation: Fundamental operators
1626	//----------------------------------------------------------------
1627
1628	void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1629
1630	void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1631
1632	void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1633
1634	/**
1635	* Return true if the given position, in the given pattern, appears
1636	* to be the start of a property set pattern [:foo:], \\p{foo}, or
1637	* \\P{foo}, or \\N{name}.
1638	*/
1639	static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1640	int32_t pos);
1641
1642	static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1643	int32_t iterOpts);
1644
1645	/**
1646	* Parse the given property pattern at the given parse position
1647	* and set this UnicodeSet to the result.
1648	*
1649	* The original design document is out of date, but still useful.
1650	* Ignore the property and value names:
1651	* https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/unicodeset_properties.html
1652	*
1653	* Recognized syntax:
1654	*
1655	* [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
1656	* \\p{foo} \\P{foo} - white space not allowed within "\\p" or "\\P"
1657	* \\N{name} - white space not allowed within "\\N"
1658	*
1659	* Other than the above restrictions, Unicode Pattern_White_Space characters are ignored.
1660	* Case is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading
1661	* and trailing space is deleted, and internal runs of whitespace
1662	* are collapsed to a single space.
1663	*
1664	* We support binary properties, enumerated properties, and the
1665	* following non-enumerated properties:
1666	*
1667	* Numeric_Value
1668	* Name
1669	* Unicode_1_Name
1670	*
1671	* @param pattern the pattern string
1672	* @param ppos on entry, the position at which to begin parsing.
1673	* This should be one of the locations marked '^':
1674	*
1675	* [:blah:] \\p{blah} \\P{blah} \\N{name}
1676	* ^ % ^ % ^ % ^ %
1677	*
1678	* On return, the position after the last character parsed, that is,
1679	* the locations marked '%'. If the parse fails, ppos is returned
1680	* unchanged.
1681	* @param ec status
1682	* @return a reference to this.
1683	*/
1684	UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1685	ParsePosition& ppos,
1686	UErrorCode &ec);
1687
1688	void applyPropertyPattern(RuleCharacterIterator& chars,
1689	UnicodeString& rebuiltPat,
1690	UErrorCode& ec);
1691
1692	/**
1693	* A filter that returns true if the given code point should be
1694	* included in the UnicodeSet being constructed.
1695	*/
1696	typedef UBool (Filter)(UChar32 codePoint, void** context);
1697
1698	/**
1699	* Given a filter, set this UnicodeSet to the code points
1700	* contained by that filter. The filter MUST be
1701	* property-conformant. That is, if it returns value v for one
1702	* code point, then it must return v for all affiliated code
1703	* points, as defined by the inclusions list. See
1704	* getInclusions().
1705	* src is a UPropertySource value.
1706	*/
1707	void applyFilter(Filter filter,
1708	void* context,
1709	const UnicodeSet* inclusions,
1710	UErrorCode &status);
1711
1712	/**
1713	* Set the new pattern to cache.
1714	*/
1715	void setPattern(const UnicodeString& newPat) {
1716	setPattern(newPat.getBuffer(), newPat.length());
1717	}
1718	void setPattern(const char16_t *newPat, int32_t newPatLen);
1719	/**
1720	* Release existing cached pattern.
1721	*/
1722	void releasePattern();
1723
1724	friend class UnicodeSetIterator;
1725	};
1726
1727
1728
1729	inline bool UnicodeSet::operator!=(const UnicodeSet& o) const {
1730	return !operator==(o);
1731	}
1732
1733	inline UBool UnicodeSet::isFrozen() const {
1734	return (UBool)(bmpSet!=nullptr \|\| stringSpan!=nullptr);
1735	}
1736
1737	inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1738	return !containsNone(start, end);
1739	}
1740
1741	inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1742	return !containsNone(s);
1743	}
1744
1745	inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1746	return !containsNone(s);
1747	}
1748
1749	inline UBool UnicodeSet::isBogus() const {
1750	return (UBool)(fFlags & kIsBogus);
1751	}
1752
1753	inline UnicodeSet UnicodeSet::fromUSet(USet uset) {
1754	return reinterpret_cast<UnicodeSet *>(uset);
1755	}
1756
1757	inline const UnicodeSet UnicodeSet::fromUSet(const* USet *uset) {
1758	return reinterpret_cast<const UnicodeSet *>(uset);
1759	}
1760
1761	inline USet *UnicodeSet::toUSet() {
1762	return reinterpret_cast<USet >(this*);
1763	}
1764
1765	inline const USet UnicodeSet::toUSet() const* {
1766	return reinterpret_cast<const USet >(this*);
1767	}
1768
1769	inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1770	int32_t sLength=s.length();
1771	if(start<`0`) {
1772	start=`0`;
1773	} else if(start>sLength) {
1774	start=sLength;
1775	}
1776	return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1777	}
1778
1779	inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1780	int32_t sLength=s.length();
1781	if(limit<`0`) {
1782	limit=`0`;
1783	} else if(limit>sLength) {
1784	limit=sLength;
1785	}
1786	return spanBack(s.getBuffer(), limit, spanCondition);
1787	}
1788
1789	U_NAMESPACE_END
1790
1791	#endif /* U_SHOW_CPLUSPLUS_API */
1792
1793	#endif
1794

Browse the source code of Godot/thirdparty/icu4c/common/unicode/uniset.h