uniset.cpp source code [Godot/thirdparty/icu4c/common/uniset.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 1999-2015, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 10/20/99 alan Creation.
10	**********************************************************************
11	*/
12
13	#include "unicode/utypes.h"
14	#include "unicode/parsepos.h"
15	#include "unicode/symtable.h"
16	#include "unicode/uniset.h"
17	#include "unicode/ustring.h"
18	#include "unicode/utf8.h"
19	#include "unicode/utf16.h"
20	#include "ruleiter.h"
21	#include "cmemory.h"
22	#include "cstring.h"
23	#include "patternprops.h"
24	#include "uelement.h"
25	#include "util.h"
26	#include "uvector.h"
27	#include "charstr.h"
28	#include "ustrfmt.h"
29	#include "uassert.h"
30	#include "bmpset.h"
31	#include "unisetspan.h"
32
33	// HIGH_VALUE > all valid values. 110000 for codepoints
34	#define UNICODESET_HIGH 0x0110000
35
36	// LOW <= all valid values. ZERO for codepoints
37	#define UNICODESET_LOW 0x000000
38
39	/* Max list [0, 1, 2, ..., max code point, HIGH] /
40	constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + `1`;
41
42	U_NAMESPACE_BEGIN
43
44	SymbolTable::~SymbolTable() {}
45
46	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
47
48	/**
49	* Modify the given UChar32 variable so that it is in range, by
50	* pinning values < UNICODESET_LOW to UNICODESET_LOW, and
51	* pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
52	* It modifies its argument in-place and also returns it.
53	*/
54	static inline UChar32 pinCodePoint(UChar32& c) {
55	if (c < UNICODESET_LOW) {
56	c = UNICODESET_LOW;
57	} else if (c > (UNICODESET_HIGH-`1`)) {
58	c = (UNICODESET_HIGH-`1`);
59	}
60	return c;
61	}
62
63	//----------------------------------------------------------------
64	// Debugging
65	//----------------------------------------------------------------
66
67	// DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
68	// To enable the debugging, define the symbol DEBUG_MEM in the line
69	// below. This will result in text being sent to stdout that looks
70	// like this:
71	// DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
72	// DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
73	// Each line lists a construction (ct) or destruction (dt) event, the
74	// object address, the number of outstanding objects after the event,
75	// and the pattern of the object in question.
76
77	// #define DEBUG_MEM
78
79	#ifdef DEBUG_MEM
80	#include <stdio.h>
81	static int32_t _dbgCount = `0`;
82
83	static inline void _dbgct(UnicodeSet* set) {
84	UnicodeString str;
85	set->toPattern(str, true);
86	char buf[`40`];
87	str.extract(`0`, `39`, buf, "");
88	printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
89	}
90
91	static inline void _dbgdt(UnicodeSet* set) {
92	UnicodeString str;
93	set->toPattern(str, true);
94	char buf[`40`];
95	str.extract(`0`, `39`, buf, "");
96	printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
97	}
98
99	#else
100
101	#define _dbgct(set)
102	#define _dbgdt(set)
103
104	#endif
105
106	//----------------------------------------------------------------
107	// UnicodeString in UVector support
108	//----------------------------------------------------------------
109
110	static void U_CALLCONV cloneUnicodeString(UElement dst, UElement src) {
111	dst->pointer = new UnicodeString ((UnicodeString)src->pointer);
112	}
113
114	static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
115	const UnicodeString &a = (const* UnicodeString*)t1.pointer;
116	const UnicodeString &b = (const* UnicodeString*)t2.pointer;
117	return a.compare(b);
118	}
119
120	UBool UnicodeSet::hasStrings() const {
121	return strings != nullptr && !strings->isEmpty();
122	}
123
124	int32_t UnicodeSet::stringsSize() const {
125	return strings == nullptr ? `0` : strings->size();
126	}
127
128	UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
129	return strings != nullptr && strings->contains((void*) &s);
130	}
131
132	//----------------------------------------------------------------
133	// Constructors &c
134	//----------------------------------------------------------------
135
136	/**
137	* Constructs an empty set.
138	*/
139	UnicodeSet::UnicodeSet() {
140	list[`0`] = UNICODESET_HIGH;
141	_dbgct(this);
142	}
143
144	/**
145	* Constructs a set containing the given range. If <code>end >
146	* start</code> then an empty set is created.
147	*
148	* @param start first character, inclusive, of range
149	* @param end last character, inclusive, of range
150	*/
151	UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
152	list[`0`] = UNICODESET_HIGH;
153	add(start, end);
154	_dbgct(this);
155	}
156
157	/**
158	* Constructs a set that is identical to the given UnicodeSet.
159	*/
160	UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter (o) {
161	*this = o;
162	_dbgct(this);
163	}
164
165	// Copy-construct as thawed.
166	UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool / asThawed /) : UnicodeFilter (o) {
167	if (ensureCapacity(o.len)) {
168	// this = o except for bmpSet and stringSpan*
169	len = o.len;
170	uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
171	if (o.hasStrings()) {
172	UErrorCode status = U_ZERO_ERROR;
173	if (!allocateStrings(status) \|\|
174	(strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
175	setToBogus();
176	return;
177	}
178	}
179	if (o.pat) {
180	setPattern(o.pat, o.patLen);
181	}
182	_dbgct(this);
183	}
184	}
185
186	/**
187	* Destructs the set.
188	*/
189	UnicodeSet::~UnicodeSet() {
190	_dbgdt(this); // first!
191	if (list != stackList) {
192	uprv_free(list);
193	}
194	delete bmpSet;
195	if (buffer != stackList) {
196	uprv_free(buffer);
197	}
198	delete strings;
199	delete stringSpan;
200	releasePattern();
201	}
202
203	/**
204	* Assigns this object to be a copy of another.
205	*/
206	UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
207	return copyFrom(o, false);
208	}
209
210	UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
211	if (this == &o) {
212	return *this;
213	}
214	if (isFrozen()) {
215	return *this;
216	}
217	if (o.isBogus()) {
218	setToBogus();
219	return *this;
220	}
221	if (!ensureCapacity(o.len)) {
222	// ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
223	return *this;
224	}
225	len = o.len;
226	uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
227	if (o.bmpSet != nullptr && !asThawed) {
228	bmpSet = new BMPSet (*o.bmpSet, list, len);
229	if (bmpSet == nullptr) { // Check for memory allocation error.
230	setToBogus();
231	return *this;
232	}
233	}
234	if (o.hasStrings()) {
235	UErrorCode status = U_ZERO_ERROR;
236	if ((strings == nullptr && !allocateStrings(status)) \|\|
237	(strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
238	setToBogus();
239	return *this;
240	}
241	} else if (hasStrings()) {
242	strings->removeAllElements();
243	}
244	if (o.stringSpan != nullptr && !asThawed) {
245	stringSpan = new UnicodeSetStringSpan (o.stringSpan, strings);
246	if (stringSpan == nullptr) { // Check for memory allocation error.
247	setToBogus();
248	return *this;
249	}
250	}
251	releasePattern();
252	if (o.pat) {
253	setPattern(o.pat, o.patLen);
254	}
255	return *this;
256	}
257
258	/**
259	* Returns a copy of this object. All UnicodeMatcher objects have
260	* to support cloning in order to allow classes using
261	* UnicodeMatchers, such as Transliterator, to implement cloning.
262	*/
263	UnicodeSet* UnicodeSet::clone() const {
264	return new UnicodeSet (*this);
265	}
266
267	UnicodeSet UnicodeSet::cloneAsThawed() const* {
268	return new UnicodeSet (*this, true);
269	}
270
271	/**
272	* Compares the specified object with this set for equality. Returns
273	* <tt>true</tt> if the two sets
274	* have the same size, and every member of the specified set is
275	* contained in this set (or equivalently, every member of this set is
276	* contained in the specified set).
277	*
278	* @param o set to be compared for equality with this set.
279	* @return <tt>true</tt> if the specified set is equal to this set.
280	*/
281	bool UnicodeSet::operator==(const UnicodeSet& o) const {
282	if (len != o.len) return false;
283	for (int32_t i = `0`; i < len; ++i) {
284	if (list[i] != o.list[i]) return false;
285	}
286	if (hasStrings() != o.hasStrings()) { return false; }
287	if (hasStrings() && strings != o.strings) return false;
288	return true;
289	}
290
291	/**
292	* Returns the hash code value for this set.
293	*
294	* @return the hash code value for this set.
295	* @see Object#hashCode()
296	*/
297	int32_t UnicodeSet::hashCode() const {
298	uint32_t result = static_cast<uint32_t>(len);
299	for (int32_t i = `0`; i < len; ++i) {
300	result *= `1000003u`;
301	result += list[i];
302	}
303	return static_cast<int32_t>(result);
304	}
305
306	//----------------------------------------------------------------
307	// Public API
308	//----------------------------------------------------------------
309
310	/**
311	* Returns the number of elements in this set (its cardinality),
312	* Note than the elements of a set may include both individual
313	* codepoints and strings.
314	*
315	* @return the number of elements in this set (its cardinality).
316	*/
317	int32_t UnicodeSet::size() const {
318	int32_t n = `0`;
319	int32_t count = getRangeCount();
320	for (int32_t i = `0`; i < count; ++i) {
321	n += getRangeEnd(i) - getRangeStart(i) + `1`;
322	}
323	return n + stringsSize();
324	}
325
326	/**
327	* Returns <tt>true</tt> if this set contains no elements.
328	*
329	* @return <tt>true</tt> if this set contains no elements.
330	*/
331	UBool UnicodeSet::isEmpty() const {
332	return len == `1` && !hasStrings();
333	}
334
335	/**
336	* Returns true if this set contains the given character.
337	* @param c character to be checked for containment
338	* @return true if the test condition is met
339	*/
340	UBool UnicodeSet::contains(UChar32 c) const {
341	// Set i to the index of the start item greater than ch
342	// We know we will terminate without length test!
343	// LATER: for large sets, add binary search
344	//int32_t i = -1;
345	//for (;;) {
346	// if (c < list[++i]) break;
347	//}
348	if (bmpSet != nullptr) {
349	return bmpSet->contains(c);
350	}
351	if (stringSpan != nullptr) {
352	return stringSpan->contains(c);
353	}
354	if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
355	return false;
356	}
357	int32_t i = findCodePoint(c);
358	return (UBool)(i & `1`); // return true if odd
359	}
360
361	/**
362	* Returns the smallest value i such that c < list[i]. Caller
363	* must ensure that c is a legal value or this method will enter
364	* an infinite loop. This method performs a binary search.
365	* @param c a character in the range MIN_VALUE..MAX_VALUE
366	* inclusive
367	* @return the smallest integer i in the range 0..len-1,
368	* inclusive, such that c < list[i]
369	*/
370	int32_t UnicodeSet::findCodePoint(UChar32 c) const {
371	/ Examples:*
372	findCodePoint(c)
373	set list[] c=0 1 3 4 7 8
374	=== ============== ===========
375	[] [110000] 0 0 0 0 0 0
376	[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
377	[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
378	[:Any:] [0, 110000] 1 1 1 1 1 1
379	*/
380
381	// Return the smallest i such that c < list[i]. Assume
382	// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
383	if (c < list[`0`])
384	return `0`;
385	// High runner test. c is often after the last range, so an
386	// initial check for this condition pays off.
387	int32_t lo = `0`;
388	int32_t hi = len - `1`;
389	if (lo >= hi \|\| c >= list[hi-`1`])
390	return hi;
391	// invariant: c >= list[lo]
392	// invariant: c < list[hi]
393	for (;;) {
394	int32_t i = (lo + hi) >> `1`;
395	if (i == lo) {
396	break; // Found!
397	} else if (c < list[i]) {
398	hi = i;
399	} else {
400	lo = i;
401	}
402	}
403	return hi;
404	}
405
406	/**
407	* Returns true if this set contains every character
408	* of the given range.
409	* @param start first character, inclusive, of the range
410	* @param end last character, inclusive, of the range
411	* @return true if the test condition is met
412	*/
413	UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
414	//int32_t i = -1;
415	//for (;;) {
416	// if (start < list[++i]) break;
417	//}
418	int32_t i = findCodePoint(start);
419	return ((i & `1`) != `0` && end < list[i]);
420	}
421
422	/**
423	* Returns <tt>true</tt> if this set contains the given
424	* multicharacter string.
425	* @param s string to be checked for containment
426	* @return <tt>true</tt> if this set contains the specified string
427	*/
428	UBool UnicodeSet::contains(const UnicodeString& s) const {
429	int32_t cp = getSingleCP(s);
430	if (cp < `0`) {
431	return stringsContains(s);
432	} else {
433	return contains((UChar32) cp);
434	}
435	}
436
437	/**
438	* Returns true if this set contains all the characters and strings
439	* of the given set.
440	* @param c set to be checked for containment
441	* @return true if the test condition is met
442	*/
443	UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
444	// The specified set is a subset if all of its pairs are contained in
445	// this set. It's possible to code this more efficiently in terms of
446	// direct manipulation of the inversion lists if the need arises.
447	int32_t n = c.getRangeCount();
448	for (int i=`0`; i<n; ++i) {
449	if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
450	return false;
451	}
452	}
453	return !c.hasStrings() \|\| (strings != nullptr && strings->containsAll(*c.strings));
454	}
455
456	/**
457	* Returns true if this set contains all the characters
458	* of the given string.
459	* @param s string containing characters to be checked for containment
460	* @return true if the test condition is met
461	*/
462	UBool UnicodeSet::containsAll(const UnicodeString& s) const {
463	return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
464	s.length());
465	}
466
467	/**
468	* Returns true if this set contains none of the characters
469	* of the given range.
470	* @param start first character, inclusive, of the range
471	* @param end last character, inclusive, of the range
472	* @return true if the test condition is met
473	*/
474	UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
475	//int32_t i = -1;
476	//for (;;) {
477	// if (start < list[++i]) break;
478	//}
479	int32_t i = findCodePoint(start);
480	return ((i & `1`) == `0` && end < list[i]);
481	}
482
483	/**
484	* Returns true if this set contains none of the characters and strings
485	* of the given set.
486	* @param c set to be checked for containment
487	* @return true if the test condition is met
488	*/
489	UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
490	// The specified set is a subset if all of its pairs are contained in
491	// this set. It's possible to code this more efficiently in terms of
492	// direct manipulation of the inversion lists if the need arises.
493	int32_t n = c.getRangeCount();
494	for (int32_t i=`0`; i<n; ++i) {
495	if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
496	return false;
497	}
498	}
499	return strings == nullptr \|\| !c.hasStrings() \|\| strings->containsNone(*c.strings);
500	}
501
502	/**
503	* Returns true if this set contains none of the characters
504	* of the given string.
505	* @param s string containing characters to be checked for containment
506	* @return true if the test condition is met
507	*/
508	UBool UnicodeSet::containsNone(const UnicodeString& s) const {
509	return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
510	s.length());
511	}
512
513	/**
514	* Returns <tt>true</tt> if this set contains any character whose low byte
515	* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
516	* indexing.
517	*/
518	UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
519	/ The index value v, in the range [0,255], is contained in this set if*
520	* it is contained in any pair of this set. Pairs either have the high
521	* bytes equal, or unequal. If the high bytes are equal, then we have
522	* aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
523	* v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
524	* Then v is contained if xx <= v \|\| v <= yy. (This is identical to the
525	* time zone month containment logic.)
526	*/
527	int32_t i;
528	int32_t rangeCount=getRangeCount();
529	for (i=`0`; i<rangeCount; ++i) {
530	UChar32 low = getRangeStart(i);
531	UChar32 high = getRangeEnd(i);
532	if ((low & ~`0xFF`) == (high & ~`0xFF`)) {
533	if ((low & `0xFF`) <= v && v <= (high & `0xFF`)) {
534	return true;
535	}
536	} else if ((low & `0xFF`) <= v \|\| v <= (high & `0xFF`)) {
537	return true;
538	}
539	}
540	if (hasStrings()) {
541	for (i=`0`; i<strings->size(); ++i) {
542	const UnicodeString& s = (const* UnicodeString*)strings->elementAt(i);
543	if (s.isEmpty()) {
544	continue; // skip the empty string
545	}
546	UChar32 c = s.char32At(`0`);
547	if ((c & `0xFF`) == v) {
548	return true;
549	}
550	}
551	}
552	return false;
553	}
554
555	/**
556	* Implementation of UnicodeMatcher::matches(). Always matches the
557	* longest possible multichar string.
558	*/
559	UMatchDegree UnicodeSet::matches(const Replaceable& text,
560	int32_t& offset,
561	int32_t limit,
562	UBool incremental) {
563	if (offset == limit) {
564	if (contains(U_ETHER)) {
565	return incremental ? U_PARTIAL_MATCH : U_MATCH;
566	} else {
567	return U_MISMATCH;
568	}
569	} else {
570	if (hasStrings()) { // try strings first
571
572	// might separate forward and backward loops later
573	// for now they are combined
574
575	// TODO Improve efficiency of this, at least in the forward
576	// direction, if not in both. In the forward direction we
577	// can assume the strings are sorted.
578
579	int32_t i;
580	UBool forward = offset < limit;
581
582	// firstChar is the leftmost char to match in the
583	// forward direction or the rightmost char to match in
584	// the reverse direction.
585	char16_t firstChar = text.charAt(offset);
586
587	// If there are multiple strings that can match we
588	// return the longest match.
589	int32_t highWaterLength = `0`;
590
591	for (i=`0`; i<strings->size(); ++i) {
592	const UnicodeString& trial = (const* UnicodeString*)strings->elementAt(i);
593	if (trial.isEmpty()) {
594	continue; // skip the empty string
595	}
596
597	char16_t c = trial.charAt(forward ? `0` : trial.length() - `1`);
598
599	// Strings are sorted, so we can optimize in the
600	// forward direction.
601	if (forward && c > firstChar) break;
602	if (c != firstChar) continue;
603
604	int32_t matchLen = matchRest(text, offset, limit, trial);
605
606	if (incremental) {
607	int32_t maxLen = forward ? limit-offset : offset-limit;
608	if (matchLen == maxLen) {
609	// We have successfully matched but only up to limit.
610	return U_PARTIAL_MATCH;
611	}
612	}
613
614	if (matchLen == trial.length()) {
615	// We have successfully matched the whole string.
616	if (matchLen > highWaterLength) {
617	highWaterLength = matchLen;
618	}
619	// In the forward direction we know strings
620	// are sorted so we can bail early.
621	if (forward && matchLen < highWaterLength) {
622	break;
623	}
624	continue;
625	}
626	}
627
628	// We've checked all strings without a partial match.
629	// If we have full matches, return the longest one.
630	if (highWaterLength != `0`) {
631	offset += forward ? highWaterLength : -highWaterLength;
632	return U_MATCH;
633	}
634	}
635	return UnicodeFilter::matches(text, offset, limit, incremental);
636	}
637	}
638
639	/**
640	* Returns the longest match for s in text at the given position.
641	* If limit > start then match forward from start+1 to limit
642	* matching all characters except s.charAt(0). If limit < start,
643	* go backward starting from start-1 matching all characters
644	* except s.charAt(s.length()-1). This method assumes that the
645	* first character, text.charAt(start), matches s, so it does not
646	* check it.
647	* @param text the text to match
648	* @param start the first character to match. In the forward
649	* direction, text.charAt(start) is matched against s.charAt(0).
650	* In the reverse direction, it is matched against
651	* s.charAt(s.length()-1).
652	* @param limit the limit offset for matching, either last+1 in
653	* the forward direction, or last-1 in the reverse direction,
654	* where last is the index of the last character to match.
655	* @return If part of s matches up to the limit, return \|limit -
656	* start\|. If all of s matches before reaching the limit, return
657	* s.length(). If there is a mismatch between s and text, return
658	* 0
659	*/
660	int32_t UnicodeSet::matchRest(const Replaceable& text,
661	int32_t start, int32_t limit,
662	const UnicodeString& s) {
663	int32_t i;
664	int32_t maxLen;
665	int32_t slen = s.length();
666	if (start < limit) {
667	maxLen = limit - start;
668	if (maxLen > slen) maxLen = slen;
669	for (i = `1`; i < maxLen; ++i) {
670	if (text.charAt(start + i) != s.charAt(i)) return `0`;
671	}
672	} else {
673	maxLen = start - limit;
674	if (maxLen > slen) maxLen = slen;
675	--slen; // <=> slen = s.length() - 1;
676	for (i = `1`; i < maxLen; ++i) {
677	if (text.charAt(start - i) != s.charAt(slen - i)) return `0`;
678	}
679	}
680	return maxLen;
681	}
682
683	/**
684	* Implement of UnicodeMatcher
685	*/
686	void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
687	toUnionTo.addAll(*this);
688	}
689
690	/**
691	* Returns the index of the given character within this set, where
692	* the set is ordered by ascending code point. If the character
693	* is not in this set, return -1. The inverse of this method is
694	* <code>charAt()</code>.
695	* @return an index from 0..size()-1, or -1
696	*/
697	int32_t UnicodeSet::indexOf(UChar32 c) const {
698	if (c < MIN_VALUE \|\| c > MAX_VALUE) {
699	return -`1`;
700	}
701	int32_t i = `0`;
702	int32_t n = `0`;
703	for (;;) {
704	UChar32 start = list[i++];
705	if (c < start) {
706	return -`1`;
707	}
708	UChar32 limit = list[i++];
709	if (c < limit) {
710	return n + c - start;
711	}
712	n += limit - start;
713	}
714	}
715
716	/**
717	* Returns the character at the given index within this set, where
718	* the set is ordered by ascending code point. If the index is
719	* out of range, return (UChar32)-1. The inverse of this method is
720	* <code>indexOf()</code>.
721	* @param index an index from 0..size()-1
722	* @return the character at the given index, or (UChar32)-1.
723	*/
724	UChar32 UnicodeSet::charAt(int32_t index) const {
725	if (index >= `0`) {
726	// len2 is the largest even integer <= len, that is, it is len
727	// for even values and len-1 for odd values. With odd values
728	// the last entry is UNICODESET_HIGH.
729	int32_t len2 = len & ~`1`;
730	for (int32_t i=`0`; i < len2;) {
731	UChar32 start = list[i++];
732	int32_t count = list[i++] - start;
733	if (index < count) {
734	return (UChar32)(start + index);
735	}
736	index -= count;
737	}
738	}
739	return (UChar32)-`1`;
740	}
741
742	/**
743	* Make this object represent the range <code>start - end</code>.
744	* If <code>end > start</code> then this object is set to an
745	* an empty range.
746	*
747	* @param start first character in the set, inclusive
748	* @rparam end last character in the set, inclusive
749	*/
750	UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
751	clear();
752	complement(start, end);
753	return *this;
754	}
755
756	/**
757	* Adds the specified range to this set if it is not already
758	* present. If this set already contains the specified range,
759	* the call leaves this set unchanged. If <code>end > start</code>
760	* then an empty range is added, leaving the set unchanged.
761	*
762	* @param start first character, inclusive, of range to be added
763	* to this set.
764	* @param end last character, inclusive, of range to be added
765	* to this set.
766	*/
767	UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
768	if (pinCodePoint(start) < pinCodePoint(end)) {
769	UChar32 limit = end + `1`;
770	// Fast path for adding a new range after the last one.
771	// Odd list length: [..., lastStart, lastLimit, HIGH]
772	if ((len & `1`) != `0`) {
773	// If the list is empty, set lastLimit low enough to not be adjacent to 0.
774	UChar32 lastLimit = len == `1` ? -`2` : list[len - `2`];
775	if (lastLimit <= start && !isFrozen() && !isBogus()) {
776	if (lastLimit == start) {
777	// Extend the last range.
778	list[len - `2`] = limit;
779	if (limit == UNICODESET_HIGH) {
780	--len;
781	}
782	} else {
783	list[len - `1`] = start;
784	if (limit < UNICODESET_HIGH) {
785	if (ensureCapacity(len + `2`)) {
786	list[len++] = limit;
787	list[len++] = UNICODESET_HIGH;
788	}
789	} else { // limit == UNICODESET_HIGH
790	if (ensureCapacity(len + `1`)) {
791	list[len++] = UNICODESET_HIGH;
792	}
793	}
794	}
795	releasePattern();
796	return *this;
797	}
798	}
799	// This is slow. Could be much faster using findCodePoint(start)
800	// and modifying the list, dealing with adjacent & overlapping ranges.
801	UChar32 range[`3`] = { start, limit, UNICODESET_HIGH };
802	add(range, `2`, `0`);
803	} else if (start == end) {
804	add(start);
805	}
806	return *this;
807	}
808
809	// #define DEBUG_US_ADD
810
811	#ifdef DEBUG_US_ADD
812	#include <stdio.h>
813	void dump(UChar32 c) {
814	if (c <= `0xFF`) {
815	printf("%c", (char)c);
816	} else {
817	printf("U+%04X", c);
818	}
819	}
820	void dump(const UChar32* list, int32_t len) {
821	printf("[");
822	for (int32_t i=`0`; i<len; ++i) {
823	if (i != `0`) printf(", ");
824	dump(list[i]);
825	}
826	printf("]");
827	}
828	#endif
829
830	/**
831	* Adds the specified character to this set if it is not already
832	* present. If this set already contains the specified character,
833	* the call leaves this set unchanged.
834	*/
835	UnicodeSet& UnicodeSet::add(UChar32 c) {
836	// find smallest i such that c < list[i]
837	// if odd, then it is IN the set
838	// if even, then it is OUT of the set
839	int32_t i = findCodePoint(pinCodePoint(c));
840
841	// already in set?
842	if ((i & `1`) != `0` \|\| isFrozen() \|\| isBogus()) return *this;
843
844	// HIGH is 0x110000
845	// assert(list[len-1] == HIGH);
846
847	// empty = [HIGH]
848	// [start_0, limit_0, start_1, limit_1, HIGH]
849
850	// [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
851	// ^
852	// list[i]
853
854	// i == 0 means c is before the first range
855
856	#ifdef DEBUG_US_ADD
857	printf("Add of ");
858	dump(c);
859	printf(" found at %d", i);
860	printf(": ");
861	dump(list, len);
862	printf(" => ");
863	#endif
864
865	if (c == list[i]-`1`) {
866	// c is before start of next range
867	list[i] = c;
868	// if we touched the HIGH mark, then add a new one
869	if (c == (UNICODESET_HIGH - `1`)) {
870	if (!ensureCapacity(len+`1`)) {
871	// ensureCapacity will mark the object as Bogus if OOM failure happens.
872	return *this;
873	}
874	list[len++] = UNICODESET_HIGH;
875	}
876	if (i > `0` && c == list[i-`1`]) {
877	// collapse adjacent ranges
878
879	// [..., start_k-1, c, c, limit_k, ..., HIGH]
880	// ^
881	// list[i]
882
883	//for (int32_t k=i-1; k<len-2; ++k) {
884	// list[k] = list[k+2];
885	//}
886	UChar32* dst = list + i - `1`;
887	UChar32* src = dst + `2`;
888	UChar32* srclimit = list + len;
889	while (src < srclimit) (dst++) = (src++);
890
891	len -= `2`;
892	}
893	}
894
895	else if (i > `0` && c == list[i-`1`]) {
896	// c is after end of prior range
897	list[i-`1`]++;
898	// no need to check for collapse here
899	}
900
901	else {
902	// At this point we know the new char is not adjacent to
903	// any existing ranges, and it is not 10FFFF.
904
905
906	// [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
907	// ^
908	// list[i]
909
910	// [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
911	// ^
912	// list[i]
913
914	if (!ensureCapacity(len+`2`)) {
915	// ensureCapacity will mark the object as Bogus if OOM failure happens.
916	return *this;
917	}
918
919	UChar32 *p = list + i;
920	uprv_memmove(p + `2`, p, (len - i) * sizeof(*p));
921	list[i] = c;
922	list[i+`1`] = c+`1`;
923	len += `2`;
924	}
925
926	#ifdef DEBUG_US_ADD
927	dump(list, len);
928	printf("\n");
929
930	for (i=`1`; i<len; ++i) {
931	if (list[i] <= list[i-`1`]) {
932	// Corrupt array!
933	printf("ERROR: list has been corrupted\n");
934	exit(`1`);
935	}
936	}
937	#endif
938
939	releasePattern();
940	return *this;
941	}
942
943	/**
944	* Adds the specified multicharacter to this set if it is not already
945	* present. If this set already contains the multicharacter,
946	* the call leaves this set unchanged.
947	* Thus "ch" => {"ch"}
948	*
949	* @param s the source string
950	* @return the modified set, for chaining
951	*/
952	UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
953	if (isFrozen() \|\| isBogus()) return *this;
954	int32_t cp = getSingleCP(s);
955	if (cp < `0`) {
956	if (!stringsContains(s)) {
957	_add(s);
958	releasePattern();
959	}
960	} else {
961	add((UChar32)cp);
962	}
963	return *this;
964	}
965
966	/**
967	* Adds the given string, in order, to 'strings'. The given string
968	* must have been checked by the caller to not already be in 'strings'.
969	*/
970	void UnicodeSet::_add(const UnicodeString& s) {
971	if (isFrozen() \|\| isBogus()) {
972	return;
973	}
974	UErrorCode ec = U_ZERO_ERROR;
975	if (strings == nullptr && !allocateStrings(ec)) {
976	setToBogus();
977	return;
978	}
979	UnicodeString* t = new UnicodeString (s);
980	if (t == nullptr) { // Check for memory allocation error.
981	setToBogus();
982	return;
983	}
984	strings->sortedInsert(t, compareUnicodeString, ec);
985	if (U_FAILURE(ec)) {
986	setToBogus();
987	}
988	}
989
990	/**
991	* @return a code point IF the string consists of a single one.
992	* otherwise returns -1.
993	* @param string to test
994	*/
995	int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
996	int32_t sLength = s.length();
997	if (sLength == `1`) return s.charAt(`0`);
998	if (sLength == `2`) {
999	UChar32 cp = s.char32At(`0`);
1000	if (cp > `0xFFFF`) { // is surrogate pair
1001	return cp;
1002	}
1003	}
1004	return -`1`;
1005	}
1006
1007	/**
1008	* Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1009	* If this set already any particular character, it has no effect on that character.
1010	* @param the source string
1011	* @return the modified set, for chaining
1012	*/
1013	UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1014	UChar32 cp;
1015	for (int32_t i = `0`; i < s.length(); i += U16_LENGTH(cp)) {
1016	cp = s.char32At(i);
1017	add(cp);
1018	}
1019	return *this;
1020	}
1021
1022	/**
1023	* Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1024	* If this set already any particular character, it has no effect on that character.
1025	* @param the source string
1026	* @return the modified set, for chaining
1027	*/
1028	UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1029	UnicodeSet set;
1030	set.addAll(s);
1031	retainAll(set);
1032	return *this;
1033	}
1034
1035	/**
1036	* Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1037	* If this set already any particular character, it has no effect on that character.
1038	* @param the source string
1039	* @return the modified set, for chaining
1040	*/
1041	UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1042	UnicodeSet set;
1043	set.addAll(s);
1044	complementAll(set);
1045	return *this;
1046	}
1047
1048	/**
1049	* Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1050	* If this set already any particular character, it has no effect on that character.
1051	* @param the source string
1052	* @return the modified set, for chaining
1053	*/
1054	UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1055	UnicodeSet set;
1056	set.addAll(s);
1057	removeAll(set);
1058	return *this;
1059	}
1060
1061	UnicodeSet& UnicodeSet::removeAllStrings() {
1062	if (!isFrozen() && hasStrings()) {
1063	strings->removeAllElements();
1064	releasePattern();
1065	}
1066	return *this;
1067	}
1068
1069
1070	/**
1071	* Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1072	* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1073	* @param the source string
1074	* @return a newly created set containing the given string
1075	*/
1076	UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1077	UnicodeSet set = new* UnicodeSet ();
1078	if (set != nullptr) { // Check for memory allocation error.
1079	set->add(s);
1080	}
1081	return set;
1082	}
1083
1084
1085	/**
1086	* Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1087	* @param the source string
1088	* @return a newly created set containing the given characters
1089	*/
1090	UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1091	UnicodeSet set = new* UnicodeSet ();
1092	if (set != nullptr) { // Check for memory allocation error.
1093	set->addAll(s);
1094	}
1095	return set;
1096	}
1097
1098	/**
1099	* Retain only the elements in this set that are contained in the
1100	* specified range. If <code>end > start</code> then an empty range is
1101	* retained, leaving the set empty.
1102	*
1103	* @param start first character, inclusive, of range to be retained
1104	* to this set.
1105	* @param end last character, inclusive, of range to be retained
1106	* to this set.
1107	*/
1108	UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1109	if (pinCodePoint(start) <= pinCodePoint(end)) {
1110	UChar32 range[`3`] = { start, end+`1`, UNICODESET_HIGH };
1111	retain(range, `2`, `0`);
1112	} else {
1113	clear();
1114	}
1115	return *this;
1116	}
1117
1118	UnicodeSet& UnicodeSet::retain(UChar32 c) {
1119	return retain(c, c);
1120	}
1121
1122	UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
1123	if (isFrozen() \|\| isBogus()) { return *this; }
1124	UChar32 cp = getSingleCP(s);
1125	if (cp < `0`) {
1126	bool isIn = stringsContains(s);
1127	// Check for getRangeCount() first to avoid somewhat-expensive size()
1128	// when there are single code points.
1129	if (isIn && getRangeCount() == `0` && size() == `1`) {
1130	return *this;
1131	}
1132	clear();
1133	if (isIn) {
1134	_add(s);
1135	}
1136	} else {
1137	retain(cp, cp);
1138	}
1139	return *this;
1140	}
1141
1142	/**
1143	* Removes the specified range from this set if it is present.
1144	* The set will not contain the specified range once the call
1145	* returns. If <code>end > start</code> then an empty range is
1146	* removed, leaving the set unchanged.
1147	*
1148	* @param start first character, inclusive, of range to be removed
1149	* from this set.
1150	* @param end last character, inclusive, of range to be removed
1151	* from this set.
1152	*/
1153	UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1154	if (pinCodePoint(start) <= pinCodePoint(end)) {
1155	UChar32 range[`3`] = { start, end+`1`, UNICODESET_HIGH };
1156	retain(range, `2`, `2`);
1157	}
1158	return *this;
1159	}
1160
1161	/**
1162	* Removes the specified character from this set if it is present.
1163	* The set will not contain the specified range once the call
1164	* returns.
1165	*/
1166	UnicodeSet& UnicodeSet::remove(UChar32 c) {
1167	return remove(c, c);
1168	}
1169
1170	/**
1171	* Removes the specified string from this set if it is present.
1172	* The set will not contain the specified character once the call
1173	* returns.
1174	* @param the source string
1175	* @return the modified set, for chaining
1176	*/
1177	UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1178	if (isFrozen() \|\| isBogus()) return *this;
1179	int32_t cp = getSingleCP(s);
1180	if (cp < `0`) {
1181	if (strings != nullptr && strings->removeElement((void*) &s)) {
1182	releasePattern();
1183	}
1184	} else {
1185	remove((UChar32)cp, (UChar32)cp);
1186	}
1187	return *this;
1188	}
1189
1190	/**
1191	* Complements the specified range in this set. Any character in
1192	* the range will be removed if it is in this set, or will be
1193	* added if it is not in this set. If <code>end > start</code>
1194	* then an empty range is xor'ed, leaving the set unchanged.
1195	*
1196	* @param start first character, inclusive, of range to be removed
1197	* from this set.
1198	* @param end last character, inclusive, of range to be removed
1199	* from this set.
1200	*/
1201	UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1202	if (isFrozen() \|\| isBogus()) {
1203	return *this;
1204	}
1205	if (pinCodePoint(start) <= pinCodePoint(end)) {
1206	UChar32 range[`3`] = { start, end+`1`, UNICODESET_HIGH };
1207	exclusiveOr(range, `2`, `0`);
1208	}
1209	releasePattern();
1210	return *this;
1211	}
1212
1213	UnicodeSet& UnicodeSet::complement(UChar32 c) {
1214	return complement(c, c);
1215	}
1216
1217	/**
1218	* This is equivalent to
1219	* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1220	*/
1221	UnicodeSet& UnicodeSet::complement() {
1222	if (isFrozen() \|\| isBogus()) {
1223	return *this;
1224	}
1225	if (list[`0`] == UNICODESET_LOW) {
1226	uprv_memmove(list, list + `1`, (size_t)(len-`1`)*sizeof(UChar32));
1227	--len;
1228	} else {
1229	if (!ensureCapacity(len+`1`)) {
1230	return *this;
1231	}
1232	uprv_memmove(list + `1`, list, (size_t)len*sizeof(UChar32));
1233	list[`0`] = UNICODESET_LOW;
1234	++len;
1235	}
1236	releasePattern();
1237	return *this;
1238	}
1239
1240	/**
1241	* Complement the specified string in this set.
1242	* The set will not contain the specified string once the call
1243	* returns.
1244	*
1245	* @param s the string to complement
1246	* @return this object, for chaining
1247	*/
1248	UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1249	if (isFrozen() \|\| isBogus()) return *this;
1250	int32_t cp = getSingleCP(s);
1251	if (cp < `0`) {
1252	if (stringsContains(s)) {
1253	strings->removeElement((void*) &s);
1254	} else {
1255	_add(s);
1256	}
1257	releasePattern();
1258	} else {
1259	complement((UChar32)cp, (UChar32)cp);
1260	}
1261	return *this;
1262	}
1263
1264	/**
1265	* Adds all of the elements in the specified set to this set if
1266	* they're not already present. This operation effectively
1267	* modifies this set so that its value is the <i>union</i> of the two
1268	* sets. The behavior of this operation is unspecified if the specified
1269	* collection is modified while the operation is in progress.
1270	*
1271	* @param c set whose elements are to be added to this set.
1272	* @see #add(char, char)
1273	*/
1274	UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1275	if ( c.len>`0` && c.list!=nullptr ) {
1276	add(c.list, c.len, `0`);
1277	}
1278
1279	// Add strings in order
1280	if ( c.strings!=nullptr ) {
1281	for (int32_t i=`0`; i<c.strings->size(); ++i) {
1282	const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1283	if (!stringsContains(*s)) {
1284	_add(*s);
1285	}
1286	}
1287	}
1288	return *this;
1289	}
1290
1291	/**
1292	* Retains only the elements in this set that are contained in the
1293	* specified set. In other words, removes from this set all of
1294	* its elements that are not contained in the specified set. This
1295	* operation effectively modifies this set so that its value is
1296	* the <i>intersection</i> of the two sets.
1297	*
1298	* @param c set that defines which elements this set will retain.
1299	*/
1300	UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1301	if (isFrozen() \|\| isBogus()) {
1302	return *this;
1303	}
1304	retain(c.list, c.len, `0`);
1305	if (hasStrings()) {
1306	if (!c.hasStrings()) {
1307	strings->removeAllElements();
1308	} else {
1309	strings->retainAll(*c.strings);
1310	}
1311	}
1312	return *this;
1313	}
1314
1315	/**
1316	* Removes from this set all of its elements that are contained in the
1317	* specified set. This operation effectively modifies this
1318	* set so that its value is the <i>asymmetric set difference</i> of
1319	* the two sets.
1320	*
1321	* @param c set that defines which elements will be removed from
1322	* this set.
1323	*/
1324	UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1325	if (isFrozen() \|\| isBogus()) {
1326	return *this;
1327	}
1328	retain(c.list, c.len, `2`);
1329	if (hasStrings() && c.hasStrings()) {
1330	strings->removeAll(*c.strings);
1331	}
1332	return *this;
1333	}
1334
1335	/**
1336	* Complements in this set all elements contained in the specified
1337	* set. Any character in the other set will be removed if it is
1338	* in this set, or will be added if it is not in this set.
1339	*
1340	* @param c set that defines which elements will be xor'ed from
1341	* this set.
1342	*/
1343	UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1344	if (isFrozen() \|\| isBogus()) {
1345	return *this;
1346	}
1347	exclusiveOr(c.list, c.len, `0`);
1348
1349	if (c.strings != nullptr) {
1350	for (int32_t i=`0`; i<c.strings->size(); ++i) {
1351	void* e = c.strings->elementAt(i);
1352	if (strings == nullptr \|\| !strings->removeElement(e)) {
1353	_add((const* UnicodeString*)e);
1354	}
1355	}
1356	}
1357	return *this;
1358	}
1359
1360	/**
1361	* Removes all of the elements from this set. This set will be
1362	* empty after this call returns.
1363	*/
1364	UnicodeSet& UnicodeSet::clear() {
1365	if (isFrozen()) {
1366	return *this;
1367	}
1368	list[`0`] = UNICODESET_HIGH;
1369	len = `1`;
1370	releasePattern();
1371	if (strings != nullptr) {
1372	strings->removeAllElements();
1373	}
1374	// Remove bogus
1375	fFlags = `0`;
1376	return *this;
1377	}
1378
1379	/**
1380	* Iteration method that returns the number of ranges contained in
1381	* this set.
1382	* @see #getRangeStart
1383	* @see #getRangeEnd
1384	*/
1385	int32_t UnicodeSet::getRangeCount() const {
1386	return len/`2`;
1387	}
1388
1389	/**
1390	* Iteration method that returns the first character in the
1391	* specified range of this set.
1392	* @see #getRangeCount
1393	* @see #getRangeEnd
1394	*/
1395	UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1396	return list[index*`2`];
1397	}
1398
1399	/**
1400	* Iteration method that returns the last character in the
1401	* specified range of this set.
1402	* @see #getRangeStart
1403	* @see #getRangeEnd
1404	*/
1405	UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1406	return list[index*`2` + `1`] - `1`;
1407	}
1408
1409	const UnicodeString* UnicodeSet::getString(int32_t index) const {
1410	return (const UnicodeString*) strings->elementAt(index);
1411	}
1412
1413	/**
1414	* Reallocate this objects internal structures to take up the least
1415	* possible space, without changing this object's value.
1416	*/
1417	UnicodeSet& UnicodeSet::compact() {
1418	if (isFrozen() \|\| isBogus()) {
1419	return *this;
1420	}
1421	// Delete buffer first to defragment memory less.
1422	if (buffer != stackList) {
1423	uprv_free(buffer);
1424	buffer = nullptr;
1425	bufferCapacity = `0`;
1426	}
1427	if (list == stackList) {
1428	// pass
1429	} else if (len <= INITIAL_CAPACITY) {
1430	uprv_memcpy(stackList, list, len * sizeof(UChar32));
1431	uprv_free(list);
1432	list = stackList;
1433	capacity = INITIAL_CAPACITY;
1434	} else if ((len + `7`) < capacity) {
1435	// If we have more than a little unused capacity, shrink it to len.
1436	UChar32* temp = (UChar32) uprv_realloc(list, sizeof(UChar32) len);
1437	if (temp) {
1438	list = temp;
1439	capacity = len;
1440	}
1441	// else what the heck happened?! We allocated less memory!
1442	// Oh well. We'll keep our original array.
1443	}
1444	if (strings != nullptr && strings->isEmpty()) {
1445	delete strings;
1446	strings = nullptr;
1447	}
1448	return *this;
1449	}
1450
1451	#ifdef DEBUG_SERIALIZE
1452	#include <stdio.h>
1453	#endif
1454
1455	/**
1456	* Deserialize constructor.
1457	*/
1458	UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
1459	UErrorCode &ec) {
1460
1461	if(U_FAILURE(ec)) {
1462	setToBogus();
1463	return;
1464	}
1465
1466	if( (serialization != kSerialized)
1467	\|\| (data==nullptr)
1468	\|\| (dataLen < `1`)) {
1469	ec = U_ILLEGAL_ARGUMENT_ERROR;
1470	setToBogus();
1471	return;
1472	}
1473
1474	// bmp?
1475	int32_t headerSize = ((data[`0`]&`0x8000`)) ?`2`:`1`;
1476	int32_t bmpLength = (headerSize==`1`)?data[`0`]:data[`1`];
1477
1478	int32_t newLength = (((data[`0`]&`0x7FFF`)-bmpLength)/`2`)+bmpLength;
1479	#ifdef DEBUG_SERIALIZE
1480	printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[`0`],data[`1`],data[`2`],data[`3`]);
1481	#endif
1482	if(!ensureCapacity(newLength + `1`)) { // +1 for HIGH
1483	return;
1484	}
1485	// copy bmp
1486	int32_t i;
1487	for(i = `0`; i< bmpLength;i++) {
1488	list[i] = data[i+headerSize];
1489	#ifdef DEBUG_SERIALIZE
1490	printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1491	#endif
1492	}
1493	// copy smp
1494	for(i=bmpLength;i<newLength;i++) {
1495	list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*`2`+`0`] << `16`) +
1496	((UChar32)data[headerSize+bmpLength+(i-bmpLength)*`2`+`1`]);
1497	#ifdef DEBUG_SERIALIZE
1498	printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1499	#endif
1500	}
1501	U_ASSERT(i == newLength);
1502	if (i == `0` \|\| list[i - `1`] != UNICODESET_HIGH) {
1503	list[i++] = UNICODESET_HIGH;
1504	}
1505	len = i;
1506	}
1507
1508
1509	int32_t UnicodeSet::serialize(uint16_t dest, int32_t destCapacity, UErrorCode& ec) const* {
1510	int32_t bmpLength, length, destLength;
1511
1512	if (U_FAILURE(ec)) {
1513	return `0`;
1514	}
1515
1516	if (destCapacity<`0` \|\| (destCapacity>`0` && dest==nullptr)) {
1517	ec=U_ILLEGAL_ARGUMENT_ERROR;
1518	return `0`;
1519	}
1520
1521	/ count necessary 16-bit units /
1522	length=this->len-`1`; // Subtract 1 to ignore final UNICODESET_HIGH
1523	// assert(length>=0);
1524	if (length==`0`) {
1525	/ empty set /
1526	if (destCapacity>`0`) {
1527	*dest=`0`;
1528	} else {
1529	ec=U_BUFFER_OVERFLOW_ERROR;
1530	}
1531	return `1`;
1532	}
1533	/ now length>0 /
1534
1535	if (this->list[length-`1`]<=`0xffff`) {
1536	/ all BMP /
1537	bmpLength=length;
1538	} else if (this->list[`0`]>=`0x10000`) {
1539	/ all supplementary /
1540	bmpLength=`0`;
1541	length*=`2`;
1542	} else {
1543	/ some BMP, some supplementary /
1544	for (bmpLength=`0`; bmpLength<length && this->list[bmpLength]<=`0xffff`; ++bmpLength) {}
1545	length=bmpLength+`2`*(length-bmpLength);
1546	}
1547	#ifdef DEBUG_SERIALIZE
1548	printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1549	#endif
1550	/ length: number of 16-bit array units /
1551	if (length>`0x7fff`) {
1552	/ there are only 15 bits for the length in the first serialized word /
1553	ec=U_INDEX_OUTOFBOUNDS_ERROR;
1554	return `0`;
1555	}
1556
1557	/*
1558	* total serialized length:
1559	* number of 16-bit array units (length) +
1560	* 1 length unit (always) +
1561	* 1 bmpLength unit (if there are supplementary values)
1562	*/
1563	destLength=length+((length>bmpLength)?`2`:`1`);
1564	if (destLength<=destCapacity) {
1565	const UChar32 *p;
1566	int32_t i;
1567
1568	#ifdef DEBUG_SERIALIZE
1569	printf("writeHdr\n");
1570	#endif
1571	*dest=(uint16_t)length;
1572	if (length>bmpLength) {
1573	*dest\|=`0x8000`;
1574	*++dest=(uint16_t)bmpLength;
1575	}
1576	++dest;
1577
1578	/ write the BMP part of the array /
1579	p=this->list;
1580	for (i=`0`; i<bmpLength; ++i) {
1581	#ifdef DEBUG_SERIALIZE
1582	printf("writebmp: %x\n", (int)*p);
1583	#endif
1584	dest++=(uint16_t)p++;
1585	}
1586
1587	/ write the supplementary part of the array /
1588	for (; i<length; i+=`2`) {
1589	#ifdef DEBUG_SERIALIZE
1590	printf("write32: %x\n", (int)*p);
1591	#endif
1592	dest++=(uint16_t)(p>>`16`);
1593	dest++=(uint16_t)p++;
1594	}
1595	} else {
1596	ec=U_BUFFER_OVERFLOW_ERROR;
1597	}
1598	return destLength;
1599	}
1600
1601	//----------------------------------------------------------------
1602	// Implementation: Utility methods
1603	//----------------------------------------------------------------
1604
1605	/**
1606	* Allocate our strings vector and return true if successful.
1607	*/
1608	UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1609	if (U_FAILURE(status)) {
1610	return false;
1611	}
1612	strings = new UVector (uprv_deleteUObject,
1613	uhash_compareUnicodeString, `1`, status);
1614	if (strings == nullptr) { // Check for memory allocation error.
1615	status = U_MEMORY_ALLOCATION_ERROR;
1616	return false;
1617	}
1618	if (U_FAILURE(status)) {
1619	delete strings;
1620	strings = nullptr;
1621	return false;
1622	}
1623	return true;
1624	}
1625
1626	int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
1627	// Grow exponentially to reduce the frequency of allocations.
1628	if (minCapacity < INITIAL_CAPACITY) {
1629	return minCapacity + INITIAL_CAPACITY;
1630	} else if (minCapacity <= `2500`) {
1631	return `5` * minCapacity;
1632	} else {
1633	int32_t newCapacity = `2` * minCapacity;
1634	if (newCapacity > MAX_LENGTH) {
1635	newCapacity = MAX_LENGTH;
1636	}
1637	return newCapacity;
1638	}
1639	}
1640
1641	bool UnicodeSet::ensureCapacity(int32_t newLen) {
1642	if (newLen > MAX_LENGTH) {
1643	newLen = MAX_LENGTH;
1644	}
1645	if (newLen <= capacity) {
1646	return true;
1647	}
1648	int32_t newCapacity = nextCapacity(newLen);
1649	UChar32* temp = (UChar32) uprv_malloc(newCapacity sizeof(UChar32));
1650	if (temp == nullptr) {
1651	setToBogus(); // set the object to bogus state if an OOM failure occurred.
1652	return false;
1653	}
1654	// Copy only the actual contents.
1655	uprv_memcpy(temp, list, len * sizeof(UChar32));
1656	if (list != stackList) {
1657	uprv_free(list);
1658	}
1659	list = temp;
1660	capacity = newCapacity;
1661	return true;
1662	}
1663
1664	bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1665	if (newLen > MAX_LENGTH) {
1666	newLen = MAX_LENGTH;
1667	}
1668	if (newLen <= bufferCapacity) {
1669	return true;
1670	}
1671	int32_t newCapacity = nextCapacity(newLen);
1672	UChar32* temp = (UChar32) uprv_malloc(newCapacity sizeof(UChar32));
1673	if (temp == nullptr) {
1674	setToBogus();
1675	return false;
1676	}
1677	// The buffer has no contents to be copied.
1678	// It is always filled from scratch after this call.
1679	if (buffer != stackList) {
1680	uprv_free(buffer);
1681	}
1682	buffer = temp;
1683	bufferCapacity = newCapacity;
1684	return true;
1685	}
1686
1687	/**
1688	* Swap list and buffer.
1689	*/
1690	void UnicodeSet::swapBuffers() {
1691	// swap list and buffer
1692	UChar32* temp = list;
1693	list = buffer;
1694	buffer = temp;
1695
1696	int32_t c = capacity;
1697	capacity = bufferCapacity;
1698	bufferCapacity = c;
1699	}
1700
1701	void UnicodeSet::setToBogus() {
1702	clear(); // Remove everything in the set.
1703	fFlags = kIsBogus;
1704	}
1705
1706	//----------------------------------------------------------------
1707	// Implementation: Fundamental operators
1708	//----------------------------------------------------------------
1709
1710	static inline UChar32 max(UChar32 a, UChar32 b) {
1711	return (a > b) ? a : b;
1712	}
1713
1714	// polarity = 0, 3 is normal: x xor y
1715	// polarity = 1, 2: x xor ~y == x === y
1716
1717	void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1718	if (isFrozen() \|\| isBogus()) {
1719	return;
1720	}
1721	if (!ensureBufferCapacity(len + otherLen)) {
1722	return;
1723	}
1724
1725	int32_t i = `0`, j = `0`, k = `0`;
1726	UChar32 a = list[i++];
1727	UChar32 b;
1728	if (polarity == `1` \|\| polarity == `2`) {
1729	b = UNICODESET_LOW;
1730	if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1731	++j;
1732	b = other[j];
1733	}
1734	} else {
1735	b = other[j++];
1736	}
1737	// simplest of all the routines
1738	// sort the values, discarding identicals!
1739	for (;;) {
1740	if (a < b) {
1741	buffer[k++] = a;
1742	a = list[i++];
1743	} else if (b < a) {
1744	buffer[k++] = b;
1745	b = other[j++];
1746	} else if (a != UNICODESET_HIGH) { // at this point, a == b
1747	// discard both values!
1748	a = list[i++];
1749	b = other[j++];
1750	} else { // DONE!
1751	buffer[k++] = UNICODESET_HIGH;
1752	len = k;
1753	break;
1754	}
1755	}
1756	swapBuffers();
1757	releasePattern();
1758	}
1759
1760	// polarity = 0 is normal: x union y
1761	// polarity = 2: x union ~y
1762	// polarity = 1: ~x union y
1763	// polarity = 3: ~x union ~y
1764
1765	void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1766	if (isFrozen() \|\| isBogus() \|\| other==nullptr) {
1767	return;
1768	}
1769	if (!ensureBufferCapacity(len + otherLen)) {
1770	return;
1771	}
1772
1773	int32_t i = `0`, j = `0`, k = `0`;
1774	UChar32 a = list[i++];
1775	UChar32 b = other[j++];
1776	// change from xor is that we have to check overlapping pairs
1777	// polarity bit 1 means a is second, bit 2 means b is.
1778	for (;;) {
1779	switch (polarity) {
1780	case `0`: // both first; take lower if unequal
1781	if (a < b) { // take a
1782	// Back up over overlapping ranges in buffer[]
1783	if (k > `0` && a <= buffer[k-`1`]) {
1784	// Pick latter end value in buffer[] vs. list[]
1785	a = max(list[i], buffer[--k]);
1786	} else {
1787	// No overlap
1788	buffer[k++] = a;
1789	a = list[i];
1790	}
1791	i++; // Common if/else code factored out
1792	polarity ^= `1`;
1793	} else if (b < a) { // take b
1794	if (k > `0` && b <= buffer[k-`1`]) {
1795	b = max(other[j], buffer[--k]);
1796	} else {
1797	buffer[k++] = b;
1798	b = other[j];
1799	}
1800	j++;
1801	polarity ^= `2`;
1802	} else { // a == b, take a, drop b
1803	if (a == UNICODESET_HIGH) goto loop_end;
1804	// This is symmetrical; it doesn't matter if
1805	// we backtrack with a or b. - liu
1806	if (k > `0` && a <= buffer[k-`1`]) {
1807	a = max(list[i], buffer[--k]);
1808	} else {
1809	// No overlap
1810	buffer[k++] = a;
1811	a = list[i];
1812	}
1813	i++;
1814	polarity ^= `1`;
1815	b = other[j++];
1816	polarity ^= `2`;
1817	}
1818	break;
1819	case `3`: // both second; take higher if unequal, and drop other
1820	if (b <= a) { // take a
1821	if (a == UNICODESET_HIGH) goto loop_end;
1822	buffer[k++] = a;
1823	} else { // take b
1824	if (b == UNICODESET_HIGH) goto loop_end;
1825	buffer[k++] = b;
1826	}
1827	a = list[i++];
1828	polarity ^= `1`; // factored common code
1829	b = other[j++];
1830	polarity ^= `2`;
1831	break;
1832	case `1`: // a second, b first; if b < a, overlap
1833	if (a < b) { // no overlap, take a
1834	buffer[k++] = a; a = list[i++]; polarity ^= `1`;
1835	} else if (b < a) { // OVERLAP, drop b
1836	b = other[j++];
1837	polarity ^= `2`;
1838	} else { // a == b, drop both!
1839	if (a == UNICODESET_HIGH) goto loop_end;
1840	a = list[i++];
1841	polarity ^= `1`;
1842	b = other[j++];
1843	polarity ^= `2`;
1844	}
1845	break;
1846	case `2`: // a first, b second; if a < b, overlap
1847	if (b < a) { // no overlap, take b
1848	buffer[k++] = b;
1849	b = other[j++];
1850	polarity ^= `2`;
1851	} else if (a < b) { // OVERLAP, drop a
1852	a = list[i++];
1853	polarity ^= `1`;
1854	} else { // a == b, drop both!
1855	if (a == UNICODESET_HIGH) goto loop_end;
1856	a = list[i++];
1857	polarity ^= `1`;
1858	b = other[j++];
1859	polarity ^= `2`;
1860	}
1861	break;
1862	}
1863	}
1864	loop_end:
1865	buffer[k++] = UNICODESET_HIGH; // terminate
1866	len = k;
1867	swapBuffers();
1868	releasePattern();
1869	}
1870
1871	// polarity = 0 is normal: x intersect y
1872	// polarity = 2: x intersect ~y == set-minus
1873	// polarity = 1: ~x intersect y
1874	// polarity = 3: ~x intersect ~y
1875
1876	void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1877	if (isFrozen() \|\| isBogus()) {
1878	return;
1879	}
1880	if (!ensureBufferCapacity(len + otherLen)) {
1881	return;
1882	}
1883
1884	int32_t i = `0`, j = `0`, k = `0`;
1885	UChar32 a = list[i++];
1886	UChar32 b = other[j++];
1887	// change from xor is that we have to check overlapping pairs
1888	// polarity bit 1 means a is second, bit 2 means b is.
1889	for (;;) {
1890	switch (polarity) {
1891	case `0`: // both first; drop the smaller
1892	if (a < b) { // drop a
1893	a = list[i++];
1894	polarity ^= `1`;
1895	} else if (b < a) { // drop b
1896	b = other[j++];
1897	polarity ^= `2`;
1898	} else { // a == b, take one, drop other
1899	if (a == UNICODESET_HIGH) goto loop_end;
1900	buffer[k++] = a;
1901	a = list[i++];
1902	polarity ^= `1`;
1903	b = other[j++];
1904	polarity ^= `2`;
1905	}
1906	break;
1907	case `3`: // both second; take lower if unequal
1908	if (a < b) { // take a
1909	buffer[k++] = a;
1910	a = list[i++];
1911	polarity ^= `1`;
1912	} else if (b < a) { // take b
1913	buffer[k++] = b;
1914	b = other[j++];
1915	polarity ^= `2`;
1916	} else { // a == b, take one, drop other
1917	if (a == UNICODESET_HIGH) goto loop_end;
1918	buffer[k++] = a;
1919	a = list[i++];
1920	polarity ^= `1`;
1921	b = other[j++];
1922	polarity ^= `2`;
1923	}
1924	break;
1925	case `1`: // a second, b first;
1926	if (a < b) { // NO OVERLAP, drop a
1927	a = list[i++];
1928	polarity ^= `1`;
1929	} else if (b < a) { // OVERLAP, take b
1930	buffer[k++] = b;
1931	b = other[j++];
1932	polarity ^= `2`;
1933	} else { // a == b, drop both!
1934	if (a == UNICODESET_HIGH) goto loop_end;
1935	a = list[i++];
1936	polarity ^= `1`;
1937	b = other[j++];
1938	polarity ^= `2`;
1939	}
1940	break;
1941	case `2`: // a first, b second; if a < b, overlap
1942	if (b < a) { // no overlap, drop b
1943	b = other[j++];
1944	polarity ^= `2`;
1945	} else if (a < b) { // OVERLAP, take a
1946	buffer[k++] = a;
1947	a = list[i++];
1948	polarity ^= `1`;
1949	} else { // a == b, drop both!
1950	if (a == UNICODESET_HIGH) goto loop_end;
1951	a = list[i++];
1952	polarity ^= `1`;
1953	b = other[j++];
1954	polarity ^= `2`;
1955	}
1956	break;
1957	}
1958	}
1959	loop_end:
1960	buffer[k++] = UNICODESET_HIGH; // terminate
1961	len = k;
1962	swapBuffers();
1963	releasePattern();
1964	}
1965
1966	/**
1967	* Append the <code>toPattern()</code> representation of a
1968	* string to the given <code>StringBuffer</code>.
1969	*/
1970	void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
1971	UChar32 cp;
1972	for (int32_t i = `0`; i < s.length(); i += U16_LENGTH(cp)) {
1973	_appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1974	}
1975	}
1976
1977	/**
1978	* Append the <code>toPattern()</code> representation of a
1979	* character to the given <code>StringBuffer</code>.
1980	*/
1981	void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
1982	if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
1983	// Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1984	// unprintable
1985	ICU_Utility::escape(buf, c);
1986	return;
1987	}
1988	// Okay to let ':' pass through
1989	switch (c) {
1990	case u`'['`:
1991	case u`']'`:
1992	case u`'-'`:
1993	case u`'^'`:
1994	case u`'&'`:
1995	case u`'\\'`:
1996	case u`'{'`:
1997	case u`'}'`:
1998	case u`':'`:
1999	case SymbolTable::SYMBOL_REF:
2000	buf.append(u`'\\'`);
2001	break;
2002	default:
2003	// Escape whitespace
2004	if (PatternProps::isWhiteSpace(c)) {
2005	buf.append(u`'\\'`);
2006	}
2007	break;
2008	}
2009	buf.append(c);
2010	}
2011
2012	void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
2013	UBool escapeUnprintable) {
2014	_appendToPat(result, start, escapeUnprintable);
2015	if (start != end) {
2016	if ((start+`1`) != end \|\|
2017	// Avoid writing what looks like a lead+trail surrogate pair.
2018	start == `0xdbff`) {
2019	result.append(u`'-'`);
2020	}
2021	_appendToPat(result, end, escapeUnprintable);
2022	}
2023	}
2024
2025	/**
2026	* Append a string representation of this set to result. This will be
2027	* a cleaned version of the string passed to applyPattern(), if there
2028	* is one. Otherwise it will be generated.
2029	*/
2030	UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2031	UBool escapeUnprintable) const
2032	{
2033	if (pat != nullptr) {
2034	int32_t i;
2035	int32_t backslashCount = `0`;
2036	for (i=`0`; i<patLen; ) {
2037	UChar32 c;
2038	U16_NEXT(pat, i, patLen, c);
2039	if (escapeUnprintable ?
2040	ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
2041	// If the unprintable character is preceded by an odd
2042	// number of backslashes, then it has been escaped.
2043	// Before unescaping it, we delete the final
2044	// backslash.
2045	if ((backslashCount % `2`) == `1`) {
2046	result.truncate(result.length() - `1`);
2047	}
2048	ICU_Utility::escape(result, c);
2049	backslashCount = `0`;
2050	} else {
2051	result.append(c);
2052	if (c == u`'\\'`) {
2053	++backslashCount;
2054	} else {
2055	backslashCount = `0`;
2056	}
2057	}
2058	}
2059	return result;
2060	}
2061
2062	return _generatePattern(result, escapeUnprintable);
2063	}
2064
2065	/**
2066	* Returns a string representation of this set. If the result of
2067	* calling this function is passed to a UnicodeSet constructor, it
2068	* will produce another set that is equal to this one.
2069	*/
2070	UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2071	UBool escapeUnprintable) const
2072	{
2073	result.truncate(`0`);
2074	return _toPattern(result, escapeUnprintable);
2075	}
2076
2077	/**
2078	* Generate and append a string representation of this set to result.
2079	* This does not use this.pat, the cleaned up copy of the string
2080	* passed to applyPattern().
2081	*/
2082	UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2083	UBool escapeUnprintable) const
2084	{
2085	result.append(u`'['`);
2086
2087	int32_t i = `0`;
2088	int32_t limit = len & ~`1`; // = 2 getRangeCount()*
2089
2090	// If the set contains at least 2 intervals and includes both
2091	// MIN_VALUE and MAX_VALUE, then the inverse representation will
2092	// be more economical.
2093	// if (getRangeCount() >= 2 &&
2094	// getRangeStart(0) == MIN_VALUE &&
2095	// getRangeEnd(last) == MAX_VALUE)
2096	// Invariant: list[len-1] == HIGH == MAX_VALUE + 1
2097	// If limit == len then len is even and the last range ends with MAX_VALUE.
2098	//
2099	// But* do not write the inverse (complement) if there are strings.*
2100	// Since ICU 70, the '^' performs a code point complement which removes all strings.
2101	if (len >= `4` && list[`0`] == `0` && limit == len && !hasStrings()) {
2102	// Emit the inverse
2103	result.append(u`'^'`);
2104	// Offsetting the inversion list index by one lets us
2105	// iterate over the ranges of the set complement.
2106	i = `1`;
2107	--limit;
2108	}
2109
2110	// Emit the ranges as pairs.
2111	while (i < limit) {
2112	UChar32 start = list[i]; // getRangeStart()
2113	UChar32 end = list[i + `1`] - `1`; // getRangeEnd() = range limit minus one
2114	if (!(`0xd800` <= end && end <= `0xdbff`)) {
2115	_appendToPat(result, start, end, escapeUnprintable);
2116	i += `2`;
2117	} else {
2118	// The range ends with a lead surrogate.
2119	// Avoid writing what looks like a lead+trail surrogate pair.
2120	// 1. Postpone ranges that start with a lead surrogate code point.
2121	int32_t firstLead = i;
2122	while ((i += `2`) < limit && list[i] <= `0xdbff`) {}
2123	int32_t firstAfterLead = i;
2124	// 2. Write following ranges that start with a trail surrogate code point.
2125	while (i < limit && (start = list[i]) <= `0xdfff`) {
2126	_appendToPat(result, start, list[i + `1`] - `1`, escapeUnprintable);
2127	i += `2`;
2128	}
2129	// 3. Now write the postponed ranges.
2130	for (int j = firstLead; j < firstAfterLead; j += `2`) {
2131	_appendToPat(result, list[j], list[j + `1`] - `1`, escapeUnprintable);
2132	}
2133	}
2134	}
2135
2136	if (strings != nullptr) {
2137	for (int32_t i = `0`; i<strings->size(); ++i) {
2138	result.append(u`'{'`);
2139	_appendToPat(result,
2140	(const* UnicodeString*) strings->elementAt(i),
2141	escapeUnprintable);
2142	result.append(u`'}'`);
2143	}
2144	}
2145	return result.append(u`']'`);
2146	}
2147
2148	/**
2149	* Release existing cached pattern
2150	*/
2151	void UnicodeSet::releasePattern() {
2152	if (pat) {
2153	uprv_free(pat);
2154	pat = nullptr;
2155	patLen = `0`;
2156	}
2157	}
2158
2159	/**
2160	* Set the new pattern to cache.
2161	*/
2162	void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
2163	releasePattern();
2164	pat = (char16_t )uprv_malloc((newPatLen + `1`) sizeof(char16_t));
2165	if (pat) {
2166	patLen = newPatLen;
2167	u_memcpy(pat, newPat, patLen);
2168	pat[patLen] = `0`;
2169	}
2170	// else we don't care if malloc failed. This was just a nice cache.
2171	// We can regenerate an equivalent pattern later when requested.
2172	}
2173
2174	UnicodeSet *UnicodeSet::freeze() {
2175	if(!isFrozen() && !isBogus()) {
2176	compact();
2177
2178	// Optimize contains() and span() and similar functions.
2179	if (hasStrings()) {
2180	stringSpan = new UnicodeSetStringSpan (*this, *strings, UnicodeSetStringSpan::ALL);
2181	if (stringSpan == nullptr) {
2182	setToBogus();
2183	return this;
2184	} else if (!stringSpan->needsStringSpanUTF16()) {
2185	// All strings are irrelevant for span() etc. because
2186	// all of each string's code points are contained in this set.
2187	// Do not check needsStringSpanUTF8() because UTF-8 has at most as
2188	// many relevant strings as UTF-16.
2189	// (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2190	delete stringSpan;
2191	stringSpan = nullptr;
2192	}
2193	}
2194	if (stringSpan == nullptr) {
2195	// No span-relevant strings: Optimize for code point spans.
2196	bmpSet=new BMPSet (list, len);
2197	if (bmpSet == nullptr) { // Check for memory allocation error.
2198	setToBogus();
2199	}
2200	}
2201	}
2202	return this;
2203	}
2204
2205	int32_t UnicodeSet::span(const char16_t s, int32_t length, USetSpanCondition spanCondition) const* {
2206	if(length>`0` && bmpSet!=nullptr) {
2207	return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2208	}
2209	if(length<`0`) {
2210	length=u_strlen(s);
2211	}
2212	if(length==`0`) {
2213	return `0`;
2214	}
2215	if(stringSpan!=nullptr) {
2216	return stringSpan->span(s, length, spanCondition);
2217	} else if(hasStrings()) {
2218	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2219	UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2220	UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2221	UnicodeSetStringSpan strSpan(*this, *strings, which);
2222	if(strSpan.needsStringSpanUTF16()) {
2223	return strSpan.span(s, length, spanCondition);
2224	}
2225	}
2226
2227	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2228	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2229	}
2230
2231	UChar32 c;
2232	int32_t start=`0`, prev=`0`;
2233	do {
2234	U16_NEXT(s, start, length, c);
2235	if(spanCondition!=contains(c)) {
2236	break;
2237	}
2238	} while((prev=start)<length);
2239	return prev;
2240	}
2241
2242	int32_t UnicodeSet::spanBack(const char16_t s, int32_t length, USetSpanCondition spanCondition) const* {
2243	if(length>`0` && bmpSet!=nullptr) {
2244	return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2245	}
2246	if(length<`0`) {
2247	length=u_strlen(s);
2248	}
2249	if(length==`0`) {
2250	return `0`;
2251	}
2252	if(stringSpan!=nullptr) {
2253	return stringSpan->spanBack(s, length, spanCondition);
2254	} else if(hasStrings()) {
2255	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2256	UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2257	UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2258	UnicodeSetStringSpan strSpan(*this, *strings, which);
2259	if(strSpan.needsStringSpanUTF16()) {
2260	return strSpan.spanBack(s, length, spanCondition);
2261	}
2262	}
2263
2264	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2265	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2266	}
2267
2268	UChar32 c;
2269	int32_t prev=length;
2270	do {
2271	U16_PREV(s, `0`, length, c);
2272	if(spanCondition!=contains(c)) {
2273	break;
2274	}
2275	} while((prev=length)>`0`);
2276	return prev;
2277	}
2278
2279	int32_t UnicodeSet::spanUTF8(const char s, int32_t length, USetSpanCondition spanCondition) const* {
2280	if(length>`0` && bmpSet!=nullptr) {
2281	const uint8_t s0=(const* uint8_t *)s;
2282	return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2283	}
2284	if(length<`0`) {
2285	length=(int32_t)uprv_strlen(s);
2286	}
2287	if(length==`0`) {
2288	return `0`;
2289	}
2290	if(stringSpan!=nullptr) {
2291	return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2292	} else if(hasStrings()) {
2293	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2294	UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2295	UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2296	UnicodeSetStringSpan strSpan(*this, *strings, which);
2297	if(strSpan.needsStringSpanUTF8()) {
2298	return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2299	}
2300	}
2301
2302	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2303	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2304	}
2305
2306	UChar32 c;
2307	int32_t start=`0`, prev=`0`;
2308	do {
2309	U8_NEXT_OR_FFFD(s, start, length, c);
2310	if(spanCondition!=contains(c)) {
2311	break;
2312	}
2313	} while((prev=start)<length);
2314	return prev;
2315	}
2316
2317	int32_t UnicodeSet::spanBackUTF8(const char s, int32_t length, USetSpanCondition spanCondition) const* {
2318	if(length>`0` && bmpSet!=nullptr) {
2319	const uint8_t s0=(const* uint8_t *)s;
2320	return bmpSet->spanBackUTF8(s0, length, spanCondition);
2321	}
2322	if(length<`0`) {
2323	length=(int32_t)uprv_strlen(s);
2324	}
2325	if(length==`0`) {
2326	return `0`;
2327	}
2328	if(stringSpan!=nullptr) {
2329	return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2330	} else if(hasStrings()) {
2331	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2332	UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2333	UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2334	UnicodeSetStringSpan strSpan(*this, *strings, which);
2335	if(strSpan.needsStringSpanUTF8()) {
2336	return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2337	}
2338	}
2339
2340	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2341	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2342	}
2343
2344	UChar32 c;
2345	int32_t prev=length;
2346	do {
2347	U8_PREV_OR_FFFD(s, `0`, length, c);
2348	if(spanCondition!=contains(c)) {
2349	break;
2350	}
2351	} while((prev=length)>`0`);
2352	return prev;
2353	}
2354
2355	U_NAMESPACE_END
2356

Browse the source code of Godot/thirdparty/icu4c/common/uniset.cpp