uniset.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/uniset.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 1999-2015, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 10/20/99 alan Creation.
10	**********************************************************************
11	*/
12
13	#include "unicode/utypes.h"
14	#include "unicode/parsepos.h"
15	#include "unicode/symtable.h"
16	#include "unicode/uniset.h"
17	#include "unicode/ustring.h"
18	#include "unicode/utf8.h"
19	#include "unicode/utf16.h"
20	#include "ruleiter.h"
21	#include "cmemory.h"
22	#include "cstring.h"
23	#include "patternprops.h"
24	#include "uelement.h"
25	#include "util.h"
26	#include "uvector.h"
27	#include "charstr.h"
28	#include "ustrfmt.h"
29	#include "uassert.h"
30	#include "bmpset.h"
31	#include "unisetspan.h"
32
33	// Define UChar constants using hex for EBCDIC compatibility
34	// Used #define to reduce private static exports and memory access time.
35	#define SET_OPEN ((UChar)0x005B) /[/
36	#define SET_CLOSE ((UChar)0x005D) /]/
37	#define HYPHEN ((UChar)0x002D) /-/
38	#define COMPLEMENT ((UChar)0x005E) /^/
39	#define COLON ((UChar)0x003A) /:/
40	#define BACKSLASH ((UChar)0x005C) /\/
41	#define INTERSECTION ((UChar)0x0026) /&/
42	#define UPPER_U ((UChar)0x0055) /U/
43	#define LOWER_U ((UChar)0x0075) /u/
44	#define OPEN_BRACE ((UChar)123) /{/
45	#define CLOSE_BRACE ((UChar)125) /}/
46	#define UPPER_P ((UChar)0x0050) /P/
47	#define LOWER_P ((UChar)0x0070) /p/
48	#define UPPER_N ((UChar)78) /N/
49	#define EQUALS ((UChar)0x003D) /=/
50
51	// HIGH_VALUE > all valid values. 110000 for codepoints
52	#define UNICODESET_HIGH 0x0110000
53
54	// LOW <= all valid values. ZERO for codepoints
55	#define UNICODESET_LOW 0x000000
56
57	/* Max list [0, 1, 2, ..., max code point, HIGH] /
58	constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + `1`;
59
60	U_NAMESPACE_BEGIN
61
62	SymbolTable::~SymbolTable() {}
63
64	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
65
66	/**
67	* Modify the given UChar32 variable so that it is in range, by
68	* pinning values < UNICODESET_LOW to UNICODESET_LOW, and
69	* pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
70	* It modifies its argument in-place and also returns it.
71	*/
72	static inline UChar32 pinCodePoint(UChar32& c) {
73	if (c < UNICODESET_LOW) {
74	c = UNICODESET_LOW;
75	} else if (c > (UNICODESET_HIGH-`1`)) {
76	c = (UNICODESET_HIGH-`1`);
77	}
78	return c;
79	}
80
81	//----------------------------------------------------------------
82	// Debugging
83	//----------------------------------------------------------------
84
85	// DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
86	// To enable the debugging, define the symbol DEBUG_MEM in the line
87	// below. This will result in text being sent to stdout that looks
88	// like this:
89	// DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
90	// DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
91	// Each line lists a construction (ct) or destruction (dt) event, the
92	// object address, the number of outstanding objects after the event,
93	// and the pattern of the object in question.
94
95	// #define DEBUG_MEM
96
97	#ifdef DEBUG_MEM
98	#include <stdio.h>
99	static int32_t _dbgCount = `0`;
100
101	static inline void _dbgct(UnicodeSet* set) {
102	UnicodeString str;
103	set->toPattern(str, TRUE);
104	char buf[`40`];
105	str.extract(`0`, `39`, buf, "");
106	printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
107	}
108
109	static inline void _dbgdt(UnicodeSet* set) {
110	UnicodeString str;
111	set->toPattern(str, TRUE);
112	char buf[`40`];
113	str.extract(`0`, `39`, buf, "");
114	printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
115	}
116
117	#else
118
119	#define _dbgct(set)
120	#define _dbgdt(set)
121
122	#endif
123
124	//----------------------------------------------------------------
125	// UnicodeString in UVector support
126	//----------------------------------------------------------------
127
128	static void U_CALLCONV cloneUnicodeString(UElement dst, UElement src) {
129	dst->pointer = new UnicodeString ((UnicodeString)src->pointer);
130	}
131
132	static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
133	const UnicodeString &a = (const* UnicodeString*)t1.pointer;
134	const UnicodeString &b = (const* UnicodeString*)t2.pointer;
135	return a.compare(b);
136	}
137
138	UBool UnicodeSet::hasStrings() const {
139	return strings != nullptr && !strings->isEmpty();
140	}
141
142	int32_t UnicodeSet::stringsSize() const {
143	return strings == nullptr ? `0` : strings->size();
144	}
145
146	UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
147	return strings != nullptr && strings->contains((void*) &s);
148	}
149
150	//----------------------------------------------------------------
151	// Constructors &c
152	//----------------------------------------------------------------
153
154	/**
155	* Constructs an empty set.
156	*/
157	UnicodeSet::UnicodeSet() {
158	list[`0`] = UNICODESET_HIGH;
159	_dbgct(this);
160	}
161
162	/**
163	* Constructs a set containing the given range. If <code>end >
164	* start</code> then an empty set is created.
165	*
166	* @param start first character, inclusive, of range
167	* @param end last character, inclusive, of range
168	*/
169	UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
170	list[`0`] = UNICODESET_HIGH;
171	add(start, end);
172	_dbgct(this);
173	}
174
175	/**
176	* Constructs a set that is identical to the given UnicodeSet.
177	*/
178	UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter (o) {
179	*this = o;
180	_dbgct(this);
181	}
182
183	// Copy-construct as thawed.
184	UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool / asThawed /) : UnicodeFilter (o) {
185	if (ensureCapacity(o.len)) {
186	// this = o except for bmpSet and stringSpan*
187	len = o.len;
188	uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
189	if (o.hasStrings()) {
190	UErrorCode status = U_ZERO_ERROR;
191	if (!allocateStrings(status) \|\|
192	(strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
193	setToBogus();
194	return;
195	}
196	}
197	if (o.pat) {
198	setPattern(o.pat, o.patLen);
199	}
200	_dbgct(this);
201	}
202	}
203
204	/**
205	* Destructs the set.
206	*/
207	UnicodeSet::~UnicodeSet() {
208	_dbgdt(this); // first!
209	if (list != stackList) {
210	uprv_free(list);
211	}
212	delete bmpSet;
213	if (buffer != stackList) {
214	uprv_free(buffer);
215	}
216	delete strings;
217	delete stringSpan;
218	releasePattern();
219	}
220
221	/**
222	* Assigns this object to be a copy of another.
223	*/
224	UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
225	return copyFrom(o, FALSE);
226	}
227
228	UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
229	if (this == &o) {
230	return *this;
231	}
232	if (isFrozen()) {
233	return *this;
234	}
235	if (o.isBogus()) {
236	setToBogus();
237	return *this;
238	}
239	if (!ensureCapacity(o.len)) {
240	// ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
241	return *this;
242	}
243	len = o.len;
244	uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
245	if (o.bmpSet != nullptr && !asThawed) {
246	bmpSet = new BMPSet (*o.bmpSet, list, len);
247	if (bmpSet == NULL) { // Check for memory allocation error.
248	setToBogus();
249	return *this;
250	}
251	}
252	if (o.hasStrings()) {
253	UErrorCode status = U_ZERO_ERROR;
254	if ((strings == nullptr && !allocateStrings(status)) \|\|
255	(strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
256	setToBogus();
257	return *this;
258	}
259	} else if (hasStrings()) {
260	strings->removeAllElements();
261	}
262	if (o.stringSpan != nullptr && !asThawed) {
263	stringSpan = new UnicodeSetStringSpan (o.stringSpan, strings);
264	if (stringSpan == NULL) { // Check for memory allocation error.
265	setToBogus();
266	return *this;
267	}
268	}
269	releasePattern();
270	if (o.pat) {
271	setPattern(o.pat, o.patLen);
272	}
273	return *this;
274	}
275
276	/**
277	* Returns a copy of this object. All UnicodeMatcher objects have
278	* to support cloning in order to allow classes using
279	* UnicodeMatchers, such as Transliterator, to implement cloning.
280	*/
281	UnicodeSet* UnicodeSet::clone() const {
282	return new UnicodeSet (*this);
283	}
284
285	UnicodeSet UnicodeSet::cloneAsThawed() const* {
286	return new UnicodeSet (*this, TRUE);
287	}
288
289	/**
290	* Compares the specified object with this set for equality. Returns
291	* <tt>true</tt> if the two sets
292	* have the same size, and every member of the specified set is
293	* contained in this set (or equivalently, every member of this set is
294	* contained in the specified set).
295	*
296	* @param o set to be compared for equality with this set.
297	* @return <tt>true</tt> if the specified set is equal to this set.
298	*/
299	UBool UnicodeSet::operator==(const UnicodeSet& o) const {
300	if (len != o.len) return FALSE;
301	for (int32_t i = `0`; i < len; ++i) {
302	if (list[i] != o.list[i]) return FALSE;
303	}
304	if (hasStrings() != o.hasStrings()) { return FALSE; }
305	if (hasStrings() && strings != o.strings) return FALSE;
306	return TRUE;
307	}
308
309	/**
310	* Returns the hash code value for this set.
311	*
312	* @return the hash code value for this set.
313	* @see Object#hashCode()
314	*/
315	int32_t UnicodeSet::hashCode(void) const {
316	uint32_t result = static_cast<uint32_t>(len);
317	for (int32_t i = `0`; i < len; ++i) {
318	result *= `1000003u`;
319	result += list[i];
320	}
321	return static_cast<int32_t>(result);
322	}
323
324	//----------------------------------------------------------------
325	// Public API
326	//----------------------------------------------------------------
327
328	/**
329	* Returns the number of elements in this set (its cardinality),
330	* Note than the elements of a set may include both individual
331	* codepoints and strings.
332	*
333	* @return the number of elements in this set (its cardinality).
334	*/
335	int32_t UnicodeSet::size(void) const {
336	int32_t n = `0`;
337	int32_t count = getRangeCount();
338	for (int32_t i = `0`; i < count; ++i) {
339	n += getRangeEnd(i) - getRangeStart(i) + `1`;
340	}
341	return n + stringsSize();
342	}
343
344	/**
345	* Returns <tt>true</tt> if this set contains no elements.
346	*
347	* @return <tt>true</tt> if this set contains no elements.
348	*/
349	UBool UnicodeSet::isEmpty(void) const {
350	return len == `1` && !hasStrings();
351	}
352
353	/**
354	* Returns true if this set contains the given character.
355	* @param c character to be checked for containment
356	* @return true if the test condition is met
357	*/
358	UBool UnicodeSet::contains(UChar32 c) const {
359	// Set i to the index of the start item greater than ch
360	// We know we will terminate without length test!
361	// LATER: for large sets, add binary search
362	//int32_t i = -1;
363	//for (;;) {
364	// if (c < list[++i]) break;
365	//}
366	if (bmpSet != NULL) {
367	return bmpSet->contains(c);
368	}
369	if (stringSpan != NULL) {
370	return stringSpan->contains(c);
371	}
372	if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
373	return FALSE;
374	}
375	int32_t i = findCodePoint(c);
376	return (UBool)(i & `1`); // return true if odd
377	}
378
379	/**
380	* Returns the smallest value i such that c < list[i]. Caller
381	* must ensure that c is a legal value or this method will enter
382	* an infinite loop. This method performs a binary search.
383	* @param c a character in the range MIN_VALUE..MAX_VALUE
384	* inclusive
385	* @return the smallest integer i in the range 0..len-1,
386	* inclusive, such that c < list[i]
387	*/
388	int32_t UnicodeSet::findCodePoint(UChar32 c) const {
389	/ Examples:*
390	findCodePoint(c)
391	set list[] c=0 1 3 4 7 8
392	=== ============== ===========
393	[] [110000] 0 0 0 0 0 0
394	[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
395	[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
396	[:Any:] [0, 110000] 1 1 1 1 1 1
397	*/
398
399	// Return the smallest i such that c < list[i]. Assume
400	// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
401	if (c < list[`0`])
402	return `0`;
403	// High runner test. c is often after the last range, so an
404	// initial check for this condition pays off.
405	int32_t lo = `0`;
406	int32_t hi = len - `1`;
407	if (lo >= hi \|\| c >= list[hi-`1`])
408	return hi;
409	// invariant: c >= list[lo]
410	// invariant: c < list[hi]
411	for (;;) {
412	int32_t i = (lo + hi) >> `1`;
413	if (i == lo) {
414	break; // Found!
415	} else if (c < list[i]) {
416	hi = i;
417	} else {
418	lo = i;
419	}
420	}
421	return hi;
422	}
423
424	/**
425	* Returns true if this set contains every character
426	* of the given range.
427	* @param start first character, inclusive, of the range
428	* @param end last character, inclusive, of the range
429	* @return true if the test condition is met
430	*/
431	UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
432	//int32_t i = -1;
433	//for (;;) {
434	// if (start < list[++i]) break;
435	//}
436	int32_t i = findCodePoint(start);
437	return ((i & `1`) != `0` && end < list[i]);
438	}
439
440	/**
441	* Returns <tt>true</tt> if this set contains the given
442	* multicharacter string.
443	* @param s string to be checked for containment
444	* @return <tt>true</tt> if this set contains the specified string
445	*/
446	UBool UnicodeSet::contains(const UnicodeString& s) const {
447	if (s.length() == `0`) return FALSE;
448	int32_t cp = getSingleCP(s);
449	if (cp < `0`) {
450	return stringsContains(s);
451	} else {
452	return contains((UChar32) cp);
453	}
454	}
455
456	/**
457	* Returns true if this set contains all the characters and strings
458	* of the given set.
459	* @param c set to be checked for containment
460	* @return true if the test condition is met
461	*/
462	UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
463	// The specified set is a subset if all of its pairs are contained in
464	// this set. It's possible to code this more efficiently in terms of
465	// direct manipulation of the inversion lists if the need arises.
466	int32_t n = c.getRangeCount();
467	for (int i=`0`; i<n; ++i) {
468	if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
469	return FALSE;
470	}
471	}
472	return !c.hasStrings() \|\| (strings != nullptr && strings->containsAll(*c.strings));
473	}
474
475	/**
476	* Returns true if this set contains all the characters
477	* of the given string.
478	* @param s string containing characters to be checked for containment
479	* @return true if the test condition is met
480	*/
481	UBool UnicodeSet::containsAll(const UnicodeString& s) const {
482	return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
483	s.length());
484	}
485
486	/**
487	* Returns true if this set contains none of the characters
488	* of the given range.
489	* @param start first character, inclusive, of the range
490	* @param end last character, inclusive, of the range
491	* @return true if the test condition is met
492	*/
493	UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
494	//int32_t i = -1;
495	//for (;;) {
496	// if (start < list[++i]) break;
497	//}
498	int32_t i = findCodePoint(start);
499	return ((i & `1`) == `0` && end < list[i]);
500	}
501
502	/**
503	* Returns true if this set contains none of the characters and strings
504	* of the given set.
505	* @param c set to be checked for containment
506	* @return true if the test condition is met
507	*/
508	UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
509	// The specified set is a subset if all of its pairs are contained in
510	// this set. It's possible to code this more efficiently in terms of
511	// direct manipulation of the inversion lists if the need arises.
512	int32_t n = c.getRangeCount();
513	for (int32_t i=`0`; i<n; ++i) {
514	if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
515	return FALSE;
516	}
517	}
518	return strings == nullptr \|\| !c.hasStrings() \|\| strings->containsNone(*c.strings);
519	}
520
521	/**
522	* Returns true if this set contains none of the characters
523	* of the given string.
524	* @param s string containing characters to be checked for containment
525	* @return true if the test condition is met
526	*/
527	UBool UnicodeSet::containsNone(const UnicodeString& s) const {
528	return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
529	s.length());
530	}
531
532	/**
533	* Returns <tt>true</tt> if this set contains any character whose low byte
534	* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
535	* indexing.
536	*/
537	UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
538	/ The index value v, in the range [0,255], is contained in this set if*
539	* it is contained in any pair of this set. Pairs either have the high
540	* bytes equal, or unequal. If the high bytes are equal, then we have
541	* aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
542	* v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
543	* Then v is contained if xx <= v \|\| v <= yy. (This is identical to the
544	* time zone month containment logic.)
545	*/
546	int32_t i;
547	int32_t rangeCount=getRangeCount();
548	for (i=`0`; i<rangeCount; ++i) {
549	UChar32 low = getRangeStart(i);
550	UChar32 high = getRangeEnd(i);
551	if ((low & ~`0xFF`) == (high & ~`0xFF`)) {
552	if ((low & `0xFF`) <= v && v <= (high & `0xFF`)) {
553	return TRUE;
554	}
555	} else if ((low & `0xFF`) <= v \|\| v <= (high & `0xFF`)) {
556	return TRUE;
557	}
558	}
559	if (hasStrings()) {
560	for (i=`0`; i<strings->size(); ++i) {
561	const UnicodeString& s = (const* UnicodeString*)strings->elementAt(i);
562	//if (s.length() == 0) {
563	// // Empty strings match everything
564	// return TRUE;
565	//}
566	// assert(s.length() != 0); // We enforce this elsewhere
567	UChar32 c = s.char32At(`0`);
568	if ((c & `0xFF`) == v) {
569	return TRUE;
570	}
571	}
572	}
573	return FALSE;
574	}
575
576	/**
577	* Implementation of UnicodeMatcher::matches(). Always matches the
578	* longest possible multichar string.
579	*/
580	UMatchDegree UnicodeSet::matches(const Replaceable& text,
581	int32_t& offset,
582	int32_t limit,
583	UBool incremental) {
584	if (offset == limit) {
585	// Strings, if any, have length != 0, so we don't worry
586	// about them here. If we ever allow zero-length strings
587	// we much check for them here.
588	if (contains(U_ETHER)) {
589	return incremental ? U_PARTIAL_MATCH : U_MATCH;
590	} else {
591	return U_MISMATCH;
592	}
593	} else {
594	if (hasStrings()) { // try strings first
595
596	// might separate forward and backward loops later
597	// for now they are combined
598
599	// TODO Improve efficiency of this, at least in the forward
600	// direction, if not in both. In the forward direction we
601	// can assume the strings are sorted.
602
603	int32_t i;
604	UBool forward = offset < limit;
605
606	// firstChar is the leftmost char to match in the
607	// forward direction or the rightmost char to match in
608	// the reverse direction.
609	UChar firstChar = text.charAt(offset);
610
611	// If there are multiple strings that can match we
612	// return the longest match.
613	int32_t highWaterLength = `0`;
614
615	for (i=`0`; i<strings->size(); ++i) {
616	const UnicodeString& trial = (const* UnicodeString*)strings->elementAt(i);
617
618	//if (trial.length() == 0) {
619	// return U_MATCH; // null-string always matches
620	//}
621	// assert(trial.length() != 0); // We ensure this elsewhere
622
623	UChar c = trial.charAt(forward ? `0` : trial.length() - `1`);
624
625	// Strings are sorted, so we can optimize in the
626	// forward direction.
627	if (forward && c > firstChar) break;
628	if (c != firstChar) continue;
629
630	int32_t matchLen = matchRest(text, offset, limit, trial);
631
632	if (incremental) {
633	int32_t maxLen = forward ? limit-offset : offset-limit;
634	if (matchLen == maxLen) {
635	// We have successfully matched but only up to limit.
636	return U_PARTIAL_MATCH;
637	}
638	}
639
640	if (matchLen == trial.length()) {
641	// We have successfully matched the whole string.
642	if (matchLen > highWaterLength) {
643	highWaterLength = matchLen;
644	}
645	// In the forward direction we know strings
646	// are sorted so we can bail early.
647	if (forward && matchLen < highWaterLength) {
648	break;
649	}
650	continue;
651	}
652	}
653
654	// We've checked all strings without a partial match.
655	// If we have full matches, return the longest one.
656	if (highWaterLength != `0`) {
657	offset += forward ? highWaterLength : -highWaterLength;
658	return U_MATCH;
659	}
660	}
661	return UnicodeFilter::matches(text, offset, limit, incremental);
662	}
663	}
664
665	/**
666	* Returns the longest match for s in text at the given position.
667	* If limit > start then match forward from start+1 to limit
668	* matching all characters except s.charAt(0). If limit < start,
669	* go backward starting from start-1 matching all characters
670	* except s.charAt(s.length()-1). This method assumes that the
671	* first character, text.charAt(start), matches s, so it does not
672	* check it.
673	* @param text the text to match
674	* @param start the first character to match. In the forward
675	* direction, text.charAt(start) is matched against s.charAt(0).
676	* In the reverse direction, it is matched against
677	* s.charAt(s.length()-1).
678	* @param limit the limit offset for matching, either last+1 in
679	* the forward direction, or last-1 in the reverse direction,
680	* where last is the index of the last character to match.
681	* @return If part of s matches up to the limit, return \|limit -
682	* start\|. If all of s matches before reaching the limit, return
683	* s.length(). If there is a mismatch between s and text, return
684	* 0
685	*/
686	int32_t UnicodeSet::matchRest(const Replaceable& text,
687	int32_t start, int32_t limit,
688	const UnicodeString& s) {
689	int32_t i;
690	int32_t maxLen;
691	int32_t slen = s.length();
692	if (start < limit) {
693	maxLen = limit - start;
694	if (maxLen > slen) maxLen = slen;
695	for (i = `1`; i < maxLen; ++i) {
696	if (text.charAt(start + i) != s.charAt(i)) return `0`;
697	}
698	} else {
699	maxLen = start - limit;
700	if (maxLen > slen) maxLen = slen;
701	--slen; // <=> slen = s.length() - 1;
702	for (i = `1`; i < maxLen; ++i) {
703	if (text.charAt(start - i) != s.charAt(slen - i)) return `0`;
704	}
705	}
706	return maxLen;
707	}
708
709	/**
710	* Implement of UnicodeMatcher
711	*/
712	void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
713	toUnionTo.addAll(*this);
714	}
715
716	/**
717	* Returns the index of the given character within this set, where
718	* the set is ordered by ascending code point. If the character
719	* is not in this set, return -1. The inverse of this method is
720	* <code>charAt()</code>.
721	* @return an index from 0..size()-1, or -1
722	*/
723	int32_t UnicodeSet::indexOf(UChar32 c) const {
724	if (c < MIN_VALUE \|\| c > MAX_VALUE) {
725	return -`1`;
726	}
727	int32_t i = `0`;
728	int32_t n = `0`;
729	for (;;) {
730	UChar32 start = list[i++];
731	if (c < start) {
732	return -`1`;
733	}
734	UChar32 limit = list[i++];
735	if (c < limit) {
736	return n + c - start;
737	}
738	n += limit - start;
739	}
740	}
741
742	/**
743	* Returns the character at the given index within this set, where
744	* the set is ordered by ascending code point. If the index is
745	* out of range, return (UChar32)-1. The inverse of this method is
746	* <code>indexOf()</code>.
747	* @param index an index from 0..size()-1
748	* @return the character at the given index, or (UChar32)-1.
749	*/
750	UChar32 UnicodeSet::charAt(int32_t index) const {
751	if (index >= `0`) {
752	// len2 is the largest even integer <= len, that is, it is len
753	// for even values and len-1 for odd values. With odd values
754	// the last entry is UNICODESET_HIGH.
755	int32_t len2 = len & ~`1`;
756	for (int32_t i=`0`; i < len2;) {
757	UChar32 start = list[i++];
758	int32_t count = list[i++] - start;
759	if (index < count) {
760	return (UChar32)(start + index);
761	}
762	index -= count;
763	}
764	}
765	return (UChar32)-`1`;
766	}
767
768	/**
769	* Make this object represent the range <code>start - end</code>.
770	* If <code>end > start</code> then this object is set to an
771	* an empty range.
772	*
773	* @param start first character in the set, inclusive
774	* @rparam end last character in the set, inclusive
775	*/
776	UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
777	clear();
778	complement(start, end);
779	return *this;
780	}
781
782	/**
783	* Adds the specified range to this set if it is not already
784	* present. If this set already contains the specified range,
785	* the call leaves this set unchanged. If <code>end > start</code>
786	* then an empty range is added, leaving the set unchanged.
787	*
788	* @param start first character, inclusive, of range to be added
789	* to this set.
790	* @param end last character, inclusive, of range to be added
791	* to this set.
792	*/
793	UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
794	if (pinCodePoint(start) < pinCodePoint(end)) {
795	UChar32 limit = end + `1`;
796	// Fast path for adding a new range after the last one.
797	// Odd list length: [..., lastStart, lastLimit, HIGH]
798	if ((len & `1`) != `0`) {
799	// If the list is empty, set lastLimit low enough to not be adjacent to 0.
800	UChar32 lastLimit = len == `1` ? -`2` : list[len - `2`];
801	if (lastLimit <= start && !isFrozen() && !isBogus()) {
802	if (lastLimit == start) {
803	// Extend the last range.
804	list[len - `2`] = limit;
805	if (limit == UNICODESET_HIGH) {
806	--len;
807	}
808	} else {
809	list[len - `1`] = start;
810	if (limit < UNICODESET_HIGH) {
811	if (ensureCapacity(len + `2`)) {
812	list[len++] = limit;
813	list[len++] = UNICODESET_HIGH;
814	}
815	} else { // limit == UNICODESET_HIGH
816	if (ensureCapacity(len + `1`)) {
817	list[len++] = UNICODESET_HIGH;
818	}
819	}
820	}
821	releasePattern();
822	return *this;
823	}
824	}
825	// This is slow. Could be much faster using findCodePoint(start)
826	// and modifying the list, dealing with adjacent & overlapping ranges.
827	UChar32 range[`3`] = { start, limit, UNICODESET_HIGH };
828	add(range, `2`, `0`);
829	} else if (start == end) {
830	add(start);
831	}
832	return *this;
833	}
834
835	// #define DEBUG_US_ADD
836
837	#ifdef DEBUG_US_ADD
838	#include <stdio.h>
839	void dump(UChar32 c) {
840	if (c <= `0xFF`) {
841	printf("%c", (char)c);
842	} else {
843	printf("U+%04X", c);
844	}
845	}
846	void dump(const UChar32* list, int32_t len) {
847	printf("[");
848	for (int32_t i=`0`; i<len; ++i) {
849	if (i != `0`) printf(", ");
850	dump(list[i]);
851	}
852	printf("]");
853	}
854	#endif
855
856	/**
857	* Adds the specified character to this set if it is not already
858	* present. If this set already contains the specified character,
859	* the call leaves this set unchanged.
860	*/
861	UnicodeSet& UnicodeSet::add(UChar32 c) {
862	// find smallest i such that c < list[i]
863	// if odd, then it is IN the set
864	// if even, then it is OUT of the set
865	int32_t i = findCodePoint(pinCodePoint(c));
866
867	// already in set?
868	if ((i & `1`) != `0` \|\| isFrozen() \|\| isBogus()) return *this;
869
870	// HIGH is 0x110000
871	// assert(list[len-1] == HIGH);
872
873	// empty = [HIGH]
874	// [start_0, limit_0, start_1, limit_1, HIGH]
875
876	// [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
877	// ^
878	// list[i]
879
880	// i == 0 means c is before the first range
881
882	#ifdef DEBUG_US_ADD
883	printf("Add of ");
884	dump(c);
885	printf(" found at %d", i);
886	printf(": ");
887	dump(list, len);
888	printf(" => ");
889	#endif
890
891	if (c == list[i]-`1`) {
892	// c is before start of next range
893	list[i] = c;
894	// if we touched the HIGH mark, then add a new one
895	if (c == (UNICODESET_HIGH - `1`)) {
896	if (!ensureCapacity(len+`1`)) {
897	// ensureCapacity will mark the object as Bogus if OOM failure happens.
898	return *this;
899	}
900	list[len++] = UNICODESET_HIGH;
901	}
902	if (i > `0` && c == list[i-`1`]) {
903	// collapse adjacent ranges
904
905	// [..., start_k-1, c, c, limit_k, ..., HIGH]
906	// ^
907	// list[i]
908
909	//for (int32_t k=i-1; k<len-2; ++k) {
910	// list[k] = list[k+2];
911	//}
912	UChar32* dst = list + i - `1`;
913	UChar32* src = dst + `2`;
914	UChar32* srclimit = list + len;
915	while (src < srclimit) (dst++) = (src++);
916
917	len -= `2`;
918	}
919	}
920
921	else if (i > `0` && c == list[i-`1`]) {
922	// c is after end of prior range
923	list[i-`1`]++;
924	// no need to check for collapse here
925	}
926
927	else {
928	// At this point we know the new char is not adjacent to
929	// any existing ranges, and it is not 10FFFF.
930
931
932	// [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
933	// ^
934	// list[i]
935
936	// [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
937	// ^
938	// list[i]
939
940	if (!ensureCapacity(len+`2`)) {
941	// ensureCapacity will mark the object as Bogus if OOM failure happens.
942	return *this;
943	}
944
945	UChar32 *p = list + i;
946	uprv_memmove(p + `2`, p, (len - i) * sizeof(*p));
947	list[i] = c;
948	list[i+`1`] = c+`1`;
949	len += `2`;
950	}
951
952	#ifdef DEBUG_US_ADD
953	dump(list, len);
954	printf("\n");
955
956	for (i=`1`; i<len; ++i) {
957	if (list[i] <= list[i-`1`]) {
958	// Corrupt array!
959	printf("ERROR: list has been corrupted\n");
960	exit(`1`);
961	}
962	}
963	#endif
964
965	releasePattern();
966	return *this;
967	}
968
969	/**
970	* Adds the specified multicharacter to this set if it is not already
971	* present. If this set already contains the multicharacter,
972	* the call leaves this set unchanged.
973	* Thus "ch" => {"ch"}
974	* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
975	* @param s the source string
976	* @return the modified set, for chaining
977	*/
978	UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
979	if (s.length() == `0` \|\| isFrozen() \|\| isBogus()) return *this;
980	int32_t cp = getSingleCP(s);
981	if (cp < `0`) {
982	if (!stringsContains(s)) {
983	_add(s);
984	releasePattern();
985	}
986	} else {
987	add((UChar32)cp);
988	}
989	return *this;
990	}
991
992	/**
993	* Adds the given string, in order, to 'strings'. The given string
994	* must have been checked by the caller to not be empty and to not
995	* already be in 'strings'.
996	*/
997	void UnicodeSet::_add(const UnicodeString& s) {
998	if (isFrozen() \|\| isBogus()) {
999	return;
1000	}
1001	UErrorCode ec = U_ZERO_ERROR;
1002	if (strings == nullptr && !allocateStrings(ec)) {
1003	setToBogus();
1004	return;
1005	}
1006	UnicodeString* t = new UnicodeString (s);
1007	if (t == NULL) { // Check for memory allocation error.
1008	setToBogus();
1009	return;
1010	}
1011	strings->sortedInsert(t, compareUnicodeString, ec);
1012	if (U_FAILURE(ec)) {
1013	setToBogus();
1014	delete t;
1015	}
1016	}
1017
1018	/**
1019	* @return a code point IF the string consists of a single one.
1020	* otherwise returns -1.
1021	* @param string to test
1022	*/
1023	int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1024	//if (s.length() < 1) {
1025	// throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1026	//}
1027	if (s.length() > `2`) return -`1`;
1028	if (s.length() == `1`) return s.charAt(`0`);
1029
1030	// at this point, len = 2
1031	UChar32 cp = s.char32At(`0`);
1032	if (cp > `0xFFFF`) { // is surrogate pair
1033	return cp;
1034	}
1035	return -`1`;
1036	}
1037
1038	/**
1039	* Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1040	* If this set already any particular character, it has no effect on that character.
1041	* @param the source string
1042	* @return the modified set, for chaining
1043	*/
1044	UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1045	UChar32 cp;
1046	for (int32_t i = `0`; i < s.length(); i += U16_LENGTH(cp)) {
1047	cp = s.char32At(i);
1048	add(cp);
1049	}
1050	return *this;
1051	}
1052
1053	/**
1054	* Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1055	* If this set already any particular character, it has no effect on that character.
1056	* @param the source string
1057	* @return the modified set, for chaining
1058	*/
1059	UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1060	UnicodeSet set;
1061	set.addAll(s);
1062	retainAll(set);
1063	return *this;
1064	}
1065
1066	/**
1067	* Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1068	* If this set already any particular character, it has no effect on that character.
1069	* @param the source string
1070	* @return the modified set, for chaining
1071	*/
1072	UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1073	UnicodeSet set;
1074	set.addAll(s);
1075	complementAll(set);
1076	return *this;
1077	}
1078
1079	/**
1080	* Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1081	* If this set already any particular character, it has no effect on that character.
1082	* @param the source string
1083	* @return the modified set, for chaining
1084	*/
1085	UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1086	UnicodeSet set;
1087	set.addAll(s);
1088	removeAll(set);
1089	return *this;
1090	}
1091
1092	UnicodeSet& UnicodeSet::removeAllStrings() {
1093	if (!isFrozen() && hasStrings()) {
1094	strings->removeAllElements();
1095	releasePattern();
1096	}
1097	return *this;
1098	}
1099
1100
1101	/**
1102	* Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1103	* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1104	* @param the source string
1105	* @return a newly created set containing the given string
1106	*/
1107	UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1108	UnicodeSet set = new* UnicodeSet ();
1109	if (set != NULL) { // Check for memory allocation error.
1110	set->add(s);
1111	}
1112	return set;
1113	}
1114
1115
1116	/**
1117	* Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1118	* @param the source string
1119	* @return a newly created set containing the given characters
1120	*/
1121	UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1122	UnicodeSet set = new* UnicodeSet ();
1123	if (set != NULL) { // Check for memory allocation error.
1124	set->addAll(s);
1125	}
1126	return set;
1127	}
1128
1129	/**
1130	* Retain only the elements in this set that are contained in the
1131	* specified range. If <code>end > start</code> then an empty range is
1132	* retained, leaving the set empty.
1133	*
1134	* @param start first character, inclusive, of range to be retained
1135	* to this set.
1136	* @param end last character, inclusive, of range to be retained
1137	* to this set.
1138	*/
1139	UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1140	if (pinCodePoint(start) <= pinCodePoint(end)) {
1141	UChar32 range[`3`] = { start, end+`1`, UNICODESET_HIGH };
1142	retain(range, `2`, `0`);
1143	} else {
1144	clear();
1145	}
1146	return *this;
1147	}
1148
1149	UnicodeSet& UnicodeSet::retain(UChar32 c) {
1150	return retain(c, c);
1151	}
1152
1153	/**
1154	* Removes the specified range from this set if it is present.
1155	* The set will not contain the specified range once the call
1156	* returns. If <code>end > start</code> then an empty range is
1157	* removed, leaving the set unchanged.
1158	*
1159	* @param start first character, inclusive, of range to be removed
1160	* from this set.
1161	* @param end last character, inclusive, of range to be removed
1162	* from this set.
1163	*/
1164	UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1165	if (pinCodePoint(start) <= pinCodePoint(end)) {
1166	UChar32 range[`3`] = { start, end+`1`, UNICODESET_HIGH };
1167	retain(range, `2`, `2`);
1168	}
1169	return *this;
1170	}
1171
1172	/**
1173	* Removes the specified character from this set if it is present.
1174	* The set will not contain the specified range once the call
1175	* returns.
1176	*/
1177	UnicodeSet& UnicodeSet::remove(UChar32 c) {
1178	return remove(c, c);
1179	}
1180
1181	/**
1182	* Removes the specified string from this set if it is present.
1183	* The set will not contain the specified character once the call
1184	* returns.
1185	* @param the source string
1186	* @return the modified set, for chaining
1187	*/
1188	UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1189	if (s.length() == `0` \|\| isFrozen() \|\| isBogus()) return *this;
1190	int32_t cp = getSingleCP(s);
1191	if (cp < `0`) {
1192	if (strings != nullptr && strings->removeElement((void*) &s)) {
1193	releasePattern();
1194	}
1195	} else {
1196	remove((UChar32)cp, (UChar32)cp);
1197	}
1198	return *this;
1199	}
1200
1201	/**
1202	* Complements the specified range in this set. Any character in
1203	* the range will be removed if it is in this set, or will be
1204	* added if it is not in this set. If <code>end > start</code>
1205	* then an empty range is xor'ed, leaving the set unchanged.
1206	*
1207	* @param start first character, inclusive, of range to be removed
1208	* from this set.
1209	* @param end last character, inclusive, of range to be removed
1210	* from this set.
1211	*/
1212	UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1213	if (isFrozen() \|\| isBogus()) {
1214	return *this;
1215	}
1216	if (pinCodePoint(start) <= pinCodePoint(end)) {
1217	UChar32 range[`3`] = { start, end+`1`, UNICODESET_HIGH };
1218	exclusiveOr(range, `2`, `0`);
1219	}
1220	releasePattern();
1221	return *this;
1222	}
1223
1224	UnicodeSet& UnicodeSet::complement(UChar32 c) {
1225	return complement(c, c);
1226	}
1227
1228	/**
1229	* This is equivalent to
1230	* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1231	*/
1232	UnicodeSet& UnicodeSet::complement(void) {
1233	if (isFrozen() \|\| isBogus()) {
1234	return *this;
1235	}
1236	if (list[`0`] == UNICODESET_LOW) {
1237	uprv_memmove(list, list + `1`, (size_t)(len-`1`)*sizeof(UChar32));
1238	--len;
1239	} else {
1240	if (!ensureCapacity(len+`1`)) {
1241	return *this;
1242	}
1243	uprv_memmove(list + `1`, list, (size_t)len*sizeof(UChar32));
1244	list[`0`] = UNICODESET_LOW;
1245	++len;
1246	}
1247	releasePattern();
1248	return *this;
1249	}
1250
1251	/**
1252	* Complement the specified string in this set.
1253	* The set will not contain the specified string once the call
1254	* returns.
1255	* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1256	* @param s the string to complement
1257	* @return this object, for chaining
1258	*/
1259	UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1260	if (s.length() == `0` \|\| isFrozen() \|\| isBogus()) return *this;
1261	int32_t cp = getSingleCP(s);
1262	if (cp < `0`) {
1263	if (stringsContains(s)) {
1264	strings->removeElement((void*) &s);
1265	} else {
1266	_add(s);
1267	}
1268	releasePattern();
1269	} else {
1270	complement((UChar32)cp, (UChar32)cp);
1271	}
1272	return *this;
1273	}
1274
1275	/**
1276	* Adds all of the elements in the specified set to this set if
1277	* they're not already present. This operation effectively
1278	* modifies this set so that its value is the <i>union</i> of the two
1279	* sets. The behavior of this operation is unspecified if the specified
1280	* collection is modified while the operation is in progress.
1281	*
1282	* @param c set whose elements are to be added to this set.
1283	* @see #add(char, char)
1284	*/
1285	UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1286	if ( c.len>`0` && c.list!=NULL ) {
1287	add(c.list, c.len, `0`);
1288	}
1289
1290	// Add strings in order
1291	if ( c.strings!=NULL ) {
1292	for (int32_t i=`0`; i<c.strings->size(); ++i) {
1293	const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1294	if (!stringsContains(*s)) {
1295	_add(*s);
1296	}
1297	}
1298	}
1299	return *this;
1300	}
1301
1302	/**
1303	* Retains only the elements in this set that are contained in the
1304	* specified set. In other words, removes from this set all of
1305	* its elements that are not contained in the specified set. This
1306	* operation effectively modifies this set so that its value is
1307	* the <i>intersection</i> of the two sets.
1308	*
1309	* @param c set that defines which elements this set will retain.
1310	*/
1311	UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1312	if (isFrozen() \|\| isBogus()) {
1313	return *this;
1314	}
1315	retain(c.list, c.len, `0`);
1316	if (hasStrings()) {
1317	if (!c.hasStrings()) {
1318	strings->removeAllElements();
1319	} else {
1320	strings->retainAll(*c.strings);
1321	}
1322	}
1323	return *this;
1324	}
1325
1326	/**
1327	* Removes from this set all of its elements that are contained in the
1328	* specified set. This operation effectively modifies this
1329	* set so that its value is the <i>asymmetric set difference</i> of
1330	* the two sets.
1331	*
1332	* @param c set that defines which elements will be removed from
1333	* this set.
1334	*/
1335	UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1336	if (isFrozen() \|\| isBogus()) {
1337	return *this;
1338	}
1339	retain(c.list, c.len, `2`);
1340	if (hasStrings() && c.hasStrings()) {
1341	strings->removeAll(*c.strings);
1342	}
1343	return *this;
1344	}
1345
1346	/**
1347	* Complements in this set all elements contained in the specified
1348	* set. Any character in the other set will be removed if it is
1349	* in this set, or will be added if it is not in this set.
1350	*
1351	* @param c set that defines which elements will be xor'ed from
1352	* this set.
1353	*/
1354	UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1355	if (isFrozen() \|\| isBogus()) {
1356	return *this;
1357	}
1358	exclusiveOr(c.list, c.len, `0`);
1359
1360	if (c.strings != nullptr) {
1361	for (int32_t i=`0`; i<c.strings->size(); ++i) {
1362	void* e = c.strings->elementAt(i);
1363	if (strings == nullptr \|\| !strings->removeElement(e)) {
1364	_add((const* UnicodeString*)e);
1365	}
1366	}
1367	}
1368	return *this;
1369	}
1370
1371	/**
1372	* Removes all of the elements from this set. This set will be
1373	* empty after this call returns.
1374	*/
1375	UnicodeSet& UnicodeSet::clear(void) {
1376	if (isFrozen()) {
1377	return *this;
1378	}
1379	list[`0`] = UNICODESET_HIGH;
1380	len = `1`;
1381	releasePattern();
1382	if (strings != NULL) {
1383	strings->removeAllElements();
1384	}
1385	// Remove bogus
1386	fFlags = `0`;
1387	return *this;
1388	}
1389
1390	/**
1391	* Iteration method that returns the number of ranges contained in
1392	* this set.
1393	* @see #getRangeStart
1394	* @see #getRangeEnd
1395	*/
1396	int32_t UnicodeSet::getRangeCount() const {
1397	return len/`2`;
1398	}
1399
1400	/**
1401	* Iteration method that returns the first character in the
1402	* specified range of this set.
1403	* @see #getRangeCount
1404	* @see #getRangeEnd
1405	*/
1406	UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1407	return list[index*`2`];
1408	}
1409
1410	/**
1411	* Iteration method that returns the last character in the
1412	* specified range of this set.
1413	* @see #getRangeStart
1414	* @see #getRangeEnd
1415	*/
1416	UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1417	return list[index*`2` + `1`] - `1`;
1418	}
1419
1420	const UnicodeString* UnicodeSet::getString(int32_t index) const {
1421	return (const UnicodeString*) strings->elementAt(index);
1422	}
1423
1424	/**
1425	* Reallocate this objects internal structures to take up the least
1426	* possible space, without changing this object's value.
1427	*/
1428	UnicodeSet& UnicodeSet::compact() {
1429	if (isFrozen() \|\| isBogus()) {
1430	return *this;
1431	}
1432	// Delete buffer first to defragment memory less.
1433	if (buffer != stackList) {
1434	uprv_free(buffer);
1435	buffer = NULL;
1436	bufferCapacity = `0`;
1437	}
1438	if (list == stackList) {
1439	// pass
1440	} else if (len <= INITIAL_CAPACITY) {
1441	uprv_memcpy(stackList, list, len * sizeof(UChar32));
1442	uprv_free(list);
1443	list = stackList;
1444	capacity = INITIAL_CAPACITY;
1445	} else if ((len + `7`) < capacity) {
1446	// If we have more than a little unused capacity, shrink it to len.
1447	UChar32* temp = (UChar32) uprv_realloc(list, sizeof(UChar32) len);
1448	if (temp) {
1449	list = temp;
1450	capacity = len;
1451	}
1452	// else what the heck happened?! We allocated less memory!
1453	// Oh well. We'll keep our original array.
1454	}
1455	if (strings != nullptr && strings->isEmpty()) {
1456	delete strings;
1457	strings = nullptr;
1458	}
1459	return *this;
1460	}
1461
1462	#ifdef DEBUG_SERIALIZE
1463	#include <stdio.h>
1464	#endif
1465
1466	/**
1467	* Deserialize constructor.
1468	*/
1469	UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
1470	UErrorCode &ec) {
1471
1472	if(U_FAILURE(ec)) {
1473	setToBogus();
1474	return;
1475	}
1476
1477	if( (serialization != kSerialized)
1478	\|\| (data==NULL)
1479	\|\| (dataLen < `1`)) {
1480	ec = U_ILLEGAL_ARGUMENT_ERROR;
1481	setToBogus();
1482	return;
1483	}
1484
1485	// bmp?
1486	int32_t headerSize = ((data[`0`]&`0x8000`)) ?`2`:`1`;
1487	int32_t bmpLength = (headerSize==`1`)?data[`0`]:data[`1`];
1488
1489	int32_t newLength = (((data[`0`]&`0x7FFF`)-bmpLength)/`2`)+bmpLength;
1490	#ifdef DEBUG_SERIALIZE
1491	printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[`0`],data[`1`],data[`2`],data[`3`]);
1492	#endif
1493	if(!ensureCapacity(newLength + `1`)) { // +1 for HIGH
1494	return;
1495	}
1496	// copy bmp
1497	int32_t i;
1498	for(i = `0`; i< bmpLength;i++) {
1499	list[i] = data[i+headerSize];
1500	#ifdef DEBUG_SERIALIZE
1501	printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1502	#endif
1503	}
1504	// copy smp
1505	for(i=bmpLength;i<newLength;i++) {
1506	list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*`2`+`0`] << `16`) +
1507	((UChar32)data[headerSize+bmpLength+(i-bmpLength)*`2`+`1`]);
1508	#ifdef DEBUG_SERIALIZE
1509	printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1510	#endif
1511	}
1512	U_ASSERT(i == newLength);
1513	if (i == `0` \|\| list[i - `1`] != UNICODESET_HIGH) {
1514	list[i++] = UNICODESET_HIGH;
1515	}
1516	len = i;
1517	}
1518
1519
1520	int32_t UnicodeSet::serialize(uint16_t dest, int32_t destCapacity, UErrorCode& ec) const* {
1521	int32_t bmpLength, length, destLength;
1522
1523	if (U_FAILURE(ec)) {
1524	return `0`;
1525	}
1526
1527	if (destCapacity<`0` \|\| (destCapacity>`0` && dest==NULL)) {
1528	ec=U_ILLEGAL_ARGUMENT_ERROR;
1529	return `0`;
1530	}
1531
1532	/ count necessary 16-bit units /
1533	length=this->len-`1`; // Subtract 1 to ignore final UNICODESET_HIGH
1534	// assert(length>=0);
1535	if (length==`0`) {
1536	/ empty set /
1537	if (destCapacity>`0`) {
1538	*dest=`0`;
1539	} else {
1540	ec=U_BUFFER_OVERFLOW_ERROR;
1541	}
1542	return `1`;
1543	}
1544	/ now length>0 /
1545
1546	if (this->list[length-`1`]<=`0xffff`) {
1547	/ all BMP /
1548	bmpLength=length;
1549	} else if (this->list[`0`]>=`0x10000`) {
1550	/ all supplementary /
1551	bmpLength=`0`;
1552	length*=`2`;
1553	} else {
1554	/ some BMP, some supplementary /
1555	for (bmpLength=`0`; bmpLength<length && this->list[bmpLength]<=`0xffff`; ++bmpLength) {}
1556	length=bmpLength+`2`*(length-bmpLength);
1557	}
1558	#ifdef DEBUG_SERIALIZE
1559	printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1560	#endif
1561	/ length: number of 16-bit array units /
1562	if (length>`0x7fff`) {
1563	/ there are only 15 bits for the length in the first serialized word /
1564	ec=U_INDEX_OUTOFBOUNDS_ERROR;
1565	return `0`;
1566	}
1567
1568	/*
1569	* total serialized length:
1570	* number of 16-bit array units (length) +
1571	* 1 length unit (always) +
1572	* 1 bmpLength unit (if there are supplementary values)
1573	*/
1574	destLength=length+((length>bmpLength)?`2`:`1`);
1575	if (destLength<=destCapacity) {
1576	const UChar32 *p;
1577	int32_t i;
1578
1579	#ifdef DEBUG_SERIALIZE
1580	printf("writeHdr\n");
1581	#endif
1582	*dest=(uint16_t)length;
1583	if (length>bmpLength) {
1584	*dest\|=`0x8000`;
1585	*++dest=(uint16_t)bmpLength;
1586	}
1587	++dest;
1588
1589	/ write the BMP part of the array /
1590	p=this->list;
1591	for (i=`0`; i<bmpLength; ++i) {
1592	#ifdef DEBUG_SERIALIZE
1593	printf("writebmp: %x\n", (int)*p);
1594	#endif
1595	dest++=(uint16_t)p++;
1596	}
1597
1598	/ write the supplementary part of the array /
1599	for (; i<length; i+=`2`) {
1600	#ifdef DEBUG_SERIALIZE
1601	printf("write32: %x\n", (int)*p);
1602	#endif
1603	dest++=(uint16_t)(p>>`16`);
1604	dest++=(uint16_t)p++;
1605	}
1606	} else {
1607	ec=U_BUFFER_OVERFLOW_ERROR;
1608	}
1609	return destLength;
1610	}
1611
1612	//----------------------------------------------------------------
1613	// Implementation: Utility methods
1614	//----------------------------------------------------------------
1615
1616	/**
1617	* Allocate our strings vector and return TRUE if successful.
1618	*/
1619	UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1620	if (U_FAILURE(status)) {
1621	return FALSE;
1622	}
1623	strings = new UVector (uprv_deleteUObject,
1624	uhash_compareUnicodeString, `1`, status);
1625	if (strings == NULL) { // Check for memory allocation error.
1626	status = U_MEMORY_ALLOCATION_ERROR;
1627	return FALSE;
1628	}
1629	if (U_FAILURE(status)) {
1630	delete strings;
1631	strings = NULL;
1632	return FALSE;
1633	}
1634	return TRUE;
1635	}
1636
1637	int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
1638	// Grow exponentially to reduce the frequency of allocations.
1639	if (minCapacity < INITIAL_CAPACITY) {
1640	return minCapacity + INITIAL_CAPACITY;
1641	} else if (minCapacity <= `2500`) {
1642	return `5` * minCapacity;
1643	} else {
1644	int32_t newCapacity = `2` * minCapacity;
1645	if (newCapacity > MAX_LENGTH) {
1646	newCapacity = MAX_LENGTH;
1647	}
1648	return newCapacity;
1649	}
1650	}
1651
1652	bool UnicodeSet::ensureCapacity(int32_t newLen) {
1653	if (newLen > MAX_LENGTH) {
1654	newLen = MAX_LENGTH;
1655	}
1656	if (newLen <= capacity) {
1657	return true;
1658	}
1659	int32_t newCapacity = nextCapacity(newLen);
1660	UChar32* temp = (UChar32) uprv_malloc(newCapacity sizeof(UChar32));
1661	if (temp == NULL) {
1662	setToBogus(); // set the object to bogus state if an OOM failure occurred.
1663	return false;
1664	}
1665	// Copy only the actual contents.
1666	uprv_memcpy(temp, list, len * sizeof(UChar32));
1667	if (list != stackList) {
1668	uprv_free(list);
1669	}
1670	list = temp;
1671	capacity = newCapacity;
1672	return true;
1673	}
1674
1675	bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1676	if (newLen > MAX_LENGTH) {
1677	newLen = MAX_LENGTH;
1678	}
1679	if (newLen <= bufferCapacity) {
1680	return true;
1681	}
1682	int32_t newCapacity = nextCapacity(newLen);
1683	UChar32* temp = (UChar32) uprv_malloc(newCapacity sizeof(UChar32));
1684	if (temp == NULL) {
1685	setToBogus();
1686	return false;
1687	}
1688	// The buffer has no contents to be copied.
1689	// It is always filled from scratch after this call.
1690	if (buffer != stackList) {
1691	uprv_free(buffer);
1692	}
1693	buffer = temp;
1694	bufferCapacity = newCapacity;
1695	return true;
1696	}
1697
1698	/**
1699	* Swap list and buffer.
1700	*/
1701	void UnicodeSet::swapBuffers(void) {
1702	// swap list and buffer
1703	UChar32* temp = list;
1704	list = buffer;
1705	buffer = temp;
1706
1707	int32_t c = capacity;
1708	capacity = bufferCapacity;
1709	bufferCapacity = c;
1710	}
1711
1712	void UnicodeSet::setToBogus() {
1713	clear(); // Remove everything in the set.
1714	fFlags = kIsBogus;
1715	}
1716
1717	//----------------------------------------------------------------
1718	// Implementation: Fundamental operators
1719	//----------------------------------------------------------------
1720
1721	static inline UChar32 max(UChar32 a, UChar32 b) {
1722	return (a > b) ? a : b;
1723	}
1724
1725	// polarity = 0, 3 is normal: x xor y
1726	// polarity = 1, 2: x xor ~y == x === y
1727
1728	void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1729	if (isFrozen() \|\| isBogus()) {
1730	return;
1731	}
1732	if (!ensureBufferCapacity(len + otherLen)) {
1733	return;
1734	}
1735
1736	int32_t i = `0`, j = `0`, k = `0`;
1737	UChar32 a = list[i++];
1738	UChar32 b;
1739	if (polarity == `1` \|\| polarity == `2`) {
1740	b = UNICODESET_LOW;
1741	if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1742	++j;
1743	b = other[j];
1744	}
1745	} else {
1746	b = other[j++];
1747	}
1748	// simplest of all the routines
1749	// sort the values, discarding identicals!
1750	for (;;) {
1751	if (a < b) {
1752	buffer[k++] = a;
1753	a = list[i++];
1754	} else if (b < a) {
1755	buffer[k++] = b;
1756	b = other[j++];
1757	} else if (a != UNICODESET_HIGH) { // at this point, a == b
1758	// discard both values!
1759	a = list[i++];
1760	b = other[j++];
1761	} else { // DONE!
1762	buffer[k++] = UNICODESET_HIGH;
1763	len = k;
1764	break;
1765	}
1766	}
1767	swapBuffers();
1768	releasePattern();
1769	}
1770
1771	// polarity = 0 is normal: x union y
1772	// polarity = 2: x union ~y
1773	// polarity = 1: ~x union y
1774	// polarity = 3: ~x union ~y
1775
1776	void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1777	if (isFrozen() \|\| isBogus() \|\| other==NULL) {
1778	return;
1779	}
1780	if (!ensureBufferCapacity(len + otherLen)) {
1781	return;
1782	}
1783
1784	int32_t i = `0`, j = `0`, k = `0`;
1785	UChar32 a = list[i++];
1786	UChar32 b = other[j++];
1787	// change from xor is that we have to check overlapping pairs
1788	// polarity bit 1 means a is second, bit 2 means b is.
1789	for (;;) {
1790	switch (polarity) {
1791	case `0`: // both first; take lower if unequal
1792	if (a < b) { // take a
1793	// Back up over overlapping ranges in buffer[]
1794	if (k > `0` && a <= buffer[k-`1`]) {
1795	// Pick latter end value in buffer[] vs. list[]
1796	a = max(list[i], buffer[--k]);
1797	} else {
1798	// No overlap
1799	buffer[k++] = a;
1800	a = list[i];
1801	}
1802	i++; // Common if/else code factored out
1803	polarity ^= `1`;
1804	} else if (b < a) { // take b
1805	if (k > `0` && b <= buffer[k-`1`]) {
1806	b = max(other[j], buffer[--k]);
1807	} else {
1808	buffer[k++] = b;
1809	b = other[j];
1810	}
1811	j++;
1812	polarity ^= `2`;
1813	} else { // a == b, take a, drop b
1814	if (a == UNICODESET_HIGH) goto loop_end;
1815	// This is symmetrical; it doesn't matter if
1816	// we backtrack with a or b. - liu
1817	if (k > `0` && a <= buffer[k-`1`]) {
1818	a = max(list[i], buffer[--k]);
1819	} else {
1820	// No overlap
1821	buffer[k++] = a;
1822	a = list[i];
1823	}
1824	i++;
1825	polarity ^= `1`;
1826	b = other[j++];
1827	polarity ^= `2`;
1828	}
1829	break;
1830	case `3`: // both second; take higher if unequal, and drop other
1831	if (b <= a) { // take a
1832	if (a == UNICODESET_HIGH) goto loop_end;
1833	buffer[k++] = a;
1834	} else { // take b
1835	if (b == UNICODESET_HIGH) goto loop_end;
1836	buffer[k++] = b;
1837	}
1838	a = list[i++];
1839	polarity ^= `1`; // factored common code
1840	b = other[j++];
1841	polarity ^= `2`;
1842	break;
1843	case `1`: // a second, b first; if b < a, overlap
1844	if (a < b) { // no overlap, take a
1845	buffer[k++] = a; a = list[i++]; polarity ^= `1`;
1846	} else if (b < a) { // OVERLAP, drop b
1847	b = other[j++];
1848	polarity ^= `2`;
1849	} else { // a == b, drop both!
1850	if (a == UNICODESET_HIGH) goto loop_end;
1851	a = list[i++];
1852	polarity ^= `1`;
1853	b = other[j++];
1854	polarity ^= `2`;
1855	}
1856	break;
1857	case `2`: // a first, b second; if a < b, overlap
1858	if (b < a) { // no overlap, take b
1859	buffer[k++] = b;
1860	b = other[j++];
1861	polarity ^= `2`;
1862	} else if (a < b) { // OVERLAP, drop a
1863	a = list[i++];
1864	polarity ^= `1`;
1865	} else { // a == b, drop both!
1866	if (a == UNICODESET_HIGH) goto loop_end;
1867	a = list[i++];
1868	polarity ^= `1`;
1869	b = other[j++];
1870	polarity ^= `2`;
1871	}
1872	break;
1873	}
1874	}
1875	loop_end:
1876	buffer[k++] = UNICODESET_HIGH; // terminate
1877	len = k;
1878	swapBuffers();
1879	releasePattern();
1880	}
1881
1882	// polarity = 0 is normal: x intersect y
1883	// polarity = 2: x intersect ~y == set-minus
1884	// polarity = 1: ~x intersect y
1885	// polarity = 3: ~x intersect ~y
1886
1887	void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1888	if (isFrozen() \|\| isBogus()) {
1889	return;
1890	}
1891	if (!ensureBufferCapacity(len + otherLen)) {
1892	return;
1893	}
1894
1895	int32_t i = `0`, j = `0`, k = `0`;
1896	UChar32 a = list[i++];
1897	UChar32 b = other[j++];
1898	// change from xor is that we have to check overlapping pairs
1899	// polarity bit 1 means a is second, bit 2 means b is.
1900	for (;;) {
1901	switch (polarity) {
1902	case `0`: // both first; drop the smaller
1903	if (a < b) { // drop a
1904	a = list[i++];
1905	polarity ^= `1`;
1906	} else if (b < a) { // drop b
1907	b = other[j++];
1908	polarity ^= `2`;
1909	} else { // a == b, take one, drop other
1910	if (a == UNICODESET_HIGH) goto loop_end;
1911	buffer[k++] = a;
1912	a = list[i++];
1913	polarity ^= `1`;
1914	b = other[j++];
1915	polarity ^= `2`;
1916	}
1917	break;
1918	case `3`: // both second; take lower if unequal
1919	if (a < b) { // take a
1920	buffer[k++] = a;
1921	a = list[i++];
1922	polarity ^= `1`;
1923	} else if (b < a) { // take b
1924	buffer[k++] = b;
1925	b = other[j++];
1926	polarity ^= `2`;
1927	} else { // a == b, take one, drop other
1928	if (a == UNICODESET_HIGH) goto loop_end;
1929	buffer[k++] = a;
1930	a = list[i++];
1931	polarity ^= `1`;
1932	b = other[j++];
1933	polarity ^= `2`;
1934	}
1935	break;
1936	case `1`: // a second, b first;
1937	if (a < b) { // NO OVERLAP, drop a
1938	a = list[i++];
1939	polarity ^= `1`;
1940	} else if (b < a) { // OVERLAP, take b
1941	buffer[k++] = b;
1942	b = other[j++];
1943	polarity ^= `2`;
1944	} else { // a == b, drop both!
1945	if (a == UNICODESET_HIGH) goto loop_end;
1946	a = list[i++];
1947	polarity ^= `1`;
1948	b = other[j++];
1949	polarity ^= `2`;
1950	}
1951	break;
1952	case `2`: // a first, b second; if a < b, overlap
1953	if (b < a) { // no overlap, drop b
1954	b = other[j++];
1955	polarity ^= `2`;
1956	} else if (a < b) { // OVERLAP, take a
1957	buffer[k++] = a;
1958	a = list[i++];
1959	polarity ^= `1`;
1960	} else { // a == b, drop both!
1961	if (a == UNICODESET_HIGH) goto loop_end;
1962	a = list[i++];
1963	polarity ^= `1`;
1964	b = other[j++];
1965	polarity ^= `2`;
1966	}
1967	break;
1968	}
1969	}
1970	loop_end:
1971	buffer[k++] = UNICODESET_HIGH; // terminate
1972	len = k;
1973	swapBuffers();
1974	releasePattern();
1975	}
1976
1977	/**
1978	* Append the <code>toPattern()</code> representation of a
1979	* string to the given <code>StringBuffer</code>.
1980	*/
1981	void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1982	escapeUnprintable) {
1983	UChar32 cp;
1984	for (int32_t i = `0`; i < s.length(); i += U16_LENGTH(cp)) {
1985	_appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1986	}
1987	}
1988
1989	/**
1990	* Append the <code>toPattern()</code> representation of a
1991	* character to the given <code>StringBuffer</code>.
1992	*/
1993	void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1994	escapeUnprintable) {
1995	if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1996	// Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1997	// unprintable
1998	if (ICU_Utility::escapeUnprintable(buf, c)) {
1999	return;
2000	}
2001	}
2002	// Okay to let ':' pass through
2003	switch (c) {
2004	case SET_OPEN:
2005	case SET_CLOSE:
2006	case HYPHEN:
2007	case COMPLEMENT:
2008	case INTERSECTION:
2009	case BACKSLASH:
2010	case OPEN_BRACE:
2011	case CLOSE_BRACE:
2012	case COLON:
2013	case SymbolTable::SYMBOL_REF:
2014	buf.append(BACKSLASH);
2015	break;
2016	default:
2017	// Escape whitespace
2018	if (PatternProps::isWhiteSpace(c)) {
2019	buf.append(BACKSLASH);
2020	}
2021	break;
2022	}
2023	buf.append(c);
2024	}
2025
2026	/**
2027	* Append a string representation of this set to result. This will be
2028	* a cleaned version of the string passed to applyPattern(), if there
2029	* is one. Otherwise it will be generated.
2030	*/
2031	UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2032	UBool escapeUnprintable) const
2033	{
2034	if (pat != NULL) {
2035	int32_t i;
2036	int32_t backslashCount = `0`;
2037	for (i=`0`; i<patLen; ) {
2038	UChar32 c;
2039	U16_NEXT(pat, i, patLen, c);
2040	if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
2041	// If the unprintable character is preceded by an odd
2042	// number of backslashes, then it has been escaped.
2043	// Before unescaping it, we delete the final
2044	// backslash.
2045	if ((backslashCount % `2`) == `1`) {
2046	result.truncate(result.length() - `1`);
2047	}
2048	ICU_Utility::escapeUnprintable(result, c);
2049	backslashCount = `0`;
2050	} else {
2051	result.append(c);
2052	if (c == BACKSLASH) {
2053	++backslashCount;
2054	} else {
2055	backslashCount = `0`;
2056	}
2057	}
2058	}
2059	return result;
2060	}
2061
2062	return _generatePattern(result, escapeUnprintable);
2063	}
2064
2065	/**
2066	* Returns a string representation of this set. If the result of
2067	* calling this function is passed to a UnicodeSet constructor, it
2068	* will produce another set that is equal to this one.
2069	*/
2070	UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2071	UBool escapeUnprintable) const
2072	{
2073	result.truncate(`0`);
2074	return _toPattern(result, escapeUnprintable);
2075	}
2076
2077	/**
2078	* Generate and append a string representation of this set to result.
2079	* This does not use this.pat, the cleaned up copy of the string
2080	* passed to applyPattern().
2081	*/
2082	UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2083	UBool escapeUnprintable) const
2084	{
2085	result.append(SET_OPEN);
2086
2087	// // Check against the predefined categories. We implicitly build
2088	// // up ALL category sets the first time toPattern() is called.
2089	// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2090	// if (this == getCategorySet(cat)) {*
2091	// result.append(COLON);
2092	// result.append(CATEGORY_NAMES, cat2, 2);*
2093	// return result.append(CATEGORY_CLOSE);
2094	// }
2095	// }
2096
2097	int32_t count = getRangeCount();
2098
2099	// If the set contains at least 2 intervals and includes both
2100	// MIN_VALUE and MAX_VALUE, then the inverse representation will
2101	// be more economical.
2102	if (count > `1` &&
2103	getRangeStart(`0`) == MIN_VALUE &&
2104	getRangeEnd(count-`1`) == MAX_VALUE) {
2105
2106	// Emit the inverse
2107	result.append(COMPLEMENT);
2108
2109	for (int32_t i = `1`; i < count; ++i) {
2110	UChar32 start = getRangeEnd(i-`1`)+`1`;
2111	UChar32 end = getRangeStart(i)-`1`;
2112	_appendToPat(result, start, escapeUnprintable);
2113	if (start != end) {
2114	if ((start+`1`) != end) {
2115	result.append(HYPHEN);
2116	}
2117	_appendToPat(result, end, escapeUnprintable);
2118	}
2119	}
2120	}
2121
2122	// Default; emit the ranges as pairs
2123	else {
2124	for (int32_t i = `0`; i < count; ++i) {
2125	UChar32 start = getRangeStart(i);
2126	UChar32 end = getRangeEnd(i);
2127	_appendToPat(result, start, escapeUnprintable);
2128	if (start != end) {
2129	if ((start+`1`) != end) {
2130	result.append(HYPHEN);
2131	}
2132	_appendToPat(result, end, escapeUnprintable);
2133	}
2134	}
2135	}
2136
2137	if (strings != nullptr) {
2138	for (int32_t i = `0`; i<strings->size(); ++i) {
2139	result.append(OPEN_BRACE);
2140	_appendToPat(result,
2141	(const* UnicodeString*) strings->elementAt(i),
2142	escapeUnprintable);
2143	result.append(CLOSE_BRACE);
2144	}
2145	}
2146	return result.append(SET_CLOSE);
2147	}
2148
2149	/**
2150	* Release existing cached pattern
2151	*/
2152	void UnicodeSet::releasePattern() {
2153	if (pat) {
2154	uprv_free(pat);
2155	pat = NULL;
2156	patLen = `0`;
2157	}
2158	}
2159
2160	/**
2161	* Set the new pattern to cache.
2162	*/
2163	void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
2164	releasePattern();
2165	pat = (UChar )uprv_malloc((newPatLen + `1`) sizeof(UChar));
2166	if (pat) {
2167	patLen = newPatLen;
2168	u_memcpy(pat, newPat, patLen);
2169	pat[patLen] = `0`;
2170	}
2171	// else we don't care if malloc failed. This was just a nice cache.
2172	// We can regenerate an equivalent pattern later when requested.
2173	}
2174
2175	UnicodeSet *UnicodeSet::freeze() {
2176	if(!isFrozen() && !isBogus()) {
2177	compact();
2178
2179	// Optimize contains() and span() and similar functions.
2180	if (hasStrings()) {
2181	stringSpan = new UnicodeSetStringSpan (*this, *strings, UnicodeSetStringSpan::ALL);
2182	if (stringSpan == nullptr) {
2183	setToBogus();
2184	return this;
2185	} else if (!stringSpan->needsStringSpanUTF16()) {
2186	// All strings are irrelevant for span() etc. because
2187	// all of each string's code points are contained in this set.
2188	// Do not check needsStringSpanUTF8() because UTF-8 has at most as
2189	// many relevant strings as UTF-16.
2190	// (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2191	delete stringSpan;
2192	stringSpan = NULL;
2193	}
2194	}
2195	if (stringSpan == NULL) {
2196	// No span-relevant strings: Optimize for code point spans.
2197	bmpSet=new BMPSet (list, len);
2198	if (bmpSet == NULL) { // Check for memory allocation error.
2199	setToBogus();
2200	}
2201	}
2202	}
2203	return this;
2204	}
2205
2206	int32_t UnicodeSet::span(const UChar s, int32_t length, USetSpanCondition spanCondition) const* {
2207	if(length>`0` && bmpSet!=NULL) {
2208	return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2209	}
2210	if(length<`0`) {
2211	length=u_strlen(s);
2212	}
2213	if(length==`0`) {
2214	return `0`;
2215	}
2216	if(stringSpan!=NULL) {
2217	return stringSpan->span(s, length, spanCondition);
2218	} else if(hasStrings()) {
2219	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2220	UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2221	UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2222	UnicodeSetStringSpan strSpan(*this, *strings, which);
2223	if(strSpan.needsStringSpanUTF16()) {
2224	return strSpan.span(s, length, spanCondition);
2225	}
2226	}
2227
2228	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2229	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2230	}
2231
2232	UChar32 c;
2233	int32_t start=`0`, prev=`0`;
2234	do {
2235	U16_NEXT(s, start, length, c);
2236	if(spanCondition!=contains(c)) {
2237	break;
2238	}
2239	} while((prev=start)<length);
2240	return prev;
2241	}
2242
2243	int32_t UnicodeSet::spanBack(const UChar s, int32_t length, USetSpanCondition spanCondition) const* {
2244	if(length>`0` && bmpSet!=NULL) {
2245	return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2246	}
2247	if(length<`0`) {
2248	length=u_strlen(s);
2249	}
2250	if(length==`0`) {
2251	return `0`;
2252	}
2253	if(stringSpan!=NULL) {
2254	return stringSpan->spanBack(s, length, spanCondition);
2255	} else if(hasStrings()) {
2256	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2257	UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2258	UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2259	UnicodeSetStringSpan strSpan(*this, *strings, which);
2260	if(strSpan.needsStringSpanUTF16()) {
2261	return strSpan.spanBack(s, length, spanCondition);
2262	}
2263	}
2264
2265	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2266	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2267	}
2268
2269	UChar32 c;
2270	int32_t prev=length;
2271	do {
2272	U16_PREV(s, `0`, length, c);
2273	if(spanCondition!=contains(c)) {
2274	break;
2275	}
2276	} while((prev=length)>`0`);
2277	return prev;
2278	}
2279
2280	int32_t UnicodeSet::spanUTF8(const char s, int32_t length, USetSpanCondition spanCondition) const* {
2281	if(length>`0` && bmpSet!=NULL) {
2282	const uint8_t s0=(const* uint8_t *)s;
2283	return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2284	}
2285	if(length<`0`) {
2286	length=(int32_t)uprv_strlen(s);
2287	}
2288	if(length==`0`) {
2289	return `0`;
2290	}
2291	if(stringSpan!=NULL) {
2292	return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2293	} else if(hasStrings()) {
2294	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2295	UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2296	UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2297	UnicodeSetStringSpan strSpan(*this, *strings, which);
2298	if(strSpan.needsStringSpanUTF8()) {
2299	return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2300	}
2301	}
2302
2303	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2304	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2305	}
2306
2307	UChar32 c;
2308	int32_t start=`0`, prev=`0`;
2309	do {
2310	U8_NEXT_OR_FFFD(s, start, length, c);
2311	if(spanCondition!=contains(c)) {
2312	break;
2313	}
2314	} while((prev=start)<length);
2315	return prev;
2316	}
2317
2318	int32_t UnicodeSet::spanBackUTF8(const char s, int32_t length, USetSpanCondition spanCondition) const* {
2319	if(length>`0` && bmpSet!=NULL) {
2320	const uint8_t s0=(const* uint8_t *)s;
2321	return bmpSet->spanBackUTF8(s0, length, spanCondition);
2322	}
2323	if(length<`0`) {
2324	length=(int32_t)uprv_strlen(s);
2325	}
2326	if(length==`0`) {
2327	return `0`;
2328	}
2329	if(stringSpan!=NULL) {
2330	return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2331	} else if(hasStrings()) {
2332	uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2333	UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2334	UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2335	UnicodeSetStringSpan strSpan(*this, *strings, which);
2336	if(strSpan.needsStringSpanUTF8()) {
2337	return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2338	}
2339	}
2340
2341	if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2342	spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2343	}
2344
2345	UChar32 c;
2346	int32_t prev=length;
2347	do {
2348	U8_PREV_OR_FFFD(s, `0`, length, c);
2349	if(spanCondition!=contains(c)) {
2350	break;
2351	}
2352	} while((prev=length)>`0`);
2353	return prev;
2354	}
2355
2356	U_NAMESPACE_END
2357

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/uniset.cpp