uniset_props.cpp source code [Godot/thirdparty/icu4c/common/uniset_props.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 1999-2014, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: uniset_props.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2004aug25
16	* created by: Markus W. Scherer
17	*
18	* Character property dependent functions moved here from uniset.cpp
19	*/
20
21	#include "unicode/utypes.h"
22	#include "unicode/uniset.h"
23	#include "unicode/parsepos.h"
24	#include "unicode/uchar.h"
25	#include "unicode/uscript.h"
26	#include "unicode/symtable.h"
27	#include "unicode/uset.h"
28	#include "unicode/locid.h"
29	#include "unicode/brkiter.h"
30	#include "uset_imp.h"
31	#include "ruleiter.h"
32	#include "cmemory.h"
33	#include "ucln_cmn.h"
34	#include "util.h"
35	#include "uvector.h"
36	#include "uprops.h"
37	#include "propname.h"
38	#include "normalizer2impl.h"
39	#include "uinvchar.h"
40	#include "uprops.h"
41	#include "charstr.h"
42	#include "cstring.h"
43	#include "mutex.h"
44	#include "umutex.h"
45	#include "uassert.h"
46	#include "hash.h"
47
48	U_NAMESPACE_USE
49
50	// Special property set IDs
51	static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
52	static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
53	static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
54
55	// Unicode name property alias
56	#define NAME_PROP "na"
57	#define NAME_PROP_LENGTH 2
58
59	// Cached sets ------------------------------------------------------------- ***
60
61	U_CDECL_BEGIN
62	static UBool U_CALLCONV uset_cleanup();
63
64	static UnicodeSet *uni32Singleton;
65	static icu::UInitOnce uni32InitOnce {};
66
67	/**
68	* Cleanup function for UnicodeSet
69	*/
70	static UBool U_CALLCONV uset_cleanup() {
71	delete uni32Singleton;
72	uni32Singleton = nullptr;
73	uni32InitOnce.reset();
74	return true;
75	}
76
77	U_CDECL_END
78
79	U_NAMESPACE_BEGIN
80
81	namespace {
82
83	// Cache some sets for other services -------------------------------------- ***
84	void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
85	U_ASSERT(uni32Singleton == nullptr);
86	uni32Singleton = new UnicodeSet (UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
87	if(uni32Singleton==nullptr) {
88	errorCode=U_MEMORY_ALLOCATION_ERROR;
89	} else {
90	uni32Singleton->freeze();
91	}
92	ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
93	}
94
95
96	U_CFUNC UnicodeSet *
97	uniset_getUnicode32Instance(UErrorCode &errorCode) {
98	umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
99	return uni32Singleton;
100	}
101
102	// helper functions for matching of pattern syntax pieces ------------------ ***
103	// these functions are parallel to the PERL_OPEN etc. strings above
104
105	// using these functions is not only faster than UnicodeString::compare() and
106	// caseCompare(), but they also make UnicodeSet work for simple patterns when
107	// no Unicode properties data is available - when caseCompare() fails
108
109	static inline UBool
110	isPerlOpen(const UnicodeString &pattern, int32_t pos) {
111	char16_t c;
112	return pattern.charAt(pos)==u`'\\'` && ((c=pattern.charAt(pos+`1`))==u`'p'` \|\| c==u`'P'`);
113	}
114
115	/static inline UBool*
116	isPerlClose(const UnicodeString &pattern, int32_t pos) {
117	return pattern.charAt(pos)==u'}';
118	}/*
119
120	static inline UBool
121	isNameOpen(const UnicodeString &pattern, int32_t pos) {
122	return pattern.charAt(pos)==u`'\\'` && pattern.charAt(pos+`1`)==u`'N'`;
123	}
124
125	static inline UBool
126	isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
127	return pattern.charAt(pos)==u`'['` && pattern.charAt(pos+`1`)==u`':'`;
128	}
129
130	/static inline UBool*
131	isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
132	return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
133	}/*
134
135	// TODO memory debugging provided inside uniset.cpp
136	// could be made available here but probably obsolete with use of modern
137	// memory leak checker tools
138	#define _dbgct(me)
139
140	} // namespace
141
142	//----------------------------------------------------------------
143	// Constructors &c
144	//----------------------------------------------------------------
145
146	/**
147	* Constructs a set from the given pattern, optionally ignoring
148	* white space. See the class description for the syntax of the
149	* pattern language.
150	* @param pattern a string specifying what characters are in the set
151	*/
152	UnicodeSet::UnicodeSet(const UnicodeString& pattern,
153	UErrorCode& status) {
154	applyPattern(pattern, status);
155	_dbgct(this);
156	}
157
158	//----------------------------------------------------------------
159	// Public API
160	//----------------------------------------------------------------
161
162	UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
163	UErrorCode& status) {
164	// Equivalent to
165	// return applyPattern(pattern, USET_IGNORE_SPACE, nullptr, status);
166	// but without dependency on closeOver().
167	ParsePosition pos(`0`);
168	applyPatternIgnoreSpace(pattern, pos, nullptr, status);
169	if (U_FAILURE(status)) return *this;
170
171	int32_t i = pos.getIndex();
172	// Skip over trailing whitespace
173	ICU_Utility::skipWhitespace(pattern, i, true);
174	if (i != pattern.length()) {
175	status = U_ILLEGAL_ARGUMENT_ERROR;
176	}
177	return *this;
178	}
179
180	void
181	UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
182	ParsePosition& pos,
183	const SymbolTable* symbols,
184	UErrorCode& status) {
185	if (U_FAILURE(status)) {
186	return;
187	}
188	if (isFrozen()) {
189	status = U_NO_WRITE_PERMISSION;
190	return;
191	}
192	// Need to build the pattern in a temporary string because
193	// _applyPattern calls add() etc., which set pat to empty.
194	UnicodeString rebuiltPat;
195	RuleCharacterIterator chars(pattern, symbols, pos);
196	applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, `0`, status);
197	if (U_FAILURE(status)) return;
198	if (chars.inVariable()) {
199	// syntaxError(chars, "Extra chars in variable value");
200	status = U_MALFORMED_SET;
201	return;
202	}
203	setPattern(rebuiltPat);
204	}
205
206	/**
207	* Return true if the given position, in the given pattern, appears
208	* to be the start of a UnicodeSet pattern.
209	*/
210	UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
211	return ((pos+`1`) < pattern.length() &&
212	pattern.charAt(pos) == (char16_t)`91`/[/) \|\|
213	resemblesPropertyPattern(pattern, pos);
214	}
215
216	//----------------------------------------------------------------
217	// Implementation: Pattern parsing
218	//----------------------------------------------------------------
219
220	namespace {
221
222	/**
223	* A small all-inline class to manage a UnicodeSet pointer. Add
224	* operator->() etc. as needed.
225	*/
226	class UnicodeSetPointer {
227	UnicodeSet* p;
228	public:
229	inline UnicodeSetPointer() : p(`0`) {}
230	inline ~UnicodeSetPointer() { delete p; }
231	inline UnicodeSet* pointer() { return p; }
232	inline UBool allocate() {
233	if (p == `0`) {
234	p = new UnicodeSet ();
235	}
236	return p != `0`;
237	}
238	};
239
240	constexpr int32_t MAX_DEPTH = `100`;
241
242	} // namespace
243
244	/**
245	* Parse the pattern from the given RuleCharacterIterator. The
246	* iterator is advanced over the parsed pattern.
247	* @param chars iterator over the pattern characters. Upon return
248	* it will be advanced to the first character after the parsed
249	* pattern, or the end of the iteration if all characters are
250	* parsed.
251	* @param symbols symbol table to use to parse and dereference
252	* variables, or null if none.
253	* @param rebuiltPat the pattern that was parsed, rebuilt or
254	* copied from the input pattern, as appropriate.
255	* @param options a bit mask of zero or more of the following:
256	* IGNORE_SPACE, CASE.
257	*/
258	void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
259	const SymbolTable* symbols,
260	UnicodeString& rebuiltPat,
261	uint32_t options,
262	UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
263	int32_t depth,
264	UErrorCode& ec) {
265	if (U_FAILURE(ec)) return;
266	if (depth > MAX_DEPTH) {
267	ec = U_ILLEGAL_ARGUMENT_ERROR;
268	return;
269	}
270
271	// Syntax characters: [ ] ^ - & { }
272
273	// Recognized special forms for chars, sets: c-c s-s s&s
274
275	int32_t opts = RuleCharacterIterator::PARSE_VARIABLES \|
276	RuleCharacterIterator::PARSE_ESCAPES;
277	if ((options & USET_IGNORE_SPACE) != `0`) {
278	opts \|= RuleCharacterIterator::SKIP_WHITESPACE;
279	}
280
281	UnicodeString patLocal, buf;
282	UBool usePat = false;
283	UnicodeSetPointer scratch;
284	RuleCharacterIterator::Pos backup;
285
286	// mode: 0=before [, 1=between [...], 2=after ]
287	// lastItem: 0=none, 1=char, 2=set
288	int8_t lastItem = `0`, mode = `0`;
289	UChar32 lastChar = `0`;
290	char16_t op = `0`;
291
292	UBool invert = false;
293
294	clear();
295
296	while (mode != `2` && !chars.atEnd()) {
297	U_ASSERT((lastItem == `0` && op == `0`) \|\|
298	(lastItem == `1` && (op == `0` \|\| op == u`'-'`)) \|\|
299	(lastItem == `2` && (op == `0` \|\| op == u`'-'` \|\| op == u`'&'`)));
300
301	UChar32 c = `0`;
302	UBool literal = false;
303	UnicodeSet* nested = `0`; // alias - do not delete
304
305	// -------- Check for property pattern
306
307	// setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
308	int8_t setMode = `0`;
309	if (resemblesPropertyPattern(chars, opts)) {
310	setMode = `2`;
311	}
312
313	// -------- Parse '[' of opening delimiter OR nested set.
314	// If there is a nested set, use `setMode' to define how
315	// the set should be parsed. If the '[' is part of the
316	// opening delimiter for this pattern, parse special
317	// strings "[", "[^", "[-", and "[^-". Check for stand-in
318	// characters representing a nested set in the symbol
319	// table.
320
321	else {
322	// Prepare to backup if necessary
323	chars.getPos(backup);
324	c = chars.next(opts, literal, ec);
325	if (U_FAILURE(ec)) return;
326
327	if (c == u`'['` && !literal) {
328	if (mode == `1`) {
329	chars.setPos(backup); // backup
330	setMode = `1`;
331	} else {
332	// Handle opening '[' delimiter
333	mode = `1`;
334	patLocal.append(u`'['`);
335	chars.getPos(backup); // prepare to backup
336	c = chars.next(opts, literal, ec);
337	if (U_FAILURE(ec)) return;
338	if (c == u`'^'` && !literal) {
339	invert = true;
340	patLocal.append(u`'^'`);
341	chars.getPos(backup); // prepare to backup
342	c = chars.next(opts, literal, ec);
343	if (U_FAILURE(ec)) return;
344	}
345	// Fall through to handle special leading '-';
346	// otherwise restart loop for nested [], \p{}, etc.
347	if (c == u`'-'`) {
348	literal = true;
349	// Fall through to handle literal '-' below
350	} else {
351	chars.setPos(backup); // backup
352	continue;
353	}
354	}
355	} else if (symbols != `0`) {
356	const UnicodeFunctor *m = symbols->lookupMatcher(c);
357	if (m != `0`) {
358	const UnicodeSet ms = dynamic_cast<const* UnicodeSet *>(m);
359	if (ms == nullptr) {
360	ec = U_MALFORMED_SET;
361	return;
362	}
363	// casting away const, but `nested' won't be modified
364	// (important not to modify stored set)
365	nested = const_cast<UnicodeSet*>(ms);
366	setMode = `3`;
367	}
368	}
369	}
370
371	// -------- Handle a nested set. This either is inline in
372	// the pattern or represented by a stand-in that has
373	// previously been parsed and was looked up in the symbol
374	// table.
375
376	if (setMode != `0`) {
377	if (lastItem == `1`) {
378	if (op != `0`) {
379	// syntaxError(chars, "Char expected after operator");
380	ec = U_MALFORMED_SET;
381	return;
382	}
383	add(lastChar, lastChar);
384	_appendToPat(patLocal, lastChar, false);
385	lastItem = `0`;
386	op = `0`;
387	}
388
389	if (op == u`'-'` \|\| op == u`'&'`) {
390	patLocal.append(op);
391	}
392
393	if (nested == `0`) {
394	// lazy allocation
395	if (!scratch.allocate()) {
396	ec = U_MEMORY_ALLOCATION_ERROR;
397	return;
398	}
399	nested = scratch.pointer();
400	}
401	switch (setMode) {
402	case `1`:
403	nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + `1`, ec);
404	break;
405	case `2`:
406	chars.skipIgnored(opts);
407	nested->applyPropertyPattern(chars, patLocal, ec);
408	if (U_FAILURE(ec)) return;
409	break;
410	case `3`: // `nested' already parsed
411	nested->_toPattern(patLocal, false);
412	break;
413	}
414
415	usePat = true;
416
417	if (mode == `0`) {
418	// Entire pattern is a category; leave parse loop
419	*this = *nested;
420	mode = `2`;
421	break;
422	}
423
424	switch (op) {
425	case u`'-'`:
426	removeAll(*nested);
427	break;
428	case u`'&'`:
429	retainAll(*nested);
430	break;
431	case `0`:
432	addAll(*nested);
433	break;
434	}
435
436	op = `0`;
437	lastItem = `2`;
438
439	continue;
440	}
441
442	if (mode == `0`) {
443	// syntaxError(chars, "Missing '['");
444	ec = U_MALFORMED_SET;
445	return;
446	}
447
448	// -------- Parse special (syntax) characters. If the
449	// current character is not special, or if it is escaped,
450	// then fall through and handle it below.
451
452	if (!literal) {
453	switch (c) {
454	case u`']'`:
455	if (lastItem == `1`) {
456	add(lastChar, lastChar);
457	_appendToPat(patLocal, lastChar, false);
458	}
459	// Treat final trailing '-' as a literal
460	if (op == u`'-'`) {
461	add(op, op);
462	patLocal.append(op);
463	} else if (op == u`'&'`) {
464	// syntaxError(chars, "Trailing '&'");
465	ec = U_MALFORMED_SET;
466	return;
467	}
468	patLocal.append(u`']'`);
469	mode = `2`;
470	continue;
471	case u`'-'`:
472	if (op == `0`) {
473	if (lastItem != `0`) {
474	op = (char16_t) c;
475	continue;
476	} else {
477	// Treat final trailing '-' as a literal
478	add(c, c);
479	c = chars.next(opts, literal, ec);
480	if (U_FAILURE(ec)) return;
481	if (c == u`']'` && !literal) {
482	patLocal.append(u"-]", `2`);
483	mode = `2`;
484	continue;
485	}
486	}
487	}
488	// syntaxError(chars, "'-' not after char or set");
489	ec = U_MALFORMED_SET;
490	return;
491	case u`'&'`:
492	if (lastItem == `2` && op == `0`) {
493	op = (char16_t) c;
494	continue;
495	}
496	// syntaxError(chars, "'&' not after set");
497	ec = U_MALFORMED_SET;
498	return;
499	case u`'^'`:
500	// syntaxError(chars, "'^' not after '['");
501	ec = U_MALFORMED_SET;
502	return;
503	case u`'{'`:
504	if (op != `0`) {
505	// syntaxError(chars, "Missing operand after operator");
506	ec = U_MALFORMED_SET;
507	return;
508	}
509	if (lastItem == `1`) {
510	add(lastChar, lastChar);
511	_appendToPat(patLocal, lastChar, false);
512	}
513	lastItem = `0`;
514	buf.truncate(`0`);
515	{
516	UBool ok = false;
517	while (!chars.atEnd()) {
518	c = chars.next(opts, literal, ec);
519	if (U_FAILURE(ec)) return;
520	if (c == u`'}'` && !literal) {
521	ok = true;
522	break;
523	}
524	buf.append(c);
525	}
526	if (!ok) {
527	// syntaxError(chars, "Invalid multicharacter string");
528	ec = U_MALFORMED_SET;
529	return;
530	}
531	}
532	// We have new string. Add it to set and continue;
533	// we don't need to drop through to the further
534	// processing
535	add(buf);
536	patLocal.append(u`'{'`);
537	_appendToPat(patLocal, buf, false);
538	patLocal.append(u`'}'`);
539	continue;
540	case SymbolTable::SYMBOL_REF:
541	// symbols nosymbols
542	// [a-$] error error (ambiguous)
543	// [a$] anchor anchor
544	// [a-$x] var "x" literal '$'*
545	// [a-$.] error literal '$'
546	// We won't get here in the case of var "x"*
547	{
548	chars.getPos(backup);
549	c = chars.next(opts, literal, ec);
550	if (U_FAILURE(ec)) return;
551	UBool anchor = (c == u`']'` && !literal);
552	if (symbols == `0` && !anchor) {
553	c = SymbolTable::SYMBOL_REF;
554	chars.setPos(backup);
555	break; // literal '$'
556	}
557	if (anchor && op == `0`) {
558	if (lastItem == `1`) {
559	add(lastChar, lastChar);
560	_appendToPat(patLocal, lastChar, false);
561	}
562	add(U_ETHER);
563	usePat = true;
564	patLocal.append((char16_t) SymbolTable::SYMBOL_REF);
565	patLocal.append(u`']'`);
566	mode = `2`;
567	continue;
568	}
569	// syntaxError(chars, "Unquoted '$'");
570	ec = U_MALFORMED_SET;
571	return;
572	}
573	default:
574	break;
575	}
576	}
577
578	// -------- Parse literal characters. This includes both
579	// escaped chars ("\u4E01") and non-syntax characters
580	// ("a").
581
582	switch (lastItem) {
583	case `0`:
584	lastItem = `1`;
585	lastChar = c;
586	break;
587	case `1`:
588	if (op == u`'-'`) {
589	if (lastChar >= c) {
590	// Don't allow redundant (a-a) or empty (b-a) ranges;
591	// these are most likely typos.
592	// syntaxError(chars, "Invalid range");
593	ec = U_MALFORMED_SET;
594	return;
595	}
596	add(lastChar, c);
597	_appendToPat(patLocal, lastChar, false);
598	patLocal.append(op);
599	_appendToPat(patLocal, c, false);
600	lastItem = `0`;
601	op = `0`;
602	} else {
603	add(lastChar, lastChar);
604	_appendToPat(patLocal, lastChar, false);
605	lastChar = c;
606	}
607	break;
608	case `2`:
609	if (op != `0`) {
610	// syntaxError(chars, "Set expected after operator");
611	ec = U_MALFORMED_SET;
612	return;
613	}
614	lastChar = c;
615	lastItem = `1`;
616	break;
617	}
618	}
619
620	if (mode != `2`) {
621	// syntaxError(chars, "Missing ']'");
622	ec = U_MALFORMED_SET;
623	return;
624	}
625
626	chars.skipIgnored(opts);
627
628	/**
629	* Handle global flags (invert, case insensitivity). If this
630	* pattern should be compiled case-insensitive, then we need
631	* to close over case BEFORE COMPLEMENTING. This makes
632	* patterns like /[^abc]/i work.
633	*/
634	if ((options & USET_CASE_MASK) != `0`) {
635	(this->*caseClosure)(options);
636	}
637	if (invert) {
638	complement().removeAllStrings(); // code point complement
639	}
640
641	// Use the rebuilt pattern (patLocal) only if necessary. Prefer the
642	// generated pattern.
643	if (usePat) {
644	rebuiltPat.append(patLocal);
645	} else {
646	_generatePattern(rebuiltPat, false);
647	}
648	if (isBogus() && U_SUCCESS(ec)) {
649	// We likely ran out of memory. AHHH!
650	ec = U_MEMORY_ALLOCATION_ERROR;
651	}
652	}
653
654	//----------------------------------------------------------------
655	// Property set implementation
656	//----------------------------------------------------------------
657
658	namespace {
659
660	static UBool numericValueFilter(UChar32 ch, void* context) {
661	return u_getNumericValue(ch) == (double**)context;
662	}
663
664	static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
665	int32_t value = (int32_t)context;
666	return (U_GET_GC_MASK((UChar32) ch) & value) != `0`;
667	}
668
669	static UBool versionFilter(UChar32 ch, void* context) {
670	static const UVersionInfo none = { `0`, `0`, `0`, `0` };
671	UVersionInfo v;
672	u_charAge(ch, v);
673	UVersionInfo* version = (UVersionInfo*)context;
674	return uprv_memcmp(&v, &none, sizeof(v)) > `0` && uprv_memcmp(&v, version, sizeof(v)) <= `0`;
675	}
676
677	typedef struct {
678	UProperty prop;
679	int32_t value;
680	} IntPropertyContext;
681
682	static UBool intPropertyFilter(UChar32 ch, void* context) {
683	IntPropertyContext* c = (IntPropertyContext*)context;
684	return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
685	}
686
687	static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
688	return uscript_hasScript(ch, (UScriptCode)context);
689	}
690
691	} // namespace
692
693	/**
694	* Generic filter-based scanning code for UCD property UnicodeSets.
695	*/
696	void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
697	void* context,
698	const UnicodeSet* inclusions,
699	UErrorCode &status) {
700	if (U_FAILURE(status)) return;
701
702	// Logically, walk through all Unicode characters, noting the start
703	// and end of each range for which filter.contain(c) is
704	// true. Add each range to a set.
705	//
706	// To improve performance, use an inclusions set which
707	// encodes information about character ranges that are known
708	// to have identical properties.
709	// inclusions contains the first characters of
710	// same-value ranges for the given property.
711
712	clear();
713
714	UChar32 startHasProperty = -`1`;
715	int32_t limitRange = inclusions->getRangeCount();
716
717	for (int j=`0`; j<limitRange; ++j) {
718	// get current range
719	UChar32 start = inclusions->getRangeStart(j);
720	UChar32 end = inclusions->getRangeEnd(j);
721
722	// for all the code points in the range, process
723	for (UChar32 ch = start; ch <= end; ++ch) {
724	// only add to this UnicodeSet on inflection points --
725	// where the hasProperty value changes to false
726	if ((*filter)(ch, context)) {
727	if (startHasProperty < `0`) {
728	startHasProperty = ch;
729	}
730	} else if (startHasProperty >= `0`) {
731	add(startHasProperty, ch-`1`);
732	startHasProperty = -`1`;
733	}
734	}
735	}
736	if (startHasProperty >= `0`) {
737	add((UChar32)startHasProperty, (UChar32)`0x10FFFF`);
738	}
739	if (isBogus() && U_SUCCESS(status)) {
740	// We likely ran out of memory. AHHH!
741	status = U_MEMORY_ALLOCATION_ERROR;
742	}
743	}
744
745	namespace {
746
747	static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
748	/ Note: we use ' ' in compiler code page /
749	int32_t j = `0`;
750	char ch;
751	--dstCapacity; / make room for term. zero /
752	while ((ch = *src++) != `0`) {
753	if (ch == `' '` && (j==`0` \|\| (j>`0` && dst[j-`1`]==`' '`))) {
754	continue;
755	}
756	if (j >= dstCapacity) return false;
757	dst[j++] = ch;
758	}
759	if (j > `0` && dst[j-`1`] == `' '`) --j;
760	dst[j] = `0`;
761	return true;
762	}
763
764	} // namespace
765
766	//----------------------------------------------------------------
767	// Property set API
768	//----------------------------------------------------------------
769
770	#define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
771	ec=U_ILLEGAL_ARGUMENT_ERROR; \
772	return *this; \
773	} UPRV_BLOCK_MACRO_END
774
775	UnicodeSet&
776	UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
777	if (U_FAILURE(ec) \|\| isFrozen()) { return *this; }
778	if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
779	const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
780	applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
781	} else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
782	const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
783	UScriptCode script = (UScriptCode)value;
784	applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
785	} else if (`0` <= prop && prop < UCHAR_BINARY_LIMIT) {
786	if (value == `0` \|\| value == `1`) {
787	const USet *set = u_getBinaryPropertySet(prop, &ec);
788	if (U_FAILURE(ec)) { return *this; }
789	copyFrom(UnicodeSet::fromUSet(set), true*);
790	if (value == `0`) {
791	complement().removeAllStrings(); // code point complement
792	}
793	} else {
794	clear();
795	}
796	} else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
797	const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
798	IntPropertyContext c = {prop, value};
799	applyFilter(intPropertyFilter, &c, inclusions, ec);
800	} else {
801	ec = U_ILLEGAL_ARGUMENT_ERROR;
802	}
803	return *this;
804	}
805
806	UnicodeSet&
807	UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
808	const UnicodeString& value,
809	UErrorCode& ec) {
810	if (U_FAILURE(ec) \|\| isFrozen()) return *this;
811
812	// prop and value used to be converted to char using the default*
813	// converter instead of the invariant conversion.
814	// This should not be necessary because all Unicode property and value
815	// names use only invariant characters.
816	// If there are any variant characters, then we won't find them anyway.
817	// Checking first avoids assertion failures in the conversion.
818	if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) \|\|
819	!uprv_isInvariantUString(value.getBuffer(), value.length())
820	) {
821	FAIL(ec);
822	}
823	CharString pname, vname;
824	pname.appendInvariantChars(prop, ec);
825	vname.appendInvariantChars(value, ec);
826	if (U_FAILURE(ec)) return *this;
827
828	UProperty p;
829	int32_t v;
830	UBool invert = false;
831
832	if (value.length() > `0`) {
833	p = u_getPropertyEnum(pname.data());
834	if (p == UCHAR_INVALID_CODE) FAIL(ec);
835
836	// Treat gc as gcm
837	if (p == UCHAR_GENERAL_CATEGORY) {
838	p = UCHAR_GENERAL_CATEGORY_MASK;
839	}
840
841	if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) \|\|
842	(p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) \|\|
843	(p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
844	v = u_getPropertyValueEnum(p, vname.data());
845	if (v == UCHAR_INVALID_CODE) {
846	// Handle numeric CCC
847	if (p == UCHAR_CANONICAL_COMBINING_CLASS \|\|
848	p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS \|\|
849	p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
850	char* end;
851	double val = uprv_strtod(vname.data(), &end);
852	// Anything between 0 and 255 is valid even if unused.
853	// Cast double->int only after range check.
854	// We catch NaN here because comparing it with both 0 and 255 will be false
855	// (as are all comparisons with NaN).
856	if (*end != `0` \|\| !(`0` <= val && val <= `255`) \|\|
857	(v = (int32_t)val) != val) {
858	// non-integral value or outside 0..255, or trailing junk
859	FAIL(ec);
860	}
861	} else {
862	FAIL(ec);
863	}
864	}
865	}
866
867	else {
868
869	switch (p) {
870	case UCHAR_NUMERIC_VALUE:
871	{
872	char* end;
873	double val = uprv_strtod(vname.data(), &end);
874	if (*end != `0`) {
875	FAIL(ec);
876	}
877	applyFilter(numericValueFilter, &val,
878	CharacterProperties::getInclusionsForProperty(p, ec), ec);
879	return *this;
880	}
881	case UCHAR_NAME:
882	{
883	// Must munge name, since u_charFromName() does not do
884	// 'loose' matching.
885	char buf[`128`]; // it suffices that this be > uprv_getMaxCharNameLength
886	if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
887	UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
888	if (U_SUCCESS(ec)) {
889	clear();
890	add(ch);
891	return *this;
892	} else {
893	FAIL(ec);
894	}
895	}
896	case UCHAR_UNICODE_1_NAME:
897	// ICU 49 deprecates the Unicode_1_Name property APIs.
898	FAIL(ec);
899	case UCHAR_AGE:
900	{
901	// Must munge name, since u_versionFromString() does not do
902	// 'loose' matching.
903	char buf[`128`];
904	if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
905	UVersionInfo version;
906	u_versionFromString(version, buf);
907	applyFilter(versionFilter, &version,
908	CharacterProperties::getInclusionsForProperty(p, ec), ec);
909	return *this;
910	}
911	case UCHAR_SCRIPT_EXTENSIONS:
912	v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
913	if (v == UCHAR_INVALID_CODE) {
914	FAIL(ec);
915	}
916	// fall through to calling applyIntPropertyValue()
917	break;
918	default:
919	// p is a non-binary, non-enumerated property that we
920	// don't support (yet).
921	FAIL(ec);
922	}
923	}
924	}
925
926	else {
927	// value is empty. Interpret as General Category, Script, or
928	// Binary property.
929	p = UCHAR_GENERAL_CATEGORY_MASK;
930	v = u_getPropertyValueEnum(p, pname.data());
931	if (v == UCHAR_INVALID_CODE) {
932	p = UCHAR_SCRIPT;
933	v = u_getPropertyValueEnum(p, pname.data());
934	if (v == UCHAR_INVALID_CODE) {
935	p = u_getPropertyEnum(pname.data());
936	if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
937	v = `1`;
938	} else if (`0` == uprv_comparePropertyNames(ANY, pname.data())) {
939	set(MIN_VALUE, MAX_VALUE);
940	return *this;
941	} else if (`0` == uprv_comparePropertyNames(ASCII, pname.data())) {
942	set(`0`, `0x7F`);
943	return *this;
944	} else if (`0` == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
945	// [:Assigned:]=[:^Cn:]
946	p = UCHAR_GENERAL_CATEGORY_MASK;
947	v = U_GC_CN_MASK;
948	invert = true;
949	} else {
950	FAIL(ec);
951	}
952	}
953	}
954	}
955
956	applyIntPropertyValue(p, v, ec);
957	if(invert) {
958	complement().removeAllStrings(); // code point complement
959	}
960
961	if (isBogus() && U_SUCCESS(ec)) {
962	// We likely ran out of memory. AHHH!
963	ec = U_MEMORY_ALLOCATION_ERROR;
964	}
965	return *this;
966	}
967
968	//----------------------------------------------------------------
969	// Property set patterns
970	//----------------------------------------------------------------
971
972	/**
973	* Return true if the given position, in the given pattern, appears
974	* to be the start of a property set pattern.
975	*/
976	UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
977	int32_t pos) {
978	// Patterns are at least 5 characters long
979	if ((pos+`5`) > pattern.length()) {
980	return false;
981	}
982
983	// Look for an opening [:, [:^, \p, or \P
984	return isPOSIXOpen(pattern, pos) \|\| isPerlOpen(pattern, pos) \|\| isNameOpen(pattern, pos);
985	}
986
987	/**
988	* Return true if the given iterator appears to point at a
989	* property pattern. Regardless of the result, return with the
990	* iterator unchanged.
991	* @param chars iterator over the pattern characters. Upon return
992	* it will be unchanged.
993	* @param iterOpts RuleCharacterIterator options
994	*/
995	UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
996	int32_t iterOpts) {
997	// NOTE: literal will always be false, because we don't parse escapes.
998	UBool result = false, literal;
999	UErrorCode ec = U_ZERO_ERROR;
1000	iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1001	RuleCharacterIterator::Pos pos;
1002	chars.getPos(pos);
1003	UChar32 c = chars.next(iterOpts, literal, ec);
1004	if (c == u`'['` \|\| c == u`'\\'`) {
1005	UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1006	literal, ec);
1007	result = (c == u`'['`) ? (d == u`':'`) :
1008	(d == u`'N'` \|\| d == u`'p'` \|\| d == u`'P'`);
1009	}
1010	chars.setPos(pos);
1011	return result && U_SUCCESS(ec);
1012	}
1013
1014	/**
1015	* Parse the given property pattern at the given parse position.
1016	*/
1017	UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1018	ParsePosition& ppos,
1019	UErrorCode &ec) {
1020	int32_t pos = ppos.getIndex();
1021
1022	UBool posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1023	UBool isName = false; // true for \N{pat}, o/w false
1024	UBool invert = false;
1025
1026	if (U_FAILURE(ec)) return *this;
1027
1028	// Minimum length is 5 characters, e.g. \p{L}
1029	if ((pos+`5`) > pattern.length()) {
1030	FAIL(ec);
1031	}
1032
1033	// On entry, ppos should point to one of the following locations:
1034	// Look for an opening [:, [:^, \p, or \P
1035	if (isPOSIXOpen(pattern, pos)) {
1036	posix = true;
1037	pos += `2`;
1038	pos = ICU_Utility::skipWhitespace(pattern, pos);
1039	if (pos < pattern.length() && pattern.charAt(pos) == u`'^'`) {
1040	++pos;
1041	invert = true;
1042	}
1043	} else if (isPerlOpen(pattern, pos) \|\| isNameOpen(pattern, pos)) {
1044	char16_t c = pattern.charAt(pos+`1`);
1045	invert = (c == u`'P'`);
1046	isName = (c == u`'N'`);
1047	pos += `2`;
1048	pos = ICU_Utility::skipWhitespace(pattern, pos);
1049	if (pos == pattern.length() \|\| pattern.charAt(pos++) != u`'{'`) {
1050	// Syntax error; "\p" or "\P" not followed by "{"
1051	FAIL(ec);
1052	}
1053	} else {
1054	// Open delimiter not seen
1055	FAIL(ec);
1056	}
1057
1058	// Look for the matching close delimiter, either :] or }
1059	int32_t close;
1060	if (posix) {
1061	close = pattern.indexOf(u":]", `2`, pos);
1062	} else {
1063	close = pattern.indexOf(u`'}'`, pos);
1064	}
1065	if (close < `0`) {
1066	// Syntax error; close delimiter missing
1067	FAIL(ec);
1068	}
1069
1070	// Look for an '=' sign. If this is present, we will parse a
1071	// medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1072	// pattern.
1073	int32_t equals = pattern.indexOf(u`'='`, pos);
1074	UnicodeString propName, valueName;
1075	if (equals >= `0` && equals < close && !isName) {
1076	// Equals seen; parse medium/long pattern
1077	pattern.extractBetween(pos, equals, propName);
1078	pattern.extractBetween(equals+`1`, close, valueName);
1079	}
1080
1081	else {
1082	// Handle case where no '=' is seen, and \N{}
1083	pattern.extractBetween(pos, close, propName);
1084
1085	// Handle \N{name}
1086	if (isName) {
1087	// This is a little inefficient since it means we have to
1088	// parse NAME_PROP back to UCHAR_NAME even though we already
1089	// know it's UCHAR_NAME. If we refactor the API to
1090	// support args of (UProperty, char) then we can remove*
1091	// NAME_PROP and make this a little more efficient.
1092	valueName = propName;
1093	propName = UnicodeString (NAME_PROP, NAME_PROP_LENGTH, US_INV);
1094	}
1095	}
1096
1097	applyPropertyAlias(propName, valueName, ec);
1098
1099	if (U_SUCCESS(ec)) {
1100	if (invert) {
1101	complement().removeAllStrings(); // code point complement
1102	}
1103
1104	// Move to the limit position after the close delimiter if the
1105	// parse succeeded.
1106	ppos.setIndex(close + (posix ? `2` : `1`));
1107	}
1108
1109	return *this;
1110	}
1111
1112	/**
1113	* Parse a property pattern.
1114	* @param chars iterator over the pattern characters. Upon return
1115	* it will be advanced to the first character after the parsed
1116	* pattern, or the end of the iteration if all characters are
1117	* parsed.
1118	* @param rebuiltPat the pattern that was parsed, rebuilt or
1119	* copied from the input pattern, as appropriate.
1120	*/
1121	void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1122	UnicodeString& rebuiltPat,
1123	UErrorCode& ec) {
1124	if (U_FAILURE(ec)) return;
1125	UnicodeString pattern;
1126	chars.lookahead(pattern);
1127	ParsePosition pos(`0`);
1128	applyPropertyPattern(pattern, pos, ec);
1129	if (U_FAILURE(ec)) return;
1130	if (pos.getIndex() == `0`) {
1131	// syntaxError(chars, "Invalid property pattern");
1132	ec = U_MALFORMED_SET;
1133	return;
1134	}
1135	chars.jumpahead(pos.getIndex());
1136	rebuiltPat.append(pattern, `0`, pos.getIndex());
1137	}
1138
1139	U_NAMESPACE_END
1140

Browse the source code of Godot/thirdparty/icu4c/common/uniset_props.cpp