uniset_props.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/uniset_props.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 1999-2014, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: uniset_props.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2004aug25
16	* created by: Markus W. Scherer
17	*
18	* Character property dependent functions moved here from uniset.cpp
19	*/
20
21	#include "unicode/utypes.h"
22	#include "unicode/uniset.h"
23	#include "unicode/parsepos.h"
24	#include "unicode/uchar.h"
25	#include "unicode/uscript.h"
26	#include "unicode/symtable.h"
27	#include "unicode/uset.h"
28	#include "unicode/locid.h"
29	#include "unicode/brkiter.h"
30	#include "uset_imp.h"
31	#include "ruleiter.h"
32	#include "cmemory.h"
33	#include "ucln_cmn.h"
34	#include "util.h"
35	#include "uvector.h"
36	#include "uprops.h"
37	#include "propname.h"
38	#include "normalizer2impl.h"
39	#include "uinvchar.h"
40	#include "uprops.h"
41	#include "charstr.h"
42	#include "cstring.h"
43	#include "mutex.h"
44	#include "umutex.h"
45	#include "uassert.h"
46	#include "hash.h"
47
48	U_NAMESPACE_USE
49
50	// Define UChar constants using hex for EBCDIC compatibility
51	// Used #define to reduce private static exports and memory access time.
52	#define SET_OPEN ((UChar)0x005B) /[/
53	#define SET_CLOSE ((UChar)0x005D) /]/
54	#define HYPHEN ((UChar)0x002D) /-/
55	#define COMPLEMENT ((UChar)0x005E) /^/
56	#define COLON ((UChar)0x003A) /:/
57	#define BACKSLASH ((UChar)0x005C) /\/
58	#define INTERSECTION ((UChar)0x0026) /&/
59	#define UPPER_U ((UChar)0x0055) /U/
60	#define LOWER_U ((UChar)0x0075) /u/
61	#define OPEN_BRACE ((UChar)123) /{/
62	#define CLOSE_BRACE ((UChar)125) /}/
63	#define UPPER_P ((UChar)0x0050) /P/
64	#define LOWER_P ((UChar)0x0070) /p/
65	#define UPPER_N ((UChar)78) /N/
66	#define EQUALS ((UChar)0x003D) /=/
67
68	//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"
69	static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,`0` }; // ":]"
70	//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"
71	//static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"
72	//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"
73	static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,`0`}; /-]/
74
75	// Special property set IDs
76	static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
77	static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
78	static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
79
80	// Unicode name property alias
81	#define NAME_PROP "na"
82	#define NAME_PROP_LENGTH 2
83
84	/**
85	* Delimiter string used in patterns to close a category reference:
86	* ":]". Example: "[:Lu:]".
87	*/
88	//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; / ":]" /
89
90	// Cached sets ------------------------------------------------------------- ***
91
92	U_CDECL_BEGIN
93	static UBool U_CALLCONV uset_cleanup();
94
95	static UnicodeSet *uni32Singleton;
96	static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;
97
98	/**
99	* Cleanup function for UnicodeSet
100	*/
101	static UBool U_CALLCONV uset_cleanup(void) {
102	delete uni32Singleton;
103	uni32Singleton = NULL;
104	uni32InitOnce.reset();
105	return TRUE;
106	}
107
108	U_CDECL_END
109
110	U_NAMESPACE_BEGIN
111
112	namespace {
113
114	// Cache some sets for other services -------------------------------------- ***
115	void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
116	U_ASSERT(uni32Singleton == NULL);
117	uni32Singleton = new UnicodeSet (UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
118	if(uni32Singleton==NULL) {
119	errorCode=U_MEMORY_ALLOCATION_ERROR;
120	} else {
121	uni32Singleton->freeze();
122	}
123	ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
124	}
125
126
127	U_CFUNC UnicodeSet *
128	uniset_getUnicode32Instance(UErrorCode &errorCode) {
129	umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
130	return uni32Singleton;
131	}
132
133	// helper functions for matching of pattern syntax pieces ------------------ ***
134	// these functions are parallel to the PERL_OPEN etc. strings above
135
136	// using these functions is not only faster than UnicodeString::compare() and
137	// caseCompare(), but they also make UnicodeSet work for simple patterns when
138	// no Unicode properties data is available - when caseCompare() fails
139
140	static inline UBool
141	isPerlOpen(const UnicodeString &pattern, int32_t pos) {
142	UChar c;
143	return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+`1`))==LOWER_P \|\| c==UPPER_P);
144	}
145
146	/static inline UBool*
147	isPerlClose(const UnicodeString &pattern, int32_t pos) {
148	return pattern.charAt(pos)==CLOSE_BRACE;
149	}/*
150
151	static inline UBool
152	isNameOpen(const UnicodeString &pattern, int32_t pos) {
153	return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+`1`)==UPPER_N;
154	}
155
156	static inline UBool
157	isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
158	return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+`1`)==COLON;
159	}
160
161	/static inline UBool*
162	isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
163	return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
164	}/*
165
166	// TODO memory debugging provided inside uniset.cpp
167	// could be made available here but probably obsolete with use of modern
168	// memory leak checker tools
169	#define _dbgct(me)
170
171	} // namespace
172
173	//----------------------------------------------------------------
174	// Constructors &c
175	//----------------------------------------------------------------
176
177	/**
178	* Constructs a set from the given pattern, optionally ignoring
179	* white space. See the class description for the syntax of the
180	* pattern language.
181	* @param pattern a string specifying what characters are in the set
182	*/
183	UnicodeSet::UnicodeSet(const UnicodeString& pattern,
184	UErrorCode& status) {
185	applyPattern(pattern, status);
186	_dbgct(this);
187	}
188
189	//----------------------------------------------------------------
190	// Public API
191	//----------------------------------------------------------------
192
193	UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
194	UErrorCode& status) {
195	// Equivalent to
196	// return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
197	// but without dependency on closeOver().
198	ParsePosition pos(`0`);
199	applyPatternIgnoreSpace(pattern, pos, NULL, status);
200	if (U_FAILURE(status)) return *this;
201
202	int32_t i = pos.getIndex();
203	// Skip over trailing whitespace
204	ICU_Utility::skipWhitespace(pattern, i, TRUE);
205	if (i != pattern.length()) {
206	status = U_ILLEGAL_ARGUMENT_ERROR;
207	}
208	return *this;
209	}
210
211	void
212	UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
213	ParsePosition& pos,
214	const SymbolTable* symbols,
215	UErrorCode& status) {
216	if (U_FAILURE(status)) {
217	return;
218	}
219	if (isFrozen()) {
220	status = U_NO_WRITE_PERMISSION;
221	return;
222	}
223	// Need to build the pattern in a temporary string because
224	// _applyPattern calls add() etc., which set pat to empty.
225	UnicodeString rebuiltPat;
226	RuleCharacterIterator chars(pattern, symbols, pos);
227	applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, `0`, status);
228	if (U_FAILURE(status)) return;
229	if (chars.inVariable()) {
230	// syntaxError(chars, "Extra chars in variable value");
231	status = U_MALFORMED_SET;
232	return;
233	}
234	setPattern(rebuiltPat);
235	}
236
237	/**
238	* Return true if the given position, in the given pattern, appears
239	* to be the start of a UnicodeSet pattern.
240	*/
241	UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
242	return ((pos+`1`) < pattern.length() &&
243	pattern.charAt(pos) == (UChar)`91`/[/) \|\|
244	resemblesPropertyPattern(pattern, pos);
245	}
246
247	//----------------------------------------------------------------
248	// Implementation: Pattern parsing
249	//----------------------------------------------------------------
250
251	namespace {
252
253	/**
254	* A small all-inline class to manage a UnicodeSet pointer. Add
255	* operator->() etc. as needed.
256	*/
257	class UnicodeSetPointer {
258	UnicodeSet* p;
259	public:
260	inline UnicodeSetPointer() : p(`0`) {}
261	inline ~UnicodeSetPointer() { delete p; }
262	inline UnicodeSet* pointer() { return p; }
263	inline UBool allocate() {
264	if (p == `0`) {
265	p = new UnicodeSet ();
266	}
267	return p != `0`;
268	}
269	};
270
271	constexpr int32_t MAX_DEPTH = `100`;
272
273	} // namespace
274
275	/**
276	* Parse the pattern from the given RuleCharacterIterator. The
277	* iterator is advanced over the parsed pattern.
278	* @param chars iterator over the pattern characters. Upon return
279	* it will be advanced to the first character after the parsed
280	* pattern, or the end of the iteration if all characters are
281	* parsed.
282	* @param symbols symbol table to use to parse and dereference
283	* variables, or null if none.
284	* @param rebuiltPat the pattern that was parsed, rebuilt or
285	* copied from the input pattern, as appropriate.
286	* @param options a bit mask of zero or more of the following:
287	* IGNORE_SPACE, CASE.
288	*/
289	void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
290	const SymbolTable* symbols,
291	UnicodeString& rebuiltPat,
292	uint32_t options,
293	UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
294	int32_t depth,
295	UErrorCode& ec) {
296	if (U_FAILURE(ec)) return;
297	if (depth > MAX_DEPTH) {
298	ec = U_ILLEGAL_ARGUMENT_ERROR;
299	return;
300	}
301
302	// Syntax characters: [ ] ^ - & { }
303
304	// Recognized special forms for chars, sets: c-c s-s s&s
305
306	int32_t opts = RuleCharacterIterator::PARSE_VARIABLES \|
307	RuleCharacterIterator::PARSE_ESCAPES;
308	if ((options & USET_IGNORE_SPACE) != `0`) {
309	opts \|= RuleCharacterIterator::SKIP_WHITESPACE;
310	}
311
312	UnicodeString patLocal, buf;
313	UBool usePat = FALSE;
314	UnicodeSetPointer scratch;
315	RuleCharacterIterator::Pos backup;
316
317	// mode: 0=before [, 1=between [...], 2=after ]
318	// lastItem: 0=none, 1=char, 2=set
319	int8_t lastItem = `0`, mode = `0`;
320	UChar32 lastChar = `0`;
321	UChar op = `0`;
322
323	UBool invert = FALSE;
324
325	clear();
326
327	while (mode != `2` && !chars.atEnd()) {
328	U_ASSERT((lastItem == `0` && op == `0`) \|\|
329	(lastItem == `1` && (op == `0` \|\| op == HYPHEN /'-'/)) \|\|
330	(lastItem == `2` && (op == `0` \|\| op == HYPHEN /'-'/ \|\|
331	op == INTERSECTION /'&'/)));
332
333	UChar32 c = `0`;
334	UBool literal = FALSE;
335	UnicodeSet* nested = `0`; // alias - do not delete
336
337	// -------- Check for property pattern
338
339	// setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
340	int8_t setMode = `0`;
341	if (resemblesPropertyPattern(chars, opts)) {
342	setMode = `2`;
343	}
344
345	// -------- Parse '[' of opening delimiter OR nested set.
346	// If there is a nested set, use `setMode' to define how
347	// the set should be parsed. If the '[' is part of the
348	// opening delimiter for this pattern, parse special
349	// strings "[", "[^", "[-", and "[^-". Check for stand-in
350	// characters representing a nested set in the symbol
351	// table.
352
353	else {
354	// Prepare to backup if necessary
355	chars.getPos(backup);
356	c = chars.next(opts, literal, ec);
357	if (U_FAILURE(ec)) return;
358
359	if (c == `0x5B` /'['/ && !literal) {
360	if (mode == `1`) {
361	chars.setPos(backup); // backup
362	setMode = `1`;
363	} else {
364	// Handle opening '[' delimiter
365	mode = `1`;
366	patLocal.append((UChar) `0x5B` /'['/);
367	chars.getPos(backup); // prepare to backup
368	c = chars.next(opts, literal, ec);
369	if (U_FAILURE(ec)) return;
370	if (c == `0x5E` /'^'/ && !literal) {
371	invert = TRUE;
372	patLocal.append((UChar) `0x5E` /'^'/);
373	chars.getPos(backup); // prepare to backup
374	c = chars.next(opts, literal, ec);
375	if (U_FAILURE(ec)) return;
376	}
377	// Fall through to handle special leading '-';
378	// otherwise restart loop for nested [], \p{}, etc.
379	if (c == HYPHEN /'-'/) {
380	literal = TRUE;
381	// Fall through to handle literal '-' below
382	} else {
383	chars.setPos(backup); // backup
384	continue;
385	}
386	}
387	} else if (symbols != `0`) {
388	const UnicodeFunctor *m = symbols->lookupMatcher(c);
389	if (m != `0`) {
390	const UnicodeSet ms = dynamic_cast<const* UnicodeSet *>(m);
391	if (ms == NULL) {
392	ec = U_MALFORMED_SET;
393	return;
394	}
395	// casting away const, but `nested' won't be modified
396	// (important not to modify stored set)
397	nested = const_cast<UnicodeSet*>(ms);
398	setMode = `3`;
399	}
400	}
401	}
402
403	// -------- Handle a nested set. This either is inline in
404	// the pattern or represented by a stand-in that has
405	// previously been parsed and was looked up in the symbol
406	// table.
407
408	if (setMode != `0`) {
409	if (lastItem == `1`) {
410	if (op != `0`) {
411	// syntaxError(chars, "Char expected after operator");
412	ec = U_MALFORMED_SET;
413	return;
414	}
415	add(lastChar, lastChar);
416	_appendToPat(patLocal, lastChar, FALSE);
417	lastItem = `0`;
418	op = `0`;
419	}
420
421	if (op == HYPHEN /'-'/ \|\| op == INTERSECTION /'&'/) {
422	patLocal.append(op);
423	}
424
425	if (nested == `0`) {
426	// lazy allocation
427	if (!scratch.allocate()) {
428	ec = U_MEMORY_ALLOCATION_ERROR;
429	return;
430	}
431	nested = scratch.pointer();
432	}
433	switch (setMode) {
434	case `1`:
435	nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + `1`, ec);
436	break;
437	case `2`:
438	chars.skipIgnored(opts);
439	nested->applyPropertyPattern(chars, patLocal, ec);
440	if (U_FAILURE(ec)) return;
441	break;
442	case `3`: // `nested' already parsed
443	nested->_toPattern(patLocal, FALSE);
444	break;
445	}
446
447	usePat = TRUE;
448
449	if (mode == `0`) {
450	// Entire pattern is a category; leave parse loop
451	*this = *nested;
452	mode = `2`;
453	break;
454	}
455
456	switch (op) {
457	case HYPHEN: /'-'/
458	removeAll(*nested);
459	break;
460	case INTERSECTION: /'&'/
461	retainAll(*nested);
462	break;
463	case `0`:
464	addAll(*nested);
465	break;
466	}
467
468	op = `0`;
469	lastItem = `2`;
470
471	continue;
472	}
473
474	if (mode == `0`) {
475	// syntaxError(chars, "Missing '['");
476	ec = U_MALFORMED_SET;
477	return;
478	}
479
480	// -------- Parse special (syntax) characters. If the
481	// current character is not special, or if it is escaped,
482	// then fall through and handle it below.
483
484	if (!literal) {
485	switch (c) {
486	case `0x5D` /']'/:
487	if (lastItem == `1`) {
488	add(lastChar, lastChar);
489	_appendToPat(patLocal, lastChar, FALSE);
490	}
491	// Treat final trailing '-' as a literal
492	if (op == HYPHEN /'-'/) {
493	add(op, op);
494	patLocal.append(op);
495	} else if (op == INTERSECTION /'&'/) {
496	// syntaxError(chars, "Trailing '&'");
497	ec = U_MALFORMED_SET;
498	return;
499	}
500	patLocal.append((UChar) `0x5D` /']'/);
501	mode = `2`;
502	continue;
503	case HYPHEN /'-'/:
504	if (op == `0`) {
505	if (lastItem != `0`) {
506	op = (UChar) c;
507	continue;
508	} else {
509	// Treat final trailing '-' as a literal
510	add(c, c);
511	c = chars.next(opts, literal, ec);
512	if (U_FAILURE(ec)) return;
513	if (c == `0x5D` /']'/ && !literal) {
514	patLocal.append(HYPHEN_RIGHT_BRACE, `2`);
515	mode = `2`;
516	continue;
517	}
518	}
519	}
520	// syntaxError(chars, "'-' not after char or set");
521	ec = U_MALFORMED_SET;
522	return;
523	case INTERSECTION /'&'/:
524	if (lastItem == `2` && op == `0`) {
525	op = (UChar) c;
526	continue;
527	}
528	// syntaxError(chars, "'&' not after set");
529	ec = U_MALFORMED_SET;
530	return;
531	case `0x5E` /'^'/:
532	// syntaxError(chars, "'^' not after '['");
533	ec = U_MALFORMED_SET;
534	return;
535	case `0x7B` /'{'/:
536	if (op != `0`) {
537	// syntaxError(chars, "Missing operand after operator");
538	ec = U_MALFORMED_SET;
539	return;
540	}
541	if (lastItem == `1`) {
542	add(lastChar, lastChar);
543	_appendToPat(patLocal, lastChar, FALSE);
544	}
545	lastItem = `0`;
546	buf.truncate(`0`);
547	{
548	UBool ok = FALSE;
549	while (!chars.atEnd()) {
550	c = chars.next(opts, literal, ec);
551	if (U_FAILURE(ec)) return;
552	if (c == `0x7D` /'}'/ && !literal) {
553	ok = TRUE;
554	break;
555	}
556	buf.append(c);
557	}
558	if (buf.length() < `1` \|\| !ok) {
559	// syntaxError(chars, "Invalid multicharacter string");
560	ec = U_MALFORMED_SET;
561	return;
562	}
563	}
564	// We have new string. Add it to set and continue;
565	// we don't need to drop through to the further
566	// processing
567	add(buf);
568	patLocal.append((UChar) `0x7B` /'{'/);
569	_appendToPat(patLocal, buf, FALSE);
570	patLocal.append((UChar) `0x7D` /'}'/);
571	continue;
572	case SymbolTable::SYMBOL_REF:
573	// symbols nosymbols
574	// [a-$] error error (ambiguous)
575	// [a$] anchor anchor
576	// [a-$x] var "x" literal '$'*
577	// [a-$.] error literal '$'
578	// We won't get here in the case of var "x"*
579	{
580	chars.getPos(backup);
581	c = chars.next(opts, literal, ec);
582	if (U_FAILURE(ec)) return;
583	UBool anchor = (c == `0x5D` /']'/ && !literal);
584	if (symbols == `0` && !anchor) {
585	c = SymbolTable::SYMBOL_REF;
586	chars.setPos(backup);
587	break; // literal '$'
588	}
589	if (anchor && op == `0`) {
590	if (lastItem == `1`) {
591	add(lastChar, lastChar);
592	_appendToPat(patLocal, lastChar, FALSE);
593	}
594	add(U_ETHER);
595	usePat = TRUE;
596	patLocal.append((UChar) SymbolTable::SYMBOL_REF);
597	patLocal.append((UChar) `0x5D` /']'/);
598	mode = `2`;
599	continue;
600	}
601	// syntaxError(chars, "Unquoted '$'");
602	ec = U_MALFORMED_SET;
603	return;
604	}
605	default:
606	break;
607	}
608	}
609
610	// -------- Parse literal characters. This includes both
611	// escaped chars ("\u4E01") and non-syntax characters
612	// ("a").
613
614	switch (lastItem) {
615	case `0`:
616	lastItem = `1`;
617	lastChar = c;
618	break;
619	case `1`:
620	if (op == HYPHEN /'-'/) {
621	if (lastChar >= c) {
622	// Don't allow redundant (a-a) or empty (b-a) ranges;
623	// these are most likely typos.
624	// syntaxError(chars, "Invalid range");
625	ec = U_MALFORMED_SET;
626	return;
627	}
628	add(lastChar, c);
629	_appendToPat(patLocal, lastChar, FALSE);
630	patLocal.append(op);
631	_appendToPat(patLocal, c, FALSE);
632	lastItem = `0`;
633	op = `0`;
634	} else {
635	add(lastChar, lastChar);
636	_appendToPat(patLocal, lastChar, FALSE);
637	lastChar = c;
638	}
639	break;
640	case `2`:
641	if (op != `0`) {
642	// syntaxError(chars, "Set expected after operator");
643	ec = U_MALFORMED_SET;
644	return;
645	}
646	lastChar = c;
647	lastItem = `1`;
648	break;
649	}
650	}
651
652	if (mode != `2`) {
653	// syntaxError(chars, "Missing ']'");
654	ec = U_MALFORMED_SET;
655	return;
656	}
657
658	chars.skipIgnored(opts);
659
660	/**
661	* Handle global flags (invert, case insensitivity). If this
662	* pattern should be compiled case-insensitive, then we need
663	* to close over case BEFORE COMPLEMENTING. This makes
664	* patterns like /[^abc]/i work.
665	*/
666	if ((options & USET_CASE_INSENSITIVE) != `0`) {
667	(this->*caseClosure)(USET_CASE_INSENSITIVE);
668	}
669	else if ((options & USET_ADD_CASE_MAPPINGS) != `0`) {
670	(this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
671	}
672	if (invert) {
673	complement();
674	}
675
676	// Use the rebuilt pattern (patLocal) only if necessary. Prefer the
677	// generated pattern.
678	if (usePat) {
679	rebuiltPat.append(patLocal);
680	} else {
681	_generatePattern(rebuiltPat, FALSE);
682	}
683	if (isBogus() && U_SUCCESS(ec)) {
684	// We likely ran out of memory. AHHH!
685	ec = U_MEMORY_ALLOCATION_ERROR;
686	}
687	}
688
689	//----------------------------------------------------------------
690	// Property set implementation
691	//----------------------------------------------------------------
692
693	namespace {
694
695	static UBool numericValueFilter(UChar32 ch, void* context) {
696	return u_getNumericValue(ch) == (double**)context;
697	}
698
699	static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
700	int32_t value = (int32_t)context;
701	return (U_GET_GC_MASK((UChar32) ch) & value) != `0`;
702	}
703
704	static UBool versionFilter(UChar32 ch, void* context) {
705	static const UVersionInfo none = { `0`, `0`, `0`, `0` };
706	UVersionInfo v;
707	u_charAge(ch, v);
708	UVersionInfo* version = (UVersionInfo*)context;
709	return uprv_memcmp(&v, &none, sizeof(v)) > `0` && uprv_memcmp(&v, version, sizeof(v)) <= `0`;
710	}
711
712	typedef struct {
713	UProperty prop;
714	int32_t value;
715	} IntPropertyContext;
716
717	static UBool intPropertyFilter(UChar32 ch, void* context) {
718	IntPropertyContext* c = (IntPropertyContext*)context;
719	return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
720	}
721
722	static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
723	return uscript_hasScript(ch, (UScriptCode)context);
724	}
725
726	} // namespace
727
728	/**
729	* Generic filter-based scanning code for UCD property UnicodeSets.
730	*/
731	void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
732	void* context,
733	const UnicodeSet* inclusions,
734	UErrorCode &status) {
735	if (U_FAILURE(status)) return;
736
737	// Logically, walk through all Unicode characters, noting the start
738	// and end of each range for which filter.contain(c) is
739	// true. Add each range to a set.
740	//
741	// To improve performance, use an inclusions set which
742	// encodes information about character ranges that are known
743	// to have identical properties.
744	// inclusions contains the first characters of
745	// same-value ranges for the given property.
746
747	clear();
748
749	UChar32 startHasProperty = -`1`;
750	int32_t limitRange = inclusions->getRangeCount();
751
752	for (int j=`0`; j<limitRange; ++j) {
753	// get current range
754	UChar32 start = inclusions->getRangeStart(j);
755	UChar32 end = inclusions->getRangeEnd(j);
756
757	// for all the code points in the range, process
758	for (UChar32 ch = start; ch <= end; ++ch) {
759	// only add to this UnicodeSet on inflection points --
760	// where the hasProperty value changes to false
761	if ((*filter)(ch, context)) {
762	if (startHasProperty < `0`) {
763	startHasProperty = ch;
764	}
765	} else if (startHasProperty >= `0`) {
766	add(startHasProperty, ch-`1`);
767	startHasProperty = -`1`;
768	}
769	}
770	}
771	if (startHasProperty >= `0`) {
772	add((UChar32)startHasProperty, (UChar32)`0x10FFFF`);
773	}
774	if (isBogus() && U_SUCCESS(status)) {
775	// We likely ran out of memory. AHHH!
776	status = U_MEMORY_ALLOCATION_ERROR;
777	}
778	}
779
780	namespace {
781
782	static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
783	/ Note: we use ' ' in compiler code page /
784	int32_t j = `0`;
785	char ch;
786	--dstCapacity; / make room for term. zero /
787	while ((ch = *src++) != `0`) {
788	if (ch == `' '` && (j==`0` \|\| (j>`0` && dst[j-`1`]==`' '`))) {
789	continue;
790	}
791	if (j >= dstCapacity) return FALSE;
792	dst[j++] = ch;
793	}
794	if (j > `0` && dst[j-`1`] == `' '`) --j;
795	dst[j] = `0`;
796	return TRUE;
797	}
798
799	} // namespace
800
801	//----------------------------------------------------------------
802	// Property set API
803	//----------------------------------------------------------------
804
805	#define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
806	ec=U_ILLEGAL_ARGUMENT_ERROR; \
807	return *this; \
808	} UPRV_BLOCK_MACRO_END
809
810	UnicodeSet&
811	UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
812	if (U_FAILURE(ec) \|\| isFrozen()) { return *this; }
813	if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
814	const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
815	applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
816	} else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
817	const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
818	UScriptCode script = (UScriptCode)value;
819	applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
820	} else if (`0` <= prop && prop < UCHAR_BINARY_LIMIT) {
821	if (value == `0` \|\| value == `1`) {
822	const USet *set = u_getBinaryPropertySet(prop, &ec);
823	if (U_FAILURE(ec)) { return *this; }
824	copyFrom(*UnicodeSet::fromUSet(set), TRUE);
825	if (value == `0`) {
826	complement();
827	}
828	} else {
829	clear();
830	}
831	} else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
832	const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
833	IntPropertyContext c = {prop, value};
834	applyFilter(intPropertyFilter, &c, inclusions, ec);
835	} else {
836	ec = U_ILLEGAL_ARGUMENT_ERROR;
837	}
838	return *this;
839	}
840
841	UnicodeSet&
842	UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
843	const UnicodeString& value,
844	UErrorCode& ec) {
845	if (U_FAILURE(ec) \|\| isFrozen()) return *this;
846
847	// prop and value used to be converted to char using the default*
848	// converter instead of the invariant conversion.
849	// This should not be necessary because all Unicode property and value
850	// names use only invariant characters.
851	// If there are any variant characters, then we won't find them anyway.
852	// Checking first avoids assertion failures in the conversion.
853	if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) \|\|
854	!uprv_isInvariantUString(value.getBuffer(), value.length())
855	) {
856	FAIL(ec);
857	}
858	CharString pname, vname;
859	pname.appendInvariantChars(prop, ec);
860	vname.appendInvariantChars(value, ec);
861	if (U_FAILURE(ec)) return *this;
862
863	UProperty p;
864	int32_t v;
865	UBool invert = FALSE;
866
867	if (value.length() > `0`) {
868	p = u_getPropertyEnum(pname.data());
869	if (p == UCHAR_INVALID_CODE) FAIL(ec);
870
871	// Treat gc as gcm
872	if (p == UCHAR_GENERAL_CATEGORY) {
873	p = UCHAR_GENERAL_CATEGORY_MASK;
874	}
875
876	if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) \|\|
877	(p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) \|\|
878	(p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
879	v = u_getPropertyValueEnum(p, vname.data());
880	if (v == UCHAR_INVALID_CODE) {
881	// Handle numeric CCC
882	if (p == UCHAR_CANONICAL_COMBINING_CLASS \|\|
883	p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS \|\|
884	p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
885	char* end;
886	double val = uprv_strtod(vname.data(), &end);
887	// Anything between 0 and 255 is valid even if unused.
888	// Cast double->int only after range check.
889	// We catch NaN here because comparing it with both 0 and 255 will be false
890	// (as are all comparisons with NaN).
891	if (*end != `0` \|\| !(`0` <= val && val <= `255`) \|\|
892	(v = (int32_t)val) != val) {
893	// non-integral value or outside 0..255, or trailing junk
894	FAIL(ec);
895	}
896	} else {
897	FAIL(ec);
898	}
899	}
900	}
901
902	else {
903
904	switch (p) {
905	case UCHAR_NUMERIC_VALUE:
906	{
907	char* end;
908	double val = uprv_strtod(vname.data(), &end);
909	if (*end != `0`) {
910	FAIL(ec);
911	}
912	applyFilter(numericValueFilter, &val,
913	CharacterProperties::getInclusionsForProperty(p, ec), ec);
914	return *this;
915	}
916	case UCHAR_NAME:
917	{
918	// Must munge name, since u_charFromName() does not do
919	// 'loose' matching.
920	char buf[`128`]; // it suffices that this be > uprv_getMaxCharNameLength
921	if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
922	UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
923	if (U_SUCCESS(ec)) {
924	clear();
925	add(ch);
926	return *this;
927	} else {
928	FAIL(ec);
929	}
930	}
931	case UCHAR_UNICODE_1_NAME:
932	// ICU 49 deprecates the Unicode_1_Name property APIs.
933	FAIL(ec);
934	case UCHAR_AGE:
935	{
936	// Must munge name, since u_versionFromString() does not do
937	// 'loose' matching.
938	char buf[`128`];
939	if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
940	UVersionInfo version;
941	u_versionFromString(version, buf);
942	applyFilter(versionFilter, &version,
943	CharacterProperties::getInclusionsForProperty(p, ec), ec);
944	return *this;
945	}
946	case UCHAR_SCRIPT_EXTENSIONS:
947	v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
948	if (v == UCHAR_INVALID_CODE) {
949	FAIL(ec);
950	}
951	// fall through to calling applyIntPropertyValue()
952	break;
953	default:
954	// p is a non-binary, non-enumerated property that we
955	// don't support (yet).
956	FAIL(ec);
957	}
958	}
959	}
960
961	else {
962	// value is empty. Interpret as General Category, Script, or
963	// Binary property.
964	p = UCHAR_GENERAL_CATEGORY_MASK;
965	v = u_getPropertyValueEnum(p, pname.data());
966	if (v == UCHAR_INVALID_CODE) {
967	p = UCHAR_SCRIPT;
968	v = u_getPropertyValueEnum(p, pname.data());
969	if (v == UCHAR_INVALID_CODE) {
970	p = u_getPropertyEnum(pname.data());
971	if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
972	v = `1`;
973	} else if (`0` == uprv_comparePropertyNames(ANY, pname.data())) {
974	set(MIN_VALUE, MAX_VALUE);
975	return *this;
976	} else if (`0` == uprv_comparePropertyNames(ASCII, pname.data())) {
977	set(`0`, `0x7F`);
978	return *this;
979	} else if (`0` == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
980	// [:Assigned:]=[:^Cn:]
981	p = UCHAR_GENERAL_CATEGORY_MASK;
982	v = U_GC_CN_MASK;
983	invert = TRUE;
984	} else {
985	FAIL(ec);
986	}
987	}
988	}
989	}
990
991	applyIntPropertyValue(p, v, ec);
992	if(invert) {
993	complement();
994	}
995
996	if (isBogus() && U_SUCCESS(ec)) {
997	// We likely ran out of memory. AHHH!
998	ec = U_MEMORY_ALLOCATION_ERROR;
999	}
1000	return *this;
1001	}
1002
1003	//----------------------------------------------------------------
1004	// Property set patterns
1005	//----------------------------------------------------------------
1006
1007	/**
1008	* Return true if the given position, in the given pattern, appears
1009	* to be the start of a property set pattern.
1010	*/
1011	UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1012	int32_t pos) {
1013	// Patterns are at least 5 characters long
1014	if ((pos+`5`) > pattern.length()) {
1015	return FALSE;
1016	}
1017
1018	// Look for an opening [:, [:^, \p, or \P
1019	return isPOSIXOpen(pattern, pos) \|\| isPerlOpen(pattern, pos) \|\| isNameOpen(pattern, pos);
1020	}
1021
1022	/**
1023	* Return true if the given iterator appears to point at a
1024	* property pattern. Regardless of the result, return with the
1025	* iterator unchanged.
1026	* @param chars iterator over the pattern characters. Upon return
1027	* it will be unchanged.
1028	* @param iterOpts RuleCharacterIterator options
1029	*/
1030	UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
1031	int32_t iterOpts) {
1032	// NOTE: literal will always be FALSE, because we don't parse escapes.
1033	UBool result = FALSE, literal;
1034	UErrorCode ec = U_ZERO_ERROR;
1035	iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1036	RuleCharacterIterator::Pos pos;
1037	chars.getPos(pos);
1038	UChar32 c = chars.next(iterOpts, literal, ec);
1039	if (c == `0x5B` /'['/ \|\| c == `0x5C` /'\\'/) {
1040	UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1041	literal, ec);
1042	result = (c == `0x5B` /'['/) ? (d == `0x3A` /':'/) :
1043	(d == `0x4E` /'N'/ \|\| d == `0x70` /'p'/ \|\| d == `0x50` /'P'/);
1044	}
1045	chars.setPos(pos);
1046	return result && U_SUCCESS(ec);
1047	}
1048
1049	/**
1050	* Parse the given property pattern at the given parse position.
1051	*/
1052	UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1053	ParsePosition& ppos,
1054	UErrorCode &ec) {
1055	int32_t pos = ppos.getIndex();
1056
1057	UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1058	UBool isName = FALSE; // true for \N{pat}, o/w false
1059	UBool invert = FALSE;
1060
1061	if (U_FAILURE(ec)) return *this;
1062
1063	// Minimum length is 5 characters, e.g. \p{L}
1064	if ((pos+`5`) > pattern.length()) {
1065	FAIL(ec);
1066	}
1067
1068	// On entry, ppos should point to one of the following locations:
1069	// Look for an opening [:, [:^, \p, or \P
1070	if (isPOSIXOpen(pattern, pos)) {
1071	posix = TRUE;
1072	pos += `2`;
1073	pos = ICU_Utility::skipWhitespace(pattern, pos);
1074	if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
1075	++pos;
1076	invert = TRUE;
1077	}
1078	} else if (isPerlOpen(pattern, pos) \|\| isNameOpen(pattern, pos)) {
1079	UChar c = pattern.charAt(pos+`1`);
1080	invert = (c == UPPER_P);
1081	isName = (c == UPPER_N);
1082	pos += `2`;
1083	pos = ICU_Utility::skipWhitespace(pattern, pos);
1084	if (pos == pattern.length() \|\| pattern.charAt(pos++) != OPEN_BRACE) {
1085	// Syntax error; "\p" or "\P" not followed by "{"
1086	FAIL(ec);
1087	}
1088	} else {
1089	// Open delimiter not seen
1090	FAIL(ec);
1091	}
1092
1093	// Look for the matching close delimiter, either :] or }
1094	int32_t close;
1095	if (posix) {
1096	close = pattern.indexOf(POSIX_CLOSE, `2`, pos);
1097	} else {
1098	close = pattern.indexOf(CLOSE_BRACE, pos);
1099	}
1100	if (close < `0`) {
1101	// Syntax error; close delimiter missing
1102	FAIL(ec);
1103	}
1104
1105	// Look for an '=' sign. If this is present, we will parse a
1106	// medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1107	// pattern.
1108	int32_t equals = pattern.indexOf(EQUALS, pos);
1109	UnicodeString propName, valueName;
1110	if (equals >= `0` && equals < close && !isName) {
1111	// Equals seen; parse medium/long pattern
1112	pattern.extractBetween(pos, equals, propName);
1113	pattern.extractBetween(equals+`1`, close, valueName);
1114	}
1115
1116	else {
1117	// Handle case where no '=' is seen, and \N{}
1118	pattern.extractBetween(pos, close, propName);
1119
1120	// Handle \N{name}
1121	if (isName) {
1122	// This is a little inefficient since it means we have to
1123	// parse NAME_PROP back to UCHAR_NAME even though we already
1124	// know it's UCHAR_NAME. If we refactor the API to
1125	// support args of (UProperty, char) then we can remove*
1126	// NAME_PROP and make this a little more efficient.
1127	valueName = propName;
1128	propName = UnicodeString (NAME_PROP, NAME_PROP_LENGTH, US_INV);
1129	}
1130	}
1131
1132	applyPropertyAlias(propName, valueName, ec);
1133
1134	if (U_SUCCESS(ec)) {
1135	if (invert) {
1136	complement();
1137	}
1138
1139	// Move to the limit position after the close delimiter if the
1140	// parse succeeded.
1141	ppos.setIndex(close + (posix ? `2` : `1`));
1142	}
1143
1144	return *this;
1145	}
1146
1147	/**
1148	* Parse a property pattern.
1149	* @param chars iterator over the pattern characters. Upon return
1150	* it will be advanced to the first character after the parsed
1151	* pattern, or the end of the iteration if all characters are
1152	* parsed.
1153	* @param rebuiltPat the pattern that was parsed, rebuilt or
1154	* copied from the input pattern, as appropriate.
1155	*/
1156	void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1157	UnicodeString& rebuiltPat,
1158	UErrorCode& ec) {
1159	if (U_FAILURE(ec)) return;
1160	UnicodeString pattern;
1161	chars.lookahead(pattern);
1162	ParsePosition pos(`0`);
1163	applyPropertyPattern(pattern, pos, ec);
1164	if (U_FAILURE(ec)) return;
1165	if (pos.getIndex() == `0`) {
1166	// syntaxError(chars, "Invalid property pattern");
1167	ec = U_MALFORMED_SET;
1168	return;
1169	}
1170	chars.jumpahead(pos.getIndex());
1171	rebuiltPat.append(pattern, `0`, pos.getIndex());
1172	}
1173
1174	U_NAMESPACE_END
1175

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/uniset_props.cpp