1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 1999-2014, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: uniset_props.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2004aug25
16* created by: Markus W. Scherer
17*
18* Character property dependent functions moved here from uniset.cpp
19*/
20
21#include "unicode/utypes.h"
22#include "unicode/uniset.h"
23#include "unicode/parsepos.h"
24#include "unicode/uchar.h"
25#include "unicode/uscript.h"
26#include "unicode/symtable.h"
27#include "unicode/uset.h"
28#include "unicode/locid.h"
29#include "unicode/brkiter.h"
30#include "uset_imp.h"
31#include "ruleiter.h"
32#include "cmemory.h"
33#include "ucln_cmn.h"
34#include "util.h"
35#include "uvector.h"
36#include "uprops.h"
37#include "propname.h"
38#include "normalizer2impl.h"
39#include "uinvchar.h"
40#include "uprops.h"
41#include "charstr.h"
42#include "cstring.h"
43#include "mutex.h"
44#include "umutex.h"
45#include "uassert.h"
46#include "hash.h"
47
48U_NAMESPACE_USE
49
50// Special property set IDs
51static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
52static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
53static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
54
55// Unicode name property alias
56#define NAME_PROP "na"
57#define NAME_PROP_LENGTH 2
58
59// Cached sets ------------------------------------------------------------- ***
60
61U_CDECL_BEGIN
62static UBool U_CALLCONV uset_cleanup();
63
64static UnicodeSet *uni32Singleton;
65static icu::UInitOnce uni32InitOnce {};
66
67/**
68 * Cleanup function for UnicodeSet
69 */
70static UBool U_CALLCONV uset_cleanup() {
71 delete uni32Singleton;
72 uni32Singleton = nullptr;
73 uni32InitOnce.reset();
74 return true;
75}
76
77U_CDECL_END
78
79U_NAMESPACE_BEGIN
80
81namespace {
82
83// Cache some sets for other services -------------------------------------- ***
84void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
85 U_ASSERT(uni32Singleton == nullptr);
86 uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
87 if(uni32Singleton==nullptr) {
88 errorCode=U_MEMORY_ALLOCATION_ERROR;
89 } else {
90 uni32Singleton->freeze();
91 }
92 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
93}
94
95
96U_CFUNC UnicodeSet *
97uniset_getUnicode32Instance(UErrorCode &errorCode) {
98 umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
99 return uni32Singleton;
100}
101
102// helper functions for matching of pattern syntax pieces ------------------ ***
103// these functions are parallel to the PERL_OPEN etc. strings above
104
105// using these functions is not only faster than UnicodeString::compare() and
106// caseCompare(), but they also make UnicodeSet work for simple patterns when
107// no Unicode properties data is available - when caseCompare() fails
108
109static inline UBool
110isPerlOpen(const UnicodeString &pattern, int32_t pos) {
111 char16_t c;
112 return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
113}
114
115/*static inline UBool
116isPerlClose(const UnicodeString &pattern, int32_t pos) {
117 return pattern.charAt(pos)==u'}';
118}*/
119
120static inline UBool
121isNameOpen(const UnicodeString &pattern, int32_t pos) {
122 return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
123}
124
125static inline UBool
126isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
127 return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
128}
129
130/*static inline UBool
131isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
132 return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
133}*/
134
135// TODO memory debugging provided inside uniset.cpp
136// could be made available here but probably obsolete with use of modern
137// memory leak checker tools
138#define _dbgct(me)
139
140} // namespace
141
142//----------------------------------------------------------------
143// Constructors &c
144//----------------------------------------------------------------
145
146/**
147 * Constructs a set from the given pattern, optionally ignoring
148 * white space. See the class description for the syntax of the
149 * pattern language.
150 * @param pattern a string specifying what characters are in the set
151 */
152UnicodeSet::UnicodeSet(const UnicodeString& pattern,
153 UErrorCode& status) {
154 applyPattern(pattern, status);
155 _dbgct(this);
156}
157
158//----------------------------------------------------------------
159// Public API
160//----------------------------------------------------------------
161
162UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
163 UErrorCode& status) {
164 // Equivalent to
165 // return applyPattern(pattern, USET_IGNORE_SPACE, nullptr, status);
166 // but without dependency on closeOver().
167 ParsePosition pos(0);
168 applyPatternIgnoreSpace(pattern, pos, nullptr, status);
169 if (U_FAILURE(status)) return *this;
170
171 int32_t i = pos.getIndex();
172 // Skip over trailing whitespace
173 ICU_Utility::skipWhitespace(pattern, i, true);
174 if (i != pattern.length()) {
175 status = U_ILLEGAL_ARGUMENT_ERROR;
176 }
177 return *this;
178}
179
180void
181UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
182 ParsePosition& pos,
183 const SymbolTable* symbols,
184 UErrorCode& status) {
185 if (U_FAILURE(status)) {
186 return;
187 }
188 if (isFrozen()) {
189 status = U_NO_WRITE_PERMISSION;
190 return;
191 }
192 // Need to build the pattern in a temporary string because
193 // _applyPattern calls add() etc., which set pat to empty.
194 UnicodeString rebuiltPat;
195 RuleCharacterIterator chars(pattern, symbols, pos);
196 applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, 0, status);
197 if (U_FAILURE(status)) return;
198 if (chars.inVariable()) {
199 // syntaxError(chars, "Extra chars in variable value");
200 status = U_MALFORMED_SET;
201 return;
202 }
203 setPattern(rebuiltPat);
204}
205
206/**
207 * Return true if the given position, in the given pattern, appears
208 * to be the start of a UnicodeSet pattern.
209 */
210UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
211 return ((pos+1) < pattern.length() &&
212 pattern.charAt(pos) == (char16_t)91/*[*/) ||
213 resemblesPropertyPattern(pattern, pos);
214}
215
216//----------------------------------------------------------------
217// Implementation: Pattern parsing
218//----------------------------------------------------------------
219
220namespace {
221
222/**
223 * A small all-inline class to manage a UnicodeSet pointer. Add
224 * operator->() etc. as needed.
225 */
226class UnicodeSetPointer {
227 UnicodeSet* p;
228public:
229 inline UnicodeSetPointer() : p(0) {}
230 inline ~UnicodeSetPointer() { delete p; }
231 inline UnicodeSet* pointer() { return p; }
232 inline UBool allocate() {
233 if (p == 0) {
234 p = new UnicodeSet();
235 }
236 return p != 0;
237 }
238};
239
240constexpr int32_t MAX_DEPTH = 100;
241
242} // namespace
243
244/**
245 * Parse the pattern from the given RuleCharacterIterator. The
246 * iterator is advanced over the parsed pattern.
247 * @param chars iterator over the pattern characters. Upon return
248 * it will be advanced to the first character after the parsed
249 * pattern, or the end of the iteration if all characters are
250 * parsed.
251 * @param symbols symbol table to use to parse and dereference
252 * variables, or null if none.
253 * @param rebuiltPat the pattern that was parsed, rebuilt or
254 * copied from the input pattern, as appropriate.
255 * @param options a bit mask of zero or more of the following:
256 * IGNORE_SPACE, CASE.
257 */
258void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
259 const SymbolTable* symbols,
260 UnicodeString& rebuiltPat,
261 uint32_t options,
262 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
263 int32_t depth,
264 UErrorCode& ec) {
265 if (U_FAILURE(ec)) return;
266 if (depth > MAX_DEPTH) {
267 ec = U_ILLEGAL_ARGUMENT_ERROR;
268 return;
269 }
270
271 // Syntax characters: [ ] ^ - & { }
272
273 // Recognized special forms for chars, sets: c-c s-s s&s
274
275 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
276 RuleCharacterIterator::PARSE_ESCAPES;
277 if ((options & USET_IGNORE_SPACE) != 0) {
278 opts |= RuleCharacterIterator::SKIP_WHITESPACE;
279 }
280
281 UnicodeString patLocal, buf;
282 UBool usePat = false;
283 UnicodeSetPointer scratch;
284 RuleCharacterIterator::Pos backup;
285
286 // mode: 0=before [, 1=between [...], 2=after ]
287 // lastItem: 0=none, 1=char, 2=set
288 int8_t lastItem = 0, mode = 0;
289 UChar32 lastChar = 0;
290 char16_t op = 0;
291
292 UBool invert = false;
293
294 clear();
295
296 while (mode != 2 && !chars.atEnd()) {
297 U_ASSERT((lastItem == 0 && op == 0) ||
298 (lastItem == 1 && (op == 0 || op == u'-')) ||
299 (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));
300
301 UChar32 c = 0;
302 UBool literal = false;
303 UnicodeSet* nested = 0; // alias - do not delete
304
305 // -------- Check for property pattern
306
307 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
308 int8_t setMode = 0;
309 if (resemblesPropertyPattern(chars, opts)) {
310 setMode = 2;
311 }
312
313 // -------- Parse '[' of opening delimiter OR nested set.
314 // If there is a nested set, use `setMode' to define how
315 // the set should be parsed. If the '[' is part of the
316 // opening delimiter for this pattern, parse special
317 // strings "[", "[^", "[-", and "[^-". Check for stand-in
318 // characters representing a nested set in the symbol
319 // table.
320
321 else {
322 // Prepare to backup if necessary
323 chars.getPos(backup);
324 c = chars.next(opts, literal, ec);
325 if (U_FAILURE(ec)) return;
326
327 if (c == u'[' && !literal) {
328 if (mode == 1) {
329 chars.setPos(backup); // backup
330 setMode = 1;
331 } else {
332 // Handle opening '[' delimiter
333 mode = 1;
334 patLocal.append(u'[');
335 chars.getPos(backup); // prepare to backup
336 c = chars.next(opts, literal, ec);
337 if (U_FAILURE(ec)) return;
338 if (c == u'^' && !literal) {
339 invert = true;
340 patLocal.append(u'^');
341 chars.getPos(backup); // prepare to backup
342 c = chars.next(opts, literal, ec);
343 if (U_FAILURE(ec)) return;
344 }
345 // Fall through to handle special leading '-';
346 // otherwise restart loop for nested [], \p{}, etc.
347 if (c == u'-') {
348 literal = true;
349 // Fall through to handle literal '-' below
350 } else {
351 chars.setPos(backup); // backup
352 continue;
353 }
354 }
355 } else if (symbols != 0) {
356 const UnicodeFunctor *m = symbols->lookupMatcher(c);
357 if (m != 0) {
358 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
359 if (ms == nullptr) {
360 ec = U_MALFORMED_SET;
361 return;
362 }
363 // casting away const, but `nested' won't be modified
364 // (important not to modify stored set)
365 nested = const_cast<UnicodeSet*>(ms);
366 setMode = 3;
367 }
368 }
369 }
370
371 // -------- Handle a nested set. This either is inline in
372 // the pattern or represented by a stand-in that has
373 // previously been parsed and was looked up in the symbol
374 // table.
375
376 if (setMode != 0) {
377 if (lastItem == 1) {
378 if (op != 0) {
379 // syntaxError(chars, "Char expected after operator");
380 ec = U_MALFORMED_SET;
381 return;
382 }
383 add(lastChar, lastChar);
384 _appendToPat(patLocal, lastChar, false);
385 lastItem = 0;
386 op = 0;
387 }
388
389 if (op == u'-' || op == u'&') {
390 patLocal.append(op);
391 }
392
393 if (nested == 0) {
394 // lazy allocation
395 if (!scratch.allocate()) {
396 ec = U_MEMORY_ALLOCATION_ERROR;
397 return;
398 }
399 nested = scratch.pointer();
400 }
401 switch (setMode) {
402 case 1:
403 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
404 break;
405 case 2:
406 chars.skipIgnored(opts);
407 nested->applyPropertyPattern(chars, patLocal, ec);
408 if (U_FAILURE(ec)) return;
409 break;
410 case 3: // `nested' already parsed
411 nested->_toPattern(patLocal, false);
412 break;
413 }
414
415 usePat = true;
416
417 if (mode == 0) {
418 // Entire pattern is a category; leave parse loop
419 *this = *nested;
420 mode = 2;
421 break;
422 }
423
424 switch (op) {
425 case u'-':
426 removeAll(*nested);
427 break;
428 case u'&':
429 retainAll(*nested);
430 break;
431 case 0:
432 addAll(*nested);
433 break;
434 }
435
436 op = 0;
437 lastItem = 2;
438
439 continue;
440 }
441
442 if (mode == 0) {
443 // syntaxError(chars, "Missing '['");
444 ec = U_MALFORMED_SET;
445 return;
446 }
447
448 // -------- Parse special (syntax) characters. If the
449 // current character is not special, or if it is escaped,
450 // then fall through and handle it below.
451
452 if (!literal) {
453 switch (c) {
454 case u']':
455 if (lastItem == 1) {
456 add(lastChar, lastChar);
457 _appendToPat(patLocal, lastChar, false);
458 }
459 // Treat final trailing '-' as a literal
460 if (op == u'-') {
461 add(op, op);
462 patLocal.append(op);
463 } else if (op == u'&') {
464 // syntaxError(chars, "Trailing '&'");
465 ec = U_MALFORMED_SET;
466 return;
467 }
468 patLocal.append(u']');
469 mode = 2;
470 continue;
471 case u'-':
472 if (op == 0) {
473 if (lastItem != 0) {
474 op = (char16_t) c;
475 continue;
476 } else {
477 // Treat final trailing '-' as a literal
478 add(c, c);
479 c = chars.next(opts, literal, ec);
480 if (U_FAILURE(ec)) return;
481 if (c == u']' && !literal) {
482 patLocal.append(u"-]", 2);
483 mode = 2;
484 continue;
485 }
486 }
487 }
488 // syntaxError(chars, "'-' not after char or set");
489 ec = U_MALFORMED_SET;
490 return;
491 case u'&':
492 if (lastItem == 2 && op == 0) {
493 op = (char16_t) c;
494 continue;
495 }
496 // syntaxError(chars, "'&' not after set");
497 ec = U_MALFORMED_SET;
498 return;
499 case u'^':
500 // syntaxError(chars, "'^' not after '['");
501 ec = U_MALFORMED_SET;
502 return;
503 case u'{':
504 if (op != 0) {
505 // syntaxError(chars, "Missing operand after operator");
506 ec = U_MALFORMED_SET;
507 return;
508 }
509 if (lastItem == 1) {
510 add(lastChar, lastChar);
511 _appendToPat(patLocal, lastChar, false);
512 }
513 lastItem = 0;
514 buf.truncate(0);
515 {
516 UBool ok = false;
517 while (!chars.atEnd()) {
518 c = chars.next(opts, literal, ec);
519 if (U_FAILURE(ec)) return;
520 if (c == u'}' && !literal) {
521 ok = true;
522 break;
523 }
524 buf.append(c);
525 }
526 if (!ok) {
527 // syntaxError(chars, "Invalid multicharacter string");
528 ec = U_MALFORMED_SET;
529 return;
530 }
531 }
532 // We have new string. Add it to set and continue;
533 // we don't need to drop through to the further
534 // processing
535 add(buf);
536 patLocal.append(u'{');
537 _appendToPat(patLocal, buf, false);
538 patLocal.append(u'}');
539 continue;
540 case SymbolTable::SYMBOL_REF:
541 // symbols nosymbols
542 // [a-$] error error (ambiguous)
543 // [a$] anchor anchor
544 // [a-$x] var "x"* literal '$'
545 // [a-$.] error literal '$'
546 // *We won't get here in the case of var "x"
547 {
548 chars.getPos(backup);
549 c = chars.next(opts, literal, ec);
550 if (U_FAILURE(ec)) return;
551 UBool anchor = (c == u']' && !literal);
552 if (symbols == 0 && !anchor) {
553 c = SymbolTable::SYMBOL_REF;
554 chars.setPos(backup);
555 break; // literal '$'
556 }
557 if (anchor && op == 0) {
558 if (lastItem == 1) {
559 add(lastChar, lastChar);
560 _appendToPat(patLocal, lastChar, false);
561 }
562 add(U_ETHER);
563 usePat = true;
564 patLocal.append((char16_t) SymbolTable::SYMBOL_REF);
565 patLocal.append(u']');
566 mode = 2;
567 continue;
568 }
569 // syntaxError(chars, "Unquoted '$'");
570 ec = U_MALFORMED_SET;
571 return;
572 }
573 default:
574 break;
575 }
576 }
577
578 // -------- Parse literal characters. This includes both
579 // escaped chars ("\u4E01") and non-syntax characters
580 // ("a").
581
582 switch (lastItem) {
583 case 0:
584 lastItem = 1;
585 lastChar = c;
586 break;
587 case 1:
588 if (op == u'-') {
589 if (lastChar >= c) {
590 // Don't allow redundant (a-a) or empty (b-a) ranges;
591 // these are most likely typos.
592 // syntaxError(chars, "Invalid range");
593 ec = U_MALFORMED_SET;
594 return;
595 }
596 add(lastChar, c);
597 _appendToPat(patLocal, lastChar, false);
598 patLocal.append(op);
599 _appendToPat(patLocal, c, false);
600 lastItem = 0;
601 op = 0;
602 } else {
603 add(lastChar, lastChar);
604 _appendToPat(patLocal, lastChar, false);
605 lastChar = c;
606 }
607 break;
608 case 2:
609 if (op != 0) {
610 // syntaxError(chars, "Set expected after operator");
611 ec = U_MALFORMED_SET;
612 return;
613 }
614 lastChar = c;
615 lastItem = 1;
616 break;
617 }
618 }
619
620 if (mode != 2) {
621 // syntaxError(chars, "Missing ']'");
622 ec = U_MALFORMED_SET;
623 return;
624 }
625
626 chars.skipIgnored(opts);
627
628 /**
629 * Handle global flags (invert, case insensitivity). If this
630 * pattern should be compiled case-insensitive, then we need
631 * to close over case BEFORE COMPLEMENTING. This makes
632 * patterns like /[^abc]/i work.
633 */
634 if ((options & USET_CASE_MASK) != 0) {
635 (this->*caseClosure)(options);
636 }
637 if (invert) {
638 complement().removeAllStrings(); // code point complement
639 }
640
641 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
642 // generated pattern.
643 if (usePat) {
644 rebuiltPat.append(patLocal);
645 } else {
646 _generatePattern(rebuiltPat, false);
647 }
648 if (isBogus() && U_SUCCESS(ec)) {
649 // We likely ran out of memory. AHHH!
650 ec = U_MEMORY_ALLOCATION_ERROR;
651 }
652}
653
654//----------------------------------------------------------------
655// Property set implementation
656//----------------------------------------------------------------
657
658namespace {
659
660static UBool numericValueFilter(UChar32 ch, void* context) {
661 return u_getNumericValue(ch) == *(double*)context;
662}
663
664static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
665 int32_t value = *(int32_t*)context;
666 return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
667}
668
669static UBool versionFilter(UChar32 ch, void* context) {
670 static const UVersionInfo none = { 0, 0, 0, 0 };
671 UVersionInfo v;
672 u_charAge(ch, v);
673 UVersionInfo* version = (UVersionInfo*)context;
674 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
675}
676
677typedef struct {
678 UProperty prop;
679 int32_t value;
680} IntPropertyContext;
681
682static UBool intPropertyFilter(UChar32 ch, void* context) {
683 IntPropertyContext* c = (IntPropertyContext*)context;
684 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
685}
686
687static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
688 return uscript_hasScript(ch, *(UScriptCode*)context);
689}
690
691} // namespace
692
693/**
694 * Generic filter-based scanning code for UCD property UnicodeSets.
695 */
696void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
697 void* context,
698 const UnicodeSet* inclusions,
699 UErrorCode &status) {
700 if (U_FAILURE(status)) return;
701
702 // Logically, walk through all Unicode characters, noting the start
703 // and end of each range for which filter.contain(c) is
704 // true. Add each range to a set.
705 //
706 // To improve performance, use an inclusions set which
707 // encodes information about character ranges that are known
708 // to have identical properties.
709 // inclusions contains the first characters of
710 // same-value ranges for the given property.
711
712 clear();
713
714 UChar32 startHasProperty = -1;
715 int32_t limitRange = inclusions->getRangeCount();
716
717 for (int j=0; j<limitRange; ++j) {
718 // get current range
719 UChar32 start = inclusions->getRangeStart(j);
720 UChar32 end = inclusions->getRangeEnd(j);
721
722 // for all the code points in the range, process
723 for (UChar32 ch = start; ch <= end; ++ch) {
724 // only add to this UnicodeSet on inflection points --
725 // where the hasProperty value changes to false
726 if ((*filter)(ch, context)) {
727 if (startHasProperty < 0) {
728 startHasProperty = ch;
729 }
730 } else if (startHasProperty >= 0) {
731 add(startHasProperty, ch-1);
732 startHasProperty = -1;
733 }
734 }
735 }
736 if (startHasProperty >= 0) {
737 add((UChar32)startHasProperty, (UChar32)0x10FFFF);
738 }
739 if (isBogus() && U_SUCCESS(status)) {
740 // We likely ran out of memory. AHHH!
741 status = U_MEMORY_ALLOCATION_ERROR;
742 }
743}
744
745namespace {
746
747static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
748 /* Note: we use ' ' in compiler code page */
749 int32_t j = 0;
750 char ch;
751 --dstCapacity; /* make room for term. zero */
752 while ((ch = *src++) != 0) {
753 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
754 continue;
755 }
756 if (j >= dstCapacity) return false;
757 dst[j++] = ch;
758 }
759 if (j > 0 && dst[j-1] == ' ') --j;
760 dst[j] = 0;
761 return true;
762}
763
764} // namespace
765
766//----------------------------------------------------------------
767// Property set API
768//----------------------------------------------------------------
769
770#define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
771 ec=U_ILLEGAL_ARGUMENT_ERROR; \
772 return *this; \
773} UPRV_BLOCK_MACRO_END
774
775UnicodeSet&
776UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
777 if (U_FAILURE(ec) || isFrozen()) { return *this; }
778 if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
779 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
780 applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
781 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
782 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
783 UScriptCode script = (UScriptCode)value;
784 applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
785 } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
786 if (value == 0 || value == 1) {
787 const USet *set = u_getBinaryPropertySet(prop, &ec);
788 if (U_FAILURE(ec)) { return *this; }
789 copyFrom(*UnicodeSet::fromUSet(set), true);
790 if (value == 0) {
791 complement().removeAllStrings(); // code point complement
792 }
793 } else {
794 clear();
795 }
796 } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
797 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
798 IntPropertyContext c = {prop, value};
799 applyFilter(intPropertyFilter, &c, inclusions, ec);
800 } else {
801 ec = U_ILLEGAL_ARGUMENT_ERROR;
802 }
803 return *this;
804}
805
806UnicodeSet&
807UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
808 const UnicodeString& value,
809 UErrorCode& ec) {
810 if (U_FAILURE(ec) || isFrozen()) return *this;
811
812 // prop and value used to be converted to char * using the default
813 // converter instead of the invariant conversion.
814 // This should not be necessary because all Unicode property and value
815 // names use only invariant characters.
816 // If there are any variant characters, then we won't find them anyway.
817 // Checking first avoids assertion failures in the conversion.
818 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
819 !uprv_isInvariantUString(value.getBuffer(), value.length())
820 ) {
821 FAIL(ec);
822 }
823 CharString pname, vname;
824 pname.appendInvariantChars(prop, ec);
825 vname.appendInvariantChars(value, ec);
826 if (U_FAILURE(ec)) return *this;
827
828 UProperty p;
829 int32_t v;
830 UBool invert = false;
831
832 if (value.length() > 0) {
833 p = u_getPropertyEnum(pname.data());
834 if (p == UCHAR_INVALID_CODE) FAIL(ec);
835
836 // Treat gc as gcm
837 if (p == UCHAR_GENERAL_CATEGORY) {
838 p = UCHAR_GENERAL_CATEGORY_MASK;
839 }
840
841 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
842 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
843 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
844 v = u_getPropertyValueEnum(p, vname.data());
845 if (v == UCHAR_INVALID_CODE) {
846 // Handle numeric CCC
847 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
848 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
849 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
850 char* end;
851 double val = uprv_strtod(vname.data(), &end);
852 // Anything between 0 and 255 is valid even if unused.
853 // Cast double->int only after range check.
854 // We catch NaN here because comparing it with both 0 and 255 will be false
855 // (as are all comparisons with NaN).
856 if (*end != 0 || !(0 <= val && val <= 255) ||
857 (v = (int32_t)val) != val) {
858 // non-integral value or outside 0..255, or trailing junk
859 FAIL(ec);
860 }
861 } else {
862 FAIL(ec);
863 }
864 }
865 }
866
867 else {
868
869 switch (p) {
870 case UCHAR_NUMERIC_VALUE:
871 {
872 char* end;
873 double val = uprv_strtod(vname.data(), &end);
874 if (*end != 0) {
875 FAIL(ec);
876 }
877 applyFilter(numericValueFilter, &val,
878 CharacterProperties::getInclusionsForProperty(p, ec), ec);
879 return *this;
880 }
881 case UCHAR_NAME:
882 {
883 // Must munge name, since u_charFromName() does not do
884 // 'loose' matching.
885 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
886 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
887 UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
888 if (U_SUCCESS(ec)) {
889 clear();
890 add(ch);
891 return *this;
892 } else {
893 FAIL(ec);
894 }
895 }
896 case UCHAR_UNICODE_1_NAME:
897 // ICU 49 deprecates the Unicode_1_Name property APIs.
898 FAIL(ec);
899 case UCHAR_AGE:
900 {
901 // Must munge name, since u_versionFromString() does not do
902 // 'loose' matching.
903 char buf[128];
904 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
905 UVersionInfo version;
906 u_versionFromString(version, buf);
907 applyFilter(versionFilter, &version,
908 CharacterProperties::getInclusionsForProperty(p, ec), ec);
909 return *this;
910 }
911 case UCHAR_SCRIPT_EXTENSIONS:
912 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
913 if (v == UCHAR_INVALID_CODE) {
914 FAIL(ec);
915 }
916 // fall through to calling applyIntPropertyValue()
917 break;
918 default:
919 // p is a non-binary, non-enumerated property that we
920 // don't support (yet).
921 FAIL(ec);
922 }
923 }
924 }
925
926 else {
927 // value is empty. Interpret as General Category, Script, or
928 // Binary property.
929 p = UCHAR_GENERAL_CATEGORY_MASK;
930 v = u_getPropertyValueEnum(p, pname.data());
931 if (v == UCHAR_INVALID_CODE) {
932 p = UCHAR_SCRIPT;
933 v = u_getPropertyValueEnum(p, pname.data());
934 if (v == UCHAR_INVALID_CODE) {
935 p = u_getPropertyEnum(pname.data());
936 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
937 v = 1;
938 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
939 set(MIN_VALUE, MAX_VALUE);
940 return *this;
941 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
942 set(0, 0x7F);
943 return *this;
944 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
945 // [:Assigned:]=[:^Cn:]
946 p = UCHAR_GENERAL_CATEGORY_MASK;
947 v = U_GC_CN_MASK;
948 invert = true;
949 } else {
950 FAIL(ec);
951 }
952 }
953 }
954 }
955
956 applyIntPropertyValue(p, v, ec);
957 if(invert) {
958 complement().removeAllStrings(); // code point complement
959 }
960
961 if (isBogus() && U_SUCCESS(ec)) {
962 // We likely ran out of memory. AHHH!
963 ec = U_MEMORY_ALLOCATION_ERROR;
964 }
965 return *this;
966}
967
968//----------------------------------------------------------------
969// Property set patterns
970//----------------------------------------------------------------
971
972/**
973 * Return true if the given position, in the given pattern, appears
974 * to be the start of a property set pattern.
975 */
976UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
977 int32_t pos) {
978 // Patterns are at least 5 characters long
979 if ((pos+5) > pattern.length()) {
980 return false;
981 }
982
983 // Look for an opening [:, [:^, \p, or \P
984 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
985}
986
987/**
988 * Return true if the given iterator appears to point at a
989 * property pattern. Regardless of the result, return with the
990 * iterator unchanged.
991 * @param chars iterator over the pattern characters. Upon return
992 * it will be unchanged.
993 * @param iterOpts RuleCharacterIterator options
994 */
995UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
996 int32_t iterOpts) {
997 // NOTE: literal will always be false, because we don't parse escapes.
998 UBool result = false, literal;
999 UErrorCode ec = U_ZERO_ERROR;
1000 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1001 RuleCharacterIterator::Pos pos;
1002 chars.getPos(pos);
1003 UChar32 c = chars.next(iterOpts, literal, ec);
1004 if (c == u'[' || c == u'\\') {
1005 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1006 literal, ec);
1007 result = (c == u'[') ? (d == u':') :
1008 (d == u'N' || d == u'p' || d == u'P');
1009 }
1010 chars.setPos(pos);
1011 return result && U_SUCCESS(ec);
1012}
1013
1014/**
1015 * Parse the given property pattern at the given parse position.
1016 */
1017UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1018 ParsePosition& ppos,
1019 UErrorCode &ec) {
1020 int32_t pos = ppos.getIndex();
1021
1022 UBool posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1023 UBool isName = false; // true for \N{pat}, o/w false
1024 UBool invert = false;
1025
1026 if (U_FAILURE(ec)) return *this;
1027
1028 // Minimum length is 5 characters, e.g. \p{L}
1029 if ((pos+5) > pattern.length()) {
1030 FAIL(ec);
1031 }
1032
1033 // On entry, ppos should point to one of the following locations:
1034 // Look for an opening [:, [:^, \p, or \P
1035 if (isPOSIXOpen(pattern, pos)) {
1036 posix = true;
1037 pos += 2;
1038 pos = ICU_Utility::skipWhitespace(pattern, pos);
1039 if (pos < pattern.length() && pattern.charAt(pos) == u'^') {
1040 ++pos;
1041 invert = true;
1042 }
1043 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1044 char16_t c = pattern.charAt(pos+1);
1045 invert = (c == u'P');
1046 isName = (c == u'N');
1047 pos += 2;
1048 pos = ICU_Utility::skipWhitespace(pattern, pos);
1049 if (pos == pattern.length() || pattern.charAt(pos++) != u'{') {
1050 // Syntax error; "\p" or "\P" not followed by "{"
1051 FAIL(ec);
1052 }
1053 } else {
1054 // Open delimiter not seen
1055 FAIL(ec);
1056 }
1057
1058 // Look for the matching close delimiter, either :] or }
1059 int32_t close;
1060 if (posix) {
1061 close = pattern.indexOf(u":]", 2, pos);
1062 } else {
1063 close = pattern.indexOf(u'}', pos);
1064 }
1065 if (close < 0) {
1066 // Syntax error; close delimiter missing
1067 FAIL(ec);
1068 }
1069
1070 // Look for an '=' sign. If this is present, we will parse a
1071 // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1072 // pattern.
1073 int32_t equals = pattern.indexOf(u'=', pos);
1074 UnicodeString propName, valueName;
1075 if (equals >= 0 && equals < close && !isName) {
1076 // Equals seen; parse medium/long pattern
1077 pattern.extractBetween(pos, equals, propName);
1078 pattern.extractBetween(equals+1, close, valueName);
1079 }
1080
1081 else {
1082 // Handle case where no '=' is seen, and \N{}
1083 pattern.extractBetween(pos, close, propName);
1084
1085 // Handle \N{name}
1086 if (isName) {
1087 // This is a little inefficient since it means we have to
1088 // parse NAME_PROP back to UCHAR_NAME even though we already
1089 // know it's UCHAR_NAME. If we refactor the API to
1090 // support args of (UProperty, char*) then we can remove
1091 // NAME_PROP and make this a little more efficient.
1092 valueName = propName;
1093 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1094 }
1095 }
1096
1097 applyPropertyAlias(propName, valueName, ec);
1098
1099 if (U_SUCCESS(ec)) {
1100 if (invert) {
1101 complement().removeAllStrings(); // code point complement
1102 }
1103
1104 // Move to the limit position after the close delimiter if the
1105 // parse succeeded.
1106 ppos.setIndex(close + (posix ? 2 : 1));
1107 }
1108
1109 return *this;
1110}
1111
1112/**
1113 * Parse a property pattern.
1114 * @param chars iterator over the pattern characters. Upon return
1115 * it will be advanced to the first character after the parsed
1116 * pattern, or the end of the iteration if all characters are
1117 * parsed.
1118 * @param rebuiltPat the pattern that was parsed, rebuilt or
1119 * copied from the input pattern, as appropriate.
1120 */
1121void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1122 UnicodeString& rebuiltPat,
1123 UErrorCode& ec) {
1124 if (U_FAILURE(ec)) return;
1125 UnicodeString pattern;
1126 chars.lookahead(pattern);
1127 ParsePosition pos(0);
1128 applyPropertyPattern(pattern, pos, ec);
1129 if (U_FAILURE(ec)) return;
1130 if (pos.getIndex() == 0) {
1131 // syntaxError(chars, "Invalid property pattern");
1132 ec = U_MALFORMED_SET;
1133 return;
1134 }
1135 chars.jumpahead(pos.getIndex());
1136 rebuiltPat.append(pattern, 0, pos.getIndex());
1137}
1138
1139U_NAMESPACE_END
1140