1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 1999-2014, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: uniset_props.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2004aug25
16* created by: Markus W. Scherer
17*
18* Character property dependent functions moved here from uniset.cpp
19*/
20
21#include "unicode/utypes.h"
22#include "unicode/uniset.h"
23#include "unicode/parsepos.h"
24#include "unicode/uchar.h"
25#include "unicode/uscript.h"
26#include "unicode/symtable.h"
27#include "unicode/uset.h"
28#include "unicode/locid.h"
29#include "unicode/brkiter.h"
30#include "uset_imp.h"
31#include "ruleiter.h"
32#include "cmemory.h"
33#include "ucln_cmn.h"
34#include "util.h"
35#include "uvector.h"
36#include "uprops.h"
37#include "propname.h"
38#include "normalizer2impl.h"
39#include "uinvchar.h"
40#include "uprops.h"
41#include "charstr.h"
42#include "cstring.h"
43#include "mutex.h"
44#include "umutex.h"
45#include "uassert.h"
46#include "hash.h"
47
48U_NAMESPACE_USE
49
50// Define UChar constants using hex for EBCDIC compatibility
51// Used #define to reduce private static exports and memory access time.
52#define SET_OPEN ((UChar)0x005B) /*[*/
53#define SET_CLOSE ((UChar)0x005D) /*]*/
54#define HYPHEN ((UChar)0x002D) /*-*/
55#define COMPLEMENT ((UChar)0x005E) /*^*/
56#define COLON ((UChar)0x003A) /*:*/
57#define BACKSLASH ((UChar)0x005C) /*\*/
58#define INTERSECTION ((UChar)0x0026) /*&*/
59#define UPPER_U ((UChar)0x0055) /*U*/
60#define LOWER_U ((UChar)0x0075) /*u*/
61#define OPEN_BRACE ((UChar)123) /*{*/
62#define CLOSE_BRACE ((UChar)125) /*}*/
63#define UPPER_P ((UChar)0x0050) /*P*/
64#define LOWER_P ((UChar)0x0070) /*p*/
65#define UPPER_N ((UChar)78) /*N*/
66#define EQUALS ((UChar)0x003D) /*=*/
67
68//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"
69static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]"
70//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"
71//static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"
72//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"
73static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
74
75// Special property set IDs
76static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
77static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
78static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
79
80// Unicode name property alias
81#define NAME_PROP "na"
82#define NAME_PROP_LENGTH 2
83
84/**
85 * Delimiter string used in patterns to close a category reference:
86 * ":]". Example: "[:Lu:]".
87 */
88//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
89
90// Cached sets ------------------------------------------------------------- ***
91
92U_CDECL_BEGIN
93static UBool U_CALLCONV uset_cleanup();
94
95static UnicodeSet *uni32Singleton;
96static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;
97
98/**
99 * Cleanup function for UnicodeSet
100 */
101static UBool U_CALLCONV uset_cleanup(void) {
102 delete uni32Singleton;
103 uni32Singleton = NULL;
104 uni32InitOnce.reset();
105 return TRUE;
106}
107
108U_CDECL_END
109
110U_NAMESPACE_BEGIN
111
112namespace {
113
114// Cache some sets for other services -------------------------------------- ***
115void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
116 U_ASSERT(uni32Singleton == NULL);
117 uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
118 if(uni32Singleton==NULL) {
119 errorCode=U_MEMORY_ALLOCATION_ERROR;
120 } else {
121 uni32Singleton->freeze();
122 }
123 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
124}
125
126
127U_CFUNC UnicodeSet *
128uniset_getUnicode32Instance(UErrorCode &errorCode) {
129 umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
130 return uni32Singleton;
131}
132
133// helper functions for matching of pattern syntax pieces ------------------ ***
134// these functions are parallel to the PERL_OPEN etc. strings above
135
136// using these functions is not only faster than UnicodeString::compare() and
137// caseCompare(), but they also make UnicodeSet work for simple patterns when
138// no Unicode properties data is available - when caseCompare() fails
139
140static inline UBool
141isPerlOpen(const UnicodeString &pattern, int32_t pos) {
142 UChar c;
143 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
144}
145
146/*static inline UBool
147isPerlClose(const UnicodeString &pattern, int32_t pos) {
148 return pattern.charAt(pos)==CLOSE_BRACE;
149}*/
150
151static inline UBool
152isNameOpen(const UnicodeString &pattern, int32_t pos) {
153 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
154}
155
156static inline UBool
157isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
158 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
159}
160
161/*static inline UBool
162isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
163 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
164}*/
165
166// TODO memory debugging provided inside uniset.cpp
167// could be made available here but probably obsolete with use of modern
168// memory leak checker tools
169#define _dbgct(me)
170
171} // namespace
172
173//----------------------------------------------------------------
174// Constructors &c
175//----------------------------------------------------------------
176
177/**
178 * Constructs a set from the given pattern, optionally ignoring
179 * white space. See the class description for the syntax of the
180 * pattern language.
181 * @param pattern a string specifying what characters are in the set
182 */
183UnicodeSet::UnicodeSet(const UnicodeString& pattern,
184 UErrorCode& status) {
185 applyPattern(pattern, status);
186 _dbgct(this);
187}
188
189//----------------------------------------------------------------
190// Public API
191//----------------------------------------------------------------
192
193UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
194 UErrorCode& status) {
195 // Equivalent to
196 // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
197 // but without dependency on closeOver().
198 ParsePosition pos(0);
199 applyPatternIgnoreSpace(pattern, pos, NULL, status);
200 if (U_FAILURE(status)) return *this;
201
202 int32_t i = pos.getIndex();
203 // Skip over trailing whitespace
204 ICU_Utility::skipWhitespace(pattern, i, TRUE);
205 if (i != pattern.length()) {
206 status = U_ILLEGAL_ARGUMENT_ERROR;
207 }
208 return *this;
209}
210
211void
212UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
213 ParsePosition& pos,
214 const SymbolTable* symbols,
215 UErrorCode& status) {
216 if (U_FAILURE(status)) {
217 return;
218 }
219 if (isFrozen()) {
220 status = U_NO_WRITE_PERMISSION;
221 return;
222 }
223 // Need to build the pattern in a temporary string because
224 // _applyPattern calls add() etc., which set pat to empty.
225 UnicodeString rebuiltPat;
226 RuleCharacterIterator chars(pattern, symbols, pos);
227 applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status);
228 if (U_FAILURE(status)) return;
229 if (chars.inVariable()) {
230 // syntaxError(chars, "Extra chars in variable value");
231 status = U_MALFORMED_SET;
232 return;
233 }
234 setPattern(rebuiltPat);
235}
236
237/**
238 * Return true if the given position, in the given pattern, appears
239 * to be the start of a UnicodeSet pattern.
240 */
241UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
242 return ((pos+1) < pattern.length() &&
243 pattern.charAt(pos) == (UChar)91/*[*/) ||
244 resemblesPropertyPattern(pattern, pos);
245}
246
247//----------------------------------------------------------------
248// Implementation: Pattern parsing
249//----------------------------------------------------------------
250
251namespace {
252
253/**
254 * A small all-inline class to manage a UnicodeSet pointer. Add
255 * operator->() etc. as needed.
256 */
257class UnicodeSetPointer {
258 UnicodeSet* p;
259public:
260 inline UnicodeSetPointer() : p(0) {}
261 inline ~UnicodeSetPointer() { delete p; }
262 inline UnicodeSet* pointer() { return p; }
263 inline UBool allocate() {
264 if (p == 0) {
265 p = new UnicodeSet();
266 }
267 return p != 0;
268 }
269};
270
271constexpr int32_t MAX_DEPTH = 100;
272
273} // namespace
274
275/**
276 * Parse the pattern from the given RuleCharacterIterator. The
277 * iterator is advanced over the parsed pattern.
278 * @param chars iterator over the pattern characters. Upon return
279 * it will be advanced to the first character after the parsed
280 * pattern, or the end of the iteration if all characters are
281 * parsed.
282 * @param symbols symbol table to use to parse and dereference
283 * variables, or null if none.
284 * @param rebuiltPat the pattern that was parsed, rebuilt or
285 * copied from the input pattern, as appropriate.
286 * @param options a bit mask of zero or more of the following:
287 * IGNORE_SPACE, CASE.
288 */
289void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
290 const SymbolTable* symbols,
291 UnicodeString& rebuiltPat,
292 uint32_t options,
293 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
294 int32_t depth,
295 UErrorCode& ec) {
296 if (U_FAILURE(ec)) return;
297 if (depth > MAX_DEPTH) {
298 ec = U_ILLEGAL_ARGUMENT_ERROR;
299 return;
300 }
301
302 // Syntax characters: [ ] ^ - & { }
303
304 // Recognized special forms for chars, sets: c-c s-s s&s
305
306 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
307 RuleCharacterIterator::PARSE_ESCAPES;
308 if ((options & USET_IGNORE_SPACE) != 0) {
309 opts |= RuleCharacterIterator::SKIP_WHITESPACE;
310 }
311
312 UnicodeString patLocal, buf;
313 UBool usePat = FALSE;
314 UnicodeSetPointer scratch;
315 RuleCharacterIterator::Pos backup;
316
317 // mode: 0=before [, 1=between [...], 2=after ]
318 // lastItem: 0=none, 1=char, 2=set
319 int8_t lastItem = 0, mode = 0;
320 UChar32 lastChar = 0;
321 UChar op = 0;
322
323 UBool invert = FALSE;
324
325 clear();
326
327 while (mode != 2 && !chars.atEnd()) {
328 U_ASSERT((lastItem == 0 && op == 0) ||
329 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
330 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
331 op == INTERSECTION /*'&'*/)));
332
333 UChar32 c = 0;
334 UBool literal = FALSE;
335 UnicodeSet* nested = 0; // alias - do not delete
336
337 // -------- Check for property pattern
338
339 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
340 int8_t setMode = 0;
341 if (resemblesPropertyPattern(chars, opts)) {
342 setMode = 2;
343 }
344
345 // -------- Parse '[' of opening delimiter OR nested set.
346 // If there is a nested set, use `setMode' to define how
347 // the set should be parsed. If the '[' is part of the
348 // opening delimiter for this pattern, parse special
349 // strings "[", "[^", "[-", and "[^-". Check for stand-in
350 // characters representing a nested set in the symbol
351 // table.
352
353 else {
354 // Prepare to backup if necessary
355 chars.getPos(backup);
356 c = chars.next(opts, literal, ec);
357 if (U_FAILURE(ec)) return;
358
359 if (c == 0x5B /*'['*/ && !literal) {
360 if (mode == 1) {
361 chars.setPos(backup); // backup
362 setMode = 1;
363 } else {
364 // Handle opening '[' delimiter
365 mode = 1;
366 patLocal.append((UChar) 0x5B /*'['*/);
367 chars.getPos(backup); // prepare to backup
368 c = chars.next(opts, literal, ec);
369 if (U_FAILURE(ec)) return;
370 if (c == 0x5E /*'^'*/ && !literal) {
371 invert = TRUE;
372 patLocal.append((UChar) 0x5E /*'^'*/);
373 chars.getPos(backup); // prepare to backup
374 c = chars.next(opts, literal, ec);
375 if (U_FAILURE(ec)) return;
376 }
377 // Fall through to handle special leading '-';
378 // otherwise restart loop for nested [], \p{}, etc.
379 if (c == HYPHEN /*'-'*/) {
380 literal = TRUE;
381 // Fall through to handle literal '-' below
382 } else {
383 chars.setPos(backup); // backup
384 continue;
385 }
386 }
387 } else if (symbols != 0) {
388 const UnicodeFunctor *m = symbols->lookupMatcher(c);
389 if (m != 0) {
390 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
391 if (ms == NULL) {
392 ec = U_MALFORMED_SET;
393 return;
394 }
395 // casting away const, but `nested' won't be modified
396 // (important not to modify stored set)
397 nested = const_cast<UnicodeSet*>(ms);
398 setMode = 3;
399 }
400 }
401 }
402
403 // -------- Handle a nested set. This either is inline in
404 // the pattern or represented by a stand-in that has
405 // previously been parsed and was looked up in the symbol
406 // table.
407
408 if (setMode != 0) {
409 if (lastItem == 1) {
410 if (op != 0) {
411 // syntaxError(chars, "Char expected after operator");
412 ec = U_MALFORMED_SET;
413 return;
414 }
415 add(lastChar, lastChar);
416 _appendToPat(patLocal, lastChar, FALSE);
417 lastItem = 0;
418 op = 0;
419 }
420
421 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
422 patLocal.append(op);
423 }
424
425 if (nested == 0) {
426 // lazy allocation
427 if (!scratch.allocate()) {
428 ec = U_MEMORY_ALLOCATION_ERROR;
429 return;
430 }
431 nested = scratch.pointer();
432 }
433 switch (setMode) {
434 case 1:
435 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
436 break;
437 case 2:
438 chars.skipIgnored(opts);
439 nested->applyPropertyPattern(chars, patLocal, ec);
440 if (U_FAILURE(ec)) return;
441 break;
442 case 3: // `nested' already parsed
443 nested->_toPattern(patLocal, FALSE);
444 break;
445 }
446
447 usePat = TRUE;
448
449 if (mode == 0) {
450 // Entire pattern is a category; leave parse loop
451 *this = *nested;
452 mode = 2;
453 break;
454 }
455
456 switch (op) {
457 case HYPHEN: /*'-'*/
458 removeAll(*nested);
459 break;
460 case INTERSECTION: /*'&'*/
461 retainAll(*nested);
462 break;
463 case 0:
464 addAll(*nested);
465 break;
466 }
467
468 op = 0;
469 lastItem = 2;
470
471 continue;
472 }
473
474 if (mode == 0) {
475 // syntaxError(chars, "Missing '['");
476 ec = U_MALFORMED_SET;
477 return;
478 }
479
480 // -------- Parse special (syntax) characters. If the
481 // current character is not special, or if it is escaped,
482 // then fall through and handle it below.
483
484 if (!literal) {
485 switch (c) {
486 case 0x5D /*']'*/:
487 if (lastItem == 1) {
488 add(lastChar, lastChar);
489 _appendToPat(patLocal, lastChar, FALSE);
490 }
491 // Treat final trailing '-' as a literal
492 if (op == HYPHEN /*'-'*/) {
493 add(op, op);
494 patLocal.append(op);
495 } else if (op == INTERSECTION /*'&'*/) {
496 // syntaxError(chars, "Trailing '&'");
497 ec = U_MALFORMED_SET;
498 return;
499 }
500 patLocal.append((UChar) 0x5D /*']'*/);
501 mode = 2;
502 continue;
503 case HYPHEN /*'-'*/:
504 if (op == 0) {
505 if (lastItem != 0) {
506 op = (UChar) c;
507 continue;
508 } else {
509 // Treat final trailing '-' as a literal
510 add(c, c);
511 c = chars.next(opts, literal, ec);
512 if (U_FAILURE(ec)) return;
513 if (c == 0x5D /*']'*/ && !literal) {
514 patLocal.append(HYPHEN_RIGHT_BRACE, 2);
515 mode = 2;
516 continue;
517 }
518 }
519 }
520 // syntaxError(chars, "'-' not after char or set");
521 ec = U_MALFORMED_SET;
522 return;
523 case INTERSECTION /*'&'*/:
524 if (lastItem == 2 && op == 0) {
525 op = (UChar) c;
526 continue;
527 }
528 // syntaxError(chars, "'&' not after set");
529 ec = U_MALFORMED_SET;
530 return;
531 case 0x5E /*'^'*/:
532 // syntaxError(chars, "'^' not after '['");
533 ec = U_MALFORMED_SET;
534 return;
535 case 0x7B /*'{'*/:
536 if (op != 0) {
537 // syntaxError(chars, "Missing operand after operator");
538 ec = U_MALFORMED_SET;
539 return;
540 }
541 if (lastItem == 1) {
542 add(lastChar, lastChar);
543 _appendToPat(patLocal, lastChar, FALSE);
544 }
545 lastItem = 0;
546 buf.truncate(0);
547 {
548 UBool ok = FALSE;
549 while (!chars.atEnd()) {
550 c = chars.next(opts, literal, ec);
551 if (U_FAILURE(ec)) return;
552 if (c == 0x7D /*'}'*/ && !literal) {
553 ok = TRUE;
554 break;
555 }
556 buf.append(c);
557 }
558 if (buf.length() < 1 || !ok) {
559 // syntaxError(chars, "Invalid multicharacter string");
560 ec = U_MALFORMED_SET;
561 return;
562 }
563 }
564 // We have new string. Add it to set and continue;
565 // we don't need to drop through to the further
566 // processing
567 add(buf);
568 patLocal.append((UChar) 0x7B /*'{'*/);
569 _appendToPat(patLocal, buf, FALSE);
570 patLocal.append((UChar) 0x7D /*'}'*/);
571 continue;
572 case SymbolTable::SYMBOL_REF:
573 // symbols nosymbols
574 // [a-$] error error (ambiguous)
575 // [a$] anchor anchor
576 // [a-$x] var "x"* literal '$'
577 // [a-$.] error literal '$'
578 // *We won't get here in the case of var "x"
579 {
580 chars.getPos(backup);
581 c = chars.next(opts, literal, ec);
582 if (U_FAILURE(ec)) return;
583 UBool anchor = (c == 0x5D /*']'*/ && !literal);
584 if (symbols == 0 && !anchor) {
585 c = SymbolTable::SYMBOL_REF;
586 chars.setPos(backup);
587 break; // literal '$'
588 }
589 if (anchor && op == 0) {
590 if (lastItem == 1) {
591 add(lastChar, lastChar);
592 _appendToPat(patLocal, lastChar, FALSE);
593 }
594 add(U_ETHER);
595 usePat = TRUE;
596 patLocal.append((UChar) SymbolTable::SYMBOL_REF);
597 patLocal.append((UChar) 0x5D /*']'*/);
598 mode = 2;
599 continue;
600 }
601 // syntaxError(chars, "Unquoted '$'");
602 ec = U_MALFORMED_SET;
603 return;
604 }
605 default:
606 break;
607 }
608 }
609
610 // -------- Parse literal characters. This includes both
611 // escaped chars ("\u4E01") and non-syntax characters
612 // ("a").
613
614 switch (lastItem) {
615 case 0:
616 lastItem = 1;
617 lastChar = c;
618 break;
619 case 1:
620 if (op == HYPHEN /*'-'*/) {
621 if (lastChar >= c) {
622 // Don't allow redundant (a-a) or empty (b-a) ranges;
623 // these are most likely typos.
624 // syntaxError(chars, "Invalid range");
625 ec = U_MALFORMED_SET;
626 return;
627 }
628 add(lastChar, c);
629 _appendToPat(patLocal, lastChar, FALSE);
630 patLocal.append(op);
631 _appendToPat(patLocal, c, FALSE);
632 lastItem = 0;
633 op = 0;
634 } else {
635 add(lastChar, lastChar);
636 _appendToPat(patLocal, lastChar, FALSE);
637 lastChar = c;
638 }
639 break;
640 case 2:
641 if (op != 0) {
642 // syntaxError(chars, "Set expected after operator");
643 ec = U_MALFORMED_SET;
644 return;
645 }
646 lastChar = c;
647 lastItem = 1;
648 break;
649 }
650 }
651
652 if (mode != 2) {
653 // syntaxError(chars, "Missing ']'");
654 ec = U_MALFORMED_SET;
655 return;
656 }
657
658 chars.skipIgnored(opts);
659
660 /**
661 * Handle global flags (invert, case insensitivity). If this
662 * pattern should be compiled case-insensitive, then we need
663 * to close over case BEFORE COMPLEMENTING. This makes
664 * patterns like /[^abc]/i work.
665 */
666 if ((options & USET_CASE_INSENSITIVE) != 0) {
667 (this->*caseClosure)(USET_CASE_INSENSITIVE);
668 }
669 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
670 (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
671 }
672 if (invert) {
673 complement();
674 }
675
676 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
677 // generated pattern.
678 if (usePat) {
679 rebuiltPat.append(patLocal);
680 } else {
681 _generatePattern(rebuiltPat, FALSE);
682 }
683 if (isBogus() && U_SUCCESS(ec)) {
684 // We likely ran out of memory. AHHH!
685 ec = U_MEMORY_ALLOCATION_ERROR;
686 }
687}
688
689//----------------------------------------------------------------
690// Property set implementation
691//----------------------------------------------------------------
692
693namespace {
694
695static UBool numericValueFilter(UChar32 ch, void* context) {
696 return u_getNumericValue(ch) == *(double*)context;
697}
698
699static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
700 int32_t value = *(int32_t*)context;
701 return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
702}
703
704static UBool versionFilter(UChar32 ch, void* context) {
705 static const UVersionInfo none = { 0, 0, 0, 0 };
706 UVersionInfo v;
707 u_charAge(ch, v);
708 UVersionInfo* version = (UVersionInfo*)context;
709 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
710}
711
712typedef struct {
713 UProperty prop;
714 int32_t value;
715} IntPropertyContext;
716
717static UBool intPropertyFilter(UChar32 ch, void* context) {
718 IntPropertyContext* c = (IntPropertyContext*)context;
719 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
720}
721
722static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
723 return uscript_hasScript(ch, *(UScriptCode*)context);
724}
725
726} // namespace
727
728/**
729 * Generic filter-based scanning code for UCD property UnicodeSets.
730 */
731void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
732 void* context,
733 const UnicodeSet* inclusions,
734 UErrorCode &status) {
735 if (U_FAILURE(status)) return;
736
737 // Logically, walk through all Unicode characters, noting the start
738 // and end of each range for which filter.contain(c) is
739 // true. Add each range to a set.
740 //
741 // To improve performance, use an inclusions set which
742 // encodes information about character ranges that are known
743 // to have identical properties.
744 // inclusions contains the first characters of
745 // same-value ranges for the given property.
746
747 clear();
748
749 UChar32 startHasProperty = -1;
750 int32_t limitRange = inclusions->getRangeCount();
751
752 for (int j=0; j<limitRange; ++j) {
753 // get current range
754 UChar32 start = inclusions->getRangeStart(j);
755 UChar32 end = inclusions->getRangeEnd(j);
756
757 // for all the code points in the range, process
758 for (UChar32 ch = start; ch <= end; ++ch) {
759 // only add to this UnicodeSet on inflection points --
760 // where the hasProperty value changes to false
761 if ((*filter)(ch, context)) {
762 if (startHasProperty < 0) {
763 startHasProperty = ch;
764 }
765 } else if (startHasProperty >= 0) {
766 add(startHasProperty, ch-1);
767 startHasProperty = -1;
768 }
769 }
770 }
771 if (startHasProperty >= 0) {
772 add((UChar32)startHasProperty, (UChar32)0x10FFFF);
773 }
774 if (isBogus() && U_SUCCESS(status)) {
775 // We likely ran out of memory. AHHH!
776 status = U_MEMORY_ALLOCATION_ERROR;
777 }
778}
779
780namespace {
781
782static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
783 /* Note: we use ' ' in compiler code page */
784 int32_t j = 0;
785 char ch;
786 --dstCapacity; /* make room for term. zero */
787 while ((ch = *src++) != 0) {
788 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
789 continue;
790 }
791 if (j >= dstCapacity) return FALSE;
792 dst[j++] = ch;
793 }
794 if (j > 0 && dst[j-1] == ' ') --j;
795 dst[j] = 0;
796 return TRUE;
797}
798
799} // namespace
800
801//----------------------------------------------------------------
802// Property set API
803//----------------------------------------------------------------
804
805#define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
806 ec=U_ILLEGAL_ARGUMENT_ERROR; \
807 return *this; \
808} UPRV_BLOCK_MACRO_END
809
810UnicodeSet&
811UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
812 if (U_FAILURE(ec) || isFrozen()) { return *this; }
813 if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
814 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
815 applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
816 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
817 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
818 UScriptCode script = (UScriptCode)value;
819 applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
820 } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
821 if (value == 0 || value == 1) {
822 const USet *set = u_getBinaryPropertySet(prop, &ec);
823 if (U_FAILURE(ec)) { return *this; }
824 copyFrom(*UnicodeSet::fromUSet(set), TRUE);
825 if (value == 0) {
826 complement();
827 }
828 } else {
829 clear();
830 }
831 } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
832 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
833 IntPropertyContext c = {prop, value};
834 applyFilter(intPropertyFilter, &c, inclusions, ec);
835 } else {
836 ec = U_ILLEGAL_ARGUMENT_ERROR;
837 }
838 return *this;
839}
840
841UnicodeSet&
842UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
843 const UnicodeString& value,
844 UErrorCode& ec) {
845 if (U_FAILURE(ec) || isFrozen()) return *this;
846
847 // prop and value used to be converted to char * using the default
848 // converter instead of the invariant conversion.
849 // This should not be necessary because all Unicode property and value
850 // names use only invariant characters.
851 // If there are any variant characters, then we won't find them anyway.
852 // Checking first avoids assertion failures in the conversion.
853 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
854 !uprv_isInvariantUString(value.getBuffer(), value.length())
855 ) {
856 FAIL(ec);
857 }
858 CharString pname, vname;
859 pname.appendInvariantChars(prop, ec);
860 vname.appendInvariantChars(value, ec);
861 if (U_FAILURE(ec)) return *this;
862
863 UProperty p;
864 int32_t v;
865 UBool invert = FALSE;
866
867 if (value.length() > 0) {
868 p = u_getPropertyEnum(pname.data());
869 if (p == UCHAR_INVALID_CODE) FAIL(ec);
870
871 // Treat gc as gcm
872 if (p == UCHAR_GENERAL_CATEGORY) {
873 p = UCHAR_GENERAL_CATEGORY_MASK;
874 }
875
876 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
877 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
878 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
879 v = u_getPropertyValueEnum(p, vname.data());
880 if (v == UCHAR_INVALID_CODE) {
881 // Handle numeric CCC
882 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
883 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
884 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
885 char* end;
886 double val = uprv_strtod(vname.data(), &end);
887 // Anything between 0 and 255 is valid even if unused.
888 // Cast double->int only after range check.
889 // We catch NaN here because comparing it with both 0 and 255 will be false
890 // (as are all comparisons with NaN).
891 if (*end != 0 || !(0 <= val && val <= 255) ||
892 (v = (int32_t)val) != val) {
893 // non-integral value or outside 0..255, or trailing junk
894 FAIL(ec);
895 }
896 } else {
897 FAIL(ec);
898 }
899 }
900 }
901
902 else {
903
904 switch (p) {
905 case UCHAR_NUMERIC_VALUE:
906 {
907 char* end;
908 double val = uprv_strtod(vname.data(), &end);
909 if (*end != 0) {
910 FAIL(ec);
911 }
912 applyFilter(numericValueFilter, &val,
913 CharacterProperties::getInclusionsForProperty(p, ec), ec);
914 return *this;
915 }
916 case UCHAR_NAME:
917 {
918 // Must munge name, since u_charFromName() does not do
919 // 'loose' matching.
920 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
921 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
922 UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
923 if (U_SUCCESS(ec)) {
924 clear();
925 add(ch);
926 return *this;
927 } else {
928 FAIL(ec);
929 }
930 }
931 case UCHAR_UNICODE_1_NAME:
932 // ICU 49 deprecates the Unicode_1_Name property APIs.
933 FAIL(ec);
934 case UCHAR_AGE:
935 {
936 // Must munge name, since u_versionFromString() does not do
937 // 'loose' matching.
938 char buf[128];
939 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
940 UVersionInfo version;
941 u_versionFromString(version, buf);
942 applyFilter(versionFilter, &version,
943 CharacterProperties::getInclusionsForProperty(p, ec), ec);
944 return *this;
945 }
946 case UCHAR_SCRIPT_EXTENSIONS:
947 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
948 if (v == UCHAR_INVALID_CODE) {
949 FAIL(ec);
950 }
951 // fall through to calling applyIntPropertyValue()
952 break;
953 default:
954 // p is a non-binary, non-enumerated property that we
955 // don't support (yet).
956 FAIL(ec);
957 }
958 }
959 }
960
961 else {
962 // value is empty. Interpret as General Category, Script, or
963 // Binary property.
964 p = UCHAR_GENERAL_CATEGORY_MASK;
965 v = u_getPropertyValueEnum(p, pname.data());
966 if (v == UCHAR_INVALID_CODE) {
967 p = UCHAR_SCRIPT;
968 v = u_getPropertyValueEnum(p, pname.data());
969 if (v == UCHAR_INVALID_CODE) {
970 p = u_getPropertyEnum(pname.data());
971 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
972 v = 1;
973 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
974 set(MIN_VALUE, MAX_VALUE);
975 return *this;
976 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
977 set(0, 0x7F);
978 return *this;
979 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
980 // [:Assigned:]=[:^Cn:]
981 p = UCHAR_GENERAL_CATEGORY_MASK;
982 v = U_GC_CN_MASK;
983 invert = TRUE;
984 } else {
985 FAIL(ec);
986 }
987 }
988 }
989 }
990
991 applyIntPropertyValue(p, v, ec);
992 if(invert) {
993 complement();
994 }
995
996 if (isBogus() && U_SUCCESS(ec)) {
997 // We likely ran out of memory. AHHH!
998 ec = U_MEMORY_ALLOCATION_ERROR;
999 }
1000 return *this;
1001}
1002
1003//----------------------------------------------------------------
1004// Property set patterns
1005//----------------------------------------------------------------
1006
1007/**
1008 * Return true if the given position, in the given pattern, appears
1009 * to be the start of a property set pattern.
1010 */
1011UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1012 int32_t pos) {
1013 // Patterns are at least 5 characters long
1014 if ((pos+5) > pattern.length()) {
1015 return FALSE;
1016 }
1017
1018 // Look for an opening [:, [:^, \p, or \P
1019 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
1020}
1021
1022/**
1023 * Return true if the given iterator appears to point at a
1024 * property pattern. Regardless of the result, return with the
1025 * iterator unchanged.
1026 * @param chars iterator over the pattern characters. Upon return
1027 * it will be unchanged.
1028 * @param iterOpts RuleCharacterIterator options
1029 */
1030UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
1031 int32_t iterOpts) {
1032 // NOTE: literal will always be FALSE, because we don't parse escapes.
1033 UBool result = FALSE, literal;
1034 UErrorCode ec = U_ZERO_ERROR;
1035 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1036 RuleCharacterIterator::Pos pos;
1037 chars.getPos(pos);
1038 UChar32 c = chars.next(iterOpts, literal, ec);
1039 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
1040 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1041 literal, ec);
1042 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
1043 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
1044 }
1045 chars.setPos(pos);
1046 return result && U_SUCCESS(ec);
1047}
1048
1049/**
1050 * Parse the given property pattern at the given parse position.
1051 */
1052UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1053 ParsePosition& ppos,
1054 UErrorCode &ec) {
1055 int32_t pos = ppos.getIndex();
1056
1057 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1058 UBool isName = FALSE; // true for \N{pat}, o/w false
1059 UBool invert = FALSE;
1060
1061 if (U_FAILURE(ec)) return *this;
1062
1063 // Minimum length is 5 characters, e.g. \p{L}
1064 if ((pos+5) > pattern.length()) {
1065 FAIL(ec);
1066 }
1067
1068 // On entry, ppos should point to one of the following locations:
1069 // Look for an opening [:, [:^, \p, or \P
1070 if (isPOSIXOpen(pattern, pos)) {
1071 posix = TRUE;
1072 pos += 2;
1073 pos = ICU_Utility::skipWhitespace(pattern, pos);
1074 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
1075 ++pos;
1076 invert = TRUE;
1077 }
1078 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1079 UChar c = pattern.charAt(pos+1);
1080 invert = (c == UPPER_P);
1081 isName = (c == UPPER_N);
1082 pos += 2;
1083 pos = ICU_Utility::skipWhitespace(pattern, pos);
1084 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
1085 // Syntax error; "\p" or "\P" not followed by "{"
1086 FAIL(ec);
1087 }
1088 } else {
1089 // Open delimiter not seen
1090 FAIL(ec);
1091 }
1092
1093 // Look for the matching close delimiter, either :] or }
1094 int32_t close;
1095 if (posix) {
1096 close = pattern.indexOf(POSIX_CLOSE, 2, pos);
1097 } else {
1098 close = pattern.indexOf(CLOSE_BRACE, pos);
1099 }
1100 if (close < 0) {
1101 // Syntax error; close delimiter missing
1102 FAIL(ec);
1103 }
1104
1105 // Look for an '=' sign. If this is present, we will parse a
1106 // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1107 // pattern.
1108 int32_t equals = pattern.indexOf(EQUALS, pos);
1109 UnicodeString propName, valueName;
1110 if (equals >= 0 && equals < close && !isName) {
1111 // Equals seen; parse medium/long pattern
1112 pattern.extractBetween(pos, equals, propName);
1113 pattern.extractBetween(equals+1, close, valueName);
1114 }
1115
1116 else {
1117 // Handle case where no '=' is seen, and \N{}
1118 pattern.extractBetween(pos, close, propName);
1119
1120 // Handle \N{name}
1121 if (isName) {
1122 // This is a little inefficient since it means we have to
1123 // parse NAME_PROP back to UCHAR_NAME even though we already
1124 // know it's UCHAR_NAME. If we refactor the API to
1125 // support args of (UProperty, char*) then we can remove
1126 // NAME_PROP and make this a little more efficient.
1127 valueName = propName;
1128 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1129 }
1130 }
1131
1132 applyPropertyAlias(propName, valueName, ec);
1133
1134 if (U_SUCCESS(ec)) {
1135 if (invert) {
1136 complement();
1137 }
1138
1139 // Move to the limit position after the close delimiter if the
1140 // parse succeeded.
1141 ppos.setIndex(close + (posix ? 2 : 1));
1142 }
1143
1144 return *this;
1145}
1146
1147/**
1148 * Parse a property pattern.
1149 * @param chars iterator over the pattern characters. Upon return
1150 * it will be advanced to the first character after the parsed
1151 * pattern, or the end of the iteration if all characters are
1152 * parsed.
1153 * @param rebuiltPat the pattern that was parsed, rebuilt or
1154 * copied from the input pattern, as appropriate.
1155 */
1156void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1157 UnicodeString& rebuiltPat,
1158 UErrorCode& ec) {
1159 if (U_FAILURE(ec)) return;
1160 UnicodeString pattern;
1161 chars.lookahead(pattern);
1162 ParsePosition pos(0);
1163 applyPropertyPattern(pattern, pos, ec);
1164 if (U_FAILURE(ec)) return;
1165 if (pos.getIndex() == 0) {
1166 // syntaxError(chars, "Invalid property pattern");
1167 ec = U_MALFORMED_SET;
1168 return;
1169 }
1170 chars.jumpahead(pos.getIndex());
1171 rebuiltPat.append(pattern, 0, pos.getIndex());
1172}
1173
1174U_NAMESPACE_END
1175