1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | // |
4 | // file: repattrn.cpp |
5 | // |
6 | /* |
7 | *************************************************************************** |
8 | * Copyright (C) 2002-2016 International Business Machines Corporation |
9 | * and others. All rights reserved. |
10 | *************************************************************************** |
11 | */ |
12 | |
13 | #include "unicode/utypes.h" |
14 | |
15 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
16 | |
17 | #include "unicode/regex.h" |
18 | #include "unicode/uclean.h" |
19 | #include "cmemory.h" |
20 | #include "cstr.h" |
21 | #include "uassert.h" |
22 | #include "uhash.h" |
23 | #include "uvector.h" |
24 | #include "uvectr32.h" |
25 | #include "uvectr64.h" |
26 | #include "regexcmp.h" |
27 | #include "regeximp.h" |
28 | #include "regexst.h" |
29 | |
30 | U_NAMESPACE_BEGIN |
31 | |
32 | //-------------------------------------------------------------------------- |
33 | // |
34 | // RegexPattern Default Constructor |
35 | // |
36 | //-------------------------------------------------------------------------- |
37 | RegexPattern::RegexPattern() { |
38 | // Init all of this instances data. |
39 | init(); |
40 | } |
41 | |
42 | |
43 | //-------------------------------------------------------------------------- |
44 | // |
45 | // Copy Constructor Note: This is a rather inefficient implementation, |
46 | // but it probably doesn't matter. |
47 | // |
48 | //-------------------------------------------------------------------------- |
49 | RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) { |
50 | init(); |
51 | *this = other; |
52 | } |
53 | |
54 | |
55 | |
56 | //-------------------------------------------------------------------------- |
57 | // |
58 | // Assignment Operator |
59 | // |
60 | //-------------------------------------------------------------------------- |
61 | RegexPattern &RegexPattern::operator = (const RegexPattern &other) { |
62 | if (this == &other) { |
63 | // Source and destination are the same. Don't do anything. |
64 | return *this; |
65 | } |
66 | |
67 | // Clean out any previous contents of object being assigned to. |
68 | zap(); |
69 | |
70 | // Give target object a default initialization |
71 | init(); |
72 | |
73 | // Copy simple fields |
74 | fDeferredStatus = other.fDeferredStatus; |
75 | |
76 | if (U_FAILURE(fDeferredStatus)) { |
77 | return *this; |
78 | } |
79 | |
80 | if (other.fPatternString == NULL) { |
81 | fPatternString = NULL; |
82 | fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus); |
83 | } else { |
84 | fPatternString = new UnicodeString(*(other.fPatternString)); |
85 | if (fPatternString == NULL) { |
86 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
87 | } else { |
88 | fPattern = utext_openConstUnicodeString(NULL, fPatternString, &fDeferredStatus); |
89 | } |
90 | } |
91 | if (U_FAILURE(fDeferredStatus)) { |
92 | return *this; |
93 | } |
94 | |
95 | fFlags = other.fFlags; |
96 | fLiteralText = other.fLiteralText; |
97 | fMinMatchLen = other.fMinMatchLen; |
98 | fFrameSize = other.fFrameSize; |
99 | fDataSize = other.fDataSize; |
100 | |
101 | fStartType = other.fStartType; |
102 | fInitialStringIdx = other.fInitialStringIdx; |
103 | fInitialStringLen = other.fInitialStringLen; |
104 | *fInitialChars = *other.fInitialChars; |
105 | fInitialChar = other.fInitialChar; |
106 | *fInitialChars8 = *other.fInitialChars8; |
107 | fNeedsAltInput = other.fNeedsAltInput; |
108 | |
109 | // Copy the pattern. It's just values, nothing deep to copy. |
110 | fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus); |
111 | fGroupMap->assign(*other.fGroupMap, fDeferredStatus); |
112 | |
113 | // Copy the Unicode Sets. |
114 | // Could be made more efficient if the sets were reference counted and shared, |
115 | // but I doubt that pattern copying will be particularly common. |
116 | // Note: init() already added an empty element zero to fSets |
117 | int32_t i; |
118 | int32_t numSets = other.fSets->size(); |
119 | fSets8 = new Regex8BitSet[numSets]; |
120 | if (fSets8 == NULL) { |
121 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
122 | return *this; |
123 | } |
124 | for (i=1; i<numSets; i++) { |
125 | if (U_FAILURE(fDeferredStatus)) { |
126 | return *this; |
127 | } |
128 | UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i); |
129 | UnicodeSet *newSet = new UnicodeSet(*sourceSet); |
130 | if (newSet == NULL) { |
131 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
132 | break; |
133 | } |
134 | fSets->addElement(newSet, fDeferredStatus); |
135 | fSets8[i] = other.fSets8[i]; |
136 | } |
137 | |
138 | // Copy the named capture group hash map. |
139 | if (other.fNamedCaptureMap != nullptr && initNamedCaptureMap()) { |
140 | int32_t hashPos = UHASH_FIRST; |
141 | while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) { |
142 | if (U_FAILURE(fDeferredStatus)) { |
143 | break; |
144 | } |
145 | const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer; |
146 | UnicodeString *key = new UnicodeString(*name); |
147 | int32_t val = hashEl->value.integer; |
148 | if (key == NULL) { |
149 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
150 | } else { |
151 | uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus); |
152 | } |
153 | } |
154 | } |
155 | return *this; |
156 | } |
157 | |
158 | |
159 | //-------------------------------------------------------------------------- |
160 | // |
161 | // init Shared initialization for use by constructors. |
162 | // Bring an uninitialized RegexPattern up to a default state. |
163 | // |
164 | //-------------------------------------------------------------------------- |
165 | void RegexPattern::init() { |
166 | fFlags = 0; |
167 | fCompiledPat = 0; |
168 | fLiteralText.remove(); |
169 | fSets = NULL; |
170 | fSets8 = NULL; |
171 | fDeferredStatus = U_ZERO_ERROR; |
172 | fMinMatchLen = 0; |
173 | fFrameSize = 0; |
174 | fDataSize = 0; |
175 | fGroupMap = NULL; |
176 | fStartType = START_NO_INFO; |
177 | fInitialStringIdx = 0; |
178 | fInitialStringLen = 0; |
179 | fInitialChars = NULL; |
180 | fInitialChar = 0; |
181 | fInitialChars8 = NULL; |
182 | fNeedsAltInput = FALSE; |
183 | fNamedCaptureMap = NULL; |
184 | |
185 | fPattern = NULL; // will be set later |
186 | fPatternString = NULL; // may be set later |
187 | fCompiledPat = new UVector64(fDeferredStatus); |
188 | fGroupMap = new UVector32(fDeferredStatus); |
189 | fSets = new UVector(fDeferredStatus); |
190 | fInitialChars = new UnicodeSet; |
191 | fInitialChars8 = new Regex8BitSet; |
192 | if (U_FAILURE(fDeferredStatus)) { |
193 | return; |
194 | } |
195 | if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL || |
196 | fInitialChars == NULL || fInitialChars8 == NULL) { |
197 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
198 | return; |
199 | } |
200 | |
201 | // Slot zero of the vector of sets is reserved. Fill it here. |
202 | fSets->addElement((int32_t)0, fDeferredStatus); |
203 | } |
204 | |
205 | |
206 | bool RegexPattern::initNamedCaptureMap() { |
207 | if (fNamedCaptureMap) { |
208 | return true; |
209 | } |
210 | fNamedCaptureMap = uhash_openSize(uhash_hashUnicodeString, // Key hash function |
211 | uhash_compareUnicodeString, // Key comparator function |
212 | uhash_compareLong, // Value comparator function |
213 | 7, // Initial table capacity |
214 | &fDeferredStatus); |
215 | if (U_FAILURE(fDeferredStatus)) { |
216 | return false; |
217 | } |
218 | |
219 | // fNamedCaptureMap owns its key strings, type (UnicodeString *) |
220 | uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject); |
221 | return true; |
222 | } |
223 | |
224 | //-------------------------------------------------------------------------- |
225 | // |
226 | // zap Delete everything owned by this RegexPattern. |
227 | // |
228 | //-------------------------------------------------------------------------- |
229 | void RegexPattern::zap() { |
230 | delete fCompiledPat; |
231 | fCompiledPat = NULL; |
232 | int i; |
233 | for (i=1; i<fSets->size(); i++) { |
234 | UnicodeSet *s; |
235 | s = (UnicodeSet *)fSets->elementAt(i); |
236 | if (s != NULL) { |
237 | delete s; |
238 | } |
239 | } |
240 | delete fSets; |
241 | fSets = NULL; |
242 | delete[] fSets8; |
243 | fSets8 = NULL; |
244 | delete fGroupMap; |
245 | fGroupMap = NULL; |
246 | delete fInitialChars; |
247 | fInitialChars = NULL; |
248 | delete fInitialChars8; |
249 | fInitialChars8 = NULL; |
250 | if (fPattern != NULL) { |
251 | utext_close(fPattern); |
252 | fPattern = NULL; |
253 | } |
254 | if (fPatternString != NULL) { |
255 | delete fPatternString; |
256 | fPatternString = NULL; |
257 | } |
258 | if (fNamedCaptureMap != NULL) { |
259 | uhash_close(fNamedCaptureMap); |
260 | fNamedCaptureMap = NULL; |
261 | } |
262 | } |
263 | |
264 | |
265 | //-------------------------------------------------------------------------- |
266 | // |
267 | // Destructor |
268 | // |
269 | //-------------------------------------------------------------------------- |
270 | RegexPattern::~RegexPattern() { |
271 | zap(); |
272 | } |
273 | |
274 | |
275 | //-------------------------------------------------------------------------- |
276 | // |
277 | // Clone |
278 | // |
279 | //-------------------------------------------------------------------------- |
280 | RegexPattern *RegexPattern::clone() const { |
281 | RegexPattern *copy = new RegexPattern(*this); |
282 | return copy; |
283 | } |
284 | |
285 | |
286 | //-------------------------------------------------------------------------- |
287 | // |
288 | // operator == (comparison) Consider to patterns to be == if the |
289 | // pattern strings and the flags are the same. |
290 | // Note that pattern strings with the same |
291 | // characters can still be considered different. |
292 | // |
293 | //-------------------------------------------------------------------------- |
294 | UBool RegexPattern::operator ==(const RegexPattern &other) const { |
295 | if (this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus) { |
296 | if (this->fPatternString != NULL && other.fPatternString != NULL) { |
297 | return *(this->fPatternString) == *(other.fPatternString); |
298 | } else if (this->fPattern == NULL) { |
299 | if (other.fPattern == NULL) { |
300 | return TRUE; |
301 | } |
302 | } else if (other.fPattern != NULL) { |
303 | UTEXT_SETNATIVEINDEX(this->fPattern, 0); |
304 | UTEXT_SETNATIVEINDEX(other.fPattern, 0); |
305 | return utext_equals(this->fPattern, other.fPattern); |
306 | } |
307 | } |
308 | return FALSE; |
309 | } |
310 | |
311 | //--------------------------------------------------------------------- |
312 | // |
313 | // compile |
314 | // |
315 | //--------------------------------------------------------------------- |
316 | RegexPattern * U_EXPORT2 |
317 | RegexPattern::compile(const UnicodeString ®ex, |
318 | uint32_t flags, |
319 | UParseError &pe, |
320 | UErrorCode &status) |
321 | { |
322 | if (U_FAILURE(status)) { |
323 | return NULL; |
324 | } |
325 | |
326 | const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | |
327 | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | |
328 | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; |
329 | |
330 | if ((flags & ~allFlags) != 0) { |
331 | status = U_REGEX_INVALID_FLAG; |
332 | return NULL; |
333 | } |
334 | |
335 | if ((flags & UREGEX_CANON_EQ) != 0) { |
336 | status = U_REGEX_UNIMPLEMENTED; |
337 | return NULL; |
338 | } |
339 | |
340 | RegexPattern *This = new RegexPattern; |
341 | if (This == NULL) { |
342 | status = U_MEMORY_ALLOCATION_ERROR; |
343 | return NULL; |
344 | } |
345 | if (U_FAILURE(This->fDeferredStatus)) { |
346 | status = This->fDeferredStatus; |
347 | delete This; |
348 | return NULL; |
349 | } |
350 | This->fFlags = flags; |
351 | |
352 | RegexCompile compiler(This, status); |
353 | compiler.compile(regex, pe, status); |
354 | |
355 | if (U_FAILURE(status)) { |
356 | delete This; |
357 | This = NULL; |
358 | } |
359 | |
360 | return This; |
361 | } |
362 | |
363 | |
364 | // |
365 | // compile, UText mode |
366 | // |
367 | RegexPattern * U_EXPORT2 |
368 | RegexPattern::compile(UText *regex, |
369 | uint32_t flags, |
370 | UParseError &pe, |
371 | UErrorCode &status) |
372 | { |
373 | if (U_FAILURE(status)) { |
374 | return NULL; |
375 | } |
376 | |
377 | const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | |
378 | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | |
379 | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; |
380 | |
381 | if ((flags & ~allFlags) != 0) { |
382 | status = U_REGEX_INVALID_FLAG; |
383 | return NULL; |
384 | } |
385 | |
386 | if ((flags & UREGEX_CANON_EQ) != 0) { |
387 | status = U_REGEX_UNIMPLEMENTED; |
388 | return NULL; |
389 | } |
390 | |
391 | RegexPattern *This = new RegexPattern; |
392 | if (This == NULL) { |
393 | status = U_MEMORY_ALLOCATION_ERROR; |
394 | return NULL; |
395 | } |
396 | if (U_FAILURE(This->fDeferredStatus)) { |
397 | status = This->fDeferredStatus; |
398 | delete This; |
399 | return NULL; |
400 | } |
401 | This->fFlags = flags; |
402 | |
403 | RegexCompile compiler(This, status); |
404 | compiler.compile(regex, pe, status); |
405 | |
406 | if (U_FAILURE(status)) { |
407 | delete This; |
408 | This = NULL; |
409 | } |
410 | |
411 | return This; |
412 | } |
413 | |
414 | // |
415 | // compile with default flags. |
416 | // |
417 | RegexPattern * U_EXPORT2 |
418 | RegexPattern::compile(const UnicodeString ®ex, |
419 | UParseError &pe, |
420 | UErrorCode &err) |
421 | { |
422 | return compile(regex, 0, pe, err); |
423 | } |
424 | |
425 | |
426 | // |
427 | // compile with default flags, UText mode |
428 | // |
429 | RegexPattern * U_EXPORT2 |
430 | RegexPattern::compile(UText *regex, |
431 | UParseError &pe, |
432 | UErrorCode &err) |
433 | { |
434 | return compile(regex, 0, pe, err); |
435 | } |
436 | |
437 | |
438 | // |
439 | // compile with no UParseErr parameter. |
440 | // |
441 | RegexPattern * U_EXPORT2 |
442 | RegexPattern::compile(const UnicodeString ®ex, |
443 | uint32_t flags, |
444 | UErrorCode &err) |
445 | { |
446 | UParseError pe; |
447 | return compile(regex, flags, pe, err); |
448 | } |
449 | |
450 | |
451 | // |
452 | // compile with no UParseErr parameter, UText mode |
453 | // |
454 | RegexPattern * U_EXPORT2 |
455 | RegexPattern::compile(UText *regex, |
456 | uint32_t flags, |
457 | UErrorCode &err) |
458 | { |
459 | UParseError pe; |
460 | return compile(regex, flags, pe, err); |
461 | } |
462 | |
463 | |
464 | //--------------------------------------------------------------------- |
465 | // |
466 | // flags |
467 | // |
468 | //--------------------------------------------------------------------- |
469 | uint32_t RegexPattern::flags() const { |
470 | return fFlags; |
471 | } |
472 | |
473 | |
474 | //--------------------------------------------------------------------- |
475 | // |
476 | // matcher(UnicodeString, err) |
477 | // |
478 | //--------------------------------------------------------------------- |
479 | RegexMatcher *RegexPattern::matcher(const UnicodeString &input, |
480 | UErrorCode &status) const { |
481 | RegexMatcher *retMatcher = matcher(status); |
482 | if (retMatcher != NULL) { |
483 | retMatcher->fDeferredStatus = status; |
484 | retMatcher->reset(input); |
485 | } |
486 | return retMatcher; |
487 | } |
488 | |
489 | |
490 | //--------------------------------------------------------------------- |
491 | // |
492 | // matcher(status) |
493 | // |
494 | //--------------------------------------------------------------------- |
495 | RegexMatcher *RegexPattern::matcher(UErrorCode &status) const { |
496 | RegexMatcher *retMatcher = NULL; |
497 | |
498 | if (U_FAILURE(status)) { |
499 | return NULL; |
500 | } |
501 | if (U_FAILURE(fDeferredStatus)) { |
502 | status = fDeferredStatus; |
503 | return NULL; |
504 | } |
505 | |
506 | retMatcher = new RegexMatcher(this); |
507 | if (retMatcher == NULL) { |
508 | status = U_MEMORY_ALLOCATION_ERROR; |
509 | return NULL; |
510 | } |
511 | return retMatcher; |
512 | } |
513 | |
514 | |
515 | |
516 | //--------------------------------------------------------------------- |
517 | // |
518 | // matches Convenience function to test for a match, starting |
519 | // with a pattern string and a data string. |
520 | // |
521 | //--------------------------------------------------------------------- |
522 | UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex, |
523 | const UnicodeString &input, |
524 | UParseError &pe, |
525 | UErrorCode &status) { |
526 | |
527 | if (U_FAILURE(status)) {return FALSE;} |
528 | |
529 | UBool retVal; |
530 | RegexPattern *pat = NULL; |
531 | RegexMatcher *matcher = NULL; |
532 | |
533 | pat = RegexPattern::compile(regex, 0, pe, status); |
534 | matcher = pat->matcher(input, status); |
535 | retVal = matcher->matches(status); |
536 | |
537 | delete matcher; |
538 | delete pat; |
539 | return retVal; |
540 | } |
541 | |
542 | |
543 | // |
544 | // matches, UText mode |
545 | // |
546 | UBool U_EXPORT2 RegexPattern::matches(UText *regex, |
547 | UText *input, |
548 | UParseError &pe, |
549 | UErrorCode &status) { |
550 | |
551 | if (U_FAILURE(status)) {return FALSE;} |
552 | |
553 | UBool retVal = FALSE; |
554 | RegexPattern *pat = NULL; |
555 | RegexMatcher *matcher = NULL; |
556 | |
557 | pat = RegexPattern::compile(regex, 0, pe, status); |
558 | matcher = pat->matcher(status); |
559 | if (U_SUCCESS(status)) { |
560 | matcher->reset(input); |
561 | retVal = matcher->matches(status); |
562 | } |
563 | |
564 | delete matcher; |
565 | delete pat; |
566 | return retVal; |
567 | } |
568 | |
569 | |
570 | |
571 | |
572 | |
573 | //--------------------------------------------------------------------- |
574 | // |
575 | // pattern |
576 | // |
577 | //--------------------------------------------------------------------- |
578 | UnicodeString RegexPattern::pattern() const { |
579 | if (fPatternString != NULL) { |
580 | return *fPatternString; |
581 | } else if (fPattern == NULL) { |
582 | return UnicodeString(); |
583 | } else { |
584 | UErrorCode status = U_ZERO_ERROR; |
585 | int64_t nativeLen = utext_nativeLength(fPattern); |
586 | int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error |
587 | UnicodeString result; |
588 | |
589 | status = U_ZERO_ERROR; |
590 | UChar *resultChars = result.getBuffer(len16); |
591 | utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning |
592 | result.releaseBuffer(len16); |
593 | |
594 | return result; |
595 | } |
596 | } |
597 | |
598 | |
599 | |
600 | |
601 | //--------------------------------------------------------------------- |
602 | // |
603 | // patternText |
604 | // |
605 | //--------------------------------------------------------------------- |
606 | UText *RegexPattern::patternText(UErrorCode &status) const { |
607 | if (U_FAILURE(status)) {return NULL;} |
608 | status = U_ZERO_ERROR; |
609 | |
610 | if (fPattern != NULL) { |
611 | return fPattern; |
612 | } else { |
613 | RegexStaticSets::initGlobals(&status); |
614 | return RegexStaticSets::gStaticSets->fEmptyText; |
615 | } |
616 | } |
617 | |
618 | |
619 | //-------------------------------------------------------------------------------- |
620 | // |
621 | // groupNumberFromName() |
622 | // |
623 | //-------------------------------------------------------------------------------- |
624 | int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const { |
625 | if (U_FAILURE(status)) { |
626 | return 0; |
627 | } |
628 | |
629 | // No need to explicitly check for syntactically valid names. |
630 | // Invalid ones will never be in the map, and the lookup will fail. |
631 | |
632 | int32_t number = fNamedCaptureMap ? uhash_geti(fNamedCaptureMap, &groupName) : 0; |
633 | if (number == 0) { |
634 | status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
635 | } |
636 | return number; |
637 | } |
638 | |
639 | int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const { |
640 | if (U_FAILURE(status)) { |
641 | return 0; |
642 | } |
643 | UnicodeString name(groupName, nameLength, US_INV); |
644 | return groupNumberFromName(name, status); |
645 | } |
646 | |
647 | |
648 | //--------------------------------------------------------------------- |
649 | // |
650 | // split |
651 | // |
652 | //--------------------------------------------------------------------- |
653 | int32_t RegexPattern::split(const UnicodeString &input, |
654 | UnicodeString dest[], |
655 | int32_t destCapacity, |
656 | UErrorCode &status) const |
657 | { |
658 | if (U_FAILURE(status)) { |
659 | return 0; |
660 | } |
661 | |
662 | RegexMatcher m(this); |
663 | int32_t r = 0; |
664 | // Check m's status to make sure all is ok. |
665 | if (U_SUCCESS(m.fDeferredStatus)) { |
666 | r = m.split(input, dest, destCapacity, status); |
667 | } |
668 | return r; |
669 | } |
670 | |
671 | // |
672 | // split, UText mode |
673 | // |
674 | int32_t RegexPattern::split(UText *input, |
675 | UText *dest[], |
676 | int32_t destCapacity, |
677 | UErrorCode &status) const |
678 | { |
679 | if (U_FAILURE(status)) { |
680 | return 0; |
681 | } |
682 | |
683 | RegexMatcher m(this); |
684 | int32_t r = 0; |
685 | // Check m's status to make sure all is ok. |
686 | if (U_SUCCESS(m.fDeferredStatus)) { |
687 | r = m.split(input, dest, destCapacity, status); |
688 | } |
689 | return r; |
690 | } |
691 | |
692 | |
693 | //--------------------------------------------------------------------- |
694 | // |
695 | // dump Output the compiled form of the pattern. |
696 | // Debugging function only. |
697 | // |
698 | //--------------------------------------------------------------------- |
699 | void RegexPattern::dumpOp(int32_t index) const { |
700 | (void)index; // Suppress warnings in non-debug build. |
701 | #if defined(REGEX_DEBUG) |
702 | static const char * const opNames[] = {URX_OPCODE_NAMES}; |
703 | int32_t op = fCompiledPat->elementAti(index); |
704 | int32_t val = URX_VAL(op); |
705 | int32_t type = URX_TYPE(op); |
706 | int32_t pinnedType = type; |
707 | if ((uint32_t)pinnedType >= UPRV_LENGTHOF(opNames)) { |
708 | pinnedType = 0; |
709 | } |
710 | |
711 | printf("%4d %08x %-15s " , index, op, opNames[pinnedType]); |
712 | switch (type) { |
713 | case URX_NOP: |
714 | case URX_DOTANY: |
715 | case URX_DOTANY_ALL: |
716 | case URX_FAIL: |
717 | case URX_CARET: |
718 | case URX_DOLLAR: |
719 | case URX_BACKSLASH_G: |
720 | case URX_BACKSLASH_X: |
721 | case URX_END: |
722 | case URX_DOLLAR_M: |
723 | case URX_CARET_M: |
724 | // Types with no operand field of interest. |
725 | break; |
726 | |
727 | case URX_RESERVED_OP: |
728 | case URX_START_CAPTURE: |
729 | case URX_END_CAPTURE: |
730 | case URX_STATE_SAVE: |
731 | case URX_JMP: |
732 | case URX_JMP_SAV: |
733 | case URX_JMP_SAV_X: |
734 | case URX_BACKSLASH_B: |
735 | case URX_BACKSLASH_BU: |
736 | case URX_BACKSLASH_D: |
737 | case URX_BACKSLASH_Z: |
738 | case URX_STRING_LEN: |
739 | case URX_CTR_INIT: |
740 | case URX_CTR_INIT_NG: |
741 | case URX_CTR_LOOP: |
742 | case URX_CTR_LOOP_NG: |
743 | case URX_RELOC_OPRND: |
744 | case URX_STO_SP: |
745 | case URX_LD_SP: |
746 | case URX_BACKREF: |
747 | case URX_STO_INP_LOC: |
748 | case URX_JMPX: |
749 | case URX_LA_START: |
750 | case URX_LA_END: |
751 | case URX_BACKREF_I: |
752 | case URX_LB_START: |
753 | case URX_LB_CONT: |
754 | case URX_LB_END: |
755 | case URX_LBN_CONT: |
756 | case URX_LBN_END: |
757 | case URX_LOOP_C: |
758 | case URX_LOOP_DOT_I: |
759 | case URX_BACKSLASH_H: |
760 | case URX_BACKSLASH_R: |
761 | case URX_BACKSLASH_V: |
762 | // types with an integer operand field. |
763 | printf("%d" , val); |
764 | break; |
765 | |
766 | case URX_ONECHAR: |
767 | case URX_ONECHAR_I: |
768 | if (val < 0x20) { |
769 | printf("%#x" , val); |
770 | } else { |
771 | printf("'%s'" , CStr(UnicodeString(val))()); |
772 | } |
773 | break; |
774 | |
775 | case URX_STRING: |
776 | case URX_STRING_I: |
777 | { |
778 | int32_t lengthOp = fCompiledPat->elementAti(index+1); |
779 | U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); |
780 | int32_t length = URX_VAL(lengthOp); |
781 | UnicodeString str(fLiteralText, val, length); |
782 | printf("%s" , CStr(str)()); |
783 | } |
784 | break; |
785 | |
786 | case URX_SETREF: |
787 | case URX_LOOP_SR_I: |
788 | { |
789 | UnicodeString s; |
790 | UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); |
791 | set->toPattern(s, TRUE); |
792 | printf("%s" , CStr(s)()); |
793 | } |
794 | break; |
795 | |
796 | case URX_STATIC_SETREF: |
797 | case URX_STAT_SETREF_N: |
798 | { |
799 | UnicodeString s; |
800 | if (val & URX_NEG_SET) { |
801 | printf("NOT " ); |
802 | val &= ~URX_NEG_SET; |
803 | } |
804 | UnicodeSet &set = RegexStaticSets::gStaticSets->fPropSets[val]; |
805 | set.toPattern(s, TRUE); |
806 | printf("%s" , CStr(s)()); |
807 | } |
808 | break; |
809 | |
810 | |
811 | default: |
812 | printf("??????" ); |
813 | break; |
814 | } |
815 | printf("\n" ); |
816 | #endif |
817 | } |
818 | |
819 | |
820 | void RegexPattern::dumpPattern() const { |
821 | #if defined(REGEX_DEBUG) |
822 | int index; |
823 | |
824 | UnicodeString patStr; |
825 | for (UChar32 c = utext_next32From(fPattern, 0); c != U_SENTINEL; c = utext_next32(fPattern)) { |
826 | patStr.append(c); |
827 | } |
828 | printf("Original Pattern: \"%s\"\n" , CStr(patStr)()); |
829 | printf(" Min Match Length: %d\n" , fMinMatchLen); |
830 | printf(" Match Start Type: %s\n" , START_OF_MATCH_STR(fStartType)); |
831 | if (fStartType == START_STRING) { |
832 | UnicodeString initialString(fLiteralText,fInitialStringIdx, fInitialStringLen); |
833 | printf(" Initial match string: \"%s\"\n" , CStr(initialString)()); |
834 | } else if (fStartType == START_SET) { |
835 | UnicodeString s; |
836 | fInitialChars->toPattern(s, TRUE); |
837 | printf(" Match First Chars: %s\n" , CStr(s)()); |
838 | |
839 | } else if (fStartType == START_CHAR) { |
840 | printf(" First char of Match: " ); |
841 | if (fInitialChar > 0x20) { |
842 | printf("'%s'\n" , CStr(UnicodeString(fInitialChar))()); |
843 | } else { |
844 | printf("%#x\n" , fInitialChar); |
845 | } |
846 | } |
847 | |
848 | printf("Named Capture Groups:\n" ); |
849 | if (!fNamedCaptureMap || uhash_count(fNamedCaptureMap) == 0) { |
850 | printf(" None\n" ); |
851 | } else { |
852 | int32_t pos = UHASH_FIRST; |
853 | const UHashElement *el = NULL; |
854 | while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) { |
855 | const UnicodeString *name = (const UnicodeString *)el->key.pointer; |
856 | int32_t number = el->value.integer; |
857 | printf(" %d\t%s\n" , number, CStr(*name)()); |
858 | } |
859 | } |
860 | |
861 | printf("\nIndex Binary Type Operand\n" \ |
862 | "-------------------------------------------\n" ); |
863 | for (index = 0; index<fCompiledPat->size(); index++) { |
864 | dumpOp(index); |
865 | } |
866 | printf("\n\n" ); |
867 | #endif |
868 | } |
869 | |
870 | |
871 | |
872 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern) |
873 | |
874 | U_NAMESPACE_END |
875 | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
876 | |