1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**************************************************************************
5* Copyright (C) 2002-2016 International Business Machines Corporation
6* and others. All rights reserved.
7**************************************************************************
8*/
9//
10// file: rematch.cpp
11//
12// Contains the implementation of class RegexMatcher,
13// which is one of the main API classes for the ICU regular expression package.
14//
15
16#include "unicode/utypes.h"
17#if !UCONFIG_NO_REGULAR_EXPRESSIONS
18
19#include "unicode/regex.h"
20#include "unicode/uniset.h"
21#include "unicode/uchar.h"
22#include "unicode/ustring.h"
23#include "unicode/rbbi.h"
24#include "unicode/utf.h"
25#include "unicode/utf16.h"
26#include "uassert.h"
27#include "cmemory.h"
28#include "cstr.h"
29#include "uvector.h"
30#include "uvectr32.h"
31#include "uvectr64.h"
32#include "regeximp.h"
33#include "regexst.h"
34#include "regextxt.h"
35#include "ucase.h"
36
37// #include <malloc.h> // Needed for heapcheck testing
38
39
40U_NAMESPACE_BEGIN
41
42// Default limit for the size of the back track stack, to avoid system
43// failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44// This value puts ICU's limits higher than most other regexp implementations,
45// which use recursion rather than the heap, and take more storage per
46// backtrack point.
47//
48static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49
50// Time limit counter constant.
51// Time limits for expression evaluation are in terms of quanta of work by
52// the engine, each of which is 10,000 state saves.
53// This constant determines that state saves per tick number.
54static const int32_t TIMER_INITIAL_VALUE = 10000;
55
56
57// Test for any of the Unicode line terminating characters.
58static inline UBool isLineTerminator(UChar32 c) {
59 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60 return false;
61 }
62 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63}
64
65//-----------------------------------------------------------------------------
66//
67// Constructor and Destructor
68//
69//-----------------------------------------------------------------------------
70RegexMatcher::RegexMatcher(const RegexPattern *pat) {
71 fDeferredStatus = U_ZERO_ERROR;
72 init(fDeferredStatus);
73 if (U_FAILURE(fDeferredStatus)) {
74 return;
75 }
76 if (pat==NULL) {
77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78 return;
79 }
80 fPattern = pat;
81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82}
83
84
85
86RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87 uint32_t flags, UErrorCode &status) {
88 init(status);
89 if (U_FAILURE(status)) {
90 return;
91 }
92 UParseError pe;
93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
94 fPattern = fPatternOwned;
95
96 UText inputText = UTEXT_INITIALIZER;
97 utext_openConstUnicodeString(&inputText, &input, &status);
98 init2(&inputText, status);
99 utext_close(&inputText);
100
101 fInputUniStrMaybeMutable = TRUE;
102}
103
104
105RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106 uint32_t flags, UErrorCode &status) {
107 init(status);
108 if (U_FAILURE(status)) {
109 return;
110 }
111 UParseError pe;
112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
113 if (U_FAILURE(status)) {
114 return;
115 }
116
117 fPattern = fPatternOwned;
118 init2(input, status);
119}
120
121
122RegexMatcher::RegexMatcher(const UnicodeString &regexp,
123 uint32_t flags, UErrorCode &status) {
124 init(status);
125 if (U_FAILURE(status)) {
126 return;
127 }
128 UParseError pe;
129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
130 if (U_FAILURE(status)) {
131 return;
132 }
133 fPattern = fPatternOwned;
134 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135}
136
137RegexMatcher::RegexMatcher(UText *regexp,
138 uint32_t flags, UErrorCode &status) {
139 init(status);
140 if (U_FAILURE(status)) {
141 return;
142 }
143 UParseError pe;
144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
145 if (U_FAILURE(status)) {
146 return;
147 }
148
149 fPattern = fPatternOwned;
150 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151}
152
153
154
155
156RegexMatcher::~RegexMatcher() {
157 delete fStack;
158 if (fData != fSmallData) {
159 uprv_free(fData);
160 fData = NULL;
161 }
162 if (fPatternOwned) {
163 delete fPatternOwned;
164 fPatternOwned = NULL;
165 fPattern = NULL;
166 }
167
168 if (fInput) {
169 delete fInput;
170 }
171 if (fInputText) {
172 utext_close(fInputText);
173 }
174 if (fAltInputText) {
175 utext_close(fAltInputText);
176 }
177
178 #if UCONFIG_NO_BREAK_ITERATION==0
179 delete fWordBreakItr;
180 delete fGCBreakItr;
181 #endif
182}
183
184//
185// init() common initialization for use by all constructors.
186// Initialize all fields, get the object into a consistent state.
187// This must be done even when the initial status shows an error,
188// so that the object is initialized sufficiently well for the destructor
189// to run safely.
190//
191void RegexMatcher::init(UErrorCode &status) {
192 fPattern = NULL;
193 fPatternOwned = NULL;
194 fFrameSize = 0;
195 fRegionStart = 0;
196 fRegionLimit = 0;
197 fAnchorStart = 0;
198 fAnchorLimit = 0;
199 fLookStart = 0;
200 fLookLimit = 0;
201 fActiveStart = 0;
202 fActiveLimit = 0;
203 fTransparentBounds = FALSE;
204 fAnchoringBounds = TRUE;
205 fMatch = FALSE;
206 fMatchStart = 0;
207 fMatchEnd = 0;
208 fLastMatchEnd = -1;
209 fAppendPosition = 0;
210 fHitEnd = FALSE;
211 fRequireEnd = FALSE;
212 fStack = NULL;
213 fFrame = NULL;
214 fTimeLimit = 0;
215 fTime = 0;
216 fTickCounter = 0;
217 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
218 fCallbackFn = NULL;
219 fCallbackContext = NULL;
220 fFindProgressCallbackFn = NULL;
221 fFindProgressCallbackContext = NULL;
222 fTraceDebug = FALSE;
223 fDeferredStatus = status;
224 fData = fSmallData;
225 fWordBreakItr = NULL;
226 fGCBreakItr = NULL;
227
228 fStack = NULL;
229 fInputText = NULL;
230 fAltInputText = NULL;
231 fInput = NULL;
232 fInputLength = 0;
233 fInputUniStrMaybeMutable = FALSE;
234}
235
236//
237// init2() Common initialization for use by RegexMatcher constructors, part 2.
238// This handles the common setup to be done after the Pattern is available.
239//
240void RegexMatcher::init2(UText *input, UErrorCode &status) {
241 if (U_FAILURE(status)) {
242 fDeferredStatus = status;
243 return;
244 }
245
246 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
247 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
248 if (fData == NULL) {
249 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
250 return;
251 }
252 }
253
254 fStack = new UVector64(status);
255 if (fStack == NULL) {
256 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
257 return;
258 }
259
260 reset(input);
261 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
262 if (U_FAILURE(status)) {
263 fDeferredStatus = status;
264 return;
265 }
266}
267
268
269static const UChar BACKSLASH = 0x5c;
270static const UChar DOLLARSIGN = 0x24;
271static const UChar LEFTBRACKET = 0x7b;
272static const UChar RIGHTBRACKET = 0x7d;
273
274//--------------------------------------------------------------------------------
275//
276// appendReplacement
277//
278//--------------------------------------------------------------------------------
279RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280 const UnicodeString &replacement,
281 UErrorCode &status) {
282 UText replacementText = UTEXT_INITIALIZER;
283
284 utext_openConstUnicodeString(&replacementText, &replacement, &status);
285 if (U_SUCCESS(status)) {
286 UText resultText = UTEXT_INITIALIZER;
287 utext_openUnicodeString(&resultText, &dest, &status);
288
289 if (U_SUCCESS(status)) {
290 appendReplacement(&resultText, &replacementText, status);
291 utext_close(&resultText);
292 }
293 utext_close(&replacementText);
294 }
295
296 return *this;
297}
298
299//
300// appendReplacement, UText mode
301//
302RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303 UText *replacement,
304 UErrorCode &status) {
305 if (U_FAILURE(status)) {
306 return *this;
307 }
308 if (U_FAILURE(fDeferredStatus)) {
309 status = fDeferredStatus;
310 return *this;
311 }
312 if (fMatch == FALSE) {
313 status = U_REGEX_INVALID_STATE;
314 return *this;
315 }
316
317 // Copy input string from the end of previous match to start of current match
318 int64_t destLen = utext_nativeLength(dest);
319 if (fMatchStart > fAppendPosition) {
320 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322 (int32_t)(fMatchStart-fAppendPosition), &status);
323 } else {
324 int32_t len16;
325 if (UTEXT_USES_U16(fInputText)) {
326 len16 = (int32_t)(fMatchStart-fAppendPosition);
327 } else {
328 UErrorCode lengthStatus = U_ZERO_ERROR;
329 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330 }
331 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332 if (inputChars == NULL) {
333 status = U_MEMORY_ALLOCATION_ERROR;
334 return *this;
335 }
336 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338 uprv_free(inputChars);
339 }
340 }
341 fAppendPosition = fMatchEnd;
342
343
344 // scan the replacement text, looking for substitutions ($n) and \escapes.
345 // TODO: optimize this loop by efficiently scanning for '$' or '\',
346 // move entire ranges not containing substitutions.
347 UTEXT_SETNATIVEINDEX(replacement, 0);
348 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
349 if (c == BACKSLASH) {
350 // Backslash Escape. Copy the following char out without further checks.
351 // Note: Surrogate pairs don't need any special handling
352 // The second half wont be a '$' or a '\', and
353 // will move to the dest normally on the next
354 // loop iteration.
355 c = UTEXT_CURRENT32(replacement);
356 if (c == U_SENTINEL) {
357 break;
358 }
359
360 if (c==0x55/*U*/ || c==0x75/*u*/) {
361 // We have a \udddd or \Udddddddd escape sequence.
362 int32_t offset = 0;
363 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
364 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
365 if (escapedChar != (UChar32)0xFFFFFFFF) {
366 if (U_IS_BMP(escapedChar)) {
367 UChar c16 = (UChar)escapedChar;
368 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
369 } else {
370 UChar surrogate[2];
371 surrogate[0] = U16_LEAD(escapedChar);
372 surrogate[1] = U16_TRAIL(escapedChar);
373 if (U_SUCCESS(status)) {
374 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
375 }
376 }
377 // TODO: Report errors for mal-formed \u escapes?
378 // As this is, the original sequence is output, which may be OK.
379 if (context.lastOffset == offset) {
380 (void)UTEXT_PREVIOUS32(replacement);
381 } else if (context.lastOffset != offset-1) {
382 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
383 }
384 }
385 } else {
386 (void)UTEXT_NEXT32(replacement);
387 // Plain backslash escape. Just put out the escaped character.
388 if (U_IS_BMP(c)) {
389 UChar c16 = (UChar)c;
390 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
391 } else {
392 UChar surrogate[2];
393 surrogate[0] = U16_LEAD(c);
394 surrogate[1] = U16_TRAIL(c);
395 if (U_SUCCESS(status)) {
396 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
397 }
398 }
399 }
400 } else if (c != DOLLARSIGN) {
401 // Normal char, not a $. Copy it out without further checks.
402 if (U_IS_BMP(c)) {
403 UChar c16 = (UChar)c;
404 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
405 } else {
406 UChar surrogate[2];
407 surrogate[0] = U16_LEAD(c);
408 surrogate[1] = U16_TRAIL(c);
409 if (U_SUCCESS(status)) {
410 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
411 }
412 }
413 } else {
414 // We've got a $. Pick up a capture group name or number if one follows.
415 // Consume digits so long as the resulting group number <= the number of
416 // number of capture groups in the pattern.
417
418 int32_t groupNum = 0;
419 int32_t numDigits = 0;
420 UChar32 nextChar = utext_current32(replacement);
421 if (nextChar == LEFTBRACKET) {
422 // Scan for a Named Capture Group, ${name}.
423 UnicodeString groupName;
424 utext_next32(replacement);
425 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
426 nextChar = utext_next32(replacement);
427 if (nextChar == U_SENTINEL) {
428 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
429 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
430 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
431 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
432 groupName.append(nextChar);
433 } else if (nextChar == RIGHTBRACKET) {
434 groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
435 if (groupNum == 0) {
436 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
437 }
438 } else {
439 // Character was something other than a name char or a closing '}'
440 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
441 }
442 }
443
444 } else if (u_isdigit(nextChar)) {
445 // $n Scan for a capture group number
446 int32_t numCaptureGroups = fPattern->fGroupMap->size();
447 for (;;) {
448 nextChar = UTEXT_CURRENT32(replacement);
449 if (nextChar == U_SENTINEL) {
450 break;
451 }
452 if (u_isdigit(nextChar) == FALSE) {
453 break;
454 }
455 int32_t nextDigitVal = u_charDigitValue(nextChar);
456 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
457 // Don't consume the next digit if it makes the capture group number too big.
458 if (numDigits == 0) {
459 status = U_INDEX_OUTOFBOUNDS_ERROR;
460 }
461 break;
462 }
463 (void)UTEXT_NEXT32(replacement);
464 groupNum=groupNum*10 + nextDigitVal;
465 ++numDigits;
466 }
467 } else {
468 // $ not followed by capture group name or number.
469 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
470 }
471
472 if (U_SUCCESS(status)) {
473 destLen += appendGroup(groupNum, dest, status);
474 }
475 } // End of $ capture group handling
476 } // End of per-character loop through the replacement string.
477
478 return *this;
479}
480
481
482
483//--------------------------------------------------------------------------------
484//
485// appendTail Intended to be used in conjunction with appendReplacement()
486// To the destination string, append everything following
487// the last match position from the input string.
488//
489// Note: Match ranges do not affect appendTail or appendReplacement
490//
491//--------------------------------------------------------------------------------
492UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
493 UErrorCode status = U_ZERO_ERROR;
494 UText resultText = UTEXT_INITIALIZER;
495 utext_openUnicodeString(&resultText, &dest, &status);
496
497 if (U_SUCCESS(status)) {
498 appendTail(&resultText, status);
499 utext_close(&resultText);
500 }
501
502 return dest;
503}
504
505//
506// appendTail, UText mode
507//
508UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
509 if (U_FAILURE(status)) {
510 return dest;
511 }
512 if (U_FAILURE(fDeferredStatus)) {
513 status = fDeferredStatus;
514 return dest;
515 }
516
517 if (fInputLength > fAppendPosition) {
518 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
519 int64_t destLen = utext_nativeLength(dest);
520 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
521 (int32_t)(fInputLength-fAppendPosition), &status);
522 } else {
523 int32_t len16;
524 if (UTEXT_USES_U16(fInputText)) {
525 len16 = (int32_t)(fInputLength-fAppendPosition);
526 } else {
527 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
528 status = U_ZERO_ERROR; // buffer overflow
529 }
530
531 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
532 if (inputChars == NULL) {
533 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
534 } else {
535 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
536 int64_t destLen = utext_nativeLength(dest);
537 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
538 uprv_free(inputChars);
539 }
540 }
541 }
542 return dest;
543}
544
545
546
547//--------------------------------------------------------------------------------
548//
549// end
550//
551//--------------------------------------------------------------------------------
552int32_t RegexMatcher::end(UErrorCode &err) const {
553 return end(0, err);
554}
555
556int64_t RegexMatcher::end64(UErrorCode &err) const {
557 return end64(0, err);
558}
559
560int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
561 if (U_FAILURE(err)) {
562 return -1;
563 }
564 if (fMatch == FALSE) {
565 err = U_REGEX_INVALID_STATE;
566 return -1;
567 }
568 if (group < 0 || group > fPattern->fGroupMap->size()) {
569 err = U_INDEX_OUTOFBOUNDS_ERROR;
570 return -1;
571 }
572 int64_t e = -1;
573 if (group == 0) {
574 e = fMatchEnd;
575 } else {
576 // Get the position within the stack frame of the variables for
577 // this capture group.
578 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
579 U_ASSERT(groupOffset < fPattern->fFrameSize);
580 U_ASSERT(groupOffset >= 0);
581 e = fFrame->fExtra[groupOffset + 1];
582 }
583
584 return e;
585}
586
587int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
588 return (int32_t)end64(group, err);
589}
590
591//--------------------------------------------------------------------------------
592//
593// findProgressInterrupt This function is called once for each advance in the target
594// string from the find() function, and calls the user progress callback
595// function if there is one installed.
596//
597// Return: TRUE if the find operation is to be terminated.
598// FALSE if the find operation is to continue running.
599//
600//--------------------------------------------------------------------------------
601UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
602 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
603 status = U_REGEX_STOPPED_BY_CALLER;
604 return TRUE;
605 }
606 return FALSE;
607}
608
609//--------------------------------------------------------------------------------
610//
611// find()
612//
613//--------------------------------------------------------------------------------
614UBool RegexMatcher::find() {
615 if (U_FAILURE(fDeferredStatus)) {
616 return FALSE;
617 }
618 UErrorCode status = U_ZERO_ERROR;
619 UBool result = find(status);
620 return result;
621}
622
623//--------------------------------------------------------------------------------
624//
625// find()
626//
627//--------------------------------------------------------------------------------
628UBool RegexMatcher::find(UErrorCode &status) {
629 // Start at the position of the last match end. (Will be zero if the
630 // matcher has been reset.)
631 //
632 if (U_FAILURE(status)) {
633 return FALSE;
634 }
635 if (U_FAILURE(fDeferredStatus)) {
636 status = fDeferredStatus;
637 return FALSE;
638 }
639
640 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
641 return findUsingChunk(status);
642 }
643
644 int64_t startPos = fMatchEnd;
645 if (startPos==0) {
646 startPos = fActiveStart;
647 }
648
649 if (fMatch) {
650 // Save the position of any previous successful match.
651 fLastMatchEnd = fMatchEnd;
652
653 if (fMatchStart == fMatchEnd) {
654 // Previous match had zero length. Move start position up one position
655 // to avoid sending find() into a loop on zero-length matches.
656 if (startPos >= fActiveLimit) {
657 fMatch = FALSE;
658 fHitEnd = TRUE;
659 return FALSE;
660 }
661 UTEXT_SETNATIVEINDEX(fInputText, startPos);
662 (void)UTEXT_NEXT32(fInputText);
663 startPos = UTEXT_GETNATIVEINDEX(fInputText);
664 }
665 } else {
666 if (fLastMatchEnd >= 0) {
667 // A previous find() failed to match. Don't try again.
668 // (without this test, a pattern with a zero-length match
669 // could match again at the end of an input string.)
670 fHitEnd = TRUE;
671 return FALSE;
672 }
673 }
674
675
676 // Compute the position in the input string beyond which a match can not begin, because
677 // the minimum length match would extend past the end of the input.
678 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
679 // Be aware of possible overflows if making changes here.
680 int64_t testStartLimit;
681 if (UTEXT_USES_U16(fInputText)) {
682 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
683 if (startPos > testStartLimit) {
684 fMatch = FALSE;
685 fHitEnd = TRUE;
686 return FALSE;
687 }
688 } else {
689 // We don't know exactly how long the minimum match length is in native characters.
690 // Treat anything > 0 as 1.
691 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
692 }
693
694 UChar32 c;
695 U_ASSERT(startPos >= 0);
696
697 switch (fPattern->fStartType) {
698 case START_NO_INFO:
699 // No optimization was found.
700 // Try a match at each input position.
701 for (;;) {
702 MatchAt(startPos, FALSE, status);
703 if (U_FAILURE(status)) {
704 return FALSE;
705 }
706 if (fMatch) {
707 return TRUE;
708 }
709 if (startPos >= testStartLimit) {
710 fHitEnd = TRUE;
711 return FALSE;
712 }
713 UTEXT_SETNATIVEINDEX(fInputText, startPos);
714 (void)UTEXT_NEXT32(fInputText);
715 startPos = UTEXT_GETNATIVEINDEX(fInputText);
716 // Note that it's perfectly OK for a pattern to have a zero-length
717 // match at the end of a string, so we must make sure that the loop
718 // runs with startPos == testStartLimit the last time through.
719 if (findProgressInterrupt(startPos, status))
720 return FALSE;
721 }
722 UPRV_UNREACHABLE;
723
724 case START_START:
725 // Matches are only possible at the start of the input string
726 // (pattern begins with ^ or \A)
727 if (startPos > fActiveStart) {
728 fMatch = FALSE;
729 return FALSE;
730 }
731 MatchAt(startPos, FALSE, status);
732 if (U_FAILURE(status)) {
733 return FALSE;
734 }
735 return fMatch;
736
737
738 case START_SET:
739 {
740 // Match may start on any char from a pre-computed set.
741 U_ASSERT(fPattern->fMinMatchLen > 0);
742 UTEXT_SETNATIVEINDEX(fInputText, startPos);
743 for (;;) {
744 int64_t pos = startPos;
745 c = UTEXT_NEXT32(fInputText);
746 startPos = UTEXT_GETNATIVEINDEX(fInputText);
747 // c will be -1 (U_SENTINEL) at end of text, in which case we
748 // skip this next block (so we don't have a negative array index)
749 // and handle end of text in the following block.
750 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
751 (c>=256 && fPattern->fInitialChars->contains(c)))) {
752 MatchAt(pos, FALSE, status);
753 if (U_FAILURE(status)) {
754 return FALSE;
755 }
756 if (fMatch) {
757 return TRUE;
758 }
759 UTEXT_SETNATIVEINDEX(fInputText, pos);
760 }
761 if (startPos > testStartLimit) {
762 fMatch = FALSE;
763 fHitEnd = TRUE;
764 return FALSE;
765 }
766 if (findProgressInterrupt(startPos, status))
767 return FALSE;
768 }
769 }
770 UPRV_UNREACHABLE;
771
772 case START_STRING:
773 case START_CHAR:
774 {
775 // Match starts on exactly one char.
776 U_ASSERT(fPattern->fMinMatchLen > 0);
777 UChar32 theChar = fPattern->fInitialChar;
778 UTEXT_SETNATIVEINDEX(fInputText, startPos);
779 for (;;) {
780 int64_t pos = startPos;
781 c = UTEXT_NEXT32(fInputText);
782 startPos = UTEXT_GETNATIVEINDEX(fInputText);
783 if (c == theChar) {
784 MatchAt(pos, FALSE, status);
785 if (U_FAILURE(status)) {
786 return FALSE;
787 }
788 if (fMatch) {
789 return TRUE;
790 }
791 UTEXT_SETNATIVEINDEX(fInputText, startPos);
792 }
793 if (startPos > testStartLimit) {
794 fMatch = FALSE;
795 fHitEnd = TRUE;
796 return FALSE;
797 }
798 if (findProgressInterrupt(startPos, status))
799 return FALSE;
800 }
801 }
802 UPRV_UNREACHABLE;
803
804 case START_LINE:
805 {
806 UChar32 ch;
807 if (startPos == fAnchorStart) {
808 MatchAt(startPos, FALSE, status);
809 if (U_FAILURE(status)) {
810 return FALSE;
811 }
812 if (fMatch) {
813 return TRUE;
814 }
815 UTEXT_SETNATIVEINDEX(fInputText, startPos);
816 ch = UTEXT_NEXT32(fInputText);
817 startPos = UTEXT_GETNATIVEINDEX(fInputText);
818 } else {
819 UTEXT_SETNATIVEINDEX(fInputText, startPos);
820 ch = UTEXT_PREVIOUS32(fInputText);
821 UTEXT_SETNATIVEINDEX(fInputText, startPos);
822 }
823
824 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
825 for (;;) {
826 if (ch == 0x0a) {
827 MatchAt(startPos, FALSE, status);
828 if (U_FAILURE(status)) {
829 return FALSE;
830 }
831 if (fMatch) {
832 return TRUE;
833 }
834 UTEXT_SETNATIVEINDEX(fInputText, startPos);
835 }
836 if (startPos >= testStartLimit) {
837 fMatch = FALSE;
838 fHitEnd = TRUE;
839 return FALSE;
840 }
841 ch = UTEXT_NEXT32(fInputText);
842 startPos = UTEXT_GETNATIVEINDEX(fInputText);
843 // Note that it's perfectly OK for a pattern to have a zero-length
844 // match at the end of a string, so we must make sure that the loop
845 // runs with startPos == testStartLimit the last time through.
846 if (findProgressInterrupt(startPos, status))
847 return FALSE;
848 }
849 } else {
850 for (;;) {
851 if (isLineTerminator(ch)) {
852 if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
853 (void)UTEXT_NEXT32(fInputText);
854 startPos = UTEXT_GETNATIVEINDEX(fInputText);
855 }
856 MatchAt(startPos, FALSE, status);
857 if (U_FAILURE(status)) {
858 return FALSE;
859 }
860 if (fMatch) {
861 return TRUE;
862 }
863 UTEXT_SETNATIVEINDEX(fInputText, startPos);
864 }
865 if (startPos >= testStartLimit) {
866 fMatch = FALSE;
867 fHitEnd = TRUE;
868 return FALSE;
869 }
870 ch = UTEXT_NEXT32(fInputText);
871 startPos = UTEXT_GETNATIVEINDEX(fInputText);
872 // Note that it's perfectly OK for a pattern to have a zero-length
873 // match at the end of a string, so we must make sure that the loop
874 // runs with startPos == testStartLimit the last time through.
875 if (findProgressInterrupt(startPos, status))
876 return FALSE;
877 }
878 }
879 }
880
881 default:
882 UPRV_UNREACHABLE;
883 }
884
885 UPRV_UNREACHABLE;
886}
887
888
889
890UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
891 if (U_FAILURE(status)) {
892 return FALSE;
893 }
894 if (U_FAILURE(fDeferredStatus)) {
895 status = fDeferredStatus;
896 return FALSE;
897 }
898 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
899 // This will reset the region to be the full input length.
900 if (start < 0) {
901 status = U_INDEX_OUTOFBOUNDS_ERROR;
902 return FALSE;
903 }
904
905 int64_t nativeStart = start;
906 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
907 status = U_INDEX_OUTOFBOUNDS_ERROR;
908 return FALSE;
909 }
910 fMatchEnd = nativeStart;
911 return find(status);
912}
913
914
915//--------------------------------------------------------------------------------
916//
917// findUsingChunk() -- like find(), but with the advance knowledge that the
918// entire string is available in the UText's chunk buffer.
919//
920//--------------------------------------------------------------------------------
921UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
922 // Start at the position of the last match end. (Will be zero if the
923 // matcher has been reset.
924 //
925
926 int32_t startPos = (int32_t)fMatchEnd;
927 if (startPos==0) {
928 startPos = (int32_t)fActiveStart;
929 }
930
931 const UChar *inputBuf = fInputText->chunkContents;
932
933 if (fMatch) {
934 // Save the position of any previous successful match.
935 fLastMatchEnd = fMatchEnd;
936
937 if (fMatchStart == fMatchEnd) {
938 // Previous match had zero length. Move start position up one position
939 // to avoid sending find() into a loop on zero-length matches.
940 if (startPos >= fActiveLimit) {
941 fMatch = FALSE;
942 fHitEnd = TRUE;
943 return FALSE;
944 }
945 U16_FWD_1(inputBuf, startPos, fInputLength);
946 }
947 } else {
948 if (fLastMatchEnd >= 0) {
949 // A previous find() failed to match. Don't try again.
950 // (without this test, a pattern with a zero-length match
951 // could match again at the end of an input string.)
952 fHitEnd = TRUE;
953 return FALSE;
954 }
955 }
956
957
958 // Compute the position in the input string beyond which a match can not begin, because
959 // the minimum length match would extend past the end of the input.
960 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
961 // Be aware of possible overflows if making changes here.
962 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
963 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
964 if (startPos > testLen) {
965 fMatch = FALSE;
966 fHitEnd = TRUE;
967 return FALSE;
968 }
969
970 UChar32 c;
971 U_ASSERT(startPos >= 0);
972
973 switch (fPattern->fStartType) {
974 case START_NO_INFO:
975 // No optimization was found.
976 // Try a match at each input position.
977 for (;;) {
978 MatchChunkAt(startPos, FALSE, status);
979 if (U_FAILURE(status)) {
980 return FALSE;
981 }
982 if (fMatch) {
983 return TRUE;
984 }
985 if (startPos >= testLen) {
986 fHitEnd = TRUE;
987 return FALSE;
988 }
989 U16_FWD_1(inputBuf, startPos, fActiveLimit);
990 // Note that it's perfectly OK for a pattern to have a zero-length
991 // match at the end of a string, so we must make sure that the loop
992 // runs with startPos == testLen the last time through.
993 if (findProgressInterrupt(startPos, status))
994 return FALSE;
995 }
996 UPRV_UNREACHABLE;
997
998 case START_START:
999 // Matches are only possible at the start of the input string
1000 // (pattern begins with ^ or \A)
1001 if (startPos > fActiveStart) {
1002 fMatch = FALSE;
1003 return FALSE;
1004 }
1005 MatchChunkAt(startPos, FALSE, status);
1006 if (U_FAILURE(status)) {
1007 return FALSE;
1008 }
1009 return fMatch;
1010
1011
1012 case START_SET:
1013 {
1014 // Match may start on any char from a pre-computed set.
1015 U_ASSERT(fPattern->fMinMatchLen > 0);
1016 for (;;) {
1017 int32_t pos = startPos;
1018 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1019 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1020 (c>=256 && fPattern->fInitialChars->contains(c))) {
1021 MatchChunkAt(pos, FALSE, status);
1022 if (U_FAILURE(status)) {
1023 return FALSE;
1024 }
1025 if (fMatch) {
1026 return TRUE;
1027 }
1028 }
1029 if (startPos > testLen) {
1030 fMatch = FALSE;
1031 fHitEnd = TRUE;
1032 return FALSE;
1033 }
1034 if (findProgressInterrupt(startPos, status))
1035 return FALSE;
1036 }
1037 }
1038 UPRV_UNREACHABLE;
1039
1040 case START_STRING:
1041 case START_CHAR:
1042 {
1043 // Match starts on exactly one char.
1044 U_ASSERT(fPattern->fMinMatchLen > 0);
1045 UChar32 theChar = fPattern->fInitialChar;
1046 for (;;) {
1047 int32_t pos = startPos;
1048 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1049 if (c == theChar) {
1050 MatchChunkAt(pos, FALSE, status);
1051 if (U_FAILURE(status)) {
1052 return FALSE;
1053 }
1054 if (fMatch) {
1055 return TRUE;
1056 }
1057 }
1058 if (startPos > testLen) {
1059 fMatch = FALSE;
1060 fHitEnd = TRUE;
1061 return FALSE;
1062 }
1063 if (findProgressInterrupt(startPos, status))
1064 return FALSE;
1065 }
1066 }
1067 UPRV_UNREACHABLE;
1068
1069 case START_LINE:
1070 {
1071 UChar32 ch;
1072 if (startPos == fAnchorStart) {
1073 MatchChunkAt(startPos, FALSE, status);
1074 if (U_FAILURE(status)) {
1075 return FALSE;
1076 }
1077 if (fMatch) {
1078 return TRUE;
1079 }
1080 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1081 }
1082
1083 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1084 for (;;) {
1085 ch = inputBuf[startPos-1];
1086 if (ch == 0x0a) {
1087 MatchChunkAt(startPos, FALSE, status);
1088 if (U_FAILURE(status)) {
1089 return FALSE;
1090 }
1091 if (fMatch) {
1092 return TRUE;
1093 }
1094 }
1095 if (startPos >= testLen) {
1096 fMatch = FALSE;
1097 fHitEnd = TRUE;
1098 return FALSE;
1099 }
1100 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1101 // Note that it's perfectly OK for a pattern to have a zero-length
1102 // match at the end of a string, so we must make sure that the loop
1103 // runs with startPos == testLen the last time through.
1104 if (findProgressInterrupt(startPos, status))
1105 return FALSE;
1106 }
1107 } else {
1108 for (;;) {
1109 ch = inputBuf[startPos-1];
1110 if (isLineTerminator(ch)) {
1111 if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1112 startPos++;
1113 }
1114 MatchChunkAt(startPos, FALSE, status);
1115 if (U_FAILURE(status)) {
1116 return FALSE;
1117 }
1118 if (fMatch) {
1119 return TRUE;
1120 }
1121 }
1122 if (startPos >= testLen) {
1123 fMatch = FALSE;
1124 fHitEnd = TRUE;
1125 return FALSE;
1126 }
1127 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1128 // Note that it's perfectly OK for a pattern to have a zero-length
1129 // match at the end of a string, so we must make sure that the loop
1130 // runs with startPos == testLen the last time through.
1131 if (findProgressInterrupt(startPos, status))
1132 return FALSE;
1133 }
1134 }
1135 }
1136
1137 default:
1138 UPRV_UNREACHABLE;
1139 }
1140
1141 UPRV_UNREACHABLE;
1142}
1143
1144
1145
1146//--------------------------------------------------------------------------------
1147//
1148// group()
1149//
1150//--------------------------------------------------------------------------------
1151UnicodeString RegexMatcher::group(UErrorCode &status) const {
1152 return group(0, status);
1153}
1154
1155// Return immutable shallow clone
1156UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1157 return group(0, dest, group_len, status);
1158}
1159
1160// Return immutable shallow clone
1161UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1162 group_len = 0;
1163 if (U_FAILURE(status)) {
1164 return dest;
1165 }
1166 if (U_FAILURE(fDeferredStatus)) {
1167 status = fDeferredStatus;
1168 } else if (fMatch == FALSE) {
1169 status = U_REGEX_INVALID_STATE;
1170 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1171 status = U_INDEX_OUTOFBOUNDS_ERROR;
1172 }
1173
1174 if (U_FAILURE(status)) {
1175 return dest;
1176 }
1177
1178 int64_t s, e;
1179 if (groupNum == 0) {
1180 s = fMatchStart;
1181 e = fMatchEnd;
1182 } else {
1183 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1184 U_ASSERT(groupOffset < fPattern->fFrameSize);
1185 U_ASSERT(groupOffset >= 0);
1186 s = fFrame->fExtra[groupOffset];
1187 e = fFrame->fExtra[groupOffset+1];
1188 }
1189
1190 if (s < 0) {
1191 // A capture group wasn't part of the match
1192 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1193 }
1194 U_ASSERT(s <= e);
1195 group_len = e - s;
1196
1197 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1198 if (dest)
1199 UTEXT_SETNATIVEINDEX(dest, s);
1200 return dest;
1201}
1202
1203UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1204 UnicodeString result;
1205 int64_t groupStart = start64(groupNum, status);
1206 int64_t groupEnd = end64(groupNum, status);
1207 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1208 return result;
1209 }
1210
1211 // Get the group length using a utext_extract preflight.
1212 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1213 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1214 if (status != U_BUFFER_OVERFLOW_ERROR) {
1215 return result;
1216 }
1217
1218 status = U_ZERO_ERROR;
1219 UChar *buf = result.getBuffer(length);
1220 if (buf == NULL) {
1221 status = U_MEMORY_ALLOCATION_ERROR;
1222 } else {
1223 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1224 result.releaseBuffer(extractLength);
1225 U_ASSERT(length == extractLength);
1226 }
1227 return result;
1228}
1229
1230
1231//--------------------------------------------------------------------------------
1232//
1233// appendGroup() -- currently internal only, appends a group to a UText rather
1234// than replacing its contents
1235//
1236//--------------------------------------------------------------------------------
1237
1238int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1239 if (U_FAILURE(status)) {
1240 return 0;
1241 }
1242 if (U_FAILURE(fDeferredStatus)) {
1243 status = fDeferredStatus;
1244 return 0;
1245 }
1246 int64_t destLen = utext_nativeLength(dest);
1247
1248 if (fMatch == FALSE) {
1249 status = U_REGEX_INVALID_STATE;
1250 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1251 }
1252 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1253 status = U_INDEX_OUTOFBOUNDS_ERROR;
1254 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1255 }
1256
1257 int64_t s, e;
1258 if (groupNum == 0) {
1259 s = fMatchStart;
1260 e = fMatchEnd;
1261 } else {
1262 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1263 U_ASSERT(groupOffset < fPattern->fFrameSize);
1264 U_ASSERT(groupOffset >= 0);
1265 s = fFrame->fExtra[groupOffset];
1266 e = fFrame->fExtra[groupOffset+1];
1267 }
1268
1269 if (s < 0) {
1270 // A capture group wasn't part of the match
1271 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1272 }
1273 U_ASSERT(s <= e);
1274
1275 int64_t deltaLen;
1276 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1277 U_ASSERT(e <= fInputLength);
1278 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1279 } else {
1280 int32_t len16;
1281 if (UTEXT_USES_U16(fInputText)) {
1282 len16 = (int32_t)(e-s);
1283 } else {
1284 UErrorCode lengthStatus = U_ZERO_ERROR;
1285 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1286 }
1287 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1288 if (groupChars == NULL) {
1289 status = U_MEMORY_ALLOCATION_ERROR;
1290 return 0;
1291 }
1292 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1293
1294 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1295 uprv_free(groupChars);
1296 }
1297 return deltaLen;
1298}
1299
1300
1301
1302//--------------------------------------------------------------------------------
1303//
1304// groupCount()
1305//
1306//--------------------------------------------------------------------------------
1307int32_t RegexMatcher::groupCount() const {
1308 return fPattern->fGroupMap->size();
1309}
1310
1311//--------------------------------------------------------------------------------
1312//
1313// hasAnchoringBounds()
1314//
1315//--------------------------------------------------------------------------------
1316UBool RegexMatcher::hasAnchoringBounds() const {
1317 return fAnchoringBounds;
1318}
1319
1320
1321//--------------------------------------------------------------------------------
1322//
1323// hasTransparentBounds()
1324//
1325//--------------------------------------------------------------------------------
1326UBool RegexMatcher::hasTransparentBounds() const {
1327 return fTransparentBounds;
1328}
1329
1330
1331
1332//--------------------------------------------------------------------------------
1333//
1334// hitEnd()
1335//
1336//--------------------------------------------------------------------------------
1337UBool RegexMatcher::hitEnd() const {
1338 return fHitEnd;
1339}
1340
1341
1342//--------------------------------------------------------------------------------
1343//
1344// input()
1345//
1346//--------------------------------------------------------------------------------
1347const UnicodeString &RegexMatcher::input() const {
1348 if (!fInput) {
1349 UErrorCode status = U_ZERO_ERROR;
1350 int32_t len16;
1351 if (UTEXT_USES_U16(fInputText)) {
1352 len16 = (int32_t)fInputLength;
1353 } else {
1354 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1355 status = U_ZERO_ERROR; // overflow, length status
1356 }
1357 UnicodeString *result = new UnicodeString(len16, 0, 0);
1358
1359 UChar *inputChars = result->getBuffer(len16);
1360 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1361 result->releaseBuffer(len16);
1362
1363 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1364 }
1365
1366 return *fInput;
1367}
1368
1369//--------------------------------------------------------------------------------
1370//
1371// inputText()
1372//
1373//--------------------------------------------------------------------------------
1374UText *RegexMatcher::inputText() const {
1375 return fInputText;
1376}
1377
1378
1379//--------------------------------------------------------------------------------
1380//
1381// getInput() -- like inputText(), but makes a clone or copies into another UText
1382//
1383//--------------------------------------------------------------------------------
1384UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1385 if (U_FAILURE(status)) {
1386 return dest;
1387 }
1388 if (U_FAILURE(fDeferredStatus)) {
1389 status = fDeferredStatus;
1390 return dest;
1391 }
1392
1393 if (dest) {
1394 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1395 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1396 } else {
1397 int32_t input16Len;
1398 if (UTEXT_USES_U16(fInputText)) {
1399 input16Len = (int32_t)fInputLength;
1400 } else {
1401 UErrorCode lengthStatus = U_ZERO_ERROR;
1402 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1403 }
1404 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1405 if (inputChars == NULL) {
1406 return dest;
1407 }
1408
1409 status = U_ZERO_ERROR;
1410 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1411 status = U_ZERO_ERROR;
1412 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1413
1414 uprv_free(inputChars);
1415 }
1416 return dest;
1417 } else {
1418 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1419 }
1420}
1421
1422
1423static UBool compat_SyncMutableUTextContents(UText *ut);
1424static UBool compat_SyncMutableUTextContents(UText *ut) {
1425 UBool retVal = FALSE;
1426
1427 // In the following test, we're really only interested in whether the UText should switch
1428 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1429 // will still point to the correct data.
1430 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1431 UnicodeString *us=(UnicodeString *)ut->context;
1432
1433 // Update to the latest length.
1434 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1435 int32_t newLength = us->length();
1436
1437 // Update the chunk description.
1438 // The buffer may have switched between stack- and heap-based.
1439 ut->chunkContents = us->getBuffer();
1440 ut->chunkLength = newLength;
1441 ut->chunkNativeLimit = newLength;
1442 ut->nativeIndexingLimit = newLength;
1443 retVal = TRUE;
1444 }
1445
1446 return retVal;
1447}
1448
1449//--------------------------------------------------------------------------------
1450//
1451// lookingAt()
1452//
1453//--------------------------------------------------------------------------------
1454UBool RegexMatcher::lookingAt(UErrorCode &status) {
1455 if (U_FAILURE(status)) {
1456 return FALSE;
1457 }
1458 if (U_FAILURE(fDeferredStatus)) {
1459 status = fDeferredStatus;
1460 return FALSE;
1461 }
1462
1463 if (fInputUniStrMaybeMutable) {
1464 if (compat_SyncMutableUTextContents(fInputText)) {
1465 fInputLength = utext_nativeLength(fInputText);
1466 reset();
1467 }
1468 }
1469 else {
1470 resetPreserveRegion();
1471 }
1472 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1473 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1474 } else {
1475 MatchAt(fActiveStart, FALSE, status);
1476 }
1477 return fMatch;
1478}
1479
1480
1481UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1482 if (U_FAILURE(status)) {
1483 return FALSE;
1484 }
1485 if (U_FAILURE(fDeferredStatus)) {
1486 status = fDeferredStatus;
1487 return FALSE;
1488 }
1489 reset();
1490
1491 if (start < 0) {
1492 status = U_INDEX_OUTOFBOUNDS_ERROR;
1493 return FALSE;
1494 }
1495
1496 if (fInputUniStrMaybeMutable) {
1497 if (compat_SyncMutableUTextContents(fInputText)) {
1498 fInputLength = utext_nativeLength(fInputText);
1499 reset();
1500 }
1501 }
1502
1503 int64_t nativeStart;
1504 nativeStart = start;
1505 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1506 status = U_INDEX_OUTOFBOUNDS_ERROR;
1507 return FALSE;
1508 }
1509
1510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1511 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1512 } else {
1513 MatchAt(nativeStart, FALSE, status);
1514 }
1515 return fMatch;
1516}
1517
1518
1519
1520//--------------------------------------------------------------------------------
1521//
1522// matches()
1523//
1524//--------------------------------------------------------------------------------
1525UBool RegexMatcher::matches(UErrorCode &status) {
1526 if (U_FAILURE(status)) {
1527 return FALSE;
1528 }
1529 if (U_FAILURE(fDeferredStatus)) {
1530 status = fDeferredStatus;
1531 return FALSE;
1532 }
1533
1534 if (fInputUniStrMaybeMutable) {
1535 if (compat_SyncMutableUTextContents(fInputText)) {
1536 fInputLength = utext_nativeLength(fInputText);
1537 reset();
1538 }
1539 }
1540 else {
1541 resetPreserveRegion();
1542 }
1543
1544 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1545 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1546 } else {
1547 MatchAt(fActiveStart, TRUE, status);
1548 }
1549 return fMatch;
1550}
1551
1552
1553UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1554 if (U_FAILURE(status)) {
1555 return FALSE;
1556 }
1557 if (U_FAILURE(fDeferredStatus)) {
1558 status = fDeferredStatus;
1559 return FALSE;
1560 }
1561 reset();
1562
1563 if (start < 0) {
1564 status = U_INDEX_OUTOFBOUNDS_ERROR;
1565 return FALSE;
1566 }
1567
1568 if (fInputUniStrMaybeMutable) {
1569 if (compat_SyncMutableUTextContents(fInputText)) {
1570 fInputLength = utext_nativeLength(fInputText);
1571 reset();
1572 }
1573 }
1574
1575 int64_t nativeStart;
1576 nativeStart = start;
1577 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1578 status = U_INDEX_OUTOFBOUNDS_ERROR;
1579 return FALSE;
1580 }
1581
1582 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1583 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1584 } else {
1585 MatchAt(nativeStart, TRUE, status);
1586 }
1587 return fMatch;
1588}
1589
1590
1591
1592//--------------------------------------------------------------------------------
1593//
1594// pattern
1595//
1596//--------------------------------------------------------------------------------
1597const RegexPattern &RegexMatcher::pattern() const {
1598 return *fPattern;
1599}
1600
1601
1602
1603//--------------------------------------------------------------------------------
1604//
1605// region
1606//
1607//--------------------------------------------------------------------------------
1608RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1609 if (U_FAILURE(status)) {
1610 return *this;
1611 }
1612
1613 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1614 status = U_ILLEGAL_ARGUMENT_ERROR;
1615 }
1616
1617 int64_t nativeStart = regionStart;
1618 int64_t nativeLimit = regionLimit;
1619 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1620 status = U_ILLEGAL_ARGUMENT_ERROR;
1621 }
1622
1623 if (startIndex == -1)
1624 this->reset();
1625 else
1626 resetPreserveRegion();
1627
1628 fRegionStart = nativeStart;
1629 fRegionLimit = nativeLimit;
1630 fActiveStart = nativeStart;
1631 fActiveLimit = nativeLimit;
1632
1633 if (startIndex != -1) {
1634 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1635 status = U_INDEX_OUTOFBOUNDS_ERROR;
1636 }
1637 fMatchEnd = startIndex;
1638 }
1639
1640 if (!fTransparentBounds) {
1641 fLookStart = nativeStart;
1642 fLookLimit = nativeLimit;
1643 }
1644 if (fAnchoringBounds) {
1645 fAnchorStart = nativeStart;
1646 fAnchorLimit = nativeLimit;
1647 }
1648 return *this;
1649}
1650
1651RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1652 return region(start, limit, -1, status);
1653}
1654
1655//--------------------------------------------------------------------------------
1656//
1657// regionEnd
1658//
1659//--------------------------------------------------------------------------------
1660int32_t RegexMatcher::regionEnd() const {
1661 return (int32_t)fRegionLimit;
1662}
1663
1664int64_t RegexMatcher::regionEnd64() const {
1665 return fRegionLimit;
1666}
1667
1668//--------------------------------------------------------------------------------
1669//
1670// regionStart
1671//
1672//--------------------------------------------------------------------------------
1673int32_t RegexMatcher::regionStart() const {
1674 return (int32_t)fRegionStart;
1675}
1676
1677int64_t RegexMatcher::regionStart64() const {
1678 return fRegionStart;
1679}
1680
1681
1682//--------------------------------------------------------------------------------
1683//
1684// replaceAll
1685//
1686//--------------------------------------------------------------------------------
1687UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1688 UText replacementText = UTEXT_INITIALIZER;
1689 UText resultText = UTEXT_INITIALIZER;
1690 UnicodeString resultString;
1691 if (U_FAILURE(status)) {
1692 return resultString;
1693 }
1694
1695 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1696 utext_openUnicodeString(&resultText, &resultString, &status);
1697
1698 replaceAll(&replacementText, &resultText, status);
1699
1700 utext_close(&resultText);
1701 utext_close(&replacementText);
1702
1703 return resultString;
1704}
1705
1706
1707//
1708// replaceAll, UText mode
1709//
1710UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1711 if (U_FAILURE(status)) {
1712 return dest;
1713 }
1714 if (U_FAILURE(fDeferredStatus)) {
1715 status = fDeferredStatus;
1716 return dest;
1717 }
1718
1719 if (dest == NULL) {
1720 UnicodeString emptyString;
1721 UText empty = UTEXT_INITIALIZER;
1722
1723 utext_openUnicodeString(&empty, &emptyString, &status);
1724 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1725 utext_close(&empty);
1726 }
1727
1728 if (U_SUCCESS(status)) {
1729 reset();
1730 while (find()) {
1731 appendReplacement(dest, replacement, status);
1732 if (U_FAILURE(status)) {
1733 break;
1734 }
1735 }
1736 appendTail(dest, status);
1737 }
1738
1739 return dest;
1740}
1741
1742
1743//--------------------------------------------------------------------------------
1744//
1745// replaceFirst
1746//
1747//--------------------------------------------------------------------------------
1748UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1749 UText replacementText = UTEXT_INITIALIZER;
1750 UText resultText = UTEXT_INITIALIZER;
1751 UnicodeString resultString;
1752
1753 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754 utext_openUnicodeString(&resultText, &resultString, &status);
1755
1756 replaceFirst(&replacementText, &resultText, status);
1757
1758 utext_close(&resultText);
1759 utext_close(&replacementText);
1760
1761 return resultString;
1762}
1763
1764//
1765// replaceFirst, UText mode
1766//
1767UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1768 if (U_FAILURE(status)) {
1769 return dest;
1770 }
1771 if (U_FAILURE(fDeferredStatus)) {
1772 status = fDeferredStatus;
1773 return dest;
1774 }
1775
1776 reset();
1777 if (!find()) {
1778 return getInput(dest, status);
1779 }
1780
1781 if (dest == NULL) {
1782 UnicodeString emptyString;
1783 UText empty = UTEXT_INITIALIZER;
1784
1785 utext_openUnicodeString(&empty, &emptyString, &status);
1786 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1787 utext_close(&empty);
1788 }
1789
1790 appendReplacement(dest, replacement, status);
1791 appendTail(dest, status);
1792
1793 return dest;
1794}
1795
1796
1797//--------------------------------------------------------------------------------
1798//
1799// requireEnd
1800//
1801//--------------------------------------------------------------------------------
1802UBool RegexMatcher::requireEnd() const {
1803 return fRequireEnd;
1804}
1805
1806
1807//--------------------------------------------------------------------------------
1808//
1809// reset
1810//
1811//--------------------------------------------------------------------------------
1812RegexMatcher &RegexMatcher::reset() {
1813 fRegionStart = 0;
1814 fRegionLimit = fInputLength;
1815 fActiveStart = 0;
1816 fActiveLimit = fInputLength;
1817 fAnchorStart = 0;
1818 fAnchorLimit = fInputLength;
1819 fLookStart = 0;
1820 fLookLimit = fInputLength;
1821 resetPreserveRegion();
1822 return *this;
1823}
1824
1825
1826
1827void RegexMatcher::resetPreserveRegion() {
1828 fMatchStart = 0;
1829 fMatchEnd = 0;
1830 fLastMatchEnd = -1;
1831 fAppendPosition = 0;
1832 fMatch = FALSE;
1833 fHitEnd = FALSE;
1834 fRequireEnd = FALSE;
1835 fTime = 0;
1836 fTickCounter = TIMER_INITIAL_VALUE;
1837 //resetStack(); // more expensive than it looks...
1838}
1839
1840
1841RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1842 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1843 if (fPattern->fNeedsAltInput) {
1844 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1845 }
1846 if (U_FAILURE(fDeferredStatus)) {
1847 return *this;
1848 }
1849 fInputLength = utext_nativeLength(fInputText);
1850
1851 reset();
1852 delete fInput;
1853 fInput = NULL;
1854
1855 // Do the following for any UnicodeString.
1856 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1857 fInputUniStrMaybeMutable = TRUE;
1858
1859#if UCONFIG_NO_BREAK_ITERATION==0
1860 if (fWordBreakItr) {
1861 fWordBreakItr->setText(fInputText, fDeferredStatus);
1862 }
1863 if (fGCBreakItr) {
1864 fGCBreakItr->setText(fInputText, fDeferredStatus);
1865 }
1866#endif
1867
1868 return *this;
1869}
1870
1871
1872RegexMatcher &RegexMatcher::reset(UText *input) {
1873 if (fInputText != input) {
1874 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1875 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1876 if (U_FAILURE(fDeferredStatus)) {
1877 return *this;
1878 }
1879 fInputLength = utext_nativeLength(fInputText);
1880
1881 delete fInput;
1882 fInput = NULL;
1883
1884#if UCONFIG_NO_BREAK_ITERATION==0
1885 if (fWordBreakItr) {
1886 fWordBreakItr->setText(input, fDeferredStatus);
1887 }
1888 if (fGCBreakItr) {
1889 fGCBreakItr->setText(fInputText, fDeferredStatus);
1890 }
1891#endif
1892 }
1893 reset();
1894 fInputUniStrMaybeMutable = FALSE;
1895
1896 return *this;
1897}
1898
1899/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1900 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1901 return *this;
1902}*/
1903
1904RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1905 if (U_FAILURE(status)) {
1906 return *this;
1907 }
1908 reset(); // Reset also resets the region to be the entire string.
1909
1910 if (position < 0 || position > fActiveLimit) {
1911 status = U_INDEX_OUTOFBOUNDS_ERROR;
1912 return *this;
1913 }
1914 fMatchEnd = position;
1915 return *this;
1916}
1917
1918
1919//--------------------------------------------------------------------------------
1920//
1921// refresh
1922//
1923//--------------------------------------------------------------------------------
1924RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1925 if (U_FAILURE(status)) {
1926 return *this;
1927 }
1928 if (input == NULL) {
1929 status = U_ILLEGAL_ARGUMENT_ERROR;
1930 return *this;
1931 }
1932 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1933 status = U_ILLEGAL_ARGUMENT_ERROR;
1934 return *this;
1935 }
1936 int64_t pos = utext_getNativeIndex(fInputText);
1937 // Shallow read-only clone of the new UText into the existing input UText
1938 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1939 if (U_FAILURE(status)) {
1940 return *this;
1941 }
1942 utext_setNativeIndex(fInputText, pos);
1943
1944 if (fAltInputText != NULL) {
1945 pos = utext_getNativeIndex(fAltInputText);
1946 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1947 if (U_FAILURE(status)) {
1948 return *this;
1949 }
1950 utext_setNativeIndex(fAltInputText, pos);
1951 }
1952 return *this;
1953}
1954
1955
1956
1957//--------------------------------------------------------------------------------
1958//
1959// setTrace
1960//
1961//--------------------------------------------------------------------------------
1962void RegexMatcher::setTrace(UBool state) {
1963 fTraceDebug = state;
1964}
1965
1966
1967
1968/**
1969 * UText, replace entire contents of the destination UText with a substring of the source UText.
1970 *
1971 * @param src The source UText
1972 * @param dest The destination UText. Must be writable.
1973 * May be NULL, in which case a new UText will be allocated.
1974 * @param start Start index of source substring.
1975 * @param limit Limit index of source substring.
1976 * @param status An error code.
1977 */
1978static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1979 if (U_FAILURE(*status)) {
1980 return dest;
1981 }
1982 if (start == limit) {
1983 if (dest) {
1984 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1985 return dest;
1986 } else {
1987 return utext_openUChars(NULL, NULL, 0, status);
1988 }
1989 }
1990 int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1991 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1992 return dest;
1993 }
1994 *status = U_ZERO_ERROR;
1995 MaybeStackArray<UChar, 40> buffer;
1996 if (length >= buffer.getCapacity()) {
1997 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
1998 if (newBuf == NULL) {
1999 *status = U_MEMORY_ALLOCATION_ERROR;
2000 }
2001 }
2002 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
2003 if (dest) {
2004 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2005 return dest;
2006 }
2007
2008 // Caller did not provide a prexisting UText.
2009 // Open a new one, and have it adopt the text buffer storage.
2010 if (U_FAILURE(*status)) {
2011 return NULL;
2012 }
2013 int32_t ownedLength = 0;
2014 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2015 if (ownedBuf == NULL) {
2016 *status = U_MEMORY_ALLOCATION_ERROR;
2017 return NULL;
2018 }
2019 UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2020 if (U_FAILURE(*status)) {
2021 uprv_free(ownedBuf);
2022 return NULL;
2023 }
2024 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2025 return result;
2026}
2027
2028
2029//---------------------------------------------------------------------
2030//
2031// split
2032//
2033//---------------------------------------------------------------------
2034int32_t RegexMatcher::split(const UnicodeString &input,
2035 UnicodeString dest[],
2036 int32_t destCapacity,
2037 UErrorCode &status)
2038{
2039 UText inputText = UTEXT_INITIALIZER;
2040 utext_openConstUnicodeString(&inputText, &input, &status);
2041 if (U_FAILURE(status)) {
2042 return 0;
2043 }
2044
2045 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2046 if (destText == NULL) {
2047 status = U_MEMORY_ALLOCATION_ERROR;
2048 return 0;
2049 }
2050 int32_t i;
2051 for (i = 0; i < destCapacity; i++) {
2052 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2053 }
2054
2055 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2056
2057 for (i = 0; i < destCapacity; i++) {
2058 utext_close(destText[i]);
2059 }
2060
2061 uprv_free(destText);
2062 utext_close(&inputText);
2063 return fieldCount;
2064}
2065
2066//
2067// split, UText mode
2068//
2069int32_t RegexMatcher::split(UText *input,
2070 UText *dest[],
2071 int32_t destCapacity,
2072 UErrorCode &status)
2073{
2074 //
2075 // Check arguements for validity
2076 //
2077 if (U_FAILURE(status)) {
2078 return 0;
2079 }
2080
2081 if (destCapacity < 1) {
2082 status = U_ILLEGAL_ARGUMENT_ERROR;
2083 return 0;
2084 }
2085
2086 //
2087 // Reset for the input text
2088 //
2089 reset(input);
2090 int64_t nextOutputStringStart = 0;
2091 if (fActiveLimit == 0) {
2092 return 0;
2093 }
2094
2095 //
2096 // Loop through the input text, searching for the delimiter pattern
2097 //
2098 int32_t i;
2099 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2100 for (i=0; ; i++) {
2101 if (i>=destCapacity-1) {
2102 // There is one or zero output string left.
2103 // Fill the last output string with whatever is left from the input, then exit the loop.
2104 // ( i will be == destCapacity if we filled the output array while processing
2105 // capture groups of the delimiter expression, in which case we will discard the
2106 // last capture group saved in favor of the unprocessed remainder of the
2107 // input string.)
2108 i = destCapacity-1;
2109 if (fActiveLimit > nextOutputStringStart) {
2110 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2111 if (dest[i]) {
2112 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2113 input->chunkContents+nextOutputStringStart,
2114 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2115 } else {
2116 UText remainingText = UTEXT_INITIALIZER;
2117 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2118 fActiveLimit-nextOutputStringStart, &status);
2119 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2120 utext_close(&remainingText);
2121 }
2122 } else {
2123 UErrorCode lengthStatus = U_ZERO_ERROR;
2124 int32_t remaining16Length =
2125 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2126 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2127 if (remainingChars == NULL) {
2128 status = U_MEMORY_ALLOCATION_ERROR;
2129 break;
2130 }
2131
2132 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2133 if (dest[i]) {
2134 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2135 } else {
2136 UText remainingText = UTEXT_INITIALIZER;
2137 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2138 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2139 utext_close(&remainingText);
2140 }
2141
2142 uprv_free(remainingChars);
2143 }
2144 }
2145 break;
2146 }
2147 if (find()) {
2148 // We found another delimiter. Move everything from where we started looking
2149 // up until the start of the delimiter into the next output string.
2150 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2151 if (dest[i]) {
2152 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2153 input->chunkContents+nextOutputStringStart,
2154 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2155 } else {
2156 UText remainingText = UTEXT_INITIALIZER;
2157 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2158 fMatchStart-nextOutputStringStart, &status);
2159 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2160 utext_close(&remainingText);
2161 }
2162 } else {
2163 UErrorCode lengthStatus = U_ZERO_ERROR;
2164 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2165 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2166 if (remainingChars == NULL) {
2167 status = U_MEMORY_ALLOCATION_ERROR;
2168 break;
2169 }
2170 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2171 if (dest[i]) {
2172 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2173 } else {
2174 UText remainingText = UTEXT_INITIALIZER;
2175 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2176 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2177 utext_close(&remainingText);
2178 }
2179
2180 uprv_free(remainingChars);
2181 }
2182 nextOutputStringStart = fMatchEnd;
2183
2184 // If the delimiter pattern has capturing parentheses, the captured
2185 // text goes out into the next n destination strings.
2186 int32_t groupNum;
2187 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2188 if (i >= destCapacity-2) {
2189 // Never fill the last available output string with capture group text.
2190 // It will filled with the last field, the remainder of the
2191 // unsplit input text.
2192 break;
2193 }
2194 i++;
2195 dest[i] = utext_extract_replace(fInputText, dest[i],
2196 start64(groupNum, status), end64(groupNum, status), &status);
2197 }
2198
2199 if (nextOutputStringStart == fActiveLimit) {
2200 // The delimiter was at the end of the string. We're done, but first
2201 // we output one last empty string, for the empty field following
2202 // the delimiter at the end of input.
2203 if (i+1 < destCapacity) {
2204 ++i;
2205 if (dest[i] == NULL) {
2206 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2207 } else {
2208 static const UChar emptyString[] = {(UChar)0};
2209 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2210 }
2211 }
2212 break;
2213
2214 }
2215 }
2216 else
2217 {
2218 // We ran off the end of the input while looking for the next delimiter.
2219 // All the remaining text goes into the current output string.
2220 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2221 if (dest[i]) {
2222 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2223 input->chunkContents+nextOutputStringStart,
2224 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2225 } else {
2226 UText remainingText = UTEXT_INITIALIZER;
2227 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2228 fActiveLimit-nextOutputStringStart, &status);
2229 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2230 utext_close(&remainingText);
2231 }
2232 } else {
2233 UErrorCode lengthStatus = U_ZERO_ERROR;
2234 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2235 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2236 if (remainingChars == NULL) {
2237 status = U_MEMORY_ALLOCATION_ERROR;
2238 break;
2239 }
2240
2241 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2242 if (dest[i]) {
2243 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2244 } else {
2245 UText remainingText = UTEXT_INITIALIZER;
2246 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2247 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2248 utext_close(&remainingText);
2249 }
2250
2251 uprv_free(remainingChars);
2252 }
2253 break;
2254 }
2255 if (U_FAILURE(status)) {
2256 break;
2257 }
2258 } // end of for loop
2259 return i+1;
2260}
2261
2262
2263//--------------------------------------------------------------------------------
2264//
2265// start
2266//
2267//--------------------------------------------------------------------------------
2268int32_t RegexMatcher::start(UErrorCode &status) const {
2269 return start(0, status);
2270}
2271
2272int64_t RegexMatcher::start64(UErrorCode &status) const {
2273 return start64(0, status);
2274}
2275
2276//--------------------------------------------------------------------------------
2277//
2278// start(int32_t group, UErrorCode &status)
2279//
2280//--------------------------------------------------------------------------------
2281
2282int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2283 if (U_FAILURE(status)) {
2284 return -1;
2285 }
2286 if (U_FAILURE(fDeferredStatus)) {
2287 status = fDeferredStatus;
2288 return -1;
2289 }
2290 if (fMatch == FALSE) {
2291 status = U_REGEX_INVALID_STATE;
2292 return -1;
2293 }
2294 if (group < 0 || group > fPattern->fGroupMap->size()) {
2295 status = U_INDEX_OUTOFBOUNDS_ERROR;
2296 return -1;
2297 }
2298 int64_t s;
2299 if (group == 0) {
2300 s = fMatchStart;
2301 } else {
2302 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2303 U_ASSERT(groupOffset < fPattern->fFrameSize);
2304 U_ASSERT(groupOffset >= 0);
2305 s = fFrame->fExtra[groupOffset];
2306 }
2307
2308 return s;
2309}
2310
2311
2312int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2313 return (int32_t)start64(group, status);
2314}
2315
2316//--------------------------------------------------------------------------------
2317//
2318// useAnchoringBounds
2319//
2320//--------------------------------------------------------------------------------
2321RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2322 fAnchoringBounds = b;
2323 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2324 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2325 return *this;
2326}
2327
2328
2329//--------------------------------------------------------------------------------
2330//
2331// useTransparentBounds
2332//
2333//--------------------------------------------------------------------------------
2334RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2335 fTransparentBounds = b;
2336 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2337 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2338 return *this;
2339}
2340
2341//--------------------------------------------------------------------------------
2342//
2343// setTimeLimit
2344//
2345//--------------------------------------------------------------------------------
2346void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2347 if (U_FAILURE(status)) {
2348 return;
2349 }
2350 if (U_FAILURE(fDeferredStatus)) {
2351 status = fDeferredStatus;
2352 return;
2353 }
2354 if (limit < 0) {
2355 status = U_ILLEGAL_ARGUMENT_ERROR;
2356 return;
2357 }
2358 fTimeLimit = limit;
2359}
2360
2361
2362//--------------------------------------------------------------------------------
2363//
2364// getTimeLimit
2365//
2366//--------------------------------------------------------------------------------
2367int32_t RegexMatcher::getTimeLimit() const {
2368 return fTimeLimit;
2369}
2370
2371
2372//--------------------------------------------------------------------------------
2373//
2374// setStackLimit
2375//
2376//--------------------------------------------------------------------------------
2377void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2378 if (U_FAILURE(status)) {
2379 return;
2380 }
2381 if (U_FAILURE(fDeferredStatus)) {
2382 status = fDeferredStatus;
2383 return;
2384 }
2385 if (limit < 0) {
2386 status = U_ILLEGAL_ARGUMENT_ERROR;
2387 return;
2388 }
2389
2390 // Reset the matcher. This is needed here in case there is a current match
2391 // whose final stack frame (containing the match results, pointed to by fFrame)
2392 // would be lost by resizing to a smaller stack size.
2393 reset();
2394
2395 if (limit == 0) {
2396 // Unlimited stack expansion
2397 fStack->setMaxCapacity(0);
2398 } else {
2399 // Change the units of the limit from bytes to ints, and bump the size up
2400 // to be big enough to hold at least one stack frame for the pattern,
2401 // if it isn't there already.
2402 int32_t adjustedLimit = limit / sizeof(int32_t);
2403 if (adjustedLimit < fPattern->fFrameSize) {
2404 adjustedLimit = fPattern->fFrameSize;
2405 }
2406 fStack->setMaxCapacity(adjustedLimit);
2407 }
2408 fStackLimit = limit;
2409}
2410
2411
2412//--------------------------------------------------------------------------------
2413//
2414// getStackLimit
2415//
2416//--------------------------------------------------------------------------------
2417int32_t RegexMatcher::getStackLimit() const {
2418 return fStackLimit;
2419}
2420
2421
2422//--------------------------------------------------------------------------------
2423//
2424// setMatchCallback
2425//
2426//--------------------------------------------------------------------------------
2427void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2428 const void *context,
2429 UErrorCode &status) {
2430 if (U_FAILURE(status)) {
2431 return;
2432 }
2433 fCallbackFn = callback;
2434 fCallbackContext = context;
2435}
2436
2437
2438//--------------------------------------------------------------------------------
2439//
2440// getMatchCallback
2441//
2442//--------------------------------------------------------------------------------
2443void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2444 const void *&context,
2445 UErrorCode &status) {
2446 if (U_FAILURE(status)) {
2447 return;
2448 }
2449 callback = fCallbackFn;
2450 context = fCallbackContext;
2451}
2452
2453
2454//--------------------------------------------------------------------------------
2455//
2456// setMatchCallback
2457//
2458//--------------------------------------------------------------------------------
2459void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2460 const void *context,
2461 UErrorCode &status) {
2462 if (U_FAILURE(status)) {
2463 return;
2464 }
2465 fFindProgressCallbackFn = callback;
2466 fFindProgressCallbackContext = context;
2467}
2468
2469
2470//--------------------------------------------------------------------------------
2471//
2472// getMatchCallback
2473//
2474//--------------------------------------------------------------------------------
2475void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2476 const void *&context,
2477 UErrorCode &status) {
2478 if (U_FAILURE(status)) {
2479 return;
2480 }
2481 callback = fFindProgressCallbackFn;
2482 context = fFindProgressCallbackContext;
2483}
2484
2485
2486//================================================================================
2487//
2488// Code following this point in this file is the internal
2489// Match Engine Implementation.
2490//
2491//================================================================================
2492
2493
2494//--------------------------------------------------------------------------------
2495//
2496// resetStack
2497// Discard any previous contents of the state save stack, and initialize a
2498// new stack frame to all -1. The -1s are needed for capture group limits,
2499// where they indicate that a group has not yet matched anything.
2500//--------------------------------------------------------------------------------
2501REStackFrame *RegexMatcher::resetStack() {
2502 // Discard any previous contents of the state save stack, and initialize a
2503 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2504 // where they indicate that a group has not yet matched anything.
2505 fStack->removeAllElements();
2506
2507 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2508 if(U_FAILURE(fDeferredStatus)) {
2509 return NULL;
2510 }
2511
2512 int32_t i;
2513 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2514 iFrame->fExtra[i] = -1;
2515 }
2516 return iFrame;
2517}
2518
2519
2520
2521//--------------------------------------------------------------------------------
2522//
2523// isWordBoundary
2524// in perl, "xab..cd..", \b is true at positions 0,3,5,7
2525// For us,
2526// If the current char is a combining mark,
2527// \b is FALSE.
2528// Else Scan backwards to the first non-combining char.
2529// We are at a boundary if the this char and the original chars are
2530// opposite in membership in \w set
2531//
2532// parameters: pos - the current position in the input buffer
2533//
2534// TODO: double-check edge cases at region boundaries.
2535//
2536//--------------------------------------------------------------------------------
2537UBool RegexMatcher::isWordBoundary(int64_t pos) {
2538 UBool isBoundary = FALSE;
2539 UBool cIsWord = FALSE;
2540
2541 if (pos >= fLookLimit) {
2542 fHitEnd = TRUE;
2543 } else {
2544 // Determine whether char c at current position is a member of the word set of chars.
2545 // If we're off the end of the string, behave as though we're not at a word char.
2546 UTEXT_SETNATIVEINDEX(fInputText, pos);
2547 UChar32 c = UTEXT_CURRENT32(fInputText);
2548 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2549 // Current char is a combining one. Not a boundary.
2550 return FALSE;
2551 }
2552 cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2553 }
2554
2555 // Back up until we come to a non-combining char, determine whether
2556 // that char is a word char.
2557 UBool prevCIsWord = FALSE;
2558 for (;;) {
2559 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2560 break;
2561 }
2562 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2563 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2564 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2565 prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2566 break;
2567 }
2568 }
2569 isBoundary = cIsWord ^ prevCIsWord;
2570 return isBoundary;
2571}
2572
2573UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2574 UBool isBoundary = FALSE;
2575 UBool cIsWord = FALSE;
2576
2577 const UChar *inputBuf = fInputText->chunkContents;
2578
2579 if (pos >= fLookLimit) {
2580 fHitEnd = TRUE;
2581 } else {
2582 // Determine whether char c at current position is a member of the word set of chars.
2583 // If we're off the end of the string, behave as though we're not at a word char.
2584 UChar32 c;
2585 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2586 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2587 // Current char is a combining one. Not a boundary.
2588 return FALSE;
2589 }
2590 cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2591 }
2592
2593 // Back up until we come to a non-combining char, determine whether
2594 // that char is a word char.
2595 UBool prevCIsWord = FALSE;
2596 for (;;) {
2597 if (pos <= fLookStart) {
2598 break;
2599 }
2600 UChar32 prevChar;
2601 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2602 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2603 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2604 prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2605 break;
2606 }
2607 }
2608 isBoundary = cIsWord ^ prevCIsWord;
2609 return isBoundary;
2610}
2611
2612//--------------------------------------------------------------------------------
2613//
2614// isUWordBoundary
2615//
2616// Test for a word boundary using RBBI word break.
2617//
2618// parameters: pos - the current position in the input buffer
2619//
2620//--------------------------------------------------------------------------------
2621UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) {
2622 UBool returnVal = FALSE;
2623
2624#if UCONFIG_NO_BREAK_ITERATION==0
2625 // Note: this point will never be reached if break iteration is configured out.
2626 // Regex patterns that would require this function will fail to compile.
2627
2628 // If we haven't yet created a break iterator for this matcher, do it now.
2629 if (fWordBreakItr == nullptr) {
2630 fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status);
2631 if (U_FAILURE(status)) {
2632 return FALSE;
2633 }
2634 fWordBreakItr->setText(fInputText, status);
2635 }
2636
2637 // Note: zero width boundary tests like \b see through transparent region bounds,
2638 // which is why fLookLimit is used here, rather than fActiveLimit.
2639 if (pos >= fLookLimit) {
2640 fHitEnd = TRUE;
2641 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2642 // words are not boundaries. All non-word chars stand by themselves,
2643 // with word boundaries on both sides.
2644 } else {
2645 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2646 }
2647#endif
2648 return returnVal;
2649}
2650
2651
2652int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) {
2653 int64_t result = pos;
2654
2655#if UCONFIG_NO_BREAK_ITERATION==0
2656 // Note: this point will never be reached if break iteration is configured out.
2657 // Regex patterns that would require this function will fail to compile.
2658
2659 // If we haven't yet created a break iterator for this matcher, do it now.
2660 if (fGCBreakItr == nullptr) {
2661 fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2662 if (U_FAILURE(status)) {
2663 return pos;
2664 }
2665 fGCBreakItr->setText(fInputText, status);
2666 }
2667 result = fGCBreakItr->following(pos);
2668 if (result == BreakIterator::DONE) {
2669 result = pos;
2670 }
2671#endif
2672 return result;
2673}
2674
2675//--------------------------------------------------------------------------------
2676//
2677// IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2678// saves. Increment the "time" counter, and call the
2679// user callback function if there is one installed.
2680//
2681// If the match operation needs to be aborted, either for a time-out
2682// or because the user callback asked for it, just set an error status.
2683// The engine will pick that up and stop in its outer loop.
2684//
2685//--------------------------------------------------------------------------------
2686void RegexMatcher::IncrementTime(UErrorCode &status) {
2687 fTickCounter = TIMER_INITIAL_VALUE;
2688 fTime++;
2689 if (fCallbackFn != NULL) {
2690 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2691 status = U_REGEX_STOPPED_BY_CALLER;
2692 return;
2693 }
2694 }
2695 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2696 status = U_REGEX_TIME_OUT;
2697 }
2698}
2699
2700//--------------------------------------------------------------------------------
2701//
2702// StateSave
2703// Make a new stack frame, initialized as a copy of the current stack frame.
2704// Set the pattern index in the original stack frame from the operand value
2705// in the opcode. Execution of the engine continues with the state in
2706// the newly created stack frame
2707//
2708// Note that reserveBlock() may grow the stack, resulting in the
2709// whole thing being relocated in memory.
2710//
2711// Parameters:
2712// fp The top frame pointer when called. At return, a new
2713// fame will be present
2714// savePatIdx An index into the compiled pattern. Goes into the original
2715// (not new) frame. If execution ever back-tracks out of the
2716// new frame, this will be where we continue from in the pattern.
2717// Return
2718// The new frame pointer.
2719//
2720//--------------------------------------------------------------------------------
2721inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2722 if (U_FAILURE(status)) {
2723 return fp;
2724 }
2725 // push storage for a new frame.
2726 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2727 if (U_FAILURE(status)) {
2728 // Failure on attempted stack expansion.
2729 // Stack function set some other error code, change it to a more
2730 // specific one for regular expressions.
2731 status = U_REGEX_STACK_OVERFLOW;
2732 // We need to return a writable stack frame, so just return the
2733 // previous frame. The match operation will stop quickly
2734 // because of the error status, after which the frame will never
2735 // be looked at again.
2736 return fp;
2737 }
2738 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2739
2740 // New stack frame = copy of old top frame.
2741 int64_t *source = (int64_t *)fp;
2742 int64_t *dest = newFP;
2743 for (;;) {
2744 *dest++ = *source++;
2745 if (source == newFP) {
2746 break;
2747 }
2748 }
2749
2750 fTickCounter--;
2751 if (fTickCounter <= 0) {
2752 IncrementTime(status); // Re-initializes fTickCounter
2753 }
2754 fp->fPatIdx = savePatIdx;
2755 return (REStackFrame *)newFP;
2756}
2757
2758#if defined(REGEX_DEBUG)
2759namespace {
2760UnicodeString StringFromUText(UText *ut) {
2761 UnicodeString result;
2762 for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2763 result.append(c);
2764 }
2765 return result;
2766}
2767}
2768#endif // REGEX_DEBUG
2769
2770
2771//--------------------------------------------------------------------------------
2772//
2773// MatchAt This is the actual matching engine.
2774//
2775// startIdx: begin matching a this index.
2776// toEnd: if true, match must extend to end of the input region
2777//
2778//--------------------------------------------------------------------------------
2779void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2780 UBool isMatch = FALSE; // True if the we have a match.
2781
2782 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2783
2784 int32_t op; // Operation from the compiled pattern, split into
2785 int32_t opType; // the opcode
2786 int32_t opValue; // and the operand value.
2787
2788#ifdef REGEX_RUN_DEBUG
2789 if (fTraceDebug) {
2790 printf("MatchAt(startIdx=%ld)\n", startIdx);
2791 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2792 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2793 }
2794#endif
2795
2796 if (U_FAILURE(status)) {
2797 return;
2798 }
2799
2800 // Cache frequently referenced items from the compiled pattern
2801 //
2802 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2803
2804 const UChar *litText = fPattern->fLiteralText.getBuffer();
2805 UVector *fSets = fPattern->fSets;
2806
2807 fFrameSize = fPattern->fFrameSize;
2808 REStackFrame *fp = resetStack();
2809 if (U_FAILURE(fDeferredStatus)) {
2810 status = fDeferredStatus;
2811 return;
2812 }
2813
2814 fp->fPatIdx = 0;
2815 fp->fInputIdx = startIdx;
2816
2817 // Zero out the pattern's static data
2818 int32_t i;
2819 for (i = 0; i<fPattern->fDataSize; i++) {
2820 fData[i] = 0;
2821 }
2822
2823 //
2824 // Main loop for interpreting the compiled pattern.
2825 // One iteration of the loop per pattern operation performed.
2826 //
2827 for (;;) {
2828 op = (int32_t)pat[fp->fPatIdx];
2829 opType = URX_TYPE(op);
2830 opValue = URX_VAL(op);
2831#ifdef REGEX_RUN_DEBUG
2832 if (fTraceDebug) {
2833 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2834 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2835 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2836 fPattern->dumpOp(fp->fPatIdx);
2837 }
2838#endif
2839 fp->fPatIdx++;
2840
2841 switch (opType) {
2842
2843
2844 case URX_NOP:
2845 break;
2846
2847
2848 case URX_BACKTRACK:
2849 // Force a backtrack. In some circumstances, the pattern compiler
2850 // will notice that the pattern can't possibly match anything, and will
2851 // emit one of these at that point.
2852 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2853 break;
2854
2855
2856 case URX_ONECHAR:
2857 if (fp->fInputIdx < fActiveLimit) {
2858 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2859 UChar32 c = UTEXT_NEXT32(fInputText);
2860 if (c == opValue) {
2861 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2862 break;
2863 }
2864 } else {
2865 fHitEnd = TRUE;
2866 }
2867 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2868 break;
2869
2870
2871 case URX_STRING:
2872 {
2873 // Test input against a literal string.
2874 // Strings require two slots in the compiled pattern, one for the
2875 // offset to the string text, and one for the length.
2876
2877 int32_t stringStartIdx = opValue;
2878 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2879 fp->fPatIdx++;
2880 opType = URX_TYPE(op);
2881 int32_t stringLen = URX_VAL(op);
2882 U_ASSERT(opType == URX_STRING_LEN);
2883 U_ASSERT(stringLen >= 2);
2884
2885 const UChar *patternString = litText+stringStartIdx;
2886 int32_t patternStringIndex = 0;
2887 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2888 UChar32 inputChar;
2889 UChar32 patternChar;
2890 UBool success = TRUE;
2891 while (patternStringIndex < stringLen) {
2892 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2893 success = FALSE;
2894 fHitEnd = TRUE;
2895 break;
2896 }
2897 inputChar = UTEXT_NEXT32(fInputText);
2898 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2899 if (patternChar != inputChar) {
2900 success = FALSE;
2901 break;
2902 }
2903 }
2904
2905 if (success) {
2906 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2907 } else {
2908 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2909 }
2910 }
2911 break;
2912
2913
2914 case URX_STATE_SAVE:
2915 fp = StateSave(fp, opValue, status);
2916 break;
2917
2918
2919 case URX_END:
2920 // The match loop will exit via this path on a successful match,
2921 // when we reach the end of the pattern.
2922 if (toEnd && fp->fInputIdx != fActiveLimit) {
2923 // The pattern matched, but not to the end of input. Try some more.
2924 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2925 break;
2926 }
2927 isMatch = TRUE;
2928 goto breakFromLoop;
2929
2930 // Start and End Capture stack frame variables are laid out out like this:
2931 // fp->fExtra[opValue] - The start of a completed capture group
2932 // opValue+1 - The end of a completed capture group
2933 // opValue+2 - the start of a capture group whose end
2934 // has not yet been reached (and might not ever be).
2935 case URX_START_CAPTURE:
2936 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2937 fp->fExtra[opValue+2] = fp->fInputIdx;
2938 break;
2939
2940
2941 case URX_END_CAPTURE:
2942 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2943 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2944 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2945 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2946 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2947 break;
2948
2949
2950 case URX_DOLLAR: // $, test for End of line
2951 // or for position before new line at end of input
2952 {
2953 if (fp->fInputIdx >= fAnchorLimit) {
2954 // We really are at the end of input. Success.
2955 fHitEnd = TRUE;
2956 fRequireEnd = TRUE;
2957 break;
2958 }
2959
2960 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2961
2962 // If we are positioned just before a new-line that is located at the
2963 // end of input, succeed.
2964 UChar32 c = UTEXT_NEXT32(fInputText);
2965 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2966 if (isLineTerminator(c)) {
2967 // If not in the middle of a CR/LF sequence
2968 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2969 // At new-line at end of input. Success
2970 fHitEnd = TRUE;
2971 fRequireEnd = TRUE;
2972
2973 break;
2974 }
2975 }
2976 } else {
2977 UChar32 nextC = UTEXT_NEXT32(fInputText);
2978 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2979 fHitEnd = TRUE;
2980 fRequireEnd = TRUE;
2981 break; // At CR/LF at end of input. Success
2982 }
2983 }
2984
2985 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2986 }
2987 break;
2988
2989
2990 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2991 if (fp->fInputIdx >= fAnchorLimit) {
2992 // Off the end of input. Success.
2993 fHitEnd = TRUE;
2994 fRequireEnd = TRUE;
2995 break;
2996 } else {
2997 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2998 UChar32 c = UTEXT_NEXT32(fInputText);
2999 // Either at the last character of input, or off the end.
3000 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3001 fHitEnd = TRUE;
3002 fRequireEnd = TRUE;
3003 break;
3004 }
3005 }
3006
3007 // Not at end of input. Back-track out.
3008 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3009 break;
3010
3011
3012 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
3013 {
3014 if (fp->fInputIdx >= fAnchorLimit) {
3015 // We really are at the end of input. Success.
3016 fHitEnd = TRUE;
3017 fRequireEnd = TRUE;
3018 break;
3019 }
3020 // If we are positioned just before a new-line, succeed.
3021 // It makes no difference where the new-line is within the input.
3022 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3023 UChar32 c = UTEXT_CURRENT32(fInputText);
3024 if (isLineTerminator(c)) {
3025 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3026 // In multi-line mode, hitting a new-line just before the end of input does not
3027 // set the hitEnd or requireEnd flags
3028 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3029 break;
3030 }
3031 }
3032 // not at a new line. Fail.
3033 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3034 }
3035 break;
3036
3037
3038 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3039 {
3040 if (fp->fInputIdx >= fAnchorLimit) {
3041 // We really are at the end of input. Success.
3042 fHitEnd = TRUE;
3043 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3044 break; // adding a new-line would not lose the match.
3045 }
3046 // If we are not positioned just before a new-line, the test fails; backtrack out.
3047 // It makes no difference where the new-line is within the input.
3048 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3049 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3050 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3051 }
3052 }
3053 break;
3054
3055
3056 case URX_CARET: // ^, test for start of line
3057 if (fp->fInputIdx != fAnchorStart) {
3058 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3059 }
3060 break;
3061
3062
3063 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3064 {
3065 if (fp->fInputIdx == fAnchorStart) {
3066 // We are at the start input. Success.
3067 break;
3068 }
3069 // Check whether character just before the current pos is a new-line
3070 // unless we are at the end of input
3071 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3072 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3073 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3074 // It's a new-line. ^ is true. Success.
3075 // TODO: what should be done with positions between a CR and LF?
3076 break;
3077 }
3078 // Not at the start of a line. Fail.
3079 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3080 }
3081 break;
3082
3083
3084 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3085 {
3086 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3087 if (fp->fInputIdx <= fAnchorStart) {
3088 // We are at the start input. Success.
3089 break;
3090 }
3091 // Check whether character just before the current pos is a new-line
3092 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3093 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3094 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3095 if (c != 0x0a) {
3096 // Not at the start of a line. Back-track out.
3097 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3098 }
3099 }
3100 break;
3101
3102 case URX_BACKSLASH_B: // Test for word boundaries
3103 {
3104 UBool success = isWordBoundary(fp->fInputIdx);
3105 success ^= (UBool)(opValue != 0); // flip sense for \B
3106 if (!success) {
3107 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3108 }
3109 }
3110 break;
3111
3112
3113 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3114 {
3115 UBool success = isUWordBoundary(fp->fInputIdx, status);
3116 success ^= (UBool)(opValue != 0); // flip sense for \B
3117 if (!success) {
3118 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3119 }
3120 }
3121 break;
3122
3123
3124 case URX_BACKSLASH_D: // Test for decimal digit
3125 {
3126 if (fp->fInputIdx >= fActiveLimit) {
3127 fHitEnd = TRUE;
3128 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3129 break;
3130 }
3131
3132 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3133
3134 UChar32 c = UTEXT_NEXT32(fInputText);
3135 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3136 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3137 success ^= (UBool)(opValue != 0); // flip sense for \D
3138 if (success) {
3139 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3140 } else {
3141 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3142 }
3143 }
3144 break;
3145
3146
3147 case URX_BACKSLASH_G: // Test for position at end of previous match
3148 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3149 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3150 }
3151 break;
3152
3153
3154 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3155 {
3156 if (fp->fInputIdx >= fActiveLimit) {
3157 fHitEnd = TRUE;
3158 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3159 break;
3160 }
3161 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3162 UChar32 c = UTEXT_NEXT32(fInputText);
3163 int8_t ctype = u_charType(c);
3164 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3165 success ^= (UBool)(opValue != 0); // flip sense for \H
3166 if (success) {
3167 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3168 } else {
3169 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3170 }
3171 }
3172 break;
3173
3174
3175 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3176 {
3177 if (fp->fInputIdx >= fActiveLimit) {
3178 fHitEnd = TRUE;
3179 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3180 break;
3181 }
3182 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3183 UChar32 c = UTEXT_NEXT32(fInputText);
3184 if (isLineTerminator(c)) {
3185 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3186 utext_next32(fInputText);
3187 }
3188 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3189 } else {
3190 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3191 }
3192 }
3193 break;
3194
3195
3196 case URX_BACKSLASH_V: // \v, any single line ending character.
3197 {
3198 if (fp->fInputIdx >= fActiveLimit) {
3199 fHitEnd = TRUE;
3200 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3201 break;
3202 }
3203 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3204 UChar32 c = UTEXT_NEXT32(fInputText);
3205 UBool success = isLineTerminator(c);
3206 success ^= (UBool)(opValue != 0); // flip sense for \V
3207 if (success) {
3208 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3209 } else {
3210 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3211 }
3212 }
3213 break;
3214
3215
3216 case URX_BACKSLASH_X:
3217 // Match a Grapheme, as defined by Unicode UAX 29.
3218
3219 // Fail if at end of input
3220 if (fp->fInputIdx >= fActiveLimit) {
3221 fHitEnd = TRUE;
3222 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3223 break;
3224 }
3225
3226 fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
3227 if (fp->fInputIdx >= fActiveLimit) {
3228 fHitEnd = TRUE;
3229 fp->fInputIdx = fActiveLimit;
3230 }
3231 break;
3232
3233
3234 case URX_BACKSLASH_Z: // Test for end of Input
3235 if (fp->fInputIdx < fAnchorLimit) {
3236 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3237 } else {
3238 fHitEnd = TRUE;
3239 fRequireEnd = TRUE;
3240 }
3241 break;
3242
3243
3244
3245 case URX_STATIC_SETREF:
3246 {
3247 // Test input character against one of the predefined sets
3248 // (Word Characters, for example)
3249 // The high bit of the op value is a flag for the match polarity.
3250 // 0: success if input char is in set.
3251 // 1: success if input char is not in set.
3252 if (fp->fInputIdx >= fActiveLimit) {
3253 fHitEnd = TRUE;
3254 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3255 break;
3256 }
3257
3258 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3259 opValue &= ~URX_NEG_SET;
3260 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3261
3262 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3263 UChar32 c = UTEXT_NEXT32(fInputText);
3264 if (c < 256) {
3265 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3266 if (s8.contains(c)) {
3267 success = !success;
3268 }
3269 } else {
3270 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3271 if (s.contains(c)) {
3272 success = !success;
3273 }
3274 }
3275 if (success) {
3276 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3277 } else {
3278 // the character wasn't in the set.
3279 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3280 }
3281 }
3282 break;
3283
3284
3285 case URX_STAT_SETREF_N:
3286 {
3287 // Test input character for NOT being a member of one of
3288 // the predefined sets (Word Characters, for example)
3289 if (fp->fInputIdx >= fActiveLimit) {
3290 fHitEnd = TRUE;
3291 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3292 break;
3293 }
3294
3295 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3296
3297 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3298
3299 UChar32 c = UTEXT_NEXT32(fInputText);
3300 if (c < 256) {
3301 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3302 if (s8.contains(c) == FALSE) {
3303 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3304 break;
3305 }
3306 } else {
3307 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3308 if (s.contains(c) == FALSE) {
3309 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3310 break;
3311 }
3312 }
3313 // the character wasn't in the set.
3314 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3315 }
3316 break;
3317
3318
3319 case URX_SETREF:
3320 if (fp->fInputIdx >= fActiveLimit) {
3321 fHitEnd = TRUE;
3322 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3323 break;
3324 } else {
3325 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3326
3327 // There is input left. Pick up one char and test it for set membership.
3328 UChar32 c = UTEXT_NEXT32(fInputText);
3329 U_ASSERT(opValue > 0 && opValue < fSets->size());
3330 if (c<256) {
3331 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3332 if (s8->contains(c)) {
3333 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3334 break;
3335 }
3336 } else {
3337 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
3338 if (s->contains(c)) {
3339 // The character is in the set. A Match.
3340 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3341 break;
3342 }
3343 }
3344
3345 // the character wasn't in the set.
3346 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3347 }
3348 break;
3349
3350
3351 case URX_DOTANY:
3352 {
3353 // . matches anything, but stops at end-of-line.
3354 if (fp->fInputIdx >= fActiveLimit) {
3355 // At end of input. Match failed. Backtrack out.
3356 fHitEnd = TRUE;
3357 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3358 break;
3359 }
3360
3361 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3362
3363 // There is input left. Advance over one char, unless we've hit end-of-line
3364 UChar32 c = UTEXT_NEXT32(fInputText);
3365 if (isLineTerminator(c)) {
3366 // End of line in normal mode. . does not match.
3367 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3368 break;
3369 }
3370 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3371 }
3372 break;
3373
3374
3375 case URX_DOTANY_ALL:
3376 {
3377 // ., in dot-matches-all (including new lines) mode
3378 if (fp->fInputIdx >= fActiveLimit) {
3379 // At end of input. Match failed. Backtrack out.
3380 fHitEnd = TRUE;
3381 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3382 break;
3383 }
3384
3385 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3386
3387 // There is input left. Advance over one char, except if we are
3388 // at a cr/lf, advance over both of them.
3389 UChar32 c;
3390 c = UTEXT_NEXT32(fInputText);
3391 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3392 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3393 // In the case of a CR/LF, we need to advance over both.
3394 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3395 if (nextc == 0x0a) {
3396 (void)UTEXT_NEXT32(fInputText);
3397 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3398 }
3399 }
3400 }
3401 break;
3402
3403
3404 case URX_DOTANY_UNIX:
3405 {
3406 // '.' operator, matches all, but stops at end-of-line.
3407 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3408 if (fp->fInputIdx >= fActiveLimit) {
3409 // At end of input. Match failed. Backtrack out.
3410 fHitEnd = TRUE;
3411 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3412 break;
3413 }
3414
3415 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3416
3417 // There is input left. Advance over one char, unless we've hit end-of-line
3418 UChar32 c = UTEXT_NEXT32(fInputText);
3419 if (c == 0x0a) {
3420 // End of line in normal mode. '.' does not match the \n
3421 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3422 } else {
3423 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3424 }
3425 }
3426 break;
3427
3428
3429 case URX_JMP:
3430 fp->fPatIdx = opValue;
3431 break;
3432
3433 case URX_FAIL:
3434 isMatch = FALSE;
3435 goto breakFromLoop;
3436
3437 case URX_JMP_SAV:
3438 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3439 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3440 fp->fPatIdx = opValue; // Then JMP.
3441 break;
3442
3443 case URX_JMP_SAV_X:
3444 // This opcode is used with (x)+, when x can match a zero length string.
3445 // Same as JMP_SAV, except conditional on the match having made forward progress.
3446 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3447 // data address of the input position at the start of the loop.
3448 {
3449 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3450 int32_t stoOp = (int32_t)pat[opValue-1];
3451 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3452 int32_t frameLoc = URX_VAL(stoOp);
3453 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3454 int64_t prevInputIdx = fp->fExtra[frameLoc];
3455 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3456 if (prevInputIdx < fp->fInputIdx) {
3457 // The match did make progress. Repeat the loop.
3458 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3459 fp->fPatIdx = opValue;
3460 fp->fExtra[frameLoc] = fp->fInputIdx;
3461 }
3462 // If the input position did not advance, we do nothing here,
3463 // execution will fall out of the loop.
3464 }
3465 break;
3466
3467 case URX_CTR_INIT:
3468 {
3469 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3470 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3471
3472 // Pick up the three extra operands that CTR_INIT has, and
3473 // skip the pattern location counter past
3474 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3475 fp->fPatIdx += 3;
3476 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3477 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3478 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3479 U_ASSERT(minCount>=0);
3480 U_ASSERT(maxCount>=minCount || maxCount==-1);
3481 U_ASSERT(loopLoc>=fp->fPatIdx);
3482
3483 if (minCount == 0) {
3484 fp = StateSave(fp, loopLoc+1, status);
3485 }
3486 if (maxCount == -1) {
3487 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3488 } else if (maxCount == 0) {
3489 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3490 }
3491 }
3492 break;
3493
3494 case URX_CTR_LOOP:
3495 {
3496 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3497 int32_t initOp = (int32_t)pat[opValue];
3498 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3499 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3500 int32_t minCount = (int32_t)pat[opValue+2];
3501 int32_t maxCount = (int32_t)pat[opValue+3];
3502 (*pCounter)++;
3503 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3504 U_ASSERT(*pCounter == maxCount);
3505 break;
3506 }
3507 if (*pCounter >= minCount) {
3508 if (maxCount == -1) {
3509 // Loop has no hard upper bound.
3510 // Check that it is progressing through the input, break if it is not.
3511 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3512 if (fp->fInputIdx == *pLastInputIdx) {
3513 break;
3514 } else {
3515 *pLastInputIdx = fp->fInputIdx;
3516 }
3517 }
3518 fp = StateSave(fp, fp->fPatIdx, status);
3519 } else {
3520 // Increment time-out counter. (StateSave() does it if count >= minCount)
3521 fTickCounter--;
3522 if (fTickCounter <= 0) {
3523 IncrementTime(status); // Re-initializes fTickCounter
3524 }
3525 }
3526
3527 fp->fPatIdx = opValue + 4; // Loop back.
3528 }
3529 break;
3530
3531 case URX_CTR_INIT_NG:
3532 {
3533 // Initialize a non-greedy loop
3534 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3535 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3536
3537 // Pick up the three extra operands that CTR_INIT_NG has, and
3538 // skip the pattern location counter past
3539 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3540 fp->fPatIdx += 3;
3541 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3542 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3543 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3544 U_ASSERT(minCount>=0);
3545 U_ASSERT(maxCount>=minCount || maxCount==-1);
3546 U_ASSERT(loopLoc>fp->fPatIdx);
3547 if (maxCount == -1) {
3548 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3549 }
3550
3551 if (minCount == 0) {
3552 if (maxCount != 0) {
3553 fp = StateSave(fp, fp->fPatIdx, status);
3554 }
3555 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3556 }
3557 }
3558 break;
3559
3560 case URX_CTR_LOOP_NG:
3561 {
3562 // Non-greedy {min, max} loops
3563 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3564 int32_t initOp = (int32_t)pat[opValue];
3565 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3566 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3567 int32_t minCount = (int32_t)pat[opValue+2];
3568 int32_t maxCount = (int32_t)pat[opValue+3];
3569
3570 (*pCounter)++;
3571 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3572 // The loop has matched the maximum permitted number of times.
3573 // Break out of here with no action. Matching will
3574 // continue with the following pattern.
3575 U_ASSERT(*pCounter == maxCount);
3576 break;
3577 }
3578
3579 if (*pCounter < minCount) {
3580 // We haven't met the minimum number of matches yet.
3581 // Loop back for another one.
3582 fp->fPatIdx = opValue + 4; // Loop back.
3583 // Increment time-out counter. (StateSave() does it if count >= minCount)
3584 fTickCounter--;
3585 if (fTickCounter <= 0) {
3586 IncrementTime(status); // Re-initializes fTickCounter
3587 }
3588 } else {
3589 // We do have the minimum number of matches.
3590
3591 // If there is no upper bound on the loop iterations, check that the input index
3592 // is progressing, and stop the loop if it is not.
3593 if (maxCount == -1) {
3594 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3595 if (fp->fInputIdx == *pLastInputIdx) {
3596 break;
3597 }
3598 *pLastInputIdx = fp->fInputIdx;
3599 }
3600
3601 // Loop Continuation: we will fall into the pattern following the loop
3602 // (non-greedy, don't execute loop body first), but first do
3603 // a state save to the top of the loop, so that a match failure
3604 // in the following pattern will try another iteration of the loop.
3605 fp = StateSave(fp, opValue + 4, status);
3606 }
3607 }
3608 break;
3609
3610 case URX_STO_SP:
3611 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3612 fData[opValue] = fStack->size();
3613 break;
3614
3615 case URX_LD_SP:
3616 {
3617 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3618 int32_t newStackSize = (int32_t)fData[opValue];
3619 U_ASSERT(newStackSize <= fStack->size());
3620 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3621 if (newFP == (int64_t *)fp) {
3622 break;
3623 }
3624 int32_t j;
3625 for (j=0; j<fFrameSize; j++) {
3626 newFP[j] = ((int64_t *)fp)[j];
3627 }
3628 fp = (REStackFrame *)newFP;
3629 fStack->setSize(newStackSize);
3630 }
3631 break;
3632
3633 case URX_BACKREF:
3634 {
3635 U_ASSERT(opValue < fFrameSize);
3636 int64_t groupStartIdx = fp->fExtra[opValue];
3637 int64_t groupEndIdx = fp->fExtra[opValue+1];
3638 U_ASSERT(groupStartIdx <= groupEndIdx);
3639 if (groupStartIdx < 0) {
3640 // This capture group has not participated in the match thus far,
3641 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3642 break;
3643 }
3644 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3645 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3646
3647 // Note: if the capture group match was of an empty string the backref
3648 // match succeeds. Verified by testing: Perl matches succeed
3649 // in this case, so we do too.
3650
3651 UBool success = TRUE;
3652 for (;;) {
3653 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3654 success = TRUE;
3655 break;
3656 }
3657 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3658 success = FALSE;
3659 fHitEnd = TRUE;
3660 break;
3661 }
3662 UChar32 captureGroupChar = utext_next32(fAltInputText);
3663 UChar32 inputChar = utext_next32(fInputText);
3664 if (inputChar != captureGroupChar) {
3665 success = FALSE;
3666 break;
3667 }
3668 }
3669
3670 if (success) {
3671 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3672 } else {
3673 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3674 }
3675 }
3676 break;
3677
3678
3679
3680 case URX_BACKREF_I:
3681 {
3682 U_ASSERT(opValue < fFrameSize);
3683 int64_t groupStartIdx = fp->fExtra[opValue];
3684 int64_t groupEndIdx = fp->fExtra[opValue+1];
3685 U_ASSERT(groupStartIdx <= groupEndIdx);
3686 if (groupStartIdx < 0) {
3687 // This capture group has not participated in the match thus far,
3688 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3689 break;
3690 }
3691 utext_setNativeIndex(fAltInputText, groupStartIdx);
3692 utext_setNativeIndex(fInputText, fp->fInputIdx);
3693 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3694 CaseFoldingUTextIterator inputItr(*fInputText);
3695
3696 // Note: if the capture group match was of an empty string the backref
3697 // match succeeds. Verified by testing: Perl matches succeed
3698 // in this case, so we do too.
3699
3700 UBool success = TRUE;
3701 for (;;) {
3702 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3703 success = TRUE;
3704 break;
3705 }
3706 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3707 success = FALSE;
3708 fHitEnd = TRUE;
3709 break;
3710 }
3711 UChar32 captureGroupChar = captureGroupItr.next();
3712 UChar32 inputChar = inputItr.next();
3713 if (inputChar != captureGroupChar) {
3714 success = FALSE;
3715 break;
3716 }
3717 }
3718
3719 if (success && inputItr.inExpansion()) {
3720 // We otained a match by consuming part of a string obtained from
3721 // case-folding a single code point of the input text.
3722 // This does not count as an overall match.
3723 success = FALSE;
3724 }
3725
3726 if (success) {
3727 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3728 } else {
3729 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3730 }
3731
3732 }
3733 break;
3734
3735 case URX_STO_INP_LOC:
3736 {
3737 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3738 fp->fExtra[opValue] = fp->fInputIdx;
3739 }
3740 break;
3741
3742 case URX_JMPX:
3743 {
3744 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3745 fp->fPatIdx += 1;
3746 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3747 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3748 int64_t savedInputIdx = fp->fExtra[dataLoc];
3749 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3750 if (savedInputIdx < fp->fInputIdx) {
3751 fp->fPatIdx = opValue; // JMP
3752 } else {
3753 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3754 }
3755 }
3756 break;
3757
3758 case URX_LA_START:
3759 {
3760 // Entering a look around block.
3761 // Save Stack Ptr, Input Pos.
3762 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3763 fData[opValue] = fStack->size();
3764 fData[opValue+1] = fp->fInputIdx;
3765 fData[opValue+2] = fActiveStart;
3766 fData[opValue+3] = fActiveLimit;
3767 fActiveStart = fLookStart; // Set the match region change for
3768 fActiveLimit = fLookLimit; // transparent bounds.
3769 }
3770 break;
3771
3772 case URX_LA_END:
3773 {
3774 // Leaving a look-ahead block.
3775 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3776 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3777 int32_t stackSize = fStack->size();
3778 int32_t newStackSize =(int32_t)fData[opValue];
3779 U_ASSERT(stackSize >= newStackSize);
3780 if (stackSize > newStackSize) {
3781 // Copy the current top frame back to the new (cut back) top frame.
3782 // This makes the capture groups from within the look-ahead
3783 // expression available.
3784 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3785 int32_t j;
3786 for (j=0; j<fFrameSize; j++) {
3787 newFP[j] = ((int64_t *)fp)[j];
3788 }
3789 fp = (REStackFrame *)newFP;
3790 fStack->setSize(newStackSize);
3791 }
3792 fp->fInputIdx = fData[opValue+1];
3793
3794 // Restore the active region bounds in the input string; they may have
3795 // been changed because of transparent bounds on a Region.
3796 fActiveStart = fData[opValue+2];
3797 fActiveLimit = fData[opValue+3];
3798 U_ASSERT(fActiveStart >= 0);
3799 U_ASSERT(fActiveLimit <= fInputLength);
3800 }
3801 break;
3802
3803 case URX_ONECHAR_I:
3804 // Case insensitive one char. The char from the pattern is already case folded.
3805 // Input text is not, but case folding the input can not reduce two or more code
3806 // points to one.
3807 if (fp->fInputIdx < fActiveLimit) {
3808 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3809
3810 UChar32 c = UTEXT_NEXT32(fInputText);
3811 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3812 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3813 break;
3814 }
3815 } else {
3816 fHitEnd = TRUE;
3817 }
3818
3819 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3820 break;
3821
3822 case URX_STRING_I:
3823 {
3824 // Case-insensitive test input against a literal string.
3825 // Strings require two slots in the compiled pattern, one for the
3826 // offset to the string text, and one for the length.
3827 // The compiled string has already been case folded.
3828 {
3829 const UChar *patternString = litText + opValue;
3830 int32_t patternStringIdx = 0;
3831
3832 op = (int32_t)pat[fp->fPatIdx];
3833 fp->fPatIdx++;
3834 opType = URX_TYPE(op);
3835 opValue = URX_VAL(op);
3836 U_ASSERT(opType == URX_STRING_LEN);
3837 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3838
3839
3840 UChar32 cPattern;
3841 UChar32 cText;
3842 UBool success = TRUE;
3843
3844 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3845 CaseFoldingUTextIterator inputIterator(*fInputText);
3846 while (patternStringIdx < patternStringLen) {
3847 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3848 success = FALSE;
3849 fHitEnd = TRUE;
3850 break;
3851 }
3852 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3853 cText = inputIterator.next();
3854 if (cText != cPattern) {
3855 success = FALSE;
3856 break;
3857 }
3858 }
3859 if (inputIterator.inExpansion()) {
3860 success = FALSE;
3861 }
3862
3863 if (success) {
3864 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3865 } else {
3866 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3867 }
3868 }
3869 }
3870 break;
3871
3872 case URX_LB_START:
3873 {
3874 // Entering a look-behind block.
3875 // Save Stack Ptr, Input Pos and active input region.
3876 // TODO: implement transparent bounds. Ticket #6067
3877 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3878 fData[opValue] = fStack->size();
3879 fData[opValue+1] = fp->fInputIdx;
3880 // Save input string length, then reset to pin any matches to end at
3881 // the current position.
3882 fData[opValue+2] = fActiveStart;
3883 fData[opValue+3] = fActiveLimit;
3884 fActiveStart = fRegionStart;
3885 fActiveLimit = fp->fInputIdx;
3886 // Init the variable containing the start index for attempted matches.
3887 fData[opValue+4] = -1;
3888 }
3889 break;
3890
3891
3892 case URX_LB_CONT:
3893 {
3894 // Positive Look-Behind, at top of loop checking for matches of LB expression
3895 // at all possible input starting positions.
3896
3897 // Fetch the min and max possible match lengths. They are the operands
3898 // of this op in the pattern.
3899 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3900 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3901 if (!UTEXT_USES_U16(fInputText)) {
3902 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3903 // The max length need not be exact; it just needs to be >= actual maximum.
3904 maxML *= 3;
3905 }
3906 U_ASSERT(minML <= maxML);
3907 U_ASSERT(minML >= 0);
3908
3909 // Fetch (from data) the last input index where a match was attempted.
3910 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3911 int64_t &lbStartIdx = fData[opValue+4];
3912 if (lbStartIdx < 0) {
3913 // First time through loop.
3914 lbStartIdx = fp->fInputIdx - minML;
3915 if (lbStartIdx > 0) {
3916 // move index to a code point boudary, if it's not on one already.
3917 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3918 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3919 }
3920 } else {
3921 // 2nd through nth time through the loop.
3922 // Back up start position for match by one.
3923 if (lbStartIdx == 0) {
3924 (lbStartIdx)--;
3925 } else {
3926 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3927 (void)UTEXT_PREVIOUS32(fInputText);
3928 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3929 }
3930 }
3931
3932 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3933 // We have tried all potential match starting points without
3934 // getting a match. Backtrack out, and out of the
3935 // Look Behind altogether.
3936 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3937 fActiveStart = fData[opValue+2];
3938 fActiveLimit = fData[opValue+3];
3939 U_ASSERT(fActiveStart >= 0);
3940 U_ASSERT(fActiveLimit <= fInputLength);
3941 break;
3942 }
3943
3944 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3945 // (successful match will fall off the end of the loop.)
3946 fp = StateSave(fp, fp->fPatIdx-3, status);
3947 fp->fInputIdx = lbStartIdx;
3948 }
3949 break;
3950
3951 case URX_LB_END:
3952 // End of a look-behind block, after a successful match.
3953 {
3954 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3955 if (fp->fInputIdx != fActiveLimit) {
3956 // The look-behind expression matched, but the match did not
3957 // extend all the way to the point that we are looking behind from.
3958 // FAIL out of here, which will take us back to the LB_CONT, which
3959 // will retry the match starting at another position or fail
3960 // the look-behind altogether, whichever is appropriate.
3961 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3962 break;
3963 }
3964
3965 // Look-behind match is good. Restore the orignal input string region,
3966 // which had been truncated to pin the end of the lookbehind match to the
3967 // position being looked-behind.
3968 fActiveStart = fData[opValue+2];
3969 fActiveLimit = fData[opValue+3];
3970 U_ASSERT(fActiveStart >= 0);
3971 U_ASSERT(fActiveLimit <= fInputLength);
3972 }
3973 break;
3974
3975
3976 case URX_LBN_CONT:
3977 {
3978 // Negative Look-Behind, at top of loop checking for matches of LB expression
3979 // at all possible input starting positions.
3980
3981 // Fetch the extra parameters of this op.
3982 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3983 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3984 if (!UTEXT_USES_U16(fInputText)) {
3985 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3986 // The max length need not be exact; it just needs to be >= actual maximum.
3987 maxML *= 3;
3988 }
3989 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
3990 continueLoc = URX_VAL(continueLoc);
3991 U_ASSERT(minML <= maxML);
3992 U_ASSERT(minML >= 0);
3993 U_ASSERT(continueLoc > fp->fPatIdx);
3994
3995 // Fetch (from data) the last input index where a match was attempted.
3996 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3997 int64_t &lbStartIdx = fData[opValue+4];
3998 if (lbStartIdx < 0) {
3999 // First time through loop.
4000 lbStartIdx = fp->fInputIdx - minML;
4001 if (lbStartIdx > 0) {
4002 // move index to a code point boudary, if it's not on one already.
4003 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4004 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4005 }
4006 } else {
4007 // 2nd through nth time through the loop.
4008 // Back up start position for match by one.
4009 if (lbStartIdx == 0) {
4010 (lbStartIdx)--;
4011 } else {
4012 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4013 (void)UTEXT_PREVIOUS32(fInputText);
4014 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4015 }
4016 }
4017
4018 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4019 // We have tried all potential match starting points without
4020 // getting a match, which means that the negative lookbehind as
4021 // a whole has succeeded. Jump forward to the continue location
4022 fActiveStart = fData[opValue+2];
4023 fActiveLimit = fData[opValue+3];
4024 U_ASSERT(fActiveStart >= 0);
4025 U_ASSERT(fActiveLimit <= fInputLength);
4026 fp->fPatIdx = continueLoc;
4027 break;
4028 }
4029
4030 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4031 // (successful match will cause a FAIL out of the loop altogether.)
4032 fp = StateSave(fp, fp->fPatIdx-4, status);
4033 fp->fInputIdx = lbStartIdx;
4034 }
4035 break;
4036
4037 case URX_LBN_END:
4038 // End of a negative look-behind block, after a successful match.
4039 {
4040 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4041 if (fp->fInputIdx != fActiveLimit) {
4042 // The look-behind expression matched, but the match did not
4043 // extend all the way to the point that we are looking behind from.
4044 // FAIL out of here, which will take us back to the LB_CONT, which
4045 // will retry the match starting at another position or succeed
4046 // the look-behind altogether, whichever is appropriate.
4047 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4048 break;
4049 }
4050
4051 // Look-behind expression matched, which means look-behind test as
4052 // a whole Fails
4053
4054 // Restore the orignal input string length, which had been truncated
4055 // inorder to pin the end of the lookbehind match
4056 // to the position being looked-behind.
4057 fActiveStart = fData[opValue+2];
4058 fActiveLimit = fData[opValue+3];
4059 U_ASSERT(fActiveStart >= 0);
4060 U_ASSERT(fActiveLimit <= fInputLength);
4061
4062 // Restore original stack position, discarding any state saved
4063 // by the successful pattern match.
4064 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4065 int32_t newStackSize = (int32_t)fData[opValue];
4066 U_ASSERT(fStack->size() > newStackSize);
4067 fStack->setSize(newStackSize);
4068
4069 // FAIL, which will take control back to someplace
4070 // prior to entering the look-behind test.
4071 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4072 }
4073 break;
4074
4075
4076 case URX_LOOP_SR_I:
4077 // Loop Initialization for the optimized implementation of
4078 // [some character set]*
4079 // This op scans through all matching input.
4080 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4081 {
4082 U_ASSERT(opValue > 0 && opValue < fSets->size());
4083 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4084 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4085
4086 // Loop through input, until either the input is exhausted or
4087 // we reach a character that is not a member of the set.
4088 int64_t ix = fp->fInputIdx;
4089 UTEXT_SETNATIVEINDEX(fInputText, ix);
4090 for (;;) {
4091 if (ix >= fActiveLimit) {
4092 fHitEnd = TRUE;
4093 break;
4094 }
4095 UChar32 c = UTEXT_NEXT32(fInputText);
4096 if (c<256) {
4097 if (s8->contains(c) == FALSE) {
4098 break;
4099 }
4100 } else {
4101 if (s->contains(c) == FALSE) {
4102 break;
4103 }
4104 }
4105 ix = UTEXT_GETNATIVEINDEX(fInputText);
4106 }
4107
4108 // If there were no matching characters, skip over the loop altogether.
4109 // The loop doesn't run at all, a * op always succeeds.
4110 if (ix == fp->fInputIdx) {
4111 fp->fPatIdx++; // skip the URX_LOOP_C op.
4112 break;
4113 }
4114
4115 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4116 // must follow. It's operand is the stack location
4117 // that holds the starting input index for the match of this [set]*
4118 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4119 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4120 int32_t stackLoc = URX_VAL(loopcOp);
4121 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4122 fp->fExtra[stackLoc] = fp->fInputIdx;
4123 fp->fInputIdx = ix;
4124
4125 // Save State to the URX_LOOP_C op that follows this one,
4126 // so that match failures in the following code will return to there.
4127 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4128 fp = StateSave(fp, fp->fPatIdx, status);
4129 fp->fPatIdx++;
4130 }
4131 break;
4132
4133
4134 case URX_LOOP_DOT_I:
4135 // Loop Initialization for the optimized implementation of .*
4136 // This op scans through all remaining input.
4137 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4138 {
4139 // Loop through input until the input is exhausted (we reach an end-of-line)
4140 // In DOTALL mode, we can just go straight to the end of the input.
4141 int64_t ix;
4142 if ((opValue & 1) == 1) {
4143 // Dot-matches-All mode. Jump straight to the end of the string.
4144 ix = fActiveLimit;
4145 fHitEnd = TRUE;
4146 } else {
4147 // NOT DOT ALL mode. Line endings do not match '.'
4148 // Scan forward until a line ending or end of input.
4149 ix = fp->fInputIdx;
4150 UTEXT_SETNATIVEINDEX(fInputText, ix);
4151 for (;;) {
4152 if (ix >= fActiveLimit) {
4153 fHitEnd = TRUE;
4154 break;
4155 }
4156 UChar32 c = UTEXT_NEXT32(fInputText);
4157 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4158 if ((c == 0x0a) || // 0x0a is newline in both modes.
4159 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4160 isLineTerminator(c))) {
4161 // char is a line ending. Exit the scanning loop.
4162 break;
4163 }
4164 }
4165 ix = UTEXT_GETNATIVEINDEX(fInputText);
4166 }
4167 }
4168
4169 // If there were no matching characters, skip over the loop altogether.
4170 // The loop doesn't run at all, a * op always succeeds.
4171 if (ix == fp->fInputIdx) {
4172 fp->fPatIdx++; // skip the URX_LOOP_C op.
4173 break;
4174 }
4175
4176 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4177 // must follow. It's operand is the stack location
4178 // that holds the starting input index for the match of this .*
4179 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4180 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4181 int32_t stackLoc = URX_VAL(loopcOp);
4182 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4183 fp->fExtra[stackLoc] = fp->fInputIdx;
4184 fp->fInputIdx = ix;
4185
4186 // Save State to the URX_LOOP_C op that follows this one,
4187 // so that match failures in the following code will return to there.
4188 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4189 fp = StateSave(fp, fp->fPatIdx, status);
4190 fp->fPatIdx++;
4191 }
4192 break;
4193
4194
4195 case URX_LOOP_C:
4196 {
4197 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4198 backSearchIndex = fp->fExtra[opValue];
4199 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4200 if (backSearchIndex == fp->fInputIdx) {
4201 // We've backed up the input idx to the point that the loop started.
4202 // The loop is done. Leave here without saving state.
4203 // Subsequent failures won't come back here.
4204 break;
4205 }
4206 // Set up for the next iteration of the loop, with input index
4207 // backed up by one from the last time through,
4208 // and a state save to this instruction in case the following code fails again.
4209 // (We're going backwards because this loop emulates stack unwinding, not
4210 // the initial scan forward.)
4211 U_ASSERT(fp->fInputIdx > 0);
4212 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4213 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4214 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4215
4216 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4217 if (prevC == 0x0a &&
4218 fp->fInputIdx > backSearchIndex &&
4219 twoPrevC == 0x0d) {
4220 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4221 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4222 // .*, stepping back over CRLF pair.
4223 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4224 }
4225 }
4226
4227
4228 fp = StateSave(fp, fp->fPatIdx-1, status);
4229 }
4230 break;
4231
4232
4233
4234 default:
4235 // Trouble. The compiled pattern contains an entry with an
4236 // unrecognized type tag.
4237 UPRV_UNREACHABLE;
4238 }
4239
4240 if (U_FAILURE(status)) {
4241 isMatch = FALSE;
4242 break;
4243 }
4244 }
4245
4246breakFromLoop:
4247 fMatch = isMatch;
4248 if (isMatch) {
4249 fLastMatchEnd = fMatchEnd;
4250 fMatchStart = startIdx;
4251 fMatchEnd = fp->fInputIdx;
4252 }
4253
4254#ifdef REGEX_RUN_DEBUG
4255 if (fTraceDebug) {
4256 if (isMatch) {
4257 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4258 } else {
4259 printf("No match\n\n");
4260 }
4261 }
4262#endif
4263
4264 fFrame = fp; // The active stack frame when the engine stopped.
4265 // Contains the capture group results that we need to
4266 // access later.
4267 return;
4268}
4269
4270
4271//--------------------------------------------------------------------------------
4272//
4273// MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4274// assumption that the entire string is available in the UText's
4275// chunk buffer. For now, that means we can use int32_t indexes,
4276// except for anything that needs to be saved (like group starts
4277// and ends).
4278//
4279// startIdx: begin matching a this index.
4280// toEnd: if true, match must extend to end of the input region
4281//
4282//--------------------------------------------------------------------------------
4283void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4284 UBool isMatch = FALSE; // True if the we have a match.
4285
4286 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4287
4288 int32_t op; // Operation from the compiled pattern, split into
4289 int32_t opType; // the opcode
4290 int32_t opValue; // and the operand value.
4291
4292#ifdef REGEX_RUN_DEBUG
4293 if (fTraceDebug) {
4294 printf("MatchAt(startIdx=%d)\n", startIdx);
4295 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4296 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4297 }
4298#endif
4299
4300 if (U_FAILURE(status)) {
4301 return;
4302 }
4303
4304 // Cache frequently referenced items from the compiled pattern
4305 //
4306 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4307
4308 const UChar *litText = fPattern->fLiteralText.getBuffer();
4309 UVector *fSets = fPattern->fSets;
4310
4311 const UChar *inputBuf = fInputText->chunkContents;
4312
4313 fFrameSize = fPattern->fFrameSize;
4314 REStackFrame *fp = resetStack();
4315 if (U_FAILURE(fDeferredStatus)) {
4316 status = fDeferredStatus;
4317 return;
4318 }
4319
4320 fp->fPatIdx = 0;
4321 fp->fInputIdx = startIdx;
4322
4323 // Zero out the pattern's static data
4324 int32_t i;
4325 for (i = 0; i<fPattern->fDataSize; i++) {
4326 fData[i] = 0;
4327 }
4328
4329 //
4330 // Main loop for interpreting the compiled pattern.
4331 // One iteration of the loop per pattern operation performed.
4332 //
4333 for (;;) {
4334 op = (int32_t)pat[fp->fPatIdx];
4335 opType = URX_TYPE(op);
4336 opValue = URX_VAL(op);
4337#ifdef REGEX_RUN_DEBUG
4338 if (fTraceDebug) {
4339 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4340 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4341 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4342 fPattern->dumpOp(fp->fPatIdx);
4343 }
4344#endif
4345 fp->fPatIdx++;
4346
4347 switch (opType) {
4348
4349
4350 case URX_NOP:
4351 break;
4352
4353
4354 case URX_BACKTRACK:
4355 // Force a backtrack. In some circumstances, the pattern compiler
4356 // will notice that the pattern can't possibly match anything, and will
4357 // emit one of these at that point.
4358 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4359 break;
4360
4361
4362 case URX_ONECHAR:
4363 if (fp->fInputIdx < fActiveLimit) {
4364 UChar32 c;
4365 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4366 if (c == opValue) {
4367 break;
4368 }
4369 } else {
4370 fHitEnd = TRUE;
4371 }
4372 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4373 break;
4374
4375
4376 case URX_STRING:
4377 {
4378 // Test input against a literal string.
4379 // Strings require two slots in the compiled pattern, one for the
4380 // offset to the string text, and one for the length.
4381 int32_t stringStartIdx = opValue;
4382 int32_t stringLen;
4383
4384 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4385 fp->fPatIdx++;
4386 opType = URX_TYPE(op);
4387 stringLen = URX_VAL(op);
4388 U_ASSERT(opType == URX_STRING_LEN);
4389 U_ASSERT(stringLen >= 2);
4390
4391 const UChar * pInp = inputBuf + fp->fInputIdx;
4392 const UChar * pInpLimit = inputBuf + fActiveLimit;
4393 const UChar * pPat = litText+stringStartIdx;
4394 const UChar * pEnd = pInp + stringLen;
4395 UBool success = TRUE;
4396 while (pInp < pEnd) {
4397 if (pInp >= pInpLimit) {
4398 fHitEnd = TRUE;
4399 success = FALSE;
4400 break;
4401 }
4402 if (*pInp++ != *pPat++) {
4403 success = FALSE;
4404 break;
4405 }
4406 }
4407
4408 if (success) {
4409 fp->fInputIdx += stringLen;
4410 } else {
4411 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4412 }
4413 }
4414 break;
4415
4416
4417 case URX_STATE_SAVE:
4418 fp = StateSave(fp, opValue, status);
4419 break;
4420
4421
4422 case URX_END:
4423 // The match loop will exit via this path on a successful match,
4424 // when we reach the end of the pattern.
4425 if (toEnd && fp->fInputIdx != fActiveLimit) {
4426 // The pattern matched, but not to the end of input. Try some more.
4427 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4428 break;
4429 }
4430 isMatch = TRUE;
4431 goto breakFromLoop;
4432
4433 // Start and End Capture stack frame variables are laid out out like this:
4434 // fp->fExtra[opValue] - The start of a completed capture group
4435 // opValue+1 - The end of a completed capture group
4436 // opValue+2 - the start of a capture group whose end
4437 // has not yet been reached (and might not ever be).
4438 case URX_START_CAPTURE:
4439 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4440 fp->fExtra[opValue+2] = fp->fInputIdx;
4441 break;
4442
4443
4444 case URX_END_CAPTURE:
4445 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4446 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4447 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4448 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4449 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4450 break;
4451
4452
4453 case URX_DOLLAR: // $, test for End of line
4454 // or for position before new line at end of input
4455 if (fp->fInputIdx < fAnchorLimit-2) {
4456 // We are no where near the end of input. Fail.
4457 // This is the common case. Keep it first.
4458 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4459 break;
4460 }
4461 if (fp->fInputIdx >= fAnchorLimit) {
4462 // We really are at the end of input. Success.
4463 fHitEnd = TRUE;
4464 fRequireEnd = TRUE;
4465 break;
4466 }
4467
4468 // If we are positioned just before a new-line that is located at the
4469 // end of input, succeed.
4470 if (fp->fInputIdx == fAnchorLimit-1) {
4471 UChar32 c;
4472 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4473
4474 if (isLineTerminator(c)) {
4475 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4476 // At new-line at end of input. Success
4477 fHitEnd = TRUE;
4478 fRequireEnd = TRUE;
4479 break;
4480 }
4481 }
4482 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4483 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4484 fHitEnd = TRUE;
4485 fRequireEnd = TRUE;
4486 break; // At CR/LF at end of input. Success
4487 }
4488
4489 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4490
4491 break;
4492
4493
4494 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4495 if (fp->fInputIdx >= fAnchorLimit-1) {
4496 // Either at the last character of input, or off the end.
4497 if (fp->fInputIdx == fAnchorLimit-1) {
4498 // At last char of input. Success if it's a new line.
4499 if (inputBuf[fp->fInputIdx] == 0x0a) {
4500 fHitEnd = TRUE;
4501 fRequireEnd = TRUE;
4502 break;
4503 }
4504 } else {
4505 // Off the end of input. Success.
4506 fHitEnd = TRUE;
4507 fRequireEnd = TRUE;
4508 break;
4509 }
4510 }
4511
4512 // Not at end of input. Back-track out.
4513 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4514 break;
4515
4516
4517 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4518 {
4519 if (fp->fInputIdx >= fAnchorLimit) {
4520 // We really are at the end of input. Success.
4521 fHitEnd = TRUE;
4522 fRequireEnd = TRUE;
4523 break;
4524 }
4525 // If we are positioned just before a new-line, succeed.
4526 // It makes no difference where the new-line is within the input.
4527 UChar32 c = inputBuf[fp->fInputIdx];
4528 if (isLineTerminator(c)) {
4529 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4530 // In multi-line mode, hitting a new-line just before the end of input does not
4531 // set the hitEnd or requireEnd flags
4532 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4533 break;
4534 }
4535 }
4536 // not at a new line. Fail.
4537 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4538 }
4539 break;
4540
4541
4542 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4543 {
4544 if (fp->fInputIdx >= fAnchorLimit) {
4545 // We really are at the end of input. Success.
4546 fHitEnd = TRUE;
4547 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4548 break; // adding a new-line would not lose the match.
4549 }
4550 // If we are not positioned just before a new-line, the test fails; backtrack out.
4551 // It makes no difference where the new-line is within the input.
4552 if (inputBuf[fp->fInputIdx] != 0x0a) {
4553 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4554 }
4555 }
4556 break;
4557
4558
4559 case URX_CARET: // ^, test for start of line
4560 if (fp->fInputIdx != fAnchorStart) {
4561 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4562 }
4563 break;
4564
4565
4566 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4567 {
4568 if (fp->fInputIdx == fAnchorStart) {
4569 // We are at the start input. Success.
4570 break;
4571 }
4572 // Check whether character just before the current pos is a new-line
4573 // unless we are at the end of input
4574 UChar c = inputBuf[fp->fInputIdx - 1];
4575 if ((fp->fInputIdx < fAnchorLimit) &&
4576 isLineTerminator(c)) {
4577 // It's a new-line. ^ is true. Success.
4578 // TODO: what should be done with positions between a CR and LF?
4579 break;
4580 }
4581 // Not at the start of a line. Fail.
4582 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4583 }
4584 break;
4585
4586
4587 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4588 {
4589 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4590 if (fp->fInputIdx <= fAnchorStart) {
4591 // We are at the start input. Success.
4592 break;
4593 }
4594 // Check whether character just before the current pos is a new-line
4595 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4596 UChar c = inputBuf[fp->fInputIdx - 1];
4597 if (c != 0x0a) {
4598 // Not at the start of a line. Back-track out.
4599 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4600 }
4601 }
4602 break;
4603
4604 case URX_BACKSLASH_B: // Test for word boundaries
4605 {
4606 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4607 success ^= (UBool)(opValue != 0); // flip sense for \B
4608 if (!success) {
4609 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4610 }
4611 }
4612 break;
4613
4614
4615 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4616 {
4617 UBool success = isUWordBoundary(fp->fInputIdx, status);
4618 success ^= (UBool)(opValue != 0); // flip sense for \B
4619 if (!success) {
4620 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4621 }
4622 }
4623 break;
4624
4625
4626 case URX_BACKSLASH_D: // Test for decimal digit
4627 {
4628 if (fp->fInputIdx >= fActiveLimit) {
4629 fHitEnd = TRUE;
4630 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4631 break;
4632 }
4633
4634 UChar32 c;
4635 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4636 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4637 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4638 success ^= (UBool)(opValue != 0); // flip sense for \D
4639 if (!success) {
4640 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4641 }
4642 }
4643 break;
4644
4645
4646 case URX_BACKSLASH_G: // Test for position at end of previous match
4647 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4648 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4649 }
4650 break;
4651
4652
4653 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4654 {
4655 if (fp->fInputIdx >= fActiveLimit) {
4656 fHitEnd = TRUE;
4657 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4658 break;
4659 }
4660 UChar32 c;
4661 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4662 int8_t ctype = u_charType(c);
4663 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4664 success ^= (UBool)(opValue != 0); // flip sense for \H
4665 if (!success) {
4666 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4667 }
4668 }
4669 break;
4670
4671
4672 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4673 {
4674 if (fp->fInputIdx >= fActiveLimit) {
4675 fHitEnd = TRUE;
4676 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4677 break;
4678 }
4679 UChar32 c;
4680 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4681 if (isLineTerminator(c)) {
4682 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4683 // Check for CR/LF sequence. Consume both together when found.
4684 UChar c2;
4685 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4686 if (c2 != 0x0a) {
4687 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4688 }
4689 }
4690 } else {
4691 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4692 }
4693 }
4694 break;
4695
4696
4697 case URX_BACKSLASH_V: // Any single code point line ending.
4698 {
4699 if (fp->fInputIdx >= fActiveLimit) {
4700 fHitEnd = TRUE;
4701 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4702 break;
4703 }
4704 UChar32 c;
4705 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4706 UBool success = isLineTerminator(c);
4707 success ^= (UBool)(opValue != 0); // flip sense for \V
4708 if (!success) {
4709 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4710 }
4711 }
4712 break;
4713
4714
4715 case URX_BACKSLASH_X:
4716 // Match a Grapheme, as defined by Unicode UAX 29.
4717
4718 // Fail if at end of input
4719 if (fp->fInputIdx >= fActiveLimit) {
4720 fHitEnd = TRUE;
4721 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4722 break;
4723 }
4724
4725 fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
4726 if (fp->fInputIdx >= fActiveLimit) {
4727 fHitEnd = TRUE;
4728 fp->fInputIdx = fActiveLimit;
4729 }
4730 break;
4731
4732
4733 case URX_BACKSLASH_Z: // Test for end of Input
4734 if (fp->fInputIdx < fAnchorLimit) {
4735 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4736 } else {
4737 fHitEnd = TRUE;
4738 fRequireEnd = TRUE;
4739 }
4740 break;
4741
4742
4743
4744 case URX_STATIC_SETREF:
4745 {
4746 // Test input character against one of the predefined sets
4747 // (Word Characters, for example)
4748 // The high bit of the op value is a flag for the match polarity.
4749 // 0: success if input char is in set.
4750 // 1: success if input char is not in set.
4751 if (fp->fInputIdx >= fActiveLimit) {
4752 fHitEnd = TRUE;
4753 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4754 break;
4755 }
4756
4757 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4758 opValue &= ~URX_NEG_SET;
4759 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4760
4761 UChar32 c;
4762 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4763 if (c < 256) {
4764 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4765 if (s8.contains(c)) {
4766 success = !success;
4767 }
4768 } else {
4769 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4770 if (s.contains(c)) {
4771 success = !success;
4772 }
4773 }
4774 if (!success) {
4775 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4776 }
4777 }
4778 break;
4779
4780
4781 case URX_STAT_SETREF_N:
4782 {
4783 // Test input character for NOT being a member of one of
4784 // the predefined sets (Word Characters, for example)
4785 if (fp->fInputIdx >= fActiveLimit) {
4786 fHitEnd = TRUE;
4787 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4788 break;
4789 }
4790
4791 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4792
4793 UChar32 c;
4794 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4795 if (c < 256) {
4796 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4797 if (s8.contains(c) == FALSE) {
4798 break;
4799 }
4800 } else {
4801 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4802 if (s.contains(c) == FALSE) {
4803 break;
4804 }
4805 }
4806 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4807 }
4808 break;
4809
4810
4811 case URX_SETREF:
4812 {
4813 if (fp->fInputIdx >= fActiveLimit) {
4814 fHitEnd = TRUE;
4815 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4816 break;
4817 }
4818
4819 U_ASSERT(opValue > 0 && opValue < fSets->size());
4820
4821 // There is input left. Pick up one char and test it for set membership.
4822 UChar32 c;
4823 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4824 if (c<256) {
4825 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4826 if (s8->contains(c)) {
4827 // The character is in the set. A Match.
4828 break;
4829 }
4830 } else {
4831 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4832 if (s->contains(c)) {
4833 // The character is in the set. A Match.
4834 break;
4835 }
4836 }
4837
4838 // the character wasn't in the set.
4839 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4840 }
4841 break;
4842
4843
4844 case URX_DOTANY:
4845 {
4846 // . matches anything, but stops at end-of-line.
4847 if (fp->fInputIdx >= fActiveLimit) {
4848 // At end of input. Match failed. Backtrack out.
4849 fHitEnd = TRUE;
4850 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4851 break;
4852 }
4853
4854 // There is input left. Advance over one char, unless we've hit end-of-line
4855 UChar32 c;
4856 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4857 if (isLineTerminator(c)) {
4858 // End of line in normal mode. . does not match.
4859 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4860 break;
4861 }
4862 }
4863 break;
4864
4865
4866 case URX_DOTANY_ALL:
4867 {
4868 // . in dot-matches-all (including new lines) mode
4869 if (fp->fInputIdx >= fActiveLimit) {
4870 // At end of input. Match failed. Backtrack out.
4871 fHitEnd = TRUE;
4872 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4873 break;
4874 }
4875
4876 // There is input left. Advance over one char, except if we are
4877 // at a cr/lf, advance over both of them.
4878 UChar32 c;
4879 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4880 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4881 // In the case of a CR/LF, we need to advance over both.
4882 if (inputBuf[fp->fInputIdx] == 0x0a) {
4883 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4884 }
4885 }
4886 }
4887 break;
4888
4889
4890 case URX_DOTANY_UNIX:
4891 {
4892 // '.' operator, matches all, but stops at end-of-line.
4893 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4894 if (fp->fInputIdx >= fActiveLimit) {
4895 // At end of input. Match failed. Backtrack out.
4896 fHitEnd = TRUE;
4897 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4898 break;
4899 }
4900
4901 // There is input left. Advance over one char, unless we've hit end-of-line
4902 UChar32 c;
4903 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4904 if (c == 0x0a) {
4905 // End of line in normal mode. '.' does not match the \n
4906 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4907 }
4908 }
4909 break;
4910
4911
4912 case URX_JMP:
4913 fp->fPatIdx = opValue;
4914 break;
4915
4916 case URX_FAIL:
4917 isMatch = FALSE;
4918 goto breakFromLoop;
4919
4920 case URX_JMP_SAV:
4921 U_ASSERT(opValue < fPattern->fCompiledPat->size());
4922 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4923 fp->fPatIdx = opValue; // Then JMP.
4924 break;
4925
4926 case URX_JMP_SAV_X:
4927 // This opcode is used with (x)+, when x can match a zero length string.
4928 // Same as JMP_SAV, except conditional on the match having made forward progress.
4929 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4930 // data address of the input position at the start of the loop.
4931 {
4932 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4933 int32_t stoOp = (int32_t)pat[opValue-1];
4934 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4935 int32_t frameLoc = URX_VAL(stoOp);
4936 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4937 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4938 U_ASSERT(prevInputIdx <= fp->fInputIdx);
4939 if (prevInputIdx < fp->fInputIdx) {
4940 // The match did make progress. Repeat the loop.
4941 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4942 fp->fPatIdx = opValue;
4943 fp->fExtra[frameLoc] = fp->fInputIdx;
4944 }
4945 // If the input position did not advance, we do nothing here,
4946 // execution will fall out of the loop.
4947 }
4948 break;
4949
4950 case URX_CTR_INIT:
4951 {
4952 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4953 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
4954
4955 // Pick up the three extra operands that CTR_INIT has, and
4956 // skip the pattern location counter past
4957 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4958 fp->fPatIdx += 3;
4959 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
4960 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4961 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4962 U_ASSERT(minCount>=0);
4963 U_ASSERT(maxCount>=minCount || maxCount==-1);
4964 U_ASSERT(loopLoc>=fp->fPatIdx);
4965
4966 if (minCount == 0) {
4967 fp = StateSave(fp, loopLoc+1, status);
4968 }
4969 if (maxCount == -1) {
4970 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
4971 } else if (maxCount == 0) {
4972 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4973 }
4974 }
4975 break;
4976
4977 case URX_CTR_LOOP:
4978 {
4979 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4980 int32_t initOp = (int32_t)pat[opValue];
4981 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4982 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4983 int32_t minCount = (int32_t)pat[opValue+2];
4984 int32_t maxCount = (int32_t)pat[opValue+3];
4985 (*pCounter)++;
4986 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
4987 U_ASSERT(*pCounter == maxCount);
4988 break;
4989 }
4990 if (*pCounter >= minCount) {
4991 if (maxCount == -1) {
4992 // Loop has no hard upper bound.
4993 // Check that it is progressing through the input, break if it is not.
4994 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
4995 if (fp->fInputIdx == *pLastInputIdx) {
4996 break;
4997 } else {
4998 *pLastInputIdx = fp->fInputIdx;
4999 }
5000 }
5001 fp = StateSave(fp, fp->fPatIdx, status);
5002 } else {
5003 // Increment time-out counter. (StateSave() does it if count >= minCount)
5004 fTickCounter--;
5005 if (fTickCounter <= 0) {
5006 IncrementTime(status); // Re-initializes fTickCounter
5007 }
5008 }
5009 fp->fPatIdx = opValue + 4; // Loop back.
5010 }
5011 break;
5012
5013 case URX_CTR_INIT_NG:
5014 {
5015 // Initialize a non-greedy loop
5016 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5017 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5018
5019 // Pick up the three extra operands that CTR_INIT_NG has, and
5020 // skip the pattern location counter past
5021 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5022 fp->fPatIdx += 3;
5023 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5024 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5025 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5026 U_ASSERT(minCount>=0);
5027 U_ASSERT(maxCount>=minCount || maxCount==-1);
5028 U_ASSERT(loopLoc>fp->fPatIdx);
5029 if (maxCount == -1) {
5030 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5031 }
5032
5033 if (minCount == 0) {
5034 if (maxCount != 0) {
5035 fp = StateSave(fp, fp->fPatIdx, status);
5036 }
5037 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5038 }
5039 }
5040 break;
5041
5042 case URX_CTR_LOOP_NG:
5043 {
5044 // Non-greedy {min, max} loops
5045 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5046 int32_t initOp = (int32_t)pat[opValue];
5047 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5048 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5049 int32_t minCount = (int32_t)pat[opValue+2];
5050 int32_t maxCount = (int32_t)pat[opValue+3];
5051
5052 (*pCounter)++;
5053 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5054 // The loop has matched the maximum permitted number of times.
5055 // Break out of here with no action. Matching will
5056 // continue with the following pattern.
5057 U_ASSERT(*pCounter == maxCount);
5058 break;
5059 }
5060
5061 if (*pCounter < minCount) {
5062 // We haven't met the minimum number of matches yet.
5063 // Loop back for another one.
5064 fp->fPatIdx = opValue + 4; // Loop back.
5065 fTickCounter--;
5066 if (fTickCounter <= 0) {
5067 IncrementTime(status); // Re-initializes fTickCounter
5068 }
5069 } else {
5070 // We do have the minimum number of matches.
5071
5072 // If there is no upper bound on the loop iterations, check that the input index
5073 // is progressing, and stop the loop if it is not.
5074 if (maxCount == -1) {
5075 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5076 if (fp->fInputIdx == *pLastInputIdx) {
5077 break;
5078 }
5079 *pLastInputIdx = fp->fInputIdx;
5080 }
5081
5082 // Loop Continuation: we will fall into the pattern following the loop
5083 // (non-greedy, don't execute loop body first), but first do
5084 // a state save to the top of the loop, so that a match failure
5085 // in the following pattern will try another iteration of the loop.
5086 fp = StateSave(fp, opValue + 4, status);
5087 }
5088 }
5089 break;
5090
5091 case URX_STO_SP:
5092 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5093 fData[opValue] = fStack->size();
5094 break;
5095
5096 case URX_LD_SP:
5097 {
5098 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5099 int32_t newStackSize = (int32_t)fData[opValue];
5100 U_ASSERT(newStackSize <= fStack->size());
5101 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5102 if (newFP == (int64_t *)fp) {
5103 break;
5104 }
5105 int32_t j;
5106 for (j=0; j<fFrameSize; j++) {
5107 newFP[j] = ((int64_t *)fp)[j];
5108 }
5109 fp = (REStackFrame *)newFP;
5110 fStack->setSize(newStackSize);
5111 }
5112 break;
5113
5114 case URX_BACKREF:
5115 {
5116 U_ASSERT(opValue < fFrameSize);
5117 int64_t groupStartIdx = fp->fExtra[opValue];
5118 int64_t groupEndIdx = fp->fExtra[opValue+1];
5119 U_ASSERT(groupStartIdx <= groupEndIdx);
5120 int64_t inputIndex = fp->fInputIdx;
5121 if (groupStartIdx < 0) {
5122 // This capture group has not participated in the match thus far,
5123 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5124 break;
5125 }
5126 UBool success = TRUE;
5127 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5128 if (inputIndex >= fActiveLimit) {
5129 success = FALSE;
5130 fHitEnd = TRUE;
5131 break;
5132 }
5133 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5134 success = FALSE;
5135 break;
5136 }
5137 }
5138 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5139 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5140 // Capture group ended with an unpaired lead surrogate.
5141 // Back reference is not permitted to match lead only of a surrogatge pair.
5142 success = FALSE;
5143 }
5144 if (success) {
5145 fp->fInputIdx = inputIndex;
5146 } else {
5147 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5148 }
5149 }
5150 break;
5151
5152 case URX_BACKREF_I:
5153 {
5154 U_ASSERT(opValue < fFrameSize);
5155 int64_t groupStartIdx = fp->fExtra[opValue];
5156 int64_t groupEndIdx = fp->fExtra[opValue+1];
5157 U_ASSERT(groupStartIdx <= groupEndIdx);
5158 if (groupStartIdx < 0) {
5159 // This capture group has not participated in the match thus far,
5160 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5161 break;
5162 }
5163 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5164 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5165
5166 // Note: if the capture group match was of an empty string the backref
5167 // match succeeds. Verified by testing: Perl matches succeed
5168 // in this case, so we do too.
5169
5170 UBool success = TRUE;
5171 for (;;) {
5172 UChar32 captureGroupChar = captureGroupItr.next();
5173 if (captureGroupChar == U_SENTINEL) {
5174 success = TRUE;
5175 break;
5176 }
5177 UChar32 inputChar = inputItr.next();
5178 if (inputChar == U_SENTINEL) {
5179 success = FALSE;
5180 fHitEnd = TRUE;
5181 break;
5182 }
5183 if (inputChar != captureGroupChar) {
5184 success = FALSE;
5185 break;
5186 }
5187 }
5188
5189 if (success && inputItr.inExpansion()) {
5190 // We otained a match by consuming part of a string obtained from
5191 // case-folding a single code point of the input text.
5192 // This does not count as an overall match.
5193 success = FALSE;
5194 }
5195
5196 if (success) {
5197 fp->fInputIdx = inputItr.getIndex();
5198 } else {
5199 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5200 }
5201 }
5202 break;
5203
5204 case URX_STO_INP_LOC:
5205 {
5206 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5207 fp->fExtra[opValue] = fp->fInputIdx;
5208 }
5209 break;
5210
5211 case URX_JMPX:
5212 {
5213 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5214 fp->fPatIdx += 1;
5215 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5216 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5217 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5218 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5219 if (savedInputIdx < fp->fInputIdx) {
5220 fp->fPatIdx = opValue; // JMP
5221 } else {
5222 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5223 }
5224 }
5225 break;
5226
5227 case URX_LA_START:
5228 {
5229 // Entering a look around block.
5230 // Save Stack Ptr, Input Pos.
5231 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5232 fData[opValue] = fStack->size();
5233 fData[opValue+1] = fp->fInputIdx;
5234 fData[opValue+2] = fActiveStart;
5235 fData[opValue+3] = fActiveLimit;
5236 fActiveStart = fLookStart; // Set the match region change for
5237 fActiveLimit = fLookLimit; // transparent bounds.
5238 }
5239 break;
5240
5241 case URX_LA_END:
5242 {
5243 // Leaving a look around block.
5244 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5245 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5246 int32_t stackSize = fStack->size();
5247 int32_t newStackSize = (int32_t)fData[opValue];
5248 U_ASSERT(stackSize >= newStackSize);
5249 if (stackSize > newStackSize) {
5250 // Copy the current top frame back to the new (cut back) top frame.
5251 // This makes the capture groups from within the look-ahead
5252 // expression available.
5253 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5254 int32_t j;
5255 for (j=0; j<fFrameSize; j++) {
5256 newFP[j] = ((int64_t *)fp)[j];
5257 }
5258 fp = (REStackFrame *)newFP;
5259 fStack->setSize(newStackSize);
5260 }
5261 fp->fInputIdx = fData[opValue+1];
5262
5263 // Restore the active region bounds in the input string; they may have
5264 // been changed because of transparent bounds on a Region.
5265 fActiveStart = fData[opValue+2];
5266 fActiveLimit = fData[opValue+3];
5267 U_ASSERT(fActiveStart >= 0);
5268 U_ASSERT(fActiveLimit <= fInputLength);
5269 }
5270 break;
5271
5272 case URX_ONECHAR_I:
5273 if (fp->fInputIdx < fActiveLimit) {
5274 UChar32 c;
5275 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5276 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5277 break;
5278 }
5279 } else {
5280 fHitEnd = TRUE;
5281 }
5282 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5283 break;
5284
5285 case URX_STRING_I:
5286 // Case-insensitive test input against a literal string.
5287 // Strings require two slots in the compiled pattern, one for the
5288 // offset to the string text, and one for the length.
5289 // The compiled string has already been case folded.
5290 {
5291 const UChar *patternString = litText + opValue;
5292
5293 op = (int32_t)pat[fp->fPatIdx];
5294 fp->fPatIdx++;
5295 opType = URX_TYPE(op);
5296 opValue = URX_VAL(op);
5297 U_ASSERT(opType == URX_STRING_LEN);
5298 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5299
5300 UChar32 cText;
5301 UChar32 cPattern;
5302 UBool success = TRUE;
5303 int32_t patternStringIdx = 0;
5304 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5305 while (patternStringIdx < patternStringLen) {
5306 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5307 cText = inputIterator.next();
5308 if (cText != cPattern) {
5309 success = FALSE;
5310 if (cText == U_SENTINEL) {
5311 fHitEnd = TRUE;
5312 }
5313 break;
5314 }
5315 }
5316 if (inputIterator.inExpansion()) {
5317 success = FALSE;
5318 }
5319
5320 if (success) {
5321 fp->fInputIdx = inputIterator.getIndex();
5322 } else {
5323 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5324 }
5325 }
5326 break;
5327
5328 case URX_LB_START:
5329 {
5330 // Entering a look-behind block.
5331 // Save Stack Ptr, Input Pos and active input region.
5332 // TODO: implement transparent bounds. Ticket #6067
5333 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5334 fData[opValue] = fStack->size();
5335 fData[opValue+1] = fp->fInputIdx;
5336 // Save input string length, then reset to pin any matches to end at
5337 // the current position.
5338 fData[opValue+2] = fActiveStart;
5339 fData[opValue+3] = fActiveLimit;
5340 fActiveStart = fRegionStart;
5341 fActiveLimit = fp->fInputIdx;
5342 // Init the variable containing the start index for attempted matches.
5343 fData[opValue+4] = -1;
5344 }
5345 break;
5346
5347
5348 case URX_LB_CONT:
5349 {
5350 // Positive Look-Behind, at top of loop checking for matches of LB expression
5351 // at all possible input starting positions.
5352
5353 // Fetch the min and max possible match lengths. They are the operands
5354 // of this op in the pattern.
5355 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5356 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5357 U_ASSERT(minML <= maxML);
5358 U_ASSERT(minML >= 0);
5359
5360 // Fetch (from data) the last input index where a match was attempted.
5361 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5362 int64_t &lbStartIdx = fData[opValue+4];
5363 if (lbStartIdx < 0) {
5364 // First time through loop.
5365 lbStartIdx = fp->fInputIdx - minML;
5366 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5367 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5368 }
5369 } else {
5370 // 2nd through nth time through the loop.
5371 // Back up start position for match by one.
5372 if (lbStartIdx == 0) {
5373 lbStartIdx--;
5374 } else {
5375 U16_BACK_1(inputBuf, 0, lbStartIdx);
5376 }
5377 }
5378
5379 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5380 // We have tried all potential match starting points without
5381 // getting a match. Backtrack out, and out of the
5382 // Look Behind altogether.
5383 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5384 fActiveStart = fData[opValue+2];
5385 fActiveLimit = fData[opValue+3];
5386 U_ASSERT(fActiveStart >= 0);
5387 U_ASSERT(fActiveLimit <= fInputLength);
5388 break;
5389 }
5390
5391 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5392 // (successful match will fall off the end of the loop.)
5393 fp = StateSave(fp, fp->fPatIdx-3, status);
5394 fp->fInputIdx = lbStartIdx;
5395 }
5396 break;
5397
5398 case URX_LB_END:
5399 // End of a look-behind block, after a successful match.
5400 {
5401 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5402 if (fp->fInputIdx != fActiveLimit) {
5403 // The look-behind expression matched, but the match did not
5404 // extend all the way to the point that we are looking behind from.
5405 // FAIL out of here, which will take us back to the LB_CONT, which
5406 // will retry the match starting at another position or fail
5407 // the look-behind altogether, whichever is appropriate.
5408 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5409 break;
5410 }
5411
5412 // Look-behind match is good. Restore the orignal input string region,
5413 // which had been truncated to pin the end of the lookbehind match to the
5414 // position being looked-behind.
5415 fActiveStart = fData[opValue+2];
5416 fActiveLimit = fData[opValue+3];
5417 U_ASSERT(fActiveStart >= 0);
5418 U_ASSERT(fActiveLimit <= fInputLength);
5419 }
5420 break;
5421
5422
5423 case URX_LBN_CONT:
5424 {
5425 // Negative Look-Behind, at top of loop checking for matches of LB expression
5426 // at all possible input starting positions.
5427
5428 // Fetch the extra parameters of this op.
5429 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5430 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5431 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5432 continueLoc = URX_VAL(continueLoc);
5433 U_ASSERT(minML <= maxML);
5434 U_ASSERT(minML >= 0);
5435 U_ASSERT(continueLoc > fp->fPatIdx);
5436
5437 // Fetch (from data) the last input index where a match was attempted.
5438 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5439 int64_t &lbStartIdx = fData[opValue+4];
5440 if (lbStartIdx < 0) {
5441 // First time through loop.
5442 lbStartIdx = fp->fInputIdx - minML;
5443 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5444 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5445 }
5446 } else {
5447 // 2nd through nth time through the loop.
5448 // Back up start position for match by one.
5449 if (lbStartIdx == 0) {
5450 lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
5451 } else {
5452 U16_BACK_1(inputBuf, 0, lbStartIdx);
5453 }
5454 }
5455
5456 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5457 // We have tried all potential match starting points without
5458 // getting a match, which means that the negative lookbehind as
5459 // a whole has succeeded. Jump forward to the continue location
5460 fActiveStart = fData[opValue+2];
5461 fActiveLimit = fData[opValue+3];
5462 U_ASSERT(fActiveStart >= 0);
5463 U_ASSERT(fActiveLimit <= fInputLength);
5464 fp->fPatIdx = continueLoc;
5465 break;
5466 }
5467
5468 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5469 // (successful match will cause a FAIL out of the loop altogether.)
5470 fp = StateSave(fp, fp->fPatIdx-4, status);
5471 fp->fInputIdx = lbStartIdx;
5472 }
5473 break;
5474
5475 case URX_LBN_END:
5476 // End of a negative look-behind block, after a successful match.
5477 {
5478 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5479 if (fp->fInputIdx != fActiveLimit) {
5480 // The look-behind expression matched, but the match did not
5481 // extend all the way to the point that we are looking behind from.
5482 // FAIL out of here, which will take us back to the LB_CONT, which
5483 // will retry the match starting at another position or succeed
5484 // the look-behind altogether, whichever is appropriate.
5485 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5486 break;
5487 }
5488
5489 // Look-behind expression matched, which means look-behind test as
5490 // a whole Fails
5491
5492 // Restore the orignal input string length, which had been truncated
5493 // inorder to pin the end of the lookbehind match
5494 // to the position being looked-behind.
5495 fActiveStart = fData[opValue+2];
5496 fActiveLimit = fData[opValue+3];
5497 U_ASSERT(fActiveStart >= 0);
5498 U_ASSERT(fActiveLimit <= fInputLength);
5499
5500 // Restore original stack position, discarding any state saved
5501 // by the successful pattern match.
5502 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5503 int32_t newStackSize = (int32_t)fData[opValue];
5504 U_ASSERT(fStack->size() > newStackSize);
5505 fStack->setSize(newStackSize);
5506
5507 // FAIL, which will take control back to someplace
5508 // prior to entering the look-behind test.
5509 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5510 }
5511 break;
5512
5513
5514 case URX_LOOP_SR_I:
5515 // Loop Initialization for the optimized implementation of
5516 // [some character set]*
5517 // This op scans through all matching input.
5518 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5519 {
5520 U_ASSERT(opValue > 0 && opValue < fSets->size());
5521 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5522 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
5523
5524 // Loop through input, until either the input is exhausted or
5525 // we reach a character that is not a member of the set.
5526 int32_t ix = (int32_t)fp->fInputIdx;
5527 for (;;) {
5528 if (ix >= fActiveLimit) {
5529 fHitEnd = TRUE;
5530 break;
5531 }
5532 UChar32 c;
5533 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5534 if (c<256) {
5535 if (s8->contains(c) == FALSE) {
5536 U16_BACK_1(inputBuf, 0, ix);
5537 break;
5538 }
5539 } else {
5540 if (s->contains(c) == FALSE) {
5541 U16_BACK_1(inputBuf, 0, ix);
5542 break;
5543 }
5544 }
5545 }
5546
5547 // If there were no matching characters, skip over the loop altogether.
5548 // The loop doesn't run at all, a * op always succeeds.
5549 if (ix == fp->fInputIdx) {
5550 fp->fPatIdx++; // skip the URX_LOOP_C op.
5551 break;
5552 }
5553
5554 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5555 // must follow. It's operand is the stack location
5556 // that holds the starting input index for the match of this [set]*
5557 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5558 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5559 int32_t stackLoc = URX_VAL(loopcOp);
5560 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5561 fp->fExtra[stackLoc] = fp->fInputIdx;
5562 fp->fInputIdx = ix;
5563
5564 // Save State to the URX_LOOP_C op that follows this one,
5565 // so that match failures in the following code will return to there.
5566 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5567 fp = StateSave(fp, fp->fPatIdx, status);
5568 fp->fPatIdx++;
5569 }
5570 break;
5571
5572
5573 case URX_LOOP_DOT_I:
5574 // Loop Initialization for the optimized implementation of .*
5575 // This op scans through all remaining input.
5576 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5577 {
5578 // Loop through input until the input is exhausted (we reach an end-of-line)
5579 // In DOTALL mode, we can just go straight to the end of the input.
5580 int32_t ix;
5581 if ((opValue & 1) == 1) {
5582 // Dot-matches-All mode. Jump straight to the end of the string.
5583 ix = (int32_t)fActiveLimit;
5584 fHitEnd = TRUE;
5585 } else {
5586 // NOT DOT ALL mode. Line endings do not match '.'
5587 // Scan forward until a line ending or end of input.
5588 ix = (int32_t)fp->fInputIdx;
5589 for (;;) {
5590 if (ix >= fActiveLimit) {
5591 fHitEnd = TRUE;
5592 break;
5593 }
5594 UChar32 c;
5595 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5596 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5597 if ((c == 0x0a) || // 0x0a is newline in both modes.
5598 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5599 isLineTerminator(c))) {
5600 // char is a line ending. Put the input pos back to the
5601 // line ending char, and exit the scanning loop.
5602 U16_BACK_1(inputBuf, 0, ix);
5603 break;
5604 }
5605 }
5606 }
5607 }
5608
5609 // If there were no matching characters, skip over the loop altogether.
5610 // The loop doesn't run at all, a * op always succeeds.
5611 if (ix == fp->fInputIdx) {
5612 fp->fPatIdx++; // skip the URX_LOOP_C op.
5613 break;
5614 }
5615
5616 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5617 // must follow. It's operand is the stack location
5618 // that holds the starting input index for the match of this .*
5619 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5620 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5621 int32_t stackLoc = URX_VAL(loopcOp);
5622 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5623 fp->fExtra[stackLoc] = fp->fInputIdx;
5624 fp->fInputIdx = ix;
5625
5626 // Save State to the URX_LOOP_C op that follows this one,
5627 // so that match failures in the following code will return to there.
5628 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5629 fp = StateSave(fp, fp->fPatIdx, status);
5630 fp->fPatIdx++;
5631 }
5632 break;
5633
5634
5635 case URX_LOOP_C:
5636 {
5637 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5638 backSearchIndex = (int32_t)fp->fExtra[opValue];
5639 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5640 if (backSearchIndex == fp->fInputIdx) {
5641 // We've backed up the input idx to the point that the loop started.
5642 // The loop is done. Leave here without saving state.
5643 // Subsequent failures won't come back here.
5644 break;
5645 }
5646 // Set up for the next iteration of the loop, with input index
5647 // backed up by one from the last time through,
5648 // and a state save to this instruction in case the following code fails again.
5649 // (We're going backwards because this loop emulates stack unwinding, not
5650 // the initial scan forward.)
5651 U_ASSERT(fp->fInputIdx > 0);
5652 UChar32 prevC;
5653 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5654
5655 if (prevC == 0x0a &&
5656 fp->fInputIdx > backSearchIndex &&
5657 inputBuf[fp->fInputIdx-1] == 0x0d) {
5658 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5659 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5660 // .*, stepping back over CRLF pair.
5661 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5662 }
5663 }
5664
5665
5666 fp = StateSave(fp, fp->fPatIdx-1, status);
5667 }
5668 break;
5669
5670
5671
5672 default:
5673 // Trouble. The compiled pattern contains an entry with an
5674 // unrecognized type tag.
5675 UPRV_UNREACHABLE;
5676 }
5677
5678 if (U_FAILURE(status)) {
5679 isMatch = FALSE;
5680 break;
5681 }
5682 }
5683
5684breakFromLoop:
5685 fMatch = isMatch;
5686 if (isMatch) {
5687 fLastMatchEnd = fMatchEnd;
5688 fMatchStart = startIdx;
5689 fMatchEnd = fp->fInputIdx;
5690 }
5691
5692#ifdef REGEX_RUN_DEBUG
5693 if (fTraceDebug) {
5694 if (isMatch) {
5695 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5696 } else {
5697 printf("No match\n\n");
5698 }
5699 }
5700#endif
5701
5702 fFrame = fp; // The active stack frame when the engine stopped.
5703 // Contains the capture group results that we need to
5704 // access later.
5705
5706 return;
5707}
5708
5709
5710UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5711
5712U_NAMESPACE_END
5713
5714#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
5715
5716