1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**************************************************************************
5* Copyright (C) 2002-2016 International Business Machines Corporation
6* and others. All rights reserved.
7**************************************************************************
8*/
9//
10// file: rematch.cpp
11//
12// Contains the implementation of class RegexMatcher,
13// which is one of the main API classes for the ICU regular expression package.
14//
15
16#include "unicode/utypes.h"
17#if !UCONFIG_NO_REGULAR_EXPRESSIONS
18
19#include "unicode/regex.h"
20#include "unicode/uniset.h"
21#include "unicode/uchar.h"
22#include "unicode/ustring.h"
23#include "unicode/rbbi.h"
24#include "unicode/utf.h"
25#include "unicode/utf16.h"
26#include "uassert.h"
27#include "cmemory.h"
28#include "cstr.h"
29#include "uvector.h"
30#include "uvectr32.h"
31#include "uvectr64.h"
32#include "regeximp.h"
33#include "regexst.h"
34#include "regextxt.h"
35#include "ucase.h"
36
37// #include <malloc.h> // Needed for heapcheck testing
38
39
40U_NAMESPACE_BEGIN
41
42// Default limit for the size of the back track stack, to avoid system
43// failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44// This value puts ICU's limits higher than most other regexp implementations,
45// which use recursion rather than the heap, and take more storage per
46// backtrack point.
47//
48static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49
50// Time limit counter constant.
51// Time limits for expression evaluation are in terms of quanta of work by
52// the engine, each of which is 10,000 state saves.
53// This constant determines that state saves per tick number.
54static const int32_t TIMER_INITIAL_VALUE = 10000;
55
56
57// Test for any of the Unicode line terminating characters.
58static inline UBool isLineTerminator(UChar32 c) {
59 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60 return false;
61 }
62 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63}
64
65//-----------------------------------------------------------------------------
66//
67// Constructor and Destructor
68//
69//-----------------------------------------------------------------------------
70RegexMatcher::RegexMatcher(const RegexPattern *pat) {
71 fDeferredStatus = U_ZERO_ERROR;
72 init(fDeferredStatus);
73 if (U_FAILURE(fDeferredStatus)) {
74 return;
75 }
76 if (pat==NULL) {
77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78 return;
79 }
80 fPattern = pat;
81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82}
83
84
85
86RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87 uint32_t flags, UErrorCode &status) {
88 init(status);
89 if (U_FAILURE(status)) {
90 return;
91 }
92 UParseError pe;
93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
94 fPattern = fPatternOwned;
95
96 UText inputText = UTEXT_INITIALIZER;
97 utext_openConstUnicodeString(&inputText, &input, &status);
98 init2(&inputText, status);
99 utext_close(&inputText);
100
101 fInputUniStrMaybeMutable = TRUE;
102}
103
104
105RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106 uint32_t flags, UErrorCode &status) {
107 init(status);
108 if (U_FAILURE(status)) {
109 return;
110 }
111 UParseError pe;
112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
113 if (U_FAILURE(status)) {
114 return;
115 }
116
117 fPattern = fPatternOwned;
118 init2(input, status);
119}
120
121
122RegexMatcher::RegexMatcher(const UnicodeString &regexp,
123 uint32_t flags, UErrorCode &status) {
124 init(status);
125 if (U_FAILURE(status)) {
126 return;
127 }
128 UParseError pe;
129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
130 if (U_FAILURE(status)) {
131 return;
132 }
133 fPattern = fPatternOwned;
134 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135}
136
137RegexMatcher::RegexMatcher(UText *regexp,
138 uint32_t flags, UErrorCode &status) {
139 init(status);
140 if (U_FAILURE(status)) {
141 return;
142 }
143 UParseError pe;
144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
145 if (U_FAILURE(status)) {
146 return;
147 }
148
149 fPattern = fPatternOwned;
150 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151}
152
153
154
155
156RegexMatcher::~RegexMatcher() {
157 delete fStack;
158 if (fData != fSmallData) {
159 uprv_free(fData);
160 fData = NULL;
161 }
162 if (fPatternOwned) {
163 delete fPatternOwned;
164 fPatternOwned = NULL;
165 fPattern = NULL;
166 }
167
168 if (fInput) {
169 delete fInput;
170 }
171 if (fInputText) {
172 utext_close(fInputText);
173 }
174 if (fAltInputText) {
175 utext_close(fAltInputText);
176 }
177
178 #if UCONFIG_NO_BREAK_ITERATION==0
179 delete fWordBreakItr;
180 #endif
181}
182
183//
184// init() common initialization for use by all constructors.
185// Initialize all fields, get the object into a consistent state.
186// This must be done even when the initial status shows an error,
187// so that the object is initialized sufficiently well for the destructor
188// to run safely.
189//
190void RegexMatcher::init(UErrorCode &status) {
191 fPattern = NULL;
192 fPatternOwned = NULL;
193 fFrameSize = 0;
194 fRegionStart = 0;
195 fRegionLimit = 0;
196 fAnchorStart = 0;
197 fAnchorLimit = 0;
198 fLookStart = 0;
199 fLookLimit = 0;
200 fActiveStart = 0;
201 fActiveLimit = 0;
202 fTransparentBounds = FALSE;
203 fAnchoringBounds = TRUE;
204 fMatch = FALSE;
205 fMatchStart = 0;
206 fMatchEnd = 0;
207 fLastMatchEnd = -1;
208 fAppendPosition = 0;
209 fHitEnd = FALSE;
210 fRequireEnd = FALSE;
211 fStack = NULL;
212 fFrame = NULL;
213 fTimeLimit = 0;
214 fTime = 0;
215 fTickCounter = 0;
216 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
217 fCallbackFn = NULL;
218 fCallbackContext = NULL;
219 fFindProgressCallbackFn = NULL;
220 fFindProgressCallbackContext = NULL;
221 fTraceDebug = FALSE;
222 fDeferredStatus = status;
223 fData = fSmallData;
224 fWordBreakItr = NULL;
225
226 fStack = NULL;
227 fInputText = NULL;
228 fAltInputText = NULL;
229 fInput = NULL;
230 fInputLength = 0;
231 fInputUniStrMaybeMutable = FALSE;
232}
233
234//
235// init2() Common initialization for use by RegexMatcher constructors, part 2.
236// This handles the common setup to be done after the Pattern is available.
237//
238void RegexMatcher::init2(UText *input, UErrorCode &status) {
239 if (U_FAILURE(status)) {
240 fDeferredStatus = status;
241 return;
242 }
243
244 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
245 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
246 if (fData == NULL) {
247 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
248 return;
249 }
250 }
251
252 fStack = new UVector64(status);
253 if (fStack == NULL) {
254 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
255 return;
256 }
257
258 reset(input);
259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260 if (U_FAILURE(status)) {
261 fDeferredStatus = status;
262 return;
263 }
264}
265
266
267static const UChar BACKSLASH = 0x5c;
268static const UChar DOLLARSIGN = 0x24;
269static const UChar LEFTBRACKET = 0x7b;
270static const UChar RIGHTBRACKET = 0x7d;
271
272//--------------------------------------------------------------------------------
273//
274// appendReplacement
275//
276//--------------------------------------------------------------------------------
277RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278 const UnicodeString &replacement,
279 UErrorCode &status) {
280 UText replacementText = UTEXT_INITIALIZER;
281
282 utext_openConstUnicodeString(&replacementText, &replacement, &status);
283 if (U_SUCCESS(status)) {
284 UText resultText = UTEXT_INITIALIZER;
285 utext_openUnicodeString(&resultText, &dest, &status);
286
287 if (U_SUCCESS(status)) {
288 appendReplacement(&resultText, &replacementText, status);
289 utext_close(&resultText);
290 }
291 utext_close(&replacementText);
292 }
293
294 return *this;
295}
296
297//
298// appendReplacement, UText mode
299//
300RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
301 UText *replacement,
302 UErrorCode &status) {
303 if (U_FAILURE(status)) {
304 return *this;
305 }
306 if (U_FAILURE(fDeferredStatus)) {
307 status = fDeferredStatus;
308 return *this;
309 }
310 if (fMatch == FALSE) {
311 status = U_REGEX_INVALID_STATE;
312 return *this;
313 }
314
315 // Copy input string from the end of previous match to start of current match
316 int64_t destLen = utext_nativeLength(dest);
317 if (fMatchStart > fAppendPosition) {
318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
319 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
320 (int32_t)(fMatchStart-fAppendPosition), &status);
321 } else {
322 int32_t len16;
323 if (UTEXT_USES_U16(fInputText)) {
324 len16 = (int32_t)(fMatchStart-fAppendPosition);
325 } else {
326 UErrorCode lengthStatus = U_ZERO_ERROR;
327 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
328 }
329 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
330 if (inputChars == NULL) {
331 status = U_MEMORY_ALLOCATION_ERROR;
332 return *this;
333 }
334 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336 uprv_free(inputChars);
337 }
338 }
339 fAppendPosition = fMatchEnd;
340
341
342 // scan the replacement text, looking for substitutions ($n) and \escapes.
343 // TODO: optimize this loop by efficiently scanning for '$' or '\',
344 // move entire ranges not containing substitutions.
345 UTEXT_SETNATIVEINDEX(replacement, 0);
346 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
347 if (c == BACKSLASH) {
348 // Backslash Escape. Copy the following char out without further checks.
349 // Note: Surrogate pairs don't need any special handling
350 // The second half wont be a '$' or a '\', and
351 // will move to the dest normally on the next
352 // loop iteration.
353 c = UTEXT_CURRENT32(replacement);
354 if (c == U_SENTINEL) {
355 break;
356 }
357
358 if (c==0x55/*U*/ || c==0x75/*u*/) {
359 // We have a \udddd or \Udddddddd escape sequence.
360 int32_t offset = 0;
361 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
363 if (escapedChar != (UChar32)0xFFFFFFFF) {
364 if (U_IS_BMP(escapedChar)) {
365 UChar c16 = (UChar)escapedChar;
366 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
367 } else {
368 UChar surrogate[2];
369 surrogate[0] = U16_LEAD(escapedChar);
370 surrogate[1] = U16_TRAIL(escapedChar);
371 if (U_SUCCESS(status)) {
372 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
373 }
374 }
375 // TODO: Report errors for mal-formed \u escapes?
376 // As this is, the original sequence is output, which may be OK.
377 if (context.lastOffset == offset) {
378 (void)UTEXT_PREVIOUS32(replacement);
379 } else if (context.lastOffset != offset-1) {
380 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
381 }
382 }
383 } else {
384 (void)UTEXT_NEXT32(replacement);
385 // Plain backslash escape. Just put out the escaped character.
386 if (U_IS_BMP(c)) {
387 UChar c16 = (UChar)c;
388 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
389 } else {
390 UChar surrogate[2];
391 surrogate[0] = U16_LEAD(c);
392 surrogate[1] = U16_TRAIL(c);
393 if (U_SUCCESS(status)) {
394 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
395 }
396 }
397 }
398 } else if (c != DOLLARSIGN) {
399 // Normal char, not a $. Copy it out without further checks.
400 if (U_IS_BMP(c)) {
401 UChar c16 = (UChar)c;
402 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
403 } else {
404 UChar surrogate[2];
405 surrogate[0] = U16_LEAD(c);
406 surrogate[1] = U16_TRAIL(c);
407 if (U_SUCCESS(status)) {
408 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
409 }
410 }
411 } else {
412 // We've got a $. Pick up a capture group name or number if one follows.
413 // Consume digits so long as the resulting group number <= the number of
414 // number of capture groups in the pattern.
415
416 int32_t groupNum = 0;
417 int32_t numDigits = 0;
418 UChar32 nextChar = utext_current32(replacement);
419 if (nextChar == LEFTBRACKET) {
420 // Scan for a Named Capture Group, ${name}.
421 UnicodeString groupName;
422 utext_next32(replacement);
423 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424 nextChar = utext_next32(replacement);
425 if (nextChar == U_SENTINEL) {
426 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
428 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
429 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
430 groupName.append(nextChar);
431 } else if (nextChar == RIGHTBRACKET) {
432 groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
433 if (groupNum == 0) {
434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435 }
436 } else {
437 // Character was something other than a name char or a closing '}'
438 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
439 }
440 }
441
442 } else if (u_isdigit(nextChar)) {
443 // $n Scan for a capture group number
444 int32_t numCaptureGroups = fPattern->fGroupMap->size();
445 for (;;) {
446 nextChar = UTEXT_CURRENT32(replacement);
447 if (nextChar == U_SENTINEL) {
448 break;
449 }
450 if (u_isdigit(nextChar) == FALSE) {
451 break;
452 }
453 int32_t nextDigitVal = u_charDigitValue(nextChar);
454 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455 // Don't consume the next digit if it makes the capture group number too big.
456 if (numDigits == 0) {
457 status = U_INDEX_OUTOFBOUNDS_ERROR;
458 }
459 break;
460 }
461 (void)UTEXT_NEXT32(replacement);
462 groupNum=groupNum*10 + nextDigitVal;
463 ++numDigits;
464 }
465 } else {
466 // $ not followed by capture group name or number.
467 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
468 }
469
470 if (U_SUCCESS(status)) {
471 destLen += appendGroup(groupNum, dest, status);
472 }
473 } // End of $ capture group handling
474 } // End of per-character loop through the replacement string.
475
476 return *this;
477}
478
479
480
481//--------------------------------------------------------------------------------
482//
483// appendTail Intended to be used in conjunction with appendReplacement()
484// To the destination string, append everything following
485// the last match position from the input string.
486//
487// Note: Match ranges do not affect appendTail or appendReplacement
488//
489//--------------------------------------------------------------------------------
490UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
491 UErrorCode status = U_ZERO_ERROR;
492 UText resultText = UTEXT_INITIALIZER;
493 utext_openUnicodeString(&resultText, &dest, &status);
494
495 if (U_SUCCESS(status)) {
496 appendTail(&resultText, status);
497 utext_close(&resultText);
498 }
499
500 return dest;
501}
502
503//
504// appendTail, UText mode
505//
506UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
507 if (U_FAILURE(status)) {
508 return dest;
509 }
510 if (U_FAILURE(fDeferredStatus)) {
511 status = fDeferredStatus;
512 return dest;
513 }
514
515 if (fInputLength > fAppendPosition) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517 int64_t destLen = utext_nativeLength(dest);
518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519 (int32_t)(fInputLength-fAppendPosition), &status);
520 } else {
521 int32_t len16;
522 if (UTEXT_USES_U16(fInputText)) {
523 len16 = (int32_t)(fInputLength-fAppendPosition);
524 } else {
525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526 status = U_ZERO_ERROR; // buffer overflow
527 }
528
529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530 if (inputChars == NULL) {
531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532 } else {
533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534 int64_t destLen = utext_nativeLength(dest);
535 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536 uprv_free(inputChars);
537 }
538 }
539 }
540 return dest;
541}
542
543
544
545//--------------------------------------------------------------------------------
546//
547// end
548//
549//--------------------------------------------------------------------------------
550int32_t RegexMatcher::end(UErrorCode &err) const {
551 return end(0, err);
552}
553
554int64_t RegexMatcher::end64(UErrorCode &err) const {
555 return end64(0, err);
556}
557
558int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559 if (U_FAILURE(err)) {
560 return -1;
561 }
562 if (fMatch == FALSE) {
563 err = U_REGEX_INVALID_STATE;
564 return -1;
565 }
566 if (group < 0 || group > fPattern->fGroupMap->size()) {
567 err = U_INDEX_OUTOFBOUNDS_ERROR;
568 return -1;
569 }
570 int64_t e = -1;
571 if (group == 0) {
572 e = fMatchEnd;
573 } else {
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577 U_ASSERT(groupOffset < fPattern->fFrameSize);
578 U_ASSERT(groupOffset >= 0);
579 e = fFrame->fExtra[groupOffset + 1];
580 }
581
582 return e;
583}
584
585int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586 return (int32_t)end64(group, err);
587}
588
589//--------------------------------------------------------------------------------
590//
591// findProgressInterrupt This function is called once for each advance in the target
592// string from the find() function, and calls the user progress callback
593// function if there is one installed.
594//
595// Return: TRUE if the find operation is to be terminated.
596// FALSE if the find operation is to continue running.
597//
598//--------------------------------------------------------------------------------
599UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601 status = U_REGEX_STOPPED_BY_CALLER;
602 return TRUE;
603 }
604 return FALSE;
605}
606
607//--------------------------------------------------------------------------------
608//
609// find()
610//
611//--------------------------------------------------------------------------------
612UBool RegexMatcher::find() {
613 if (U_FAILURE(fDeferredStatus)) {
614 return FALSE;
615 }
616 UErrorCode status = U_ZERO_ERROR;
617 UBool result = find(status);
618 return result;
619}
620
621//--------------------------------------------------------------------------------
622//
623// find()
624//
625//--------------------------------------------------------------------------------
626UBool RegexMatcher::find(UErrorCode &status) {
627 // Start at the position of the last match end. (Will be zero if the
628 // matcher has been reset.)
629 //
630 if (U_FAILURE(status)) {
631 return FALSE;
632 }
633 if (U_FAILURE(fDeferredStatus)) {
634 status = fDeferredStatus;
635 return FALSE;
636 }
637
638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
639 return findUsingChunk(status);
640 }
641
642 int64_t startPos = fMatchEnd;
643 if (startPos==0) {
644 startPos = fActiveStart;
645 }
646
647 if (fMatch) {
648 // Save the position of any previous successful match.
649 fLastMatchEnd = fMatchEnd;
650
651 if (fMatchStart == fMatchEnd) {
652 // Previous match had zero length. Move start position up one position
653 // to avoid sending find() into a loop on zero-length matches.
654 if (startPos >= fActiveLimit) {
655 fMatch = FALSE;
656 fHitEnd = TRUE;
657 return FALSE;
658 }
659 UTEXT_SETNATIVEINDEX(fInputText, startPos);
660 (void)UTEXT_NEXT32(fInputText);
661 startPos = UTEXT_GETNATIVEINDEX(fInputText);
662 }
663 } else {
664 if (fLastMatchEnd >= 0) {
665 // A previous find() failed to match. Don't try again.
666 // (without this test, a pattern with a zero-length match
667 // could match again at the end of an input string.)
668 fHitEnd = TRUE;
669 return FALSE;
670 }
671 }
672
673
674 // Compute the position in the input string beyond which a match can not begin, because
675 // the minimum length match would extend past the end of the input.
676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
677 // Be aware of possible overflows if making changes here.
678 int64_t testStartLimit;
679 if (UTEXT_USES_U16(fInputText)) {
680 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681 if (startPos > testStartLimit) {
682 fMatch = FALSE;
683 fHitEnd = TRUE;
684 return FALSE;
685 }
686 } else {
687 // We don't know exactly how long the minimum match length is in native characters.
688 // Treat anything > 0 as 1.
689 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
690 }
691
692 UChar32 c;
693 U_ASSERT(startPos >= 0);
694
695 switch (fPattern->fStartType) {
696 case START_NO_INFO:
697 // No optimization was found.
698 // Try a match at each input position.
699 for (;;) {
700 MatchAt(startPos, FALSE, status);
701 if (U_FAILURE(status)) {
702 return FALSE;
703 }
704 if (fMatch) {
705 return TRUE;
706 }
707 if (startPos >= testStartLimit) {
708 fHitEnd = TRUE;
709 return FALSE;
710 }
711 UTEXT_SETNATIVEINDEX(fInputText, startPos);
712 (void)UTEXT_NEXT32(fInputText);
713 startPos = UTEXT_GETNATIVEINDEX(fInputText);
714 // Note that it's perfectly OK for a pattern to have a zero-length
715 // match at the end of a string, so we must make sure that the loop
716 // runs with startPos == testStartLimit the last time through.
717 if (findProgressInterrupt(startPos, status))
718 return FALSE;
719 }
720 UPRV_UNREACHABLE;
721
722 case START_START:
723 // Matches are only possible at the start of the input string
724 // (pattern begins with ^ or \A)
725 if (startPos > fActiveStart) {
726 fMatch = FALSE;
727 return FALSE;
728 }
729 MatchAt(startPos, FALSE, status);
730 if (U_FAILURE(status)) {
731 return FALSE;
732 }
733 return fMatch;
734
735
736 case START_SET:
737 {
738 // Match may start on any char from a pre-computed set.
739 U_ASSERT(fPattern->fMinMatchLen > 0);
740 UTEXT_SETNATIVEINDEX(fInputText, startPos);
741 for (;;) {
742 int64_t pos = startPos;
743 c = UTEXT_NEXT32(fInputText);
744 startPos = UTEXT_GETNATIVEINDEX(fInputText);
745 // c will be -1 (U_SENTINEL) at end of text, in which case we
746 // skip this next block (so we don't have a negative array index)
747 // and handle end of text in the following block.
748 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749 (c>=256 && fPattern->fInitialChars->contains(c)))) {
750 MatchAt(pos, FALSE, status);
751 if (U_FAILURE(status)) {
752 return FALSE;
753 }
754 if (fMatch) {
755 return TRUE;
756 }
757 UTEXT_SETNATIVEINDEX(fInputText, pos);
758 }
759 if (startPos > testStartLimit) {
760 fMatch = FALSE;
761 fHitEnd = TRUE;
762 return FALSE;
763 }
764 if (findProgressInterrupt(startPos, status))
765 return FALSE;
766 }
767 }
768 UPRV_UNREACHABLE;
769
770 case START_STRING:
771 case START_CHAR:
772 {
773 // Match starts on exactly one char.
774 U_ASSERT(fPattern->fMinMatchLen > 0);
775 UChar32 theChar = fPattern->fInitialChar;
776 UTEXT_SETNATIVEINDEX(fInputText, startPos);
777 for (;;) {
778 int64_t pos = startPos;
779 c = UTEXT_NEXT32(fInputText);
780 startPos = UTEXT_GETNATIVEINDEX(fInputText);
781 if (c == theChar) {
782 MatchAt(pos, FALSE, status);
783 if (U_FAILURE(status)) {
784 return FALSE;
785 }
786 if (fMatch) {
787 return TRUE;
788 }
789 UTEXT_SETNATIVEINDEX(fInputText, startPos);
790 }
791 if (startPos > testStartLimit) {
792 fMatch = FALSE;
793 fHitEnd = TRUE;
794 return FALSE;
795 }
796 if (findProgressInterrupt(startPos, status))
797 return FALSE;
798 }
799 }
800 UPRV_UNREACHABLE;
801
802 case START_LINE:
803 {
804 UChar32 ch;
805 if (startPos == fAnchorStart) {
806 MatchAt(startPos, FALSE, status);
807 if (U_FAILURE(status)) {
808 return FALSE;
809 }
810 if (fMatch) {
811 return TRUE;
812 }
813 UTEXT_SETNATIVEINDEX(fInputText, startPos);
814 ch = UTEXT_NEXT32(fInputText);
815 startPos = UTEXT_GETNATIVEINDEX(fInputText);
816 } else {
817 UTEXT_SETNATIVEINDEX(fInputText, startPos);
818 ch = UTEXT_PREVIOUS32(fInputText);
819 UTEXT_SETNATIVEINDEX(fInputText, startPos);
820 }
821
822 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
823 for (;;) {
824 if (ch == 0x0a) {
825 MatchAt(startPos, FALSE, status);
826 if (U_FAILURE(status)) {
827 return FALSE;
828 }
829 if (fMatch) {
830 return TRUE;
831 }
832 UTEXT_SETNATIVEINDEX(fInputText, startPos);
833 }
834 if (startPos >= testStartLimit) {
835 fMatch = FALSE;
836 fHitEnd = TRUE;
837 return FALSE;
838 }
839 ch = UTEXT_NEXT32(fInputText);
840 startPos = UTEXT_GETNATIVEINDEX(fInputText);
841 // Note that it's perfectly OK for a pattern to have a zero-length
842 // match at the end of a string, so we must make sure that the loop
843 // runs with startPos == testStartLimit the last time through.
844 if (findProgressInterrupt(startPos, status))
845 return FALSE;
846 }
847 } else {
848 for (;;) {
849 if (isLineTerminator(ch)) {
850 if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
851 (void)UTEXT_NEXT32(fInputText);
852 startPos = UTEXT_GETNATIVEINDEX(fInputText);
853 }
854 MatchAt(startPos, FALSE, status);
855 if (U_FAILURE(status)) {
856 return FALSE;
857 }
858 if (fMatch) {
859 return TRUE;
860 }
861 UTEXT_SETNATIVEINDEX(fInputText, startPos);
862 }
863 if (startPos >= testStartLimit) {
864 fMatch = FALSE;
865 fHitEnd = TRUE;
866 return FALSE;
867 }
868 ch = UTEXT_NEXT32(fInputText);
869 startPos = UTEXT_GETNATIVEINDEX(fInputText);
870 // Note that it's perfectly OK for a pattern to have a zero-length
871 // match at the end of a string, so we must make sure that the loop
872 // runs with startPos == testStartLimit the last time through.
873 if (findProgressInterrupt(startPos, status))
874 return FALSE;
875 }
876 }
877 }
878
879 default:
880 UPRV_UNREACHABLE;
881 }
882
883 UPRV_UNREACHABLE;
884}
885
886
887
888UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
889 if (U_FAILURE(status)) {
890 return FALSE;
891 }
892 if (U_FAILURE(fDeferredStatus)) {
893 status = fDeferredStatus;
894 return FALSE;
895 }
896 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
897 // This will reset the region to be the full input length.
898 if (start < 0) {
899 status = U_INDEX_OUTOFBOUNDS_ERROR;
900 return FALSE;
901 }
902
903 int64_t nativeStart = start;
904 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
905 status = U_INDEX_OUTOFBOUNDS_ERROR;
906 return FALSE;
907 }
908 fMatchEnd = nativeStart;
909 return find(status);
910}
911
912
913//--------------------------------------------------------------------------------
914//
915// findUsingChunk() -- like find(), but with the advance knowledge that the
916// entire string is available in the UText's chunk buffer.
917//
918//--------------------------------------------------------------------------------
919UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
920 // Start at the position of the last match end. (Will be zero if the
921 // matcher has been reset.
922 //
923
924 int32_t startPos = (int32_t)fMatchEnd;
925 if (startPos==0) {
926 startPos = (int32_t)fActiveStart;
927 }
928
929 const UChar *inputBuf = fInputText->chunkContents;
930
931 if (fMatch) {
932 // Save the position of any previous successful match.
933 fLastMatchEnd = fMatchEnd;
934
935 if (fMatchStart == fMatchEnd) {
936 // Previous match had zero length. Move start position up one position
937 // to avoid sending find() into a loop on zero-length matches.
938 if (startPos >= fActiveLimit) {
939 fMatch = FALSE;
940 fHitEnd = TRUE;
941 return FALSE;
942 }
943 U16_FWD_1(inputBuf, startPos, fInputLength);
944 }
945 } else {
946 if (fLastMatchEnd >= 0) {
947 // A previous find() failed to match. Don't try again.
948 // (without this test, a pattern with a zero-length match
949 // could match again at the end of an input string.)
950 fHitEnd = TRUE;
951 return FALSE;
952 }
953 }
954
955
956 // Compute the position in the input string beyond which a match can not begin, because
957 // the minimum length match would extend past the end of the input.
958 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
959 // Be aware of possible overflows if making changes here.
960 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
961 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
962 if (startPos > testLen) {
963 fMatch = FALSE;
964 fHitEnd = TRUE;
965 return FALSE;
966 }
967
968 UChar32 c;
969 U_ASSERT(startPos >= 0);
970
971 switch (fPattern->fStartType) {
972 case START_NO_INFO:
973 // No optimization was found.
974 // Try a match at each input position.
975 for (;;) {
976 MatchChunkAt(startPos, FALSE, status);
977 if (U_FAILURE(status)) {
978 return FALSE;
979 }
980 if (fMatch) {
981 return TRUE;
982 }
983 if (startPos >= testLen) {
984 fHitEnd = TRUE;
985 return FALSE;
986 }
987 U16_FWD_1(inputBuf, startPos, fActiveLimit);
988 // Note that it's perfectly OK for a pattern to have a zero-length
989 // match at the end of a string, so we must make sure that the loop
990 // runs with startPos == testLen the last time through.
991 if (findProgressInterrupt(startPos, status))
992 return FALSE;
993 }
994 UPRV_UNREACHABLE;
995
996 case START_START:
997 // Matches are only possible at the start of the input string
998 // (pattern begins with ^ or \A)
999 if (startPos > fActiveStart) {
1000 fMatch = FALSE;
1001 return FALSE;
1002 }
1003 MatchChunkAt(startPos, FALSE, status);
1004 if (U_FAILURE(status)) {
1005 return FALSE;
1006 }
1007 return fMatch;
1008
1009
1010 case START_SET:
1011 {
1012 // Match may start on any char from a pre-computed set.
1013 U_ASSERT(fPattern->fMinMatchLen > 0);
1014 for (;;) {
1015 int32_t pos = startPos;
1016 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1017 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1018 (c>=256 && fPattern->fInitialChars->contains(c))) {
1019 MatchChunkAt(pos, FALSE, status);
1020 if (U_FAILURE(status)) {
1021 return FALSE;
1022 }
1023 if (fMatch) {
1024 return TRUE;
1025 }
1026 }
1027 if (startPos > testLen) {
1028 fMatch = FALSE;
1029 fHitEnd = TRUE;
1030 return FALSE;
1031 }
1032 if (findProgressInterrupt(startPos, status))
1033 return FALSE;
1034 }
1035 }
1036 UPRV_UNREACHABLE;
1037
1038 case START_STRING:
1039 case START_CHAR:
1040 {
1041 // Match starts on exactly one char.
1042 U_ASSERT(fPattern->fMinMatchLen > 0);
1043 UChar32 theChar = fPattern->fInitialChar;
1044 for (;;) {
1045 int32_t pos = startPos;
1046 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1047 if (c == theChar) {
1048 MatchChunkAt(pos, FALSE, status);
1049 if (U_FAILURE(status)) {
1050 return FALSE;
1051 }
1052 if (fMatch) {
1053 return TRUE;
1054 }
1055 }
1056 if (startPos > testLen) {
1057 fMatch = FALSE;
1058 fHitEnd = TRUE;
1059 return FALSE;
1060 }
1061 if (findProgressInterrupt(startPos, status))
1062 return FALSE;
1063 }
1064 }
1065 UPRV_UNREACHABLE;
1066
1067 case START_LINE:
1068 {
1069 UChar32 ch;
1070 if (startPos == fAnchorStart) {
1071 MatchChunkAt(startPos, FALSE, status);
1072 if (U_FAILURE(status)) {
1073 return FALSE;
1074 }
1075 if (fMatch) {
1076 return TRUE;
1077 }
1078 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1079 }
1080
1081 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1082 for (;;) {
1083 ch = inputBuf[startPos-1];
1084 if (ch == 0x0a) {
1085 MatchChunkAt(startPos, FALSE, status);
1086 if (U_FAILURE(status)) {
1087 return FALSE;
1088 }
1089 if (fMatch) {
1090 return TRUE;
1091 }
1092 }
1093 if (startPos >= testLen) {
1094 fMatch = FALSE;
1095 fHitEnd = TRUE;
1096 return FALSE;
1097 }
1098 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1099 // Note that it's perfectly OK for a pattern to have a zero-length
1100 // match at the end of a string, so we must make sure that the loop
1101 // runs with startPos == testLen the last time through.
1102 if (findProgressInterrupt(startPos, status))
1103 return FALSE;
1104 }
1105 } else {
1106 for (;;) {
1107 ch = inputBuf[startPos-1];
1108 if (isLineTerminator(ch)) {
1109 if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1110 startPos++;
1111 }
1112 MatchChunkAt(startPos, FALSE, status);
1113 if (U_FAILURE(status)) {
1114 return FALSE;
1115 }
1116 if (fMatch) {
1117 return TRUE;
1118 }
1119 }
1120 if (startPos >= testLen) {
1121 fMatch = FALSE;
1122 fHitEnd = TRUE;
1123 return FALSE;
1124 }
1125 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1126 // Note that it's perfectly OK for a pattern to have a zero-length
1127 // match at the end of a string, so we must make sure that the loop
1128 // runs with startPos == testLen the last time through.
1129 if (findProgressInterrupt(startPos, status))
1130 return FALSE;
1131 }
1132 }
1133 }
1134
1135 default:
1136 UPRV_UNREACHABLE;
1137 }
1138
1139 UPRV_UNREACHABLE;
1140}
1141
1142
1143
1144//--------------------------------------------------------------------------------
1145//
1146// group()
1147//
1148//--------------------------------------------------------------------------------
1149UnicodeString RegexMatcher::group(UErrorCode &status) const {
1150 return group(0, status);
1151}
1152
1153// Return immutable shallow clone
1154UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1155 return group(0, dest, group_len, status);
1156}
1157
1158// Return immutable shallow clone
1159UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1160 group_len = 0;
1161 if (U_FAILURE(status)) {
1162 return dest;
1163 }
1164 if (U_FAILURE(fDeferredStatus)) {
1165 status = fDeferredStatus;
1166 } else if (fMatch == FALSE) {
1167 status = U_REGEX_INVALID_STATE;
1168 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1169 status = U_INDEX_OUTOFBOUNDS_ERROR;
1170 }
1171
1172 if (U_FAILURE(status)) {
1173 return dest;
1174 }
1175
1176 int64_t s, e;
1177 if (groupNum == 0) {
1178 s = fMatchStart;
1179 e = fMatchEnd;
1180 } else {
1181 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1182 U_ASSERT(groupOffset < fPattern->fFrameSize);
1183 U_ASSERT(groupOffset >= 0);
1184 s = fFrame->fExtra[groupOffset];
1185 e = fFrame->fExtra[groupOffset+1];
1186 }
1187
1188 if (s < 0) {
1189 // A capture group wasn't part of the match
1190 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1191 }
1192 U_ASSERT(s <= e);
1193 group_len = e - s;
1194
1195 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1196 if (dest)
1197 UTEXT_SETNATIVEINDEX(dest, s);
1198 return dest;
1199}
1200
1201UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1202 UnicodeString result;
1203 int64_t groupStart = start64(groupNum, status);
1204 int64_t groupEnd = end64(groupNum, status);
1205 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1206 return result;
1207 }
1208
1209 // Get the group length using a utext_extract preflight.
1210 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1211 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1212 if (status != U_BUFFER_OVERFLOW_ERROR) {
1213 return result;
1214 }
1215
1216 status = U_ZERO_ERROR;
1217 UChar *buf = result.getBuffer(length);
1218 if (buf == NULL) {
1219 status = U_MEMORY_ALLOCATION_ERROR;
1220 } else {
1221 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1222 result.releaseBuffer(extractLength);
1223 U_ASSERT(length == extractLength);
1224 }
1225 return result;
1226}
1227
1228
1229//--------------------------------------------------------------------------------
1230//
1231// appendGroup() -- currently internal only, appends a group to a UText rather
1232// than replacing its contents
1233//
1234//--------------------------------------------------------------------------------
1235
1236int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1237 if (U_FAILURE(status)) {
1238 return 0;
1239 }
1240 if (U_FAILURE(fDeferredStatus)) {
1241 status = fDeferredStatus;
1242 return 0;
1243 }
1244 int64_t destLen = utext_nativeLength(dest);
1245
1246 if (fMatch == FALSE) {
1247 status = U_REGEX_INVALID_STATE;
1248 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1249 }
1250 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1251 status = U_INDEX_OUTOFBOUNDS_ERROR;
1252 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1253 }
1254
1255 int64_t s, e;
1256 if (groupNum == 0) {
1257 s = fMatchStart;
1258 e = fMatchEnd;
1259 } else {
1260 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1261 U_ASSERT(groupOffset < fPattern->fFrameSize);
1262 U_ASSERT(groupOffset >= 0);
1263 s = fFrame->fExtra[groupOffset];
1264 e = fFrame->fExtra[groupOffset+1];
1265 }
1266
1267 if (s < 0) {
1268 // A capture group wasn't part of the match
1269 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1270 }
1271 U_ASSERT(s <= e);
1272
1273 int64_t deltaLen;
1274 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1275 U_ASSERT(e <= fInputLength);
1276 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1277 } else {
1278 int32_t len16;
1279 if (UTEXT_USES_U16(fInputText)) {
1280 len16 = (int32_t)(e-s);
1281 } else {
1282 UErrorCode lengthStatus = U_ZERO_ERROR;
1283 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1284 }
1285 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1286 if (groupChars == NULL) {
1287 status = U_MEMORY_ALLOCATION_ERROR;
1288 return 0;
1289 }
1290 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1291
1292 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1293 uprv_free(groupChars);
1294 }
1295 return deltaLen;
1296}
1297
1298
1299
1300//--------------------------------------------------------------------------------
1301//
1302// groupCount()
1303//
1304//--------------------------------------------------------------------------------
1305int32_t RegexMatcher::groupCount() const {
1306 return fPattern->fGroupMap->size();
1307}
1308
1309//--------------------------------------------------------------------------------
1310//
1311// hasAnchoringBounds()
1312//
1313//--------------------------------------------------------------------------------
1314UBool RegexMatcher::hasAnchoringBounds() const {
1315 return fAnchoringBounds;
1316}
1317
1318
1319//--------------------------------------------------------------------------------
1320//
1321// hasTransparentBounds()
1322//
1323//--------------------------------------------------------------------------------
1324UBool RegexMatcher::hasTransparentBounds() const {
1325 return fTransparentBounds;
1326}
1327
1328
1329
1330//--------------------------------------------------------------------------------
1331//
1332// hitEnd()
1333//
1334//--------------------------------------------------------------------------------
1335UBool RegexMatcher::hitEnd() const {
1336 return fHitEnd;
1337}
1338
1339
1340//--------------------------------------------------------------------------------
1341//
1342// input()
1343//
1344//--------------------------------------------------------------------------------
1345const UnicodeString &RegexMatcher::input() const {
1346 if (!fInput) {
1347 UErrorCode status = U_ZERO_ERROR;
1348 int32_t len16;
1349 if (UTEXT_USES_U16(fInputText)) {
1350 len16 = (int32_t)fInputLength;
1351 } else {
1352 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1353 status = U_ZERO_ERROR; // overflow, length status
1354 }
1355 UnicodeString *result = new UnicodeString(len16, 0, 0);
1356
1357 UChar *inputChars = result->getBuffer(len16);
1358 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1359 result->releaseBuffer(len16);
1360
1361 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1362 }
1363
1364 return *fInput;
1365}
1366
1367//--------------------------------------------------------------------------------
1368//
1369// inputText()
1370//
1371//--------------------------------------------------------------------------------
1372UText *RegexMatcher::inputText() const {
1373 return fInputText;
1374}
1375
1376
1377//--------------------------------------------------------------------------------
1378//
1379// getInput() -- like inputText(), but makes a clone or copies into another UText
1380//
1381//--------------------------------------------------------------------------------
1382UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1383 if (U_FAILURE(status)) {
1384 return dest;
1385 }
1386 if (U_FAILURE(fDeferredStatus)) {
1387 status = fDeferredStatus;
1388 return dest;
1389 }
1390
1391 if (dest) {
1392 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1393 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1394 } else {
1395 int32_t input16Len;
1396 if (UTEXT_USES_U16(fInputText)) {
1397 input16Len = (int32_t)fInputLength;
1398 } else {
1399 UErrorCode lengthStatus = U_ZERO_ERROR;
1400 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1401 }
1402 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1403 if (inputChars == NULL) {
1404 return dest;
1405 }
1406
1407 status = U_ZERO_ERROR;
1408 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1409 status = U_ZERO_ERROR;
1410 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1411
1412 uprv_free(inputChars);
1413 }
1414 return dest;
1415 } else {
1416 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1417 }
1418}
1419
1420
1421static UBool compat_SyncMutableUTextContents(UText *ut);
1422static UBool compat_SyncMutableUTextContents(UText *ut) {
1423 UBool retVal = FALSE;
1424
1425 // In the following test, we're really only interested in whether the UText should switch
1426 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1427 // will still point to the correct data.
1428 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1429 UnicodeString *us=(UnicodeString *)ut->context;
1430
1431 // Update to the latest length.
1432 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1433 int32_t newLength = us->length();
1434
1435 // Update the chunk description.
1436 // The buffer may have switched between stack- and heap-based.
1437 ut->chunkContents = us->getBuffer();
1438 ut->chunkLength = newLength;
1439 ut->chunkNativeLimit = newLength;
1440 ut->nativeIndexingLimit = newLength;
1441 retVal = TRUE;
1442 }
1443
1444 return retVal;
1445}
1446
1447//--------------------------------------------------------------------------------
1448//
1449// lookingAt()
1450//
1451//--------------------------------------------------------------------------------
1452UBool RegexMatcher::lookingAt(UErrorCode &status) {
1453 if (U_FAILURE(status)) {
1454 return FALSE;
1455 }
1456 if (U_FAILURE(fDeferredStatus)) {
1457 status = fDeferredStatus;
1458 return FALSE;
1459 }
1460
1461 if (fInputUniStrMaybeMutable) {
1462 if (compat_SyncMutableUTextContents(fInputText)) {
1463 fInputLength = utext_nativeLength(fInputText);
1464 reset();
1465 }
1466 }
1467 else {
1468 resetPreserveRegion();
1469 }
1470 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1471 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1472 } else {
1473 MatchAt(fActiveStart, FALSE, status);
1474 }
1475 return fMatch;
1476}
1477
1478
1479UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1480 if (U_FAILURE(status)) {
1481 return FALSE;
1482 }
1483 if (U_FAILURE(fDeferredStatus)) {
1484 status = fDeferredStatus;
1485 return FALSE;
1486 }
1487 reset();
1488
1489 if (start < 0) {
1490 status = U_INDEX_OUTOFBOUNDS_ERROR;
1491 return FALSE;
1492 }
1493
1494 if (fInputUniStrMaybeMutable) {
1495 if (compat_SyncMutableUTextContents(fInputText)) {
1496 fInputLength = utext_nativeLength(fInputText);
1497 reset();
1498 }
1499 }
1500
1501 int64_t nativeStart;
1502 nativeStart = start;
1503 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1504 status = U_INDEX_OUTOFBOUNDS_ERROR;
1505 return FALSE;
1506 }
1507
1508 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1509 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1510 } else {
1511 MatchAt(nativeStart, FALSE, status);
1512 }
1513 return fMatch;
1514}
1515
1516
1517
1518//--------------------------------------------------------------------------------
1519//
1520// matches()
1521//
1522//--------------------------------------------------------------------------------
1523UBool RegexMatcher::matches(UErrorCode &status) {
1524 if (U_FAILURE(status)) {
1525 return FALSE;
1526 }
1527 if (U_FAILURE(fDeferredStatus)) {
1528 status = fDeferredStatus;
1529 return FALSE;
1530 }
1531
1532 if (fInputUniStrMaybeMutable) {
1533 if (compat_SyncMutableUTextContents(fInputText)) {
1534 fInputLength = utext_nativeLength(fInputText);
1535 reset();
1536 }
1537 }
1538 else {
1539 resetPreserveRegion();
1540 }
1541
1542 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1543 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1544 } else {
1545 MatchAt(fActiveStart, TRUE, status);
1546 }
1547 return fMatch;
1548}
1549
1550
1551UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1552 if (U_FAILURE(status)) {
1553 return FALSE;
1554 }
1555 if (U_FAILURE(fDeferredStatus)) {
1556 status = fDeferredStatus;
1557 return FALSE;
1558 }
1559 reset();
1560
1561 if (start < 0) {
1562 status = U_INDEX_OUTOFBOUNDS_ERROR;
1563 return FALSE;
1564 }
1565
1566 if (fInputUniStrMaybeMutable) {
1567 if (compat_SyncMutableUTextContents(fInputText)) {
1568 fInputLength = utext_nativeLength(fInputText);
1569 reset();
1570 }
1571 }
1572
1573 int64_t nativeStart;
1574 nativeStart = start;
1575 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1576 status = U_INDEX_OUTOFBOUNDS_ERROR;
1577 return FALSE;
1578 }
1579
1580 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1581 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1582 } else {
1583 MatchAt(nativeStart, TRUE, status);
1584 }
1585 return fMatch;
1586}
1587
1588
1589
1590//--------------------------------------------------------------------------------
1591//
1592// pattern
1593//
1594//--------------------------------------------------------------------------------
1595const RegexPattern &RegexMatcher::pattern() const {
1596 return *fPattern;
1597}
1598
1599
1600
1601//--------------------------------------------------------------------------------
1602//
1603// region
1604//
1605//--------------------------------------------------------------------------------
1606RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1607 if (U_FAILURE(status)) {
1608 return *this;
1609 }
1610
1611 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1612 status = U_ILLEGAL_ARGUMENT_ERROR;
1613 }
1614
1615 int64_t nativeStart = regionStart;
1616 int64_t nativeLimit = regionLimit;
1617 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1618 status = U_ILLEGAL_ARGUMENT_ERROR;
1619 }
1620
1621 if (startIndex == -1)
1622 this->reset();
1623 else
1624 resetPreserveRegion();
1625
1626 fRegionStart = nativeStart;
1627 fRegionLimit = nativeLimit;
1628 fActiveStart = nativeStart;
1629 fActiveLimit = nativeLimit;
1630
1631 if (startIndex != -1) {
1632 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1633 status = U_INDEX_OUTOFBOUNDS_ERROR;
1634 }
1635 fMatchEnd = startIndex;
1636 }
1637
1638 if (!fTransparentBounds) {
1639 fLookStart = nativeStart;
1640 fLookLimit = nativeLimit;
1641 }
1642 if (fAnchoringBounds) {
1643 fAnchorStart = nativeStart;
1644 fAnchorLimit = nativeLimit;
1645 }
1646 return *this;
1647}
1648
1649RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1650 return region(start, limit, -1, status);
1651}
1652
1653//--------------------------------------------------------------------------------
1654//
1655// regionEnd
1656//
1657//--------------------------------------------------------------------------------
1658int32_t RegexMatcher::regionEnd() const {
1659 return (int32_t)fRegionLimit;
1660}
1661
1662int64_t RegexMatcher::regionEnd64() const {
1663 return fRegionLimit;
1664}
1665
1666//--------------------------------------------------------------------------------
1667//
1668// regionStart
1669//
1670//--------------------------------------------------------------------------------
1671int32_t RegexMatcher::regionStart() const {
1672 return (int32_t)fRegionStart;
1673}
1674
1675int64_t RegexMatcher::regionStart64() const {
1676 return fRegionStart;
1677}
1678
1679
1680//--------------------------------------------------------------------------------
1681//
1682// replaceAll
1683//
1684//--------------------------------------------------------------------------------
1685UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1686 UText replacementText = UTEXT_INITIALIZER;
1687 UText resultText = UTEXT_INITIALIZER;
1688 UnicodeString resultString;
1689 if (U_FAILURE(status)) {
1690 return resultString;
1691 }
1692
1693 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1694 utext_openUnicodeString(&resultText, &resultString, &status);
1695
1696 replaceAll(&replacementText, &resultText, status);
1697
1698 utext_close(&resultText);
1699 utext_close(&replacementText);
1700
1701 return resultString;
1702}
1703
1704
1705//
1706// replaceAll, UText mode
1707//
1708UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1709 if (U_FAILURE(status)) {
1710 return dest;
1711 }
1712 if (U_FAILURE(fDeferredStatus)) {
1713 status = fDeferredStatus;
1714 return dest;
1715 }
1716
1717 if (dest == NULL) {
1718 UnicodeString emptyString;
1719 UText empty = UTEXT_INITIALIZER;
1720
1721 utext_openUnicodeString(&empty, &emptyString, &status);
1722 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1723 utext_close(&empty);
1724 }
1725
1726 if (U_SUCCESS(status)) {
1727 reset();
1728 while (find()) {
1729 appendReplacement(dest, replacement, status);
1730 if (U_FAILURE(status)) {
1731 break;
1732 }
1733 }
1734 appendTail(dest, status);
1735 }
1736
1737 return dest;
1738}
1739
1740
1741//--------------------------------------------------------------------------------
1742//
1743// replaceFirst
1744//
1745//--------------------------------------------------------------------------------
1746UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1747 UText replacementText = UTEXT_INITIALIZER;
1748 UText resultText = UTEXT_INITIALIZER;
1749 UnicodeString resultString;
1750
1751 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1752 utext_openUnicodeString(&resultText, &resultString, &status);
1753
1754 replaceFirst(&replacementText, &resultText, status);
1755
1756 utext_close(&resultText);
1757 utext_close(&replacementText);
1758
1759 return resultString;
1760}
1761
1762//
1763// replaceFirst, UText mode
1764//
1765UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1766 if (U_FAILURE(status)) {
1767 return dest;
1768 }
1769 if (U_FAILURE(fDeferredStatus)) {
1770 status = fDeferredStatus;
1771 return dest;
1772 }
1773
1774 reset();
1775 if (!find()) {
1776 return getInput(dest, status);
1777 }
1778
1779 if (dest == NULL) {
1780 UnicodeString emptyString;
1781 UText empty = UTEXT_INITIALIZER;
1782
1783 utext_openUnicodeString(&empty, &emptyString, &status);
1784 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1785 utext_close(&empty);
1786 }
1787
1788 appendReplacement(dest, replacement, status);
1789 appendTail(dest, status);
1790
1791 return dest;
1792}
1793
1794
1795//--------------------------------------------------------------------------------
1796//
1797// requireEnd
1798//
1799//--------------------------------------------------------------------------------
1800UBool RegexMatcher::requireEnd() const {
1801 return fRequireEnd;
1802}
1803
1804
1805//--------------------------------------------------------------------------------
1806//
1807// reset
1808//
1809//--------------------------------------------------------------------------------
1810RegexMatcher &RegexMatcher::reset() {
1811 fRegionStart = 0;
1812 fRegionLimit = fInputLength;
1813 fActiveStart = 0;
1814 fActiveLimit = fInputLength;
1815 fAnchorStart = 0;
1816 fAnchorLimit = fInputLength;
1817 fLookStart = 0;
1818 fLookLimit = fInputLength;
1819 resetPreserveRegion();
1820 return *this;
1821}
1822
1823
1824
1825void RegexMatcher::resetPreserveRegion() {
1826 fMatchStart = 0;
1827 fMatchEnd = 0;
1828 fLastMatchEnd = -1;
1829 fAppendPosition = 0;
1830 fMatch = FALSE;
1831 fHitEnd = FALSE;
1832 fRequireEnd = FALSE;
1833 fTime = 0;
1834 fTickCounter = TIMER_INITIAL_VALUE;
1835 //resetStack(); // more expensive than it looks...
1836}
1837
1838
1839RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1840 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1841 if (fPattern->fNeedsAltInput) {
1842 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1843 }
1844 if (U_FAILURE(fDeferredStatus)) {
1845 return *this;
1846 }
1847 fInputLength = utext_nativeLength(fInputText);
1848
1849 reset();
1850 delete fInput;
1851 fInput = NULL;
1852
1853 // Do the following for any UnicodeString.
1854 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1855 fInputUniStrMaybeMutable = TRUE;
1856
1857 if (fWordBreakItr != NULL) {
1858#if UCONFIG_NO_BREAK_ITERATION==0
1859 UErrorCode status = U_ZERO_ERROR;
1860 fWordBreakItr->setText(fInputText, status);
1861#endif
1862 }
1863 return *this;
1864}
1865
1866
1867RegexMatcher &RegexMatcher::reset(UText *input) {
1868 if (fInputText != input) {
1869 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1870 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1871 if (U_FAILURE(fDeferredStatus)) {
1872 return *this;
1873 }
1874 fInputLength = utext_nativeLength(fInputText);
1875
1876 delete fInput;
1877 fInput = NULL;
1878
1879 if (fWordBreakItr != NULL) {
1880#if UCONFIG_NO_BREAK_ITERATION==0
1881 UErrorCode status = U_ZERO_ERROR;
1882 fWordBreakItr->setText(input, status);
1883#endif
1884 }
1885 }
1886 reset();
1887 fInputUniStrMaybeMutable = FALSE;
1888
1889 return *this;
1890}
1891
1892/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1893 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1894 return *this;
1895}*/
1896
1897RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1898 if (U_FAILURE(status)) {
1899 return *this;
1900 }
1901 reset(); // Reset also resets the region to be the entire string.
1902
1903 if (position < 0 || position > fActiveLimit) {
1904 status = U_INDEX_OUTOFBOUNDS_ERROR;
1905 return *this;
1906 }
1907 fMatchEnd = position;
1908 return *this;
1909}
1910
1911
1912//--------------------------------------------------------------------------------
1913//
1914// refresh
1915//
1916//--------------------------------------------------------------------------------
1917RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1918 if (U_FAILURE(status)) {
1919 return *this;
1920 }
1921 if (input == NULL) {
1922 status = U_ILLEGAL_ARGUMENT_ERROR;
1923 return *this;
1924 }
1925 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1926 status = U_ILLEGAL_ARGUMENT_ERROR;
1927 return *this;
1928 }
1929 int64_t pos = utext_getNativeIndex(fInputText);
1930 // Shallow read-only clone of the new UText into the existing input UText
1931 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1932 if (U_FAILURE(status)) {
1933 return *this;
1934 }
1935 utext_setNativeIndex(fInputText, pos);
1936
1937 if (fAltInputText != NULL) {
1938 pos = utext_getNativeIndex(fAltInputText);
1939 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1940 if (U_FAILURE(status)) {
1941 return *this;
1942 }
1943 utext_setNativeIndex(fAltInputText, pos);
1944 }
1945 return *this;
1946}
1947
1948
1949
1950//--------------------------------------------------------------------------------
1951//
1952// setTrace
1953//
1954//--------------------------------------------------------------------------------
1955void RegexMatcher::setTrace(UBool state) {
1956 fTraceDebug = state;
1957}
1958
1959
1960
1961/**
1962 * UText, replace entire contents of the destination UText with a substring of the source UText.
1963 *
1964 * @param src The source UText
1965 * @param dest The destination UText. Must be writable.
1966 * May be NULL, in which case a new UText will be allocated.
1967 * @param start Start index of source substring.
1968 * @param limit Limit index of source substring.
1969 * @param status An error code.
1970 */
1971static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1972 if (U_FAILURE(*status)) {
1973 return dest;
1974 }
1975 if (start == limit) {
1976 if (dest) {
1977 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1978 return dest;
1979 } else {
1980 return utext_openUChars(NULL, NULL, 0, status);
1981 }
1982 }
1983 int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1984 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1985 return dest;
1986 }
1987 *status = U_ZERO_ERROR;
1988 MaybeStackArray<UChar, 40> buffer;
1989 if (length >= buffer.getCapacity()) {
1990 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
1991 if (newBuf == NULL) {
1992 *status = U_MEMORY_ALLOCATION_ERROR;
1993 }
1994 }
1995 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
1996 if (dest) {
1997 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
1998 return dest;
1999 }
2000
2001 // Caller did not provide a prexisting UText.
2002 // Open a new one, and have it adopt the text buffer storage.
2003 if (U_FAILURE(*status)) {
2004 return NULL;
2005 }
2006 int32_t ownedLength = 0;
2007 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2008 if (ownedBuf == NULL) {
2009 *status = U_MEMORY_ALLOCATION_ERROR;
2010 return NULL;
2011 }
2012 UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2013 if (U_FAILURE(*status)) {
2014 uprv_free(ownedBuf);
2015 return NULL;
2016 }
2017 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2018 return result;
2019}
2020
2021
2022//---------------------------------------------------------------------
2023//
2024// split
2025//
2026//---------------------------------------------------------------------
2027int32_t RegexMatcher::split(const UnicodeString &input,
2028 UnicodeString dest[],
2029 int32_t destCapacity,
2030 UErrorCode &status)
2031{
2032 UText inputText = UTEXT_INITIALIZER;
2033 utext_openConstUnicodeString(&inputText, &input, &status);
2034 if (U_FAILURE(status)) {
2035 return 0;
2036 }
2037
2038 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2039 if (destText == NULL) {
2040 status = U_MEMORY_ALLOCATION_ERROR;
2041 return 0;
2042 }
2043 int32_t i;
2044 for (i = 0; i < destCapacity; i++) {
2045 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2046 }
2047
2048 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2049
2050 for (i = 0; i < destCapacity; i++) {
2051 utext_close(destText[i]);
2052 }
2053
2054 uprv_free(destText);
2055 utext_close(&inputText);
2056 return fieldCount;
2057}
2058
2059//
2060// split, UText mode
2061//
2062int32_t RegexMatcher::split(UText *input,
2063 UText *dest[],
2064 int32_t destCapacity,
2065 UErrorCode &status)
2066{
2067 //
2068 // Check arguements for validity
2069 //
2070 if (U_FAILURE(status)) {
2071 return 0;
2072 }
2073
2074 if (destCapacity < 1) {
2075 status = U_ILLEGAL_ARGUMENT_ERROR;
2076 return 0;
2077 }
2078
2079 //
2080 // Reset for the input text
2081 //
2082 reset(input);
2083 int64_t nextOutputStringStart = 0;
2084 if (fActiveLimit == 0) {
2085 return 0;
2086 }
2087
2088 //
2089 // Loop through the input text, searching for the delimiter pattern
2090 //
2091 int32_t i;
2092 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2093 for (i=0; ; i++) {
2094 if (i>=destCapacity-1) {
2095 // There is one or zero output string left.
2096 // Fill the last output string with whatever is left from the input, then exit the loop.
2097 // ( i will be == destCapacity if we filled the output array while processing
2098 // capture groups of the delimiter expression, in which case we will discard the
2099 // last capture group saved in favor of the unprocessed remainder of the
2100 // input string.)
2101 i = destCapacity-1;
2102 if (fActiveLimit > nextOutputStringStart) {
2103 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2104 if (dest[i]) {
2105 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2106 input->chunkContents+nextOutputStringStart,
2107 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2108 } else {
2109 UText remainingText = UTEXT_INITIALIZER;
2110 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2111 fActiveLimit-nextOutputStringStart, &status);
2112 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2113 utext_close(&remainingText);
2114 }
2115 } else {
2116 UErrorCode lengthStatus = U_ZERO_ERROR;
2117 int32_t remaining16Length =
2118 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2119 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2120 if (remainingChars == NULL) {
2121 status = U_MEMORY_ALLOCATION_ERROR;
2122 break;
2123 }
2124
2125 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2126 if (dest[i]) {
2127 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2128 } else {
2129 UText remainingText = UTEXT_INITIALIZER;
2130 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2131 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2132 utext_close(&remainingText);
2133 }
2134
2135 uprv_free(remainingChars);
2136 }
2137 }
2138 break;
2139 }
2140 if (find()) {
2141 // We found another delimiter. Move everything from where we started looking
2142 // up until the start of the delimiter into the next output string.
2143 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2144 if (dest[i]) {
2145 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2146 input->chunkContents+nextOutputStringStart,
2147 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2148 } else {
2149 UText remainingText = UTEXT_INITIALIZER;
2150 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2151 fMatchStart-nextOutputStringStart, &status);
2152 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2153 utext_close(&remainingText);
2154 }
2155 } else {
2156 UErrorCode lengthStatus = U_ZERO_ERROR;
2157 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2158 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2159 if (remainingChars == NULL) {
2160 status = U_MEMORY_ALLOCATION_ERROR;
2161 break;
2162 }
2163 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2164 if (dest[i]) {
2165 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2166 } else {
2167 UText remainingText = UTEXT_INITIALIZER;
2168 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2169 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2170 utext_close(&remainingText);
2171 }
2172
2173 uprv_free(remainingChars);
2174 }
2175 nextOutputStringStart = fMatchEnd;
2176
2177 // If the delimiter pattern has capturing parentheses, the captured
2178 // text goes out into the next n destination strings.
2179 int32_t groupNum;
2180 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2181 if (i >= destCapacity-2) {
2182 // Never fill the last available output string with capture group text.
2183 // It will filled with the last field, the remainder of the
2184 // unsplit input text.
2185 break;
2186 }
2187 i++;
2188 dest[i] = utext_extract_replace(fInputText, dest[i],
2189 start64(groupNum, status), end64(groupNum, status), &status);
2190 }
2191
2192 if (nextOutputStringStart == fActiveLimit) {
2193 // The delimiter was at the end of the string. We're done, but first
2194 // we output one last empty string, for the empty field following
2195 // the delimiter at the end of input.
2196 if (i+1 < destCapacity) {
2197 ++i;
2198 if (dest[i] == NULL) {
2199 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2200 } else {
2201 static const UChar emptyString[] = {(UChar)0};
2202 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2203 }
2204 }
2205 break;
2206
2207 }
2208 }
2209 else
2210 {
2211 // We ran off the end of the input while looking for the next delimiter.
2212 // All the remaining text goes into the current output string.
2213 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2214 if (dest[i]) {
2215 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2216 input->chunkContents+nextOutputStringStart,
2217 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2218 } else {
2219 UText remainingText = UTEXT_INITIALIZER;
2220 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2221 fActiveLimit-nextOutputStringStart, &status);
2222 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2223 utext_close(&remainingText);
2224 }
2225 } else {
2226 UErrorCode lengthStatus = U_ZERO_ERROR;
2227 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2228 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2229 if (remainingChars == NULL) {
2230 status = U_MEMORY_ALLOCATION_ERROR;
2231 break;
2232 }
2233
2234 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2235 if (dest[i]) {
2236 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2237 } else {
2238 UText remainingText = UTEXT_INITIALIZER;
2239 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2240 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2241 utext_close(&remainingText);
2242 }
2243
2244 uprv_free(remainingChars);
2245 }
2246 break;
2247 }
2248 if (U_FAILURE(status)) {
2249 break;
2250 }
2251 } // end of for loop
2252 return i+1;
2253}
2254
2255
2256//--------------------------------------------------------------------------------
2257//
2258// start
2259//
2260//--------------------------------------------------------------------------------
2261int32_t RegexMatcher::start(UErrorCode &status) const {
2262 return start(0, status);
2263}
2264
2265int64_t RegexMatcher::start64(UErrorCode &status) const {
2266 return start64(0, status);
2267}
2268
2269//--------------------------------------------------------------------------------
2270//
2271// start(int32_t group, UErrorCode &status)
2272//
2273//--------------------------------------------------------------------------------
2274
2275int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2276 if (U_FAILURE(status)) {
2277 return -1;
2278 }
2279 if (U_FAILURE(fDeferredStatus)) {
2280 status = fDeferredStatus;
2281 return -1;
2282 }
2283 if (fMatch == FALSE) {
2284 status = U_REGEX_INVALID_STATE;
2285 return -1;
2286 }
2287 if (group < 0 || group > fPattern->fGroupMap->size()) {
2288 status = U_INDEX_OUTOFBOUNDS_ERROR;
2289 return -1;
2290 }
2291 int64_t s;
2292 if (group == 0) {
2293 s = fMatchStart;
2294 } else {
2295 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2296 U_ASSERT(groupOffset < fPattern->fFrameSize);
2297 U_ASSERT(groupOffset >= 0);
2298 s = fFrame->fExtra[groupOffset];
2299 }
2300
2301 return s;
2302}
2303
2304
2305int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2306 return (int32_t)start64(group, status);
2307}
2308
2309//--------------------------------------------------------------------------------
2310//
2311// useAnchoringBounds
2312//
2313//--------------------------------------------------------------------------------
2314RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2315 fAnchoringBounds = b;
2316 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2317 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2318 return *this;
2319}
2320
2321
2322//--------------------------------------------------------------------------------
2323//
2324// useTransparentBounds
2325//
2326//--------------------------------------------------------------------------------
2327RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2328 fTransparentBounds = b;
2329 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2330 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2331 return *this;
2332}
2333
2334//--------------------------------------------------------------------------------
2335//
2336// setTimeLimit
2337//
2338//--------------------------------------------------------------------------------
2339void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2340 if (U_FAILURE(status)) {
2341 return;
2342 }
2343 if (U_FAILURE(fDeferredStatus)) {
2344 status = fDeferredStatus;
2345 return;
2346 }
2347 if (limit < 0) {
2348 status = U_ILLEGAL_ARGUMENT_ERROR;
2349 return;
2350 }
2351 fTimeLimit = limit;
2352}
2353
2354
2355//--------------------------------------------------------------------------------
2356//
2357// getTimeLimit
2358//
2359//--------------------------------------------------------------------------------
2360int32_t RegexMatcher::getTimeLimit() const {
2361 return fTimeLimit;
2362}
2363
2364
2365//--------------------------------------------------------------------------------
2366//
2367// setStackLimit
2368//
2369//--------------------------------------------------------------------------------
2370void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2371 if (U_FAILURE(status)) {
2372 return;
2373 }
2374 if (U_FAILURE(fDeferredStatus)) {
2375 status = fDeferredStatus;
2376 return;
2377 }
2378 if (limit < 0) {
2379 status = U_ILLEGAL_ARGUMENT_ERROR;
2380 return;
2381 }
2382
2383 // Reset the matcher. This is needed here in case there is a current match
2384 // whose final stack frame (containing the match results, pointed to by fFrame)
2385 // would be lost by resizing to a smaller stack size.
2386 reset();
2387
2388 if (limit == 0) {
2389 // Unlimited stack expansion
2390 fStack->setMaxCapacity(0);
2391 } else {
2392 // Change the units of the limit from bytes to ints, and bump the size up
2393 // to be big enough to hold at least one stack frame for the pattern,
2394 // if it isn't there already.
2395 int32_t adjustedLimit = limit / sizeof(int32_t);
2396 if (adjustedLimit < fPattern->fFrameSize) {
2397 adjustedLimit = fPattern->fFrameSize;
2398 }
2399 fStack->setMaxCapacity(adjustedLimit);
2400 }
2401 fStackLimit = limit;
2402}
2403
2404
2405//--------------------------------------------------------------------------------
2406//
2407// getStackLimit
2408//
2409//--------------------------------------------------------------------------------
2410int32_t RegexMatcher::getStackLimit() const {
2411 return fStackLimit;
2412}
2413
2414
2415//--------------------------------------------------------------------------------
2416//
2417// setMatchCallback
2418//
2419//--------------------------------------------------------------------------------
2420void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2421 const void *context,
2422 UErrorCode &status) {
2423 if (U_FAILURE(status)) {
2424 return;
2425 }
2426 fCallbackFn = callback;
2427 fCallbackContext = context;
2428}
2429
2430
2431//--------------------------------------------------------------------------------
2432//
2433// getMatchCallback
2434//
2435//--------------------------------------------------------------------------------
2436void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2437 const void *&context,
2438 UErrorCode &status) {
2439 if (U_FAILURE(status)) {
2440 return;
2441 }
2442 callback = fCallbackFn;
2443 context = fCallbackContext;
2444}
2445
2446
2447//--------------------------------------------------------------------------------
2448//
2449// setMatchCallback
2450//
2451//--------------------------------------------------------------------------------
2452void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2453 const void *context,
2454 UErrorCode &status) {
2455 if (U_FAILURE(status)) {
2456 return;
2457 }
2458 fFindProgressCallbackFn = callback;
2459 fFindProgressCallbackContext = context;
2460}
2461
2462
2463//--------------------------------------------------------------------------------
2464//
2465// getMatchCallback
2466//
2467//--------------------------------------------------------------------------------
2468void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2469 const void *&context,
2470 UErrorCode &status) {
2471 if (U_FAILURE(status)) {
2472 return;
2473 }
2474 callback = fFindProgressCallbackFn;
2475 context = fFindProgressCallbackContext;
2476}
2477
2478
2479//================================================================================
2480//
2481// Code following this point in this file is the internal
2482// Match Engine Implementation.
2483//
2484//================================================================================
2485
2486
2487//--------------------------------------------------------------------------------
2488//
2489// resetStack
2490// Discard any previous contents of the state save stack, and initialize a
2491// new stack frame to all -1. The -1s are needed for capture group limits,
2492// where they indicate that a group has not yet matched anything.
2493//--------------------------------------------------------------------------------
2494REStackFrame *RegexMatcher::resetStack() {
2495 // Discard any previous contents of the state save stack, and initialize a
2496 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2497 // where they indicate that a group has not yet matched anything.
2498 fStack->removeAllElements();
2499
2500 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2501 if(U_FAILURE(fDeferredStatus)) {
2502 return NULL;
2503 }
2504
2505 int32_t i;
2506 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2507 iFrame->fExtra[i] = -1;
2508 }
2509 return iFrame;
2510}
2511
2512
2513
2514//--------------------------------------------------------------------------------
2515//
2516// isWordBoundary
2517// in perl, "xab..cd..", \b is true at positions 0,3,5,7
2518// For us,
2519// If the current char is a combining mark,
2520// \b is FALSE.
2521// Else Scan backwards to the first non-combining char.
2522// We are at a boundary if the this char and the original chars are
2523// opposite in membership in \w set
2524//
2525// parameters: pos - the current position in the input buffer
2526//
2527// TODO: double-check edge cases at region boundaries.
2528//
2529//--------------------------------------------------------------------------------
2530UBool RegexMatcher::isWordBoundary(int64_t pos) {
2531 UBool isBoundary = FALSE;
2532 UBool cIsWord = FALSE;
2533
2534 if (pos >= fLookLimit) {
2535 fHitEnd = TRUE;
2536 } else {
2537 // Determine whether char c at current position is a member of the word set of chars.
2538 // If we're off the end of the string, behave as though we're not at a word char.
2539 UTEXT_SETNATIVEINDEX(fInputText, pos);
2540 UChar32 c = UTEXT_CURRENT32(fInputText);
2541 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2542 // Current char is a combining one. Not a boundary.
2543 return FALSE;
2544 }
2545 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2546 }
2547
2548 // Back up until we come to a non-combining char, determine whether
2549 // that char is a word char.
2550 UBool prevCIsWord = FALSE;
2551 for (;;) {
2552 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2553 break;
2554 }
2555 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2556 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2557 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2558 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2559 break;
2560 }
2561 }
2562 isBoundary = cIsWord ^ prevCIsWord;
2563 return isBoundary;
2564}
2565
2566UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2567 UBool isBoundary = FALSE;
2568 UBool cIsWord = FALSE;
2569
2570 const UChar *inputBuf = fInputText->chunkContents;
2571
2572 if (pos >= fLookLimit) {
2573 fHitEnd = TRUE;
2574 } else {
2575 // Determine whether char c at current position is a member of the word set of chars.
2576 // If we're off the end of the string, behave as though we're not at a word char.
2577 UChar32 c;
2578 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2579 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2580 // Current char is a combining one. Not a boundary.
2581 return FALSE;
2582 }
2583 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2584 }
2585
2586 // Back up until we come to a non-combining char, determine whether
2587 // that char is a word char.
2588 UBool prevCIsWord = FALSE;
2589 for (;;) {
2590 if (pos <= fLookStart) {
2591 break;
2592 }
2593 UChar32 prevChar;
2594 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2595 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2596 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2597 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2598 break;
2599 }
2600 }
2601 isBoundary = cIsWord ^ prevCIsWord;
2602 return isBoundary;
2603}
2604
2605//--------------------------------------------------------------------------------
2606//
2607// isUWordBoundary
2608//
2609// Test for a word boundary using RBBI word break.
2610//
2611// parameters: pos - the current position in the input buffer
2612//
2613//--------------------------------------------------------------------------------
2614UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2615 UBool returnVal = FALSE;
2616#if UCONFIG_NO_BREAK_ITERATION==0
2617
2618 // If we haven't yet created a break iterator for this matcher, do it now.
2619 if (fWordBreakItr == NULL) {
2620 fWordBreakItr =
2621 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2622 if (U_FAILURE(fDeferredStatus)) {
2623 return FALSE;
2624 }
2625 fWordBreakItr->setText(fInputText, fDeferredStatus);
2626 }
2627
2628 if (pos >= fLookLimit) {
2629 fHitEnd = TRUE;
2630 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2631 // words are not boundaries. All non-word chars stand by themselves,
2632 // with word boundaries on both sides.
2633 } else {
2634 if (!UTEXT_USES_U16(fInputText)) {
2635 // !!!: Would like a better way to do this!
2636 UErrorCode status = U_ZERO_ERROR;
2637 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2638 }
2639 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2640 }
2641#endif
2642 return returnVal;
2643}
2644
2645//--------------------------------------------------------------------------------
2646//
2647// IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2648// saves. Increment the "time" counter, and call the
2649// user callback function if there is one installed.
2650//
2651// If the match operation needs to be aborted, either for a time-out
2652// or because the user callback asked for it, just set an error status.
2653// The engine will pick that up and stop in its outer loop.
2654//
2655//--------------------------------------------------------------------------------
2656void RegexMatcher::IncrementTime(UErrorCode &status) {
2657 fTickCounter = TIMER_INITIAL_VALUE;
2658 fTime++;
2659 if (fCallbackFn != NULL) {
2660 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2661 status = U_REGEX_STOPPED_BY_CALLER;
2662 return;
2663 }
2664 }
2665 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2666 status = U_REGEX_TIME_OUT;
2667 }
2668}
2669
2670//--------------------------------------------------------------------------------
2671//
2672// StateSave
2673// Make a new stack frame, initialized as a copy of the current stack frame.
2674// Set the pattern index in the original stack frame from the operand value
2675// in the opcode. Execution of the engine continues with the state in
2676// the newly created stack frame
2677//
2678// Note that reserveBlock() may grow the stack, resulting in the
2679// whole thing being relocated in memory.
2680//
2681// Parameters:
2682// fp The top frame pointer when called. At return, a new
2683// fame will be present
2684// savePatIdx An index into the compiled pattern. Goes into the original
2685// (not new) frame. If execution ever back-tracks out of the
2686// new frame, this will be where we continue from in the pattern.
2687// Return
2688// The new frame pointer.
2689//
2690//--------------------------------------------------------------------------------
2691inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2692 if (U_FAILURE(status)) {
2693 return fp;
2694 }
2695 // push storage for a new frame.
2696 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2697 if (U_FAILURE(status)) {
2698 // Failure on attempted stack expansion.
2699 // Stack function set some other error code, change it to a more
2700 // specific one for regular expressions.
2701 status = U_REGEX_STACK_OVERFLOW;
2702 // We need to return a writable stack frame, so just return the
2703 // previous frame. The match operation will stop quickly
2704 // because of the error status, after which the frame will never
2705 // be looked at again.
2706 return fp;
2707 }
2708 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2709
2710 // New stack frame = copy of old top frame.
2711 int64_t *source = (int64_t *)fp;
2712 int64_t *dest = newFP;
2713 for (;;) {
2714 *dest++ = *source++;
2715 if (source == newFP) {
2716 break;
2717 }
2718 }
2719
2720 fTickCounter--;
2721 if (fTickCounter <= 0) {
2722 IncrementTime(status); // Re-initializes fTickCounter
2723 }
2724 fp->fPatIdx = savePatIdx;
2725 return (REStackFrame *)newFP;
2726}
2727
2728#if defined(REGEX_DEBUG)
2729namespace {
2730UnicodeString StringFromUText(UText *ut) {
2731 UnicodeString result;
2732 for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2733 result.append(c);
2734 }
2735 return result;
2736}
2737}
2738#endif // REGEX_DEBUG
2739
2740
2741//--------------------------------------------------------------------------------
2742//
2743// MatchAt This is the actual matching engine.
2744//
2745// startIdx: begin matching a this index.
2746// toEnd: if true, match must extend to end of the input region
2747//
2748//--------------------------------------------------------------------------------
2749void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2750 UBool isMatch = FALSE; // True if the we have a match.
2751
2752 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2753
2754 int32_t op; // Operation from the compiled pattern, split into
2755 int32_t opType; // the opcode
2756 int32_t opValue; // and the operand value.
2757
2758#ifdef REGEX_RUN_DEBUG
2759 if (fTraceDebug) {
2760 printf("MatchAt(startIdx=%ld)\n", startIdx);
2761 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2762 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2763 }
2764#endif
2765
2766 if (U_FAILURE(status)) {
2767 return;
2768 }
2769
2770 // Cache frequently referenced items from the compiled pattern
2771 //
2772 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2773
2774 const UChar *litText = fPattern->fLiteralText.getBuffer();
2775 UVector *fSets = fPattern->fSets;
2776
2777 fFrameSize = fPattern->fFrameSize;
2778 REStackFrame *fp = resetStack();
2779 if (U_FAILURE(fDeferredStatus)) {
2780 status = fDeferredStatus;
2781 return;
2782 }
2783
2784 fp->fPatIdx = 0;
2785 fp->fInputIdx = startIdx;
2786
2787 // Zero out the pattern's static data
2788 int32_t i;
2789 for (i = 0; i<fPattern->fDataSize; i++) {
2790 fData[i] = 0;
2791 }
2792
2793 //
2794 // Main loop for interpreting the compiled pattern.
2795 // One iteration of the loop per pattern operation performed.
2796 //
2797 for (;;) {
2798 op = (int32_t)pat[fp->fPatIdx];
2799 opType = URX_TYPE(op);
2800 opValue = URX_VAL(op);
2801#ifdef REGEX_RUN_DEBUG
2802 if (fTraceDebug) {
2803 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2804 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2805 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2806 fPattern->dumpOp(fp->fPatIdx);
2807 }
2808#endif
2809 fp->fPatIdx++;
2810
2811 switch (opType) {
2812
2813
2814 case URX_NOP:
2815 break;
2816
2817
2818 case URX_BACKTRACK:
2819 // Force a backtrack. In some circumstances, the pattern compiler
2820 // will notice that the pattern can't possibly match anything, and will
2821 // emit one of these at that point.
2822 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2823 break;
2824
2825
2826 case URX_ONECHAR:
2827 if (fp->fInputIdx < fActiveLimit) {
2828 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2829 UChar32 c = UTEXT_NEXT32(fInputText);
2830 if (c == opValue) {
2831 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2832 break;
2833 }
2834 } else {
2835 fHitEnd = TRUE;
2836 }
2837 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2838 break;
2839
2840
2841 case URX_STRING:
2842 {
2843 // Test input against a literal string.
2844 // Strings require two slots in the compiled pattern, one for the
2845 // offset to the string text, and one for the length.
2846
2847 int32_t stringStartIdx = opValue;
2848 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2849 fp->fPatIdx++;
2850 opType = URX_TYPE(op);
2851 int32_t stringLen = URX_VAL(op);
2852 U_ASSERT(opType == URX_STRING_LEN);
2853 U_ASSERT(stringLen >= 2);
2854
2855 const UChar *patternString = litText+stringStartIdx;
2856 int32_t patternStringIndex = 0;
2857 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2858 UChar32 inputChar;
2859 UChar32 patternChar;
2860 UBool success = TRUE;
2861 while (patternStringIndex < stringLen) {
2862 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2863 success = FALSE;
2864 fHitEnd = TRUE;
2865 break;
2866 }
2867 inputChar = UTEXT_NEXT32(fInputText);
2868 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2869 if (patternChar != inputChar) {
2870 success = FALSE;
2871 break;
2872 }
2873 }
2874
2875 if (success) {
2876 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2877 } else {
2878 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2879 }
2880 }
2881 break;
2882
2883
2884 case URX_STATE_SAVE:
2885 fp = StateSave(fp, opValue, status);
2886 break;
2887
2888
2889 case URX_END:
2890 // The match loop will exit via this path on a successful match,
2891 // when we reach the end of the pattern.
2892 if (toEnd && fp->fInputIdx != fActiveLimit) {
2893 // The pattern matched, but not to the end of input. Try some more.
2894 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2895 break;
2896 }
2897 isMatch = TRUE;
2898 goto breakFromLoop;
2899
2900 // Start and End Capture stack frame variables are laid out out like this:
2901 // fp->fExtra[opValue] - The start of a completed capture group
2902 // opValue+1 - The end of a completed capture group
2903 // opValue+2 - the start of a capture group whose end
2904 // has not yet been reached (and might not ever be).
2905 case URX_START_CAPTURE:
2906 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2907 fp->fExtra[opValue+2] = fp->fInputIdx;
2908 break;
2909
2910
2911 case URX_END_CAPTURE:
2912 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2913 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2914 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2915 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2916 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2917 break;
2918
2919
2920 case URX_DOLLAR: // $, test for End of line
2921 // or for position before new line at end of input
2922 {
2923 if (fp->fInputIdx >= fAnchorLimit) {
2924 // We really are at the end of input. Success.
2925 fHitEnd = TRUE;
2926 fRequireEnd = TRUE;
2927 break;
2928 }
2929
2930 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2931
2932 // If we are positioned just before a new-line that is located at the
2933 // end of input, succeed.
2934 UChar32 c = UTEXT_NEXT32(fInputText);
2935 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2936 if (isLineTerminator(c)) {
2937 // If not in the middle of a CR/LF sequence
2938 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2939 // At new-line at end of input. Success
2940 fHitEnd = TRUE;
2941 fRequireEnd = TRUE;
2942
2943 break;
2944 }
2945 }
2946 } else {
2947 UChar32 nextC = UTEXT_NEXT32(fInputText);
2948 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2949 fHitEnd = TRUE;
2950 fRequireEnd = TRUE;
2951 break; // At CR/LF at end of input. Success
2952 }
2953 }
2954
2955 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2956 }
2957 break;
2958
2959
2960 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2961 if (fp->fInputIdx >= fAnchorLimit) {
2962 // Off the end of input. Success.
2963 fHitEnd = TRUE;
2964 fRequireEnd = TRUE;
2965 break;
2966 } else {
2967 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2968 UChar32 c = UTEXT_NEXT32(fInputText);
2969 // Either at the last character of input, or off the end.
2970 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2971 fHitEnd = TRUE;
2972 fRequireEnd = TRUE;
2973 break;
2974 }
2975 }
2976
2977 // Not at end of input. Back-track out.
2978 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2979 break;
2980
2981
2982 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
2983 {
2984 if (fp->fInputIdx >= fAnchorLimit) {
2985 // We really are at the end of input. Success.
2986 fHitEnd = TRUE;
2987 fRequireEnd = TRUE;
2988 break;
2989 }
2990 // If we are positioned just before a new-line, succeed.
2991 // It makes no difference where the new-line is within the input.
2992 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2993 UChar32 c = UTEXT_CURRENT32(fInputText);
2994 if (isLineTerminator(c)) {
2995 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
2996 // In multi-line mode, hitting a new-line just before the end of input does not
2997 // set the hitEnd or requireEnd flags
2998 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
2999 break;
3000 }
3001 }
3002 // not at a new line. Fail.
3003 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3004 }
3005 break;
3006
3007
3008 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3009 {
3010 if (fp->fInputIdx >= fAnchorLimit) {
3011 // We really are at the end of input. Success.
3012 fHitEnd = TRUE;
3013 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3014 break; // adding a new-line would not lose the match.
3015 }
3016 // If we are not positioned just before a new-line, the test fails; backtrack out.
3017 // It makes no difference where the new-line is within the input.
3018 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3019 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3020 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3021 }
3022 }
3023 break;
3024
3025
3026 case URX_CARET: // ^, test for start of line
3027 if (fp->fInputIdx != fAnchorStart) {
3028 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3029 }
3030 break;
3031
3032
3033 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3034 {
3035 if (fp->fInputIdx == fAnchorStart) {
3036 // We are at the start input. Success.
3037 break;
3038 }
3039 // Check whether character just before the current pos is a new-line
3040 // unless we are at the end of input
3041 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3042 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3043 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3044 // It's a new-line. ^ is true. Success.
3045 // TODO: what should be done with positions between a CR and LF?
3046 break;
3047 }
3048 // Not at the start of a line. Fail.
3049 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3050 }
3051 break;
3052
3053
3054 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3055 {
3056 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3057 if (fp->fInputIdx <= fAnchorStart) {
3058 // We are at the start input. Success.
3059 break;
3060 }
3061 // Check whether character just before the current pos is a new-line
3062 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3063 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3064 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3065 if (c != 0x0a) {
3066 // Not at the start of a line. Back-track out.
3067 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3068 }
3069 }
3070 break;
3071
3072 case URX_BACKSLASH_B: // Test for word boundaries
3073 {
3074 UBool success = isWordBoundary(fp->fInputIdx);
3075 success ^= (UBool)(opValue != 0); // flip sense for \B
3076 if (!success) {
3077 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3078 }
3079 }
3080 break;
3081
3082
3083 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3084 {
3085 UBool success = isUWordBoundary(fp->fInputIdx);
3086 success ^= (UBool)(opValue != 0); // flip sense for \B
3087 if (!success) {
3088 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3089 }
3090 }
3091 break;
3092
3093
3094 case URX_BACKSLASH_D: // Test for decimal digit
3095 {
3096 if (fp->fInputIdx >= fActiveLimit) {
3097 fHitEnd = TRUE;
3098 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3099 break;
3100 }
3101
3102 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3103
3104 UChar32 c = UTEXT_NEXT32(fInputText);
3105 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3106 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3107 success ^= (UBool)(opValue != 0); // flip sense for \D
3108 if (success) {
3109 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3110 } else {
3111 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3112 }
3113 }
3114 break;
3115
3116
3117 case URX_BACKSLASH_G: // Test for position at end of previous match
3118 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3119 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3120 }
3121 break;
3122
3123
3124 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3125 {
3126 if (fp->fInputIdx >= fActiveLimit) {
3127 fHitEnd = TRUE;
3128 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3129 break;
3130 }
3131 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3132 UChar32 c = UTEXT_NEXT32(fInputText);
3133 int8_t ctype = u_charType(c);
3134 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3135 success ^= (UBool)(opValue != 0); // flip sense for \H
3136 if (success) {
3137 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3138 } else {
3139 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3140 }
3141 }
3142 break;
3143
3144
3145 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3146 {
3147 if (fp->fInputIdx >= fActiveLimit) {
3148 fHitEnd = TRUE;
3149 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3150 break;
3151 }
3152 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3153 UChar32 c = UTEXT_NEXT32(fInputText);
3154 if (isLineTerminator(c)) {
3155 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3156 utext_next32(fInputText);
3157 }
3158 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3159 } else {
3160 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3161 }
3162 }
3163 break;
3164
3165
3166 case URX_BACKSLASH_V: // \v, any single line ending character.
3167 {
3168 if (fp->fInputIdx >= fActiveLimit) {
3169 fHitEnd = TRUE;
3170 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3171 break;
3172 }
3173 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3174 UChar32 c = UTEXT_NEXT32(fInputText);
3175 UBool success = isLineTerminator(c);
3176 success ^= (UBool)(opValue != 0); // flip sense for \V
3177 if (success) {
3178 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3179 } else {
3180 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3181 }
3182 }
3183 break;
3184
3185
3186 case URX_BACKSLASH_X:
3187 // Match a Grapheme, as defined by Unicode TR 29.
3188 // Differs slightly from Perl, which consumes combining marks independently
3189 // of context.
3190 {
3191
3192 // Fail if at end of input
3193 if (fp->fInputIdx >= fActiveLimit) {
3194 fHitEnd = TRUE;
3195 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3196 break;
3197 }
3198
3199 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3200
3201 // Examine (and consume) the current char.
3202 // Dispatch into a little state machine, based on the char.
3203 UChar32 c;
3204 c = UTEXT_NEXT32(fInputText);
3205 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3206 UnicodeSet **sets = fPattern->fStaticSets;
3207 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3208 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3209 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3210 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3211 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3212 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3213 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3214 goto GC_Extend;
3215
3216
3217
3218GC_L:
3219 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3220 c = UTEXT_NEXT32(fInputText);
3221 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3222 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3223 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3224 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3225 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3226 (void)UTEXT_PREVIOUS32(fInputText);
3227 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3228 goto GC_Extend;
3229
3230GC_V:
3231 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3232 c = UTEXT_NEXT32(fInputText);
3233 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3234 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3235 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3236 (void)UTEXT_PREVIOUS32(fInputText);
3237 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3238 goto GC_Extend;
3239
3240GC_T:
3241 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3242 c = UTEXT_NEXT32(fInputText);
3243 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3244 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3245 (void)UTEXT_PREVIOUS32(fInputText);
3246 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3247 goto GC_Extend;
3248
3249GC_Extend:
3250 // Combining characters are consumed here
3251 for (;;) {
3252 if (fp->fInputIdx >= fActiveLimit) {
3253 break;
3254 }
3255 c = UTEXT_CURRENT32(fInputText);
3256 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3257 break;
3258 }
3259 (void)UTEXT_NEXT32(fInputText);
3260 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3261 }
3262 goto GC_Done;
3263
3264GC_Control:
3265 // Most control chars stand alone (don't combine with combining chars),
3266 // except for that CR/LF sequence is a single grapheme cluster.
3267 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3268 c = UTEXT_NEXT32(fInputText);
3269 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3270 }
3271
3272GC_Done:
3273 if (fp->fInputIdx >= fActiveLimit) {
3274 fHitEnd = TRUE;
3275 }
3276 break;
3277 }
3278
3279
3280
3281
3282 case URX_BACKSLASH_Z: // Test for end of Input
3283 if (fp->fInputIdx < fAnchorLimit) {
3284 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3285 } else {
3286 fHitEnd = TRUE;
3287 fRequireEnd = TRUE;
3288 }
3289 break;
3290
3291
3292
3293 case URX_STATIC_SETREF:
3294 {
3295 // Test input character against one of the predefined sets
3296 // (Word Characters, for example)
3297 // The high bit of the op value is a flag for the match polarity.
3298 // 0: success if input char is in set.
3299 // 1: success if input char is not in set.
3300 if (fp->fInputIdx >= fActiveLimit) {
3301 fHitEnd = TRUE;
3302 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3303 break;
3304 }
3305
3306 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3307 opValue &= ~URX_NEG_SET;
3308 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3309
3310 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3311 UChar32 c = UTEXT_NEXT32(fInputText);
3312 if (c < 256) {
3313 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3314 if (s8->contains(c)) {
3315 success = !success;
3316 }
3317 } else {
3318 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3319 if (s->contains(c)) {
3320 success = !success;
3321 }
3322 }
3323 if (success) {
3324 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3325 } else {
3326 // the character wasn't in the set.
3327 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3328 }
3329 }
3330 break;
3331
3332
3333 case URX_STAT_SETREF_N:
3334 {
3335 // Test input character for NOT being a member of one of
3336 // the predefined sets (Word Characters, for example)
3337 if (fp->fInputIdx >= fActiveLimit) {
3338 fHitEnd = TRUE;
3339 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3340 break;
3341 }
3342
3343 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3344
3345 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3346
3347 UChar32 c = UTEXT_NEXT32(fInputText);
3348 if (c < 256) {
3349 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3350 if (s8->contains(c) == FALSE) {
3351 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3352 break;
3353 }
3354 } else {
3355 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3356 if (s->contains(c) == FALSE) {
3357 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3358 break;
3359 }
3360 }
3361 // the character wasn't in the set.
3362 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3363 }
3364 break;
3365
3366
3367 case URX_SETREF:
3368 if (fp->fInputIdx >= fActiveLimit) {
3369 fHitEnd = TRUE;
3370 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3371 break;
3372 } else {
3373 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3374
3375 // There is input left. Pick up one char and test it for set membership.
3376 UChar32 c = UTEXT_NEXT32(fInputText);
3377 U_ASSERT(opValue > 0 && opValue < fSets->size());
3378 if (c<256) {
3379 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3380 if (s8->contains(c)) {
3381 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3382 break;
3383 }
3384 } else {
3385 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
3386 if (s->contains(c)) {
3387 // The character is in the set. A Match.
3388 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3389 break;
3390 }
3391 }
3392
3393 // the character wasn't in the set.
3394 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3395 }
3396 break;
3397
3398
3399 case URX_DOTANY:
3400 {
3401 // . matches anything, but stops at end-of-line.
3402 if (fp->fInputIdx >= fActiveLimit) {
3403 // At end of input. Match failed. Backtrack out.
3404 fHitEnd = TRUE;
3405 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3406 break;
3407 }
3408
3409 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3410
3411 // There is input left. Advance over one char, unless we've hit end-of-line
3412 UChar32 c = UTEXT_NEXT32(fInputText);
3413 if (isLineTerminator(c)) {
3414 // End of line in normal mode. . does not match.
3415 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3416 break;
3417 }
3418 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3419 }
3420 break;
3421
3422
3423 case URX_DOTANY_ALL:
3424 {
3425 // ., in dot-matches-all (including new lines) mode
3426 if (fp->fInputIdx >= fActiveLimit) {
3427 // At end of input. Match failed. Backtrack out.
3428 fHitEnd = TRUE;
3429 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3430 break;
3431 }
3432
3433 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3434
3435 // There is input left. Advance over one char, except if we are
3436 // at a cr/lf, advance over both of them.
3437 UChar32 c;
3438 c = UTEXT_NEXT32(fInputText);
3439 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3440 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3441 // In the case of a CR/LF, we need to advance over both.
3442 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3443 if (nextc == 0x0a) {
3444 (void)UTEXT_NEXT32(fInputText);
3445 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3446 }
3447 }
3448 }
3449 break;
3450
3451
3452 case URX_DOTANY_UNIX:
3453 {
3454 // '.' operator, matches all, but stops at end-of-line.
3455 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3456 if (fp->fInputIdx >= fActiveLimit) {
3457 // At end of input. Match failed. Backtrack out.
3458 fHitEnd = TRUE;
3459 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3460 break;
3461 }
3462
3463 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3464
3465 // There is input left. Advance over one char, unless we've hit end-of-line
3466 UChar32 c = UTEXT_NEXT32(fInputText);
3467 if (c == 0x0a) {
3468 // End of line in normal mode. '.' does not match the \n
3469 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3470 } else {
3471 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3472 }
3473 }
3474 break;
3475
3476
3477 case URX_JMP:
3478 fp->fPatIdx = opValue;
3479 break;
3480
3481 case URX_FAIL:
3482 isMatch = FALSE;
3483 goto breakFromLoop;
3484
3485 case URX_JMP_SAV:
3486 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3487 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3488 fp->fPatIdx = opValue; // Then JMP.
3489 break;
3490
3491 case URX_JMP_SAV_X:
3492 // This opcode is used with (x)+, when x can match a zero length string.
3493 // Same as JMP_SAV, except conditional on the match having made forward progress.
3494 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3495 // data address of the input position at the start of the loop.
3496 {
3497 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3498 int32_t stoOp = (int32_t)pat[opValue-1];
3499 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3500 int32_t frameLoc = URX_VAL(stoOp);
3501 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3502 int64_t prevInputIdx = fp->fExtra[frameLoc];
3503 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3504 if (prevInputIdx < fp->fInputIdx) {
3505 // The match did make progress. Repeat the loop.
3506 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3507 fp->fPatIdx = opValue;
3508 fp->fExtra[frameLoc] = fp->fInputIdx;
3509 }
3510 // If the input position did not advance, we do nothing here,
3511 // execution will fall out of the loop.
3512 }
3513 break;
3514
3515 case URX_CTR_INIT:
3516 {
3517 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3518 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3519
3520 // Pick up the three extra operands that CTR_INIT has, and
3521 // skip the pattern location counter past
3522 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3523 fp->fPatIdx += 3;
3524 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3525 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3526 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3527 U_ASSERT(minCount>=0);
3528 U_ASSERT(maxCount>=minCount || maxCount==-1);
3529 U_ASSERT(loopLoc>=fp->fPatIdx);
3530
3531 if (minCount == 0) {
3532 fp = StateSave(fp, loopLoc+1, status);
3533 }
3534 if (maxCount == -1) {
3535 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3536 } else if (maxCount == 0) {
3537 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3538 }
3539 }
3540 break;
3541
3542 case URX_CTR_LOOP:
3543 {
3544 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3545 int32_t initOp = (int32_t)pat[opValue];
3546 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3547 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3548 int32_t minCount = (int32_t)pat[opValue+2];
3549 int32_t maxCount = (int32_t)pat[opValue+3];
3550 (*pCounter)++;
3551 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3552 U_ASSERT(*pCounter == maxCount);
3553 break;
3554 }
3555 if (*pCounter >= minCount) {
3556 if (maxCount == -1) {
3557 // Loop has no hard upper bound.
3558 // Check that it is progressing through the input, break if it is not.
3559 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3560 if (fp->fInputIdx == *pLastInputIdx) {
3561 break;
3562 } else {
3563 *pLastInputIdx = fp->fInputIdx;
3564 }
3565 }
3566 fp = StateSave(fp, fp->fPatIdx, status);
3567 } else {
3568 // Increment time-out counter. (StateSave() does it if count >= minCount)
3569 fTickCounter--;
3570 if (fTickCounter <= 0) {
3571 IncrementTime(status); // Re-initializes fTickCounter
3572 }
3573 }
3574
3575 fp->fPatIdx = opValue + 4; // Loop back.
3576 }
3577 break;
3578
3579 case URX_CTR_INIT_NG:
3580 {
3581 // Initialize a non-greedy loop
3582 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3583 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3584
3585 // Pick up the three extra operands that CTR_INIT_NG has, and
3586 // skip the pattern location counter past
3587 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3588 fp->fPatIdx += 3;
3589 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3590 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3591 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3592 U_ASSERT(minCount>=0);
3593 U_ASSERT(maxCount>=minCount || maxCount==-1);
3594 U_ASSERT(loopLoc>fp->fPatIdx);
3595 if (maxCount == -1) {
3596 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3597 }
3598
3599 if (minCount == 0) {
3600 if (maxCount != 0) {
3601 fp = StateSave(fp, fp->fPatIdx, status);
3602 }
3603 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3604 }
3605 }
3606 break;
3607
3608 case URX_CTR_LOOP_NG:
3609 {
3610 // Non-greedy {min, max} loops
3611 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3612 int32_t initOp = (int32_t)pat[opValue];
3613 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3614 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3615 int32_t minCount = (int32_t)pat[opValue+2];
3616 int32_t maxCount = (int32_t)pat[opValue+3];
3617
3618 (*pCounter)++;
3619 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3620 // The loop has matched the maximum permitted number of times.
3621 // Break out of here with no action. Matching will
3622 // continue with the following pattern.
3623 U_ASSERT(*pCounter == maxCount);
3624 break;
3625 }
3626
3627 if (*pCounter < minCount) {
3628 // We haven't met the minimum number of matches yet.
3629 // Loop back for another one.
3630 fp->fPatIdx = opValue + 4; // Loop back.
3631 // Increment time-out counter. (StateSave() does it if count >= minCount)
3632 fTickCounter--;
3633 if (fTickCounter <= 0) {
3634 IncrementTime(status); // Re-initializes fTickCounter
3635 }
3636 } else {
3637 // We do have the minimum number of matches.
3638
3639 // If there is no upper bound on the loop iterations, check that the input index
3640 // is progressing, and stop the loop if it is not.
3641 if (maxCount == -1) {
3642 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3643 if (fp->fInputIdx == *pLastInputIdx) {
3644 break;
3645 }
3646 *pLastInputIdx = fp->fInputIdx;
3647 }
3648
3649 // Loop Continuation: we will fall into the pattern following the loop
3650 // (non-greedy, don't execute loop body first), but first do
3651 // a state save to the top of the loop, so that a match failure
3652 // in the following pattern will try another iteration of the loop.
3653 fp = StateSave(fp, opValue + 4, status);
3654 }
3655 }
3656 break;
3657
3658 case URX_STO_SP:
3659 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3660 fData[opValue] = fStack->size();
3661 break;
3662
3663 case URX_LD_SP:
3664 {
3665 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3666 int32_t newStackSize = (int32_t)fData[opValue];
3667 U_ASSERT(newStackSize <= fStack->size());
3668 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3669 if (newFP == (int64_t *)fp) {
3670 break;
3671 }
3672 int32_t j;
3673 for (j=0; j<fFrameSize; j++) {
3674 newFP[j] = ((int64_t *)fp)[j];
3675 }
3676 fp = (REStackFrame *)newFP;
3677 fStack->setSize(newStackSize);
3678 }
3679 break;
3680
3681 case URX_BACKREF:
3682 {
3683 U_ASSERT(opValue < fFrameSize);
3684 int64_t groupStartIdx = fp->fExtra[opValue];
3685 int64_t groupEndIdx = fp->fExtra[opValue+1];
3686 U_ASSERT(groupStartIdx <= groupEndIdx);
3687 if (groupStartIdx < 0) {
3688 // This capture group has not participated in the match thus far,
3689 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3690 break;
3691 }
3692 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3693 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3694
3695 // Note: if the capture group match was of an empty string the backref
3696 // match succeeds. Verified by testing: Perl matches succeed
3697 // in this case, so we do too.
3698
3699 UBool success = TRUE;
3700 for (;;) {
3701 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3702 success = TRUE;
3703 break;
3704 }
3705 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3706 success = FALSE;
3707 fHitEnd = TRUE;
3708 break;
3709 }
3710 UChar32 captureGroupChar = utext_next32(fAltInputText);
3711 UChar32 inputChar = utext_next32(fInputText);
3712 if (inputChar != captureGroupChar) {
3713 success = FALSE;
3714 break;
3715 }
3716 }
3717
3718 if (success) {
3719 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3720 } else {
3721 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3722 }
3723 }
3724 break;
3725
3726
3727
3728 case URX_BACKREF_I:
3729 {
3730 U_ASSERT(opValue < fFrameSize);
3731 int64_t groupStartIdx = fp->fExtra[opValue];
3732 int64_t groupEndIdx = fp->fExtra[opValue+1];
3733 U_ASSERT(groupStartIdx <= groupEndIdx);
3734 if (groupStartIdx < 0) {
3735 // This capture group has not participated in the match thus far,
3736 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3737 break;
3738 }
3739 utext_setNativeIndex(fAltInputText, groupStartIdx);
3740 utext_setNativeIndex(fInputText, fp->fInputIdx);
3741 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3742 CaseFoldingUTextIterator inputItr(*fInputText);
3743
3744 // Note: if the capture group match was of an empty string the backref
3745 // match succeeds. Verified by testing: Perl matches succeed
3746 // in this case, so we do too.
3747
3748 UBool success = TRUE;
3749 for (;;) {
3750 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3751 success = TRUE;
3752 break;
3753 }
3754 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3755 success = FALSE;
3756 fHitEnd = TRUE;
3757 break;
3758 }
3759 UChar32 captureGroupChar = captureGroupItr.next();
3760 UChar32 inputChar = inputItr.next();
3761 if (inputChar != captureGroupChar) {
3762 success = FALSE;
3763 break;
3764 }
3765 }
3766
3767 if (success && inputItr.inExpansion()) {
3768 // We otained a match by consuming part of a string obtained from
3769 // case-folding a single code point of the input text.
3770 // This does not count as an overall match.
3771 success = FALSE;
3772 }
3773
3774 if (success) {
3775 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3776 } else {
3777 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3778 }
3779
3780 }
3781 break;
3782
3783 case URX_STO_INP_LOC:
3784 {
3785 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3786 fp->fExtra[opValue] = fp->fInputIdx;
3787 }
3788 break;
3789
3790 case URX_JMPX:
3791 {
3792 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3793 fp->fPatIdx += 1;
3794 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3795 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3796 int64_t savedInputIdx = fp->fExtra[dataLoc];
3797 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3798 if (savedInputIdx < fp->fInputIdx) {
3799 fp->fPatIdx = opValue; // JMP
3800 } else {
3801 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3802 }
3803 }
3804 break;
3805
3806 case URX_LA_START:
3807 {
3808 // Entering a look around block.
3809 // Save Stack Ptr, Input Pos.
3810 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3811 fData[opValue] = fStack->size();
3812 fData[opValue+1] = fp->fInputIdx;
3813 fData[opValue+2] = fActiveStart;
3814 fData[opValue+3] = fActiveLimit;
3815 fActiveStart = fLookStart; // Set the match region change for
3816 fActiveLimit = fLookLimit; // transparent bounds.
3817 }
3818 break;
3819
3820 case URX_LA_END:
3821 {
3822 // Leaving a look-ahead block.
3823 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3824 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3825 int32_t stackSize = fStack->size();
3826 int32_t newStackSize =(int32_t)fData[opValue];
3827 U_ASSERT(stackSize >= newStackSize);
3828 if (stackSize > newStackSize) {
3829 // Copy the current top frame back to the new (cut back) top frame.
3830 // This makes the capture groups from within the look-ahead
3831 // expression available.
3832 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3833 int32_t j;
3834 for (j=0; j<fFrameSize; j++) {
3835 newFP[j] = ((int64_t *)fp)[j];
3836 }
3837 fp = (REStackFrame *)newFP;
3838 fStack->setSize(newStackSize);
3839 }
3840 fp->fInputIdx = fData[opValue+1];
3841
3842 // Restore the active region bounds in the input string; they may have
3843 // been changed because of transparent bounds on a Region.
3844 fActiveStart = fData[opValue+2];
3845 fActiveLimit = fData[opValue+3];
3846 U_ASSERT(fActiveStart >= 0);
3847 U_ASSERT(fActiveLimit <= fInputLength);
3848 }
3849 break;
3850
3851 case URX_ONECHAR_I:
3852 // Case insensitive one char. The char from the pattern is already case folded.
3853 // Input text is not, but case folding the input can not reduce two or more code
3854 // points to one.
3855 if (fp->fInputIdx < fActiveLimit) {
3856 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3857
3858 UChar32 c = UTEXT_NEXT32(fInputText);
3859 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3860 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3861 break;
3862 }
3863 } else {
3864 fHitEnd = TRUE;
3865 }
3866
3867 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3868 break;
3869
3870 case URX_STRING_I:
3871 {
3872 // Case-insensitive test input against a literal string.
3873 // Strings require two slots in the compiled pattern, one for the
3874 // offset to the string text, and one for the length.
3875 // The compiled string has already been case folded.
3876 {
3877 const UChar *patternString = litText + opValue;
3878 int32_t patternStringIdx = 0;
3879
3880 op = (int32_t)pat[fp->fPatIdx];
3881 fp->fPatIdx++;
3882 opType = URX_TYPE(op);
3883 opValue = URX_VAL(op);
3884 U_ASSERT(opType == URX_STRING_LEN);
3885 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3886
3887
3888 UChar32 cPattern;
3889 UChar32 cText;
3890 UBool success = TRUE;
3891
3892 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3893 CaseFoldingUTextIterator inputIterator(*fInputText);
3894 while (patternStringIdx < patternStringLen) {
3895 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3896 success = FALSE;
3897 fHitEnd = TRUE;
3898 break;
3899 }
3900 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3901 cText = inputIterator.next();
3902 if (cText != cPattern) {
3903 success = FALSE;
3904 break;
3905 }
3906 }
3907 if (inputIterator.inExpansion()) {
3908 success = FALSE;
3909 }
3910
3911 if (success) {
3912 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3913 } else {
3914 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3915 }
3916 }
3917 }
3918 break;
3919
3920 case URX_LB_START:
3921 {
3922 // Entering a look-behind block.
3923 // Save Stack Ptr, Input Pos and active input region.
3924 // TODO: implement transparent bounds. Ticket #6067
3925 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3926 fData[opValue] = fStack->size();
3927 fData[opValue+1] = fp->fInputIdx;
3928 // Save input string length, then reset to pin any matches to end at
3929 // the current position.
3930 fData[opValue+2] = fActiveStart;
3931 fData[opValue+3] = fActiveLimit;
3932 fActiveStart = fRegionStart;
3933 fActiveLimit = fp->fInputIdx;
3934 // Init the variable containing the start index for attempted matches.
3935 fData[opValue+4] = -1;
3936 }
3937 break;
3938
3939
3940 case URX_LB_CONT:
3941 {
3942 // Positive Look-Behind, at top of loop checking for matches of LB expression
3943 // at all possible input starting positions.
3944
3945 // Fetch the min and max possible match lengths. They are the operands
3946 // of this op in the pattern.
3947 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3948 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3949 if (!UTEXT_USES_U16(fInputText)) {
3950 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3951 // The max length need not be exact; it just needs to be >= actual maximum.
3952 maxML *= 3;
3953 }
3954 U_ASSERT(minML <= maxML);
3955 U_ASSERT(minML >= 0);
3956
3957 // Fetch (from data) the last input index where a match was attempted.
3958 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3959 int64_t &lbStartIdx = fData[opValue+4];
3960 if (lbStartIdx < 0) {
3961 // First time through loop.
3962 lbStartIdx = fp->fInputIdx - minML;
3963 if (lbStartIdx > 0) {
3964 // move index to a code point boudary, if it's not on one already.
3965 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3966 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3967 }
3968 } else {
3969 // 2nd through nth time through the loop.
3970 // Back up start position for match by one.
3971 if (lbStartIdx == 0) {
3972 (lbStartIdx)--;
3973 } else {
3974 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3975 (void)UTEXT_PREVIOUS32(fInputText);
3976 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3977 }
3978 }
3979
3980 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3981 // We have tried all potential match starting points without
3982 // getting a match. Backtrack out, and out of the
3983 // Look Behind altogether.
3984 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3985 fActiveStart = fData[opValue+2];
3986 fActiveLimit = fData[opValue+3];
3987 U_ASSERT(fActiveStart >= 0);
3988 U_ASSERT(fActiveLimit <= fInputLength);
3989 break;
3990 }
3991
3992 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3993 // (successful match will fall off the end of the loop.)
3994 fp = StateSave(fp, fp->fPatIdx-3, status);
3995 fp->fInputIdx = lbStartIdx;
3996 }
3997 break;
3998
3999 case URX_LB_END:
4000 // End of a look-behind block, after a successful match.
4001 {
4002 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4003 if (fp->fInputIdx != fActiveLimit) {
4004 // The look-behind expression matched, but the match did not
4005 // extend all the way to the point that we are looking behind from.
4006 // FAIL out of here, which will take us back to the LB_CONT, which
4007 // will retry the match starting at another position or fail
4008 // the look-behind altogether, whichever is appropriate.
4009 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4010 break;
4011 }
4012
4013 // Look-behind match is good. Restore the orignal input string region,
4014 // which had been truncated to pin the end of the lookbehind match to the
4015 // position being looked-behind.
4016 fActiveStart = fData[opValue+2];
4017 fActiveLimit = fData[opValue+3];
4018 U_ASSERT(fActiveStart >= 0);
4019 U_ASSERT(fActiveLimit <= fInputLength);
4020 }
4021 break;
4022
4023
4024 case URX_LBN_CONT:
4025 {
4026 // Negative Look-Behind, at top of loop checking for matches of LB expression
4027 // at all possible input starting positions.
4028
4029 // Fetch the extra parameters of this op.
4030 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4031 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4032 if (!UTEXT_USES_U16(fInputText)) {
4033 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4034 // The max length need not be exact; it just needs to be >= actual maximum.
4035 maxML *= 3;
4036 }
4037 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4038 continueLoc = URX_VAL(continueLoc);
4039 U_ASSERT(minML <= maxML);
4040 U_ASSERT(minML >= 0);
4041 U_ASSERT(continueLoc > fp->fPatIdx);
4042
4043 // Fetch (from data) the last input index where a match was attempted.
4044 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4045 int64_t &lbStartIdx = fData[opValue+4];
4046 if (lbStartIdx < 0) {
4047 // First time through loop.
4048 lbStartIdx = fp->fInputIdx - minML;
4049 if (lbStartIdx > 0) {
4050 // move index to a code point boudary, if it's not on one already.
4051 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4052 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4053 }
4054 } else {
4055 // 2nd through nth time through the loop.
4056 // Back up start position for match by one.
4057 if (lbStartIdx == 0) {
4058 (lbStartIdx)--;
4059 } else {
4060 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4061 (void)UTEXT_PREVIOUS32(fInputText);
4062 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4063 }
4064 }
4065
4066 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4067 // We have tried all potential match starting points without
4068 // getting a match, which means that the negative lookbehind as
4069 // a whole has succeeded. Jump forward to the continue location
4070 fActiveStart = fData[opValue+2];
4071 fActiveLimit = fData[opValue+3];
4072 U_ASSERT(fActiveStart >= 0);
4073 U_ASSERT(fActiveLimit <= fInputLength);
4074 fp->fPatIdx = continueLoc;
4075 break;
4076 }
4077
4078 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4079 // (successful match will cause a FAIL out of the loop altogether.)
4080 fp = StateSave(fp, fp->fPatIdx-4, status);
4081 fp->fInputIdx = lbStartIdx;
4082 }
4083 break;
4084
4085 case URX_LBN_END:
4086 // End of a negative look-behind block, after a successful match.
4087 {
4088 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4089 if (fp->fInputIdx != fActiveLimit) {
4090 // The look-behind expression matched, but the match did not
4091 // extend all the way to the point that we are looking behind from.
4092 // FAIL out of here, which will take us back to the LB_CONT, which
4093 // will retry the match starting at another position or succeed
4094 // the look-behind altogether, whichever is appropriate.
4095 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4096 break;
4097 }
4098
4099 // Look-behind expression matched, which means look-behind test as
4100 // a whole Fails
4101
4102 // Restore the orignal input string length, which had been truncated
4103 // inorder to pin the end of the lookbehind match
4104 // to the position being looked-behind.
4105 fActiveStart = fData[opValue+2];
4106 fActiveLimit = fData[opValue+3];
4107 U_ASSERT(fActiveStart >= 0);
4108 U_ASSERT(fActiveLimit <= fInputLength);
4109
4110 // Restore original stack position, discarding any state saved
4111 // by the successful pattern match.
4112 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4113 int32_t newStackSize = (int32_t)fData[opValue];
4114 U_ASSERT(fStack->size() > newStackSize);
4115 fStack->setSize(newStackSize);
4116
4117 // FAIL, which will take control back to someplace
4118 // prior to entering the look-behind test.
4119 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4120 }
4121 break;
4122
4123
4124 case URX_LOOP_SR_I:
4125 // Loop Initialization for the optimized implementation of
4126 // [some character set]*
4127 // This op scans through all matching input.
4128 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4129 {
4130 U_ASSERT(opValue > 0 && opValue < fSets->size());
4131 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4132 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4133
4134 // Loop through input, until either the input is exhausted or
4135 // we reach a character that is not a member of the set.
4136 int64_t ix = fp->fInputIdx;
4137 UTEXT_SETNATIVEINDEX(fInputText, ix);
4138 for (;;) {
4139 if (ix >= fActiveLimit) {
4140 fHitEnd = TRUE;
4141 break;
4142 }
4143 UChar32 c = UTEXT_NEXT32(fInputText);
4144 if (c<256) {
4145 if (s8->contains(c) == FALSE) {
4146 break;
4147 }
4148 } else {
4149 if (s->contains(c) == FALSE) {
4150 break;
4151 }
4152 }
4153 ix = UTEXT_GETNATIVEINDEX(fInputText);
4154 }
4155
4156 // If there were no matching characters, skip over the loop altogether.
4157 // The loop doesn't run at all, a * op always succeeds.
4158 if (ix == fp->fInputIdx) {
4159 fp->fPatIdx++; // skip the URX_LOOP_C op.
4160 break;
4161 }
4162
4163 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4164 // must follow. It's operand is the stack location
4165 // that holds the starting input index for the match of this [set]*
4166 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4167 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4168 int32_t stackLoc = URX_VAL(loopcOp);
4169 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4170 fp->fExtra[stackLoc] = fp->fInputIdx;
4171 fp->fInputIdx = ix;
4172
4173 // Save State to the URX_LOOP_C op that follows this one,
4174 // so that match failures in the following code will return to there.
4175 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4176 fp = StateSave(fp, fp->fPatIdx, status);
4177 fp->fPatIdx++;
4178 }
4179 break;
4180
4181
4182 case URX_LOOP_DOT_I:
4183 // Loop Initialization for the optimized implementation of .*
4184 // This op scans through all remaining input.
4185 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4186 {
4187 // Loop through input until the input is exhausted (we reach an end-of-line)
4188 // In DOTALL mode, we can just go straight to the end of the input.
4189 int64_t ix;
4190 if ((opValue & 1) == 1) {
4191 // Dot-matches-All mode. Jump straight to the end of the string.
4192 ix = fActiveLimit;
4193 fHitEnd = TRUE;
4194 } else {
4195 // NOT DOT ALL mode. Line endings do not match '.'
4196 // Scan forward until a line ending or end of input.
4197 ix = fp->fInputIdx;
4198 UTEXT_SETNATIVEINDEX(fInputText, ix);
4199 for (;;) {
4200 if (ix >= fActiveLimit) {
4201 fHitEnd = TRUE;
4202 break;
4203 }
4204 UChar32 c = UTEXT_NEXT32(fInputText);
4205 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4206 if ((c == 0x0a) || // 0x0a is newline in both modes.
4207 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4208 isLineTerminator(c))) {
4209 // char is a line ending. Exit the scanning loop.
4210 break;
4211 }
4212 }
4213 ix = UTEXT_GETNATIVEINDEX(fInputText);
4214 }
4215 }
4216
4217 // If there were no matching characters, skip over the loop altogether.
4218 // The loop doesn't run at all, a * op always succeeds.
4219 if (ix == fp->fInputIdx) {
4220 fp->fPatIdx++; // skip the URX_LOOP_C op.
4221 break;
4222 }
4223
4224 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4225 // must follow. It's operand is the stack location
4226 // that holds the starting input index for the match of this .*
4227 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4228 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4229 int32_t stackLoc = URX_VAL(loopcOp);
4230 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4231 fp->fExtra[stackLoc] = fp->fInputIdx;
4232 fp->fInputIdx = ix;
4233
4234 // Save State to the URX_LOOP_C op that follows this one,
4235 // so that match failures in the following code will return to there.
4236 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4237 fp = StateSave(fp, fp->fPatIdx, status);
4238 fp->fPatIdx++;
4239 }
4240 break;
4241
4242
4243 case URX_LOOP_C:
4244 {
4245 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4246 backSearchIndex = fp->fExtra[opValue];
4247 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4248 if (backSearchIndex == fp->fInputIdx) {
4249 // We've backed up the input idx to the point that the loop started.
4250 // The loop is done. Leave here without saving state.
4251 // Subsequent failures won't come back here.
4252 break;
4253 }
4254 // Set up for the next iteration of the loop, with input index
4255 // backed up by one from the last time through,
4256 // and a state save to this instruction in case the following code fails again.
4257 // (We're going backwards because this loop emulates stack unwinding, not
4258 // the initial scan forward.)
4259 U_ASSERT(fp->fInputIdx > 0);
4260 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4261 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4262 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4263
4264 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4265 if (prevC == 0x0a &&
4266 fp->fInputIdx > backSearchIndex &&
4267 twoPrevC == 0x0d) {
4268 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4269 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4270 // .*, stepping back over CRLF pair.
4271 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4272 }
4273 }
4274
4275
4276 fp = StateSave(fp, fp->fPatIdx-1, status);
4277 }
4278 break;
4279
4280
4281
4282 default:
4283 // Trouble. The compiled pattern contains an entry with an
4284 // unrecognized type tag.
4285 UPRV_UNREACHABLE;
4286 }
4287
4288 if (U_FAILURE(status)) {
4289 isMatch = FALSE;
4290 break;
4291 }
4292 }
4293
4294breakFromLoop:
4295 fMatch = isMatch;
4296 if (isMatch) {
4297 fLastMatchEnd = fMatchEnd;
4298 fMatchStart = startIdx;
4299 fMatchEnd = fp->fInputIdx;
4300 }
4301
4302#ifdef REGEX_RUN_DEBUG
4303 if (fTraceDebug) {
4304 if (isMatch) {
4305 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4306 } else {
4307 printf("No match\n\n");
4308 }
4309 }
4310#endif
4311
4312 fFrame = fp; // The active stack frame when the engine stopped.
4313 // Contains the capture group results that we need to
4314 // access later.
4315 return;
4316}
4317
4318
4319//--------------------------------------------------------------------------------
4320//
4321// MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4322// assumption that the entire string is available in the UText's
4323// chunk buffer. For now, that means we can use int32_t indexes,
4324// except for anything that needs to be saved (like group starts
4325// and ends).
4326//
4327// startIdx: begin matching a this index.
4328// toEnd: if true, match must extend to end of the input region
4329//
4330//--------------------------------------------------------------------------------
4331void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4332 UBool isMatch = FALSE; // True if the we have a match.
4333
4334 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4335
4336 int32_t op; // Operation from the compiled pattern, split into
4337 int32_t opType; // the opcode
4338 int32_t opValue; // and the operand value.
4339
4340#ifdef REGEX_RUN_DEBUG
4341 if (fTraceDebug) {
4342 printf("MatchAt(startIdx=%d)\n", startIdx);
4343 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4344 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4345 }
4346#endif
4347
4348 if (U_FAILURE(status)) {
4349 return;
4350 }
4351
4352 // Cache frequently referenced items from the compiled pattern
4353 //
4354 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4355
4356 const UChar *litText = fPattern->fLiteralText.getBuffer();
4357 UVector *fSets = fPattern->fSets;
4358
4359 const UChar *inputBuf = fInputText->chunkContents;
4360
4361 fFrameSize = fPattern->fFrameSize;
4362 REStackFrame *fp = resetStack();
4363 if (U_FAILURE(fDeferredStatus)) {
4364 status = fDeferredStatus;
4365 return;
4366 }
4367
4368 fp->fPatIdx = 0;
4369 fp->fInputIdx = startIdx;
4370
4371 // Zero out the pattern's static data
4372 int32_t i;
4373 for (i = 0; i<fPattern->fDataSize; i++) {
4374 fData[i] = 0;
4375 }
4376
4377 //
4378 // Main loop for interpreting the compiled pattern.
4379 // One iteration of the loop per pattern operation performed.
4380 //
4381 for (;;) {
4382 op = (int32_t)pat[fp->fPatIdx];
4383 opType = URX_TYPE(op);
4384 opValue = URX_VAL(op);
4385#ifdef REGEX_RUN_DEBUG
4386 if (fTraceDebug) {
4387 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4388 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4389 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4390 fPattern->dumpOp(fp->fPatIdx);
4391 }
4392#endif
4393 fp->fPatIdx++;
4394
4395 switch (opType) {
4396
4397
4398 case URX_NOP:
4399 break;
4400
4401
4402 case URX_BACKTRACK:
4403 // Force a backtrack. In some circumstances, the pattern compiler
4404 // will notice that the pattern can't possibly match anything, and will
4405 // emit one of these at that point.
4406 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4407 break;
4408
4409
4410 case URX_ONECHAR:
4411 if (fp->fInputIdx < fActiveLimit) {
4412 UChar32 c;
4413 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4414 if (c == opValue) {
4415 break;
4416 }
4417 } else {
4418 fHitEnd = TRUE;
4419 }
4420 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4421 break;
4422
4423
4424 case URX_STRING:
4425 {
4426 // Test input against a literal string.
4427 // Strings require two slots in the compiled pattern, one for the
4428 // offset to the string text, and one for the length.
4429 int32_t stringStartIdx = opValue;
4430 int32_t stringLen;
4431
4432 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4433 fp->fPatIdx++;
4434 opType = URX_TYPE(op);
4435 stringLen = URX_VAL(op);
4436 U_ASSERT(opType == URX_STRING_LEN);
4437 U_ASSERT(stringLen >= 2);
4438
4439 const UChar * pInp = inputBuf + fp->fInputIdx;
4440 const UChar * pInpLimit = inputBuf + fActiveLimit;
4441 const UChar * pPat = litText+stringStartIdx;
4442 const UChar * pEnd = pInp + stringLen;
4443 UBool success = TRUE;
4444 while (pInp < pEnd) {
4445 if (pInp >= pInpLimit) {
4446 fHitEnd = TRUE;
4447 success = FALSE;
4448 break;
4449 }
4450 if (*pInp++ != *pPat++) {
4451 success = FALSE;
4452 break;
4453 }
4454 }
4455
4456 if (success) {
4457 fp->fInputIdx += stringLen;
4458 } else {
4459 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4460 }
4461 }
4462 break;
4463
4464
4465 case URX_STATE_SAVE:
4466 fp = StateSave(fp, opValue, status);
4467 break;
4468
4469
4470 case URX_END:
4471 // The match loop will exit via this path on a successful match,
4472 // when we reach the end of the pattern.
4473 if (toEnd && fp->fInputIdx != fActiveLimit) {
4474 // The pattern matched, but not to the end of input. Try some more.
4475 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4476 break;
4477 }
4478 isMatch = TRUE;
4479 goto breakFromLoop;
4480
4481 // Start and End Capture stack frame variables are laid out out like this:
4482 // fp->fExtra[opValue] - The start of a completed capture group
4483 // opValue+1 - The end of a completed capture group
4484 // opValue+2 - the start of a capture group whose end
4485 // has not yet been reached (and might not ever be).
4486 case URX_START_CAPTURE:
4487 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4488 fp->fExtra[opValue+2] = fp->fInputIdx;
4489 break;
4490
4491
4492 case URX_END_CAPTURE:
4493 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4494 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4495 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4496 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4497 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4498 break;
4499
4500
4501 case URX_DOLLAR: // $, test for End of line
4502 // or for position before new line at end of input
4503 if (fp->fInputIdx < fAnchorLimit-2) {
4504 // We are no where near the end of input. Fail.
4505 // This is the common case. Keep it first.
4506 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4507 break;
4508 }
4509 if (fp->fInputIdx >= fAnchorLimit) {
4510 // We really are at the end of input. Success.
4511 fHitEnd = TRUE;
4512 fRequireEnd = TRUE;
4513 break;
4514 }
4515
4516 // If we are positioned just before a new-line that is located at the
4517 // end of input, succeed.
4518 if (fp->fInputIdx == fAnchorLimit-1) {
4519 UChar32 c;
4520 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4521
4522 if (isLineTerminator(c)) {
4523 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4524 // At new-line at end of input. Success
4525 fHitEnd = TRUE;
4526 fRequireEnd = TRUE;
4527 break;
4528 }
4529 }
4530 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4531 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4532 fHitEnd = TRUE;
4533 fRequireEnd = TRUE;
4534 break; // At CR/LF at end of input. Success
4535 }
4536
4537 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4538
4539 break;
4540
4541
4542 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4543 if (fp->fInputIdx >= fAnchorLimit-1) {
4544 // Either at the last character of input, or off the end.
4545 if (fp->fInputIdx == fAnchorLimit-1) {
4546 // At last char of input. Success if it's a new line.
4547 if (inputBuf[fp->fInputIdx] == 0x0a) {
4548 fHitEnd = TRUE;
4549 fRequireEnd = TRUE;
4550 break;
4551 }
4552 } else {
4553 // Off the end of input. Success.
4554 fHitEnd = TRUE;
4555 fRequireEnd = TRUE;
4556 break;
4557 }
4558 }
4559
4560 // Not at end of input. Back-track out.
4561 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4562 break;
4563
4564
4565 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4566 {
4567 if (fp->fInputIdx >= fAnchorLimit) {
4568 // We really are at the end of input. Success.
4569 fHitEnd = TRUE;
4570 fRequireEnd = TRUE;
4571 break;
4572 }
4573 // If we are positioned just before a new-line, succeed.
4574 // It makes no difference where the new-line is within the input.
4575 UChar32 c = inputBuf[fp->fInputIdx];
4576 if (isLineTerminator(c)) {
4577 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4578 // In multi-line mode, hitting a new-line just before the end of input does not
4579 // set the hitEnd or requireEnd flags
4580 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4581 break;
4582 }
4583 }
4584 // not at a new line. Fail.
4585 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4586 }
4587 break;
4588
4589
4590 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4591 {
4592 if (fp->fInputIdx >= fAnchorLimit) {
4593 // We really are at the end of input. Success.
4594 fHitEnd = TRUE;
4595 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4596 break; // adding a new-line would not lose the match.
4597 }
4598 // If we are not positioned just before a new-line, the test fails; backtrack out.
4599 // It makes no difference where the new-line is within the input.
4600 if (inputBuf[fp->fInputIdx] != 0x0a) {
4601 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4602 }
4603 }
4604 break;
4605
4606
4607 case URX_CARET: // ^, test for start of line
4608 if (fp->fInputIdx != fAnchorStart) {
4609 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4610 }
4611 break;
4612
4613
4614 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4615 {
4616 if (fp->fInputIdx == fAnchorStart) {
4617 // We are at the start input. Success.
4618 break;
4619 }
4620 // Check whether character just before the current pos is a new-line
4621 // unless we are at the end of input
4622 UChar c = inputBuf[fp->fInputIdx - 1];
4623 if ((fp->fInputIdx < fAnchorLimit) &&
4624 isLineTerminator(c)) {
4625 // It's a new-line. ^ is true. Success.
4626 // TODO: what should be done with positions between a CR and LF?
4627 break;
4628 }
4629 // Not at the start of a line. Fail.
4630 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4631 }
4632 break;
4633
4634
4635 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4636 {
4637 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4638 if (fp->fInputIdx <= fAnchorStart) {
4639 // We are at the start input. Success.
4640 break;
4641 }
4642 // Check whether character just before the current pos is a new-line
4643 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4644 UChar c = inputBuf[fp->fInputIdx - 1];
4645 if (c != 0x0a) {
4646 // Not at the start of a line. Back-track out.
4647 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4648 }
4649 }
4650 break;
4651
4652 case URX_BACKSLASH_B: // Test for word boundaries
4653 {
4654 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4655 success ^= (UBool)(opValue != 0); // flip sense for \B
4656 if (!success) {
4657 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4658 }
4659 }
4660 break;
4661
4662
4663 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4664 {
4665 UBool success = isUWordBoundary(fp->fInputIdx);
4666 success ^= (UBool)(opValue != 0); // flip sense for \B
4667 if (!success) {
4668 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4669 }
4670 }
4671 break;
4672
4673
4674 case URX_BACKSLASH_D: // Test for decimal digit
4675 {
4676 if (fp->fInputIdx >= fActiveLimit) {
4677 fHitEnd = TRUE;
4678 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4679 break;
4680 }
4681
4682 UChar32 c;
4683 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4684 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4685 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4686 success ^= (UBool)(opValue != 0); // flip sense for \D
4687 if (!success) {
4688 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4689 }
4690 }
4691 break;
4692
4693
4694 case URX_BACKSLASH_G: // Test for position at end of previous match
4695 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4696 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4697 }
4698 break;
4699
4700
4701 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4702 {
4703 if (fp->fInputIdx >= fActiveLimit) {
4704 fHitEnd = TRUE;
4705 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4706 break;
4707 }
4708 UChar32 c;
4709 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4710 int8_t ctype = u_charType(c);
4711 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4712 success ^= (UBool)(opValue != 0); // flip sense for \H
4713 if (!success) {
4714 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4715 }
4716 }
4717 break;
4718
4719
4720 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4721 {
4722 if (fp->fInputIdx >= fActiveLimit) {
4723 fHitEnd = TRUE;
4724 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4725 break;
4726 }
4727 UChar32 c;
4728 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4729 if (isLineTerminator(c)) {
4730 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4731 // Check for CR/LF sequence. Consume both together when found.
4732 UChar c2;
4733 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4734 if (c2 != 0x0a) {
4735 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4736 }
4737 }
4738 } else {
4739 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4740 }
4741 }
4742 break;
4743
4744
4745 case URX_BACKSLASH_V: // Any single code point line ending.
4746 {
4747 if (fp->fInputIdx >= fActiveLimit) {
4748 fHitEnd = TRUE;
4749 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4750 break;
4751 }
4752 UChar32 c;
4753 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4754 UBool success = isLineTerminator(c);
4755 success ^= (UBool)(opValue != 0); // flip sense for \V
4756 if (!success) {
4757 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4758 }
4759 }
4760 break;
4761
4762
4763
4764 case URX_BACKSLASH_X:
4765 // Match a Grapheme, as defined by Unicode TR 29.
4766 // Differs slightly from Perl, which consumes combining marks independently
4767 // of context.
4768 {
4769
4770 // Fail if at end of input
4771 if (fp->fInputIdx >= fActiveLimit) {
4772 fHitEnd = TRUE;
4773 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4774 break;
4775 }
4776
4777 // Examine (and consume) the current char.
4778 // Dispatch into a little state machine, based on the char.
4779 UChar32 c;
4780 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4781 UnicodeSet **sets = fPattern->fStaticSets;
4782 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4783 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4784 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4785 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4786 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4787 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4788 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4789 goto GC_Extend;
4790
4791
4792
4793GC_L:
4794 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4795 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4796 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4797 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4798 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4799 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4800 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4801 goto GC_Extend;
4802
4803GC_V:
4804 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4805 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4806 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4807 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4808 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4809 goto GC_Extend;
4810
4811GC_T:
4812 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4813 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4814 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4815 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4816 goto GC_Extend;
4817
4818GC_Extend:
4819 // Combining characters are consumed here
4820 for (;;) {
4821 if (fp->fInputIdx >= fActiveLimit) {
4822 break;
4823 }
4824 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4825 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4826 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4827 break;
4828 }
4829 }
4830 goto GC_Done;
4831
4832GC_Control:
4833 // Most control chars stand alone (don't combine with combining chars),
4834 // except for that CR/LF sequence is a single grapheme cluster.
4835 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4836 fp->fInputIdx++;
4837 }
4838
4839GC_Done:
4840 if (fp->fInputIdx >= fActiveLimit) {
4841 fHitEnd = TRUE;
4842 }
4843 break;
4844 }
4845
4846
4847
4848
4849 case URX_BACKSLASH_Z: // Test for end of Input
4850 if (fp->fInputIdx < fAnchorLimit) {
4851 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4852 } else {
4853 fHitEnd = TRUE;
4854 fRequireEnd = TRUE;
4855 }
4856 break;
4857
4858
4859
4860 case URX_STATIC_SETREF:
4861 {
4862 // Test input character against one of the predefined sets
4863 // (Word Characters, for example)
4864 // The high bit of the op value is a flag for the match polarity.
4865 // 0: success if input char is in set.
4866 // 1: success if input char is not in set.
4867 if (fp->fInputIdx >= fActiveLimit) {
4868 fHitEnd = TRUE;
4869 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4870 break;
4871 }
4872
4873 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4874 opValue &= ~URX_NEG_SET;
4875 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4876
4877 UChar32 c;
4878 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4879 if (c < 256) {
4880 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4881 if (s8->contains(c)) {
4882 success = !success;
4883 }
4884 } else {
4885 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4886 if (s->contains(c)) {
4887 success = !success;
4888 }
4889 }
4890 if (!success) {
4891 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4892 }
4893 }
4894 break;
4895
4896
4897 case URX_STAT_SETREF_N:
4898 {
4899 // Test input character for NOT being a member of one of
4900 // the predefined sets (Word Characters, for example)
4901 if (fp->fInputIdx >= fActiveLimit) {
4902 fHitEnd = TRUE;
4903 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4904 break;
4905 }
4906
4907 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4908
4909 UChar32 c;
4910 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4911 if (c < 256) {
4912 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4913 if (s8->contains(c) == FALSE) {
4914 break;
4915 }
4916 } else {
4917 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4918 if (s->contains(c) == FALSE) {
4919 break;
4920 }
4921 }
4922 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4923 }
4924 break;
4925
4926
4927 case URX_SETREF:
4928 {
4929 if (fp->fInputIdx >= fActiveLimit) {
4930 fHitEnd = TRUE;
4931 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4932 break;
4933 }
4934
4935 U_ASSERT(opValue > 0 && opValue < fSets->size());
4936
4937 // There is input left. Pick up one char and test it for set membership.
4938 UChar32 c;
4939 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4940 if (c<256) {
4941 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4942 if (s8->contains(c)) {
4943 // The character is in the set. A Match.
4944 break;
4945 }
4946 } else {
4947 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4948 if (s->contains(c)) {
4949 // The character is in the set. A Match.
4950 break;
4951 }
4952 }
4953
4954 // the character wasn't in the set.
4955 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4956 }
4957 break;
4958
4959
4960 case URX_DOTANY:
4961 {
4962 // . matches anything, but stops at end-of-line.
4963 if (fp->fInputIdx >= fActiveLimit) {
4964 // At end of input. Match failed. Backtrack out.
4965 fHitEnd = TRUE;
4966 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4967 break;
4968 }
4969
4970 // There is input left. Advance over one char, unless we've hit end-of-line
4971 UChar32 c;
4972 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4973 if (isLineTerminator(c)) {
4974 // End of line in normal mode. . does not match.
4975 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4976 break;
4977 }
4978 }
4979 break;
4980
4981
4982 case URX_DOTANY_ALL:
4983 {
4984 // . in dot-matches-all (including new lines) mode
4985 if (fp->fInputIdx >= fActiveLimit) {
4986 // At end of input. Match failed. Backtrack out.
4987 fHitEnd = TRUE;
4988 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4989 break;
4990 }
4991
4992 // There is input left. Advance over one char, except if we are
4993 // at a cr/lf, advance over both of them.
4994 UChar32 c;
4995 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4996 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4997 // In the case of a CR/LF, we need to advance over both.
4998 if (inputBuf[fp->fInputIdx] == 0x0a) {
4999 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
5000 }
5001 }
5002 }
5003 break;
5004
5005
5006 case URX_DOTANY_UNIX:
5007 {
5008 // '.' operator, matches all, but stops at end-of-line.
5009 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
5010 if (fp->fInputIdx >= fActiveLimit) {
5011 // At end of input. Match failed. Backtrack out.
5012 fHitEnd = TRUE;
5013 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5014 break;
5015 }
5016
5017 // There is input left. Advance over one char, unless we've hit end-of-line
5018 UChar32 c;
5019 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5020 if (c == 0x0a) {
5021 // End of line in normal mode. '.' does not match the \n
5022 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5023 }
5024 }
5025 break;
5026
5027
5028 case URX_JMP:
5029 fp->fPatIdx = opValue;
5030 break;
5031
5032 case URX_FAIL:
5033 isMatch = FALSE;
5034 goto breakFromLoop;
5035
5036 case URX_JMP_SAV:
5037 U_ASSERT(opValue < fPattern->fCompiledPat->size());
5038 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5039 fp->fPatIdx = opValue; // Then JMP.
5040 break;
5041
5042 case URX_JMP_SAV_X:
5043 // This opcode is used with (x)+, when x can match a zero length string.
5044 // Same as JMP_SAV, except conditional on the match having made forward progress.
5045 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5046 // data address of the input position at the start of the loop.
5047 {
5048 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5049 int32_t stoOp = (int32_t)pat[opValue-1];
5050 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5051 int32_t frameLoc = URX_VAL(stoOp);
5052 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5053 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5054 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5055 if (prevInputIdx < fp->fInputIdx) {
5056 // The match did make progress. Repeat the loop.
5057 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5058 fp->fPatIdx = opValue;
5059 fp->fExtra[frameLoc] = fp->fInputIdx;
5060 }
5061 // If the input position did not advance, we do nothing here,
5062 // execution will fall out of the loop.
5063 }
5064 break;
5065
5066 case URX_CTR_INIT:
5067 {
5068 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5069 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5070
5071 // Pick up the three extra operands that CTR_INIT has, and
5072 // skip the pattern location counter past
5073 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5074 fp->fPatIdx += 3;
5075 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5076 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5077 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5078 U_ASSERT(minCount>=0);
5079 U_ASSERT(maxCount>=minCount || maxCount==-1);
5080 U_ASSERT(loopLoc>=fp->fPatIdx);
5081
5082 if (minCount == 0) {
5083 fp = StateSave(fp, loopLoc+1, status);
5084 }
5085 if (maxCount == -1) {
5086 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
5087 } else if (maxCount == 0) {
5088 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5089 }
5090 }
5091 break;
5092
5093 case URX_CTR_LOOP:
5094 {
5095 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5096 int32_t initOp = (int32_t)pat[opValue];
5097 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5098 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5099 int32_t minCount = (int32_t)pat[opValue+2];
5100 int32_t maxCount = (int32_t)pat[opValue+3];
5101 (*pCounter)++;
5102 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5103 U_ASSERT(*pCounter == maxCount);
5104 break;
5105 }
5106 if (*pCounter >= minCount) {
5107 if (maxCount == -1) {
5108 // Loop has no hard upper bound.
5109 // Check that it is progressing through the input, break if it is not.
5110 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5111 if (fp->fInputIdx == *pLastInputIdx) {
5112 break;
5113 } else {
5114 *pLastInputIdx = fp->fInputIdx;
5115 }
5116 }
5117 fp = StateSave(fp, fp->fPatIdx, status);
5118 } else {
5119 // Increment time-out counter. (StateSave() does it if count >= minCount)
5120 fTickCounter--;
5121 if (fTickCounter <= 0) {
5122 IncrementTime(status); // Re-initializes fTickCounter
5123 }
5124 }
5125 fp->fPatIdx = opValue + 4; // Loop back.
5126 }
5127 break;
5128
5129 case URX_CTR_INIT_NG:
5130 {
5131 // Initialize a non-greedy loop
5132 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5133 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5134
5135 // Pick up the three extra operands that CTR_INIT_NG has, and
5136 // skip the pattern location counter past
5137 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5138 fp->fPatIdx += 3;
5139 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5140 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5141 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5142 U_ASSERT(minCount>=0);
5143 U_ASSERT(maxCount>=minCount || maxCount==-1);
5144 U_ASSERT(loopLoc>fp->fPatIdx);
5145 if (maxCount == -1) {
5146 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5147 }
5148
5149 if (minCount == 0) {
5150 if (maxCount != 0) {
5151 fp = StateSave(fp, fp->fPatIdx, status);
5152 }
5153 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5154 }
5155 }
5156 break;
5157
5158 case URX_CTR_LOOP_NG:
5159 {
5160 // Non-greedy {min, max} loops
5161 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5162 int32_t initOp = (int32_t)pat[opValue];
5163 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5164 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5165 int32_t minCount = (int32_t)pat[opValue+2];
5166 int32_t maxCount = (int32_t)pat[opValue+3];
5167
5168 (*pCounter)++;
5169 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5170 // The loop has matched the maximum permitted number of times.
5171 // Break out of here with no action. Matching will
5172 // continue with the following pattern.
5173 U_ASSERT(*pCounter == maxCount);
5174 break;
5175 }
5176
5177 if (*pCounter < minCount) {
5178 // We haven't met the minimum number of matches yet.
5179 // Loop back for another one.
5180 fp->fPatIdx = opValue + 4; // Loop back.
5181 fTickCounter--;
5182 if (fTickCounter <= 0) {
5183 IncrementTime(status); // Re-initializes fTickCounter
5184 }
5185 } else {
5186 // We do have the minimum number of matches.
5187
5188 // If there is no upper bound on the loop iterations, check that the input index
5189 // is progressing, and stop the loop if it is not.
5190 if (maxCount == -1) {
5191 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5192 if (fp->fInputIdx == *pLastInputIdx) {
5193 break;
5194 }
5195 *pLastInputIdx = fp->fInputIdx;
5196 }
5197
5198 // Loop Continuation: we will fall into the pattern following the loop
5199 // (non-greedy, don't execute loop body first), but first do
5200 // a state save to the top of the loop, so that a match failure
5201 // in the following pattern will try another iteration of the loop.
5202 fp = StateSave(fp, opValue + 4, status);
5203 }
5204 }
5205 break;
5206
5207 case URX_STO_SP:
5208 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5209 fData[opValue] = fStack->size();
5210 break;
5211
5212 case URX_LD_SP:
5213 {
5214 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5215 int32_t newStackSize = (int32_t)fData[opValue];
5216 U_ASSERT(newStackSize <= fStack->size());
5217 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5218 if (newFP == (int64_t *)fp) {
5219 break;
5220 }
5221 int32_t j;
5222 for (j=0; j<fFrameSize; j++) {
5223 newFP[j] = ((int64_t *)fp)[j];
5224 }
5225 fp = (REStackFrame *)newFP;
5226 fStack->setSize(newStackSize);
5227 }
5228 break;
5229
5230 case URX_BACKREF:
5231 {
5232 U_ASSERT(opValue < fFrameSize);
5233 int64_t groupStartIdx = fp->fExtra[opValue];
5234 int64_t groupEndIdx = fp->fExtra[opValue+1];
5235 U_ASSERT(groupStartIdx <= groupEndIdx);
5236 int64_t inputIndex = fp->fInputIdx;
5237 if (groupStartIdx < 0) {
5238 // This capture group has not participated in the match thus far,
5239 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5240 break;
5241 }
5242 UBool success = TRUE;
5243 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5244 if (inputIndex >= fActiveLimit) {
5245 success = FALSE;
5246 fHitEnd = TRUE;
5247 break;
5248 }
5249 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5250 success = FALSE;
5251 break;
5252 }
5253 }
5254 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5255 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5256 // Capture group ended with an unpaired lead surrogate.
5257 // Back reference is not permitted to match lead only of a surrogatge pair.
5258 success = FALSE;
5259 }
5260 if (success) {
5261 fp->fInputIdx = inputIndex;
5262 } else {
5263 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5264 }
5265 }
5266 break;
5267
5268 case URX_BACKREF_I:
5269 {
5270 U_ASSERT(opValue < fFrameSize);
5271 int64_t groupStartIdx = fp->fExtra[opValue];
5272 int64_t groupEndIdx = fp->fExtra[opValue+1];
5273 U_ASSERT(groupStartIdx <= groupEndIdx);
5274 if (groupStartIdx < 0) {
5275 // This capture group has not participated in the match thus far,
5276 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5277 break;
5278 }
5279 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5280 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5281
5282 // Note: if the capture group match was of an empty string the backref
5283 // match succeeds. Verified by testing: Perl matches succeed
5284 // in this case, so we do too.
5285
5286 UBool success = TRUE;
5287 for (;;) {
5288 UChar32 captureGroupChar = captureGroupItr.next();
5289 if (captureGroupChar == U_SENTINEL) {
5290 success = TRUE;
5291 break;
5292 }
5293 UChar32 inputChar = inputItr.next();
5294 if (inputChar == U_SENTINEL) {
5295 success = FALSE;
5296 fHitEnd = TRUE;
5297 break;
5298 }
5299 if (inputChar != captureGroupChar) {
5300 success = FALSE;
5301 break;
5302 }
5303 }
5304
5305 if (success && inputItr.inExpansion()) {
5306 // We otained a match by consuming part of a string obtained from
5307 // case-folding a single code point of the input text.
5308 // This does not count as an overall match.
5309 success = FALSE;
5310 }
5311
5312 if (success) {
5313 fp->fInputIdx = inputItr.getIndex();
5314 } else {
5315 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5316 }
5317 }
5318 break;
5319
5320 case URX_STO_INP_LOC:
5321 {
5322 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5323 fp->fExtra[opValue] = fp->fInputIdx;
5324 }
5325 break;
5326
5327 case URX_JMPX:
5328 {
5329 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5330 fp->fPatIdx += 1;
5331 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5332 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5333 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5334 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5335 if (savedInputIdx < fp->fInputIdx) {
5336 fp->fPatIdx = opValue; // JMP
5337 } else {
5338 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5339 }
5340 }
5341 break;
5342
5343 case URX_LA_START:
5344 {
5345 // Entering a look around block.
5346 // Save Stack Ptr, Input Pos.
5347 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5348 fData[opValue] = fStack->size();
5349 fData[opValue+1] = fp->fInputIdx;
5350 fData[opValue+2] = fActiveStart;
5351 fData[opValue+3] = fActiveLimit;
5352 fActiveStart = fLookStart; // Set the match region change for
5353 fActiveLimit = fLookLimit; // transparent bounds.
5354 }
5355 break;
5356
5357 case URX_LA_END:
5358 {
5359 // Leaving a look around block.
5360 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5361 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5362 int32_t stackSize = fStack->size();
5363 int32_t newStackSize = (int32_t)fData[opValue];
5364 U_ASSERT(stackSize >= newStackSize);
5365 if (stackSize > newStackSize) {
5366 // Copy the current top frame back to the new (cut back) top frame.
5367 // This makes the capture groups from within the look-ahead
5368 // expression available.
5369 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5370 int32_t j;
5371 for (j=0; j<fFrameSize; j++) {
5372 newFP[j] = ((int64_t *)fp)[j];
5373 }
5374 fp = (REStackFrame *)newFP;
5375 fStack->setSize(newStackSize);
5376 }
5377 fp->fInputIdx = fData[opValue+1];
5378
5379 // Restore the active region bounds in the input string; they may have
5380 // been changed because of transparent bounds on a Region.
5381 fActiveStart = fData[opValue+2];
5382 fActiveLimit = fData[opValue+3];
5383 U_ASSERT(fActiveStart >= 0);
5384 U_ASSERT(fActiveLimit <= fInputLength);
5385 }
5386 break;
5387
5388 case URX_ONECHAR_I:
5389 if (fp->fInputIdx < fActiveLimit) {
5390 UChar32 c;
5391 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5392 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5393 break;
5394 }
5395 } else {
5396 fHitEnd = TRUE;
5397 }
5398 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5399 break;
5400
5401 case URX_STRING_I:
5402 // Case-insensitive test input against a literal string.
5403 // Strings require two slots in the compiled pattern, one for the
5404 // offset to the string text, and one for the length.
5405 // The compiled string has already been case folded.
5406 {
5407 const UChar *patternString = litText + opValue;
5408
5409 op = (int32_t)pat[fp->fPatIdx];
5410 fp->fPatIdx++;
5411 opType = URX_TYPE(op);
5412 opValue = URX_VAL(op);
5413 U_ASSERT(opType == URX_STRING_LEN);
5414 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5415
5416 UChar32 cText;
5417 UChar32 cPattern;
5418 UBool success = TRUE;
5419 int32_t patternStringIdx = 0;
5420 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5421 while (patternStringIdx < patternStringLen) {
5422 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5423 cText = inputIterator.next();
5424 if (cText != cPattern) {
5425 success = FALSE;
5426 if (cText == U_SENTINEL) {
5427 fHitEnd = TRUE;
5428 }
5429 break;
5430 }
5431 }
5432 if (inputIterator.inExpansion()) {
5433 success = FALSE;
5434 }
5435
5436 if (success) {
5437 fp->fInputIdx = inputIterator.getIndex();
5438 } else {
5439 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5440 }
5441 }
5442 break;
5443
5444 case URX_LB_START:
5445 {
5446 // Entering a look-behind block.
5447 // Save Stack Ptr, Input Pos and active input region.
5448 // TODO: implement transparent bounds. Ticket #6067
5449 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5450 fData[opValue] = fStack->size();
5451 fData[opValue+1] = fp->fInputIdx;
5452 // Save input string length, then reset to pin any matches to end at
5453 // the current position.
5454 fData[opValue+2] = fActiveStart;
5455 fData[opValue+3] = fActiveLimit;
5456 fActiveStart = fRegionStart;
5457 fActiveLimit = fp->fInputIdx;
5458 // Init the variable containing the start index for attempted matches.
5459 fData[opValue+4] = -1;
5460 }
5461 break;
5462
5463
5464 case URX_LB_CONT:
5465 {
5466 // Positive Look-Behind, at top of loop checking for matches of LB expression
5467 // at all possible input starting positions.
5468
5469 // Fetch the min and max possible match lengths. They are the operands
5470 // of this op in the pattern.
5471 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5472 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5473 U_ASSERT(minML <= maxML);
5474 U_ASSERT(minML >= 0);
5475
5476 // Fetch (from data) the last input index where a match was attempted.
5477 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5478 int64_t &lbStartIdx = fData[opValue+4];
5479 if (lbStartIdx < 0) {
5480 // First time through loop.
5481 lbStartIdx = fp->fInputIdx - minML;
5482 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5483 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5484 }
5485 } else {
5486 // 2nd through nth time through the loop.
5487 // Back up start position for match by one.
5488 if (lbStartIdx == 0) {
5489 lbStartIdx--;
5490 } else {
5491 U16_BACK_1(inputBuf, 0, lbStartIdx);
5492 }
5493 }
5494
5495 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5496 // We have tried all potential match starting points without
5497 // getting a match. Backtrack out, and out of the
5498 // Look Behind altogether.
5499 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5500 fActiveStart = fData[opValue+2];
5501 fActiveLimit = fData[opValue+3];
5502 U_ASSERT(fActiveStart >= 0);
5503 U_ASSERT(fActiveLimit <= fInputLength);
5504 break;
5505 }
5506
5507 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5508 // (successful match will fall off the end of the loop.)
5509 fp = StateSave(fp, fp->fPatIdx-3, status);
5510 fp->fInputIdx = lbStartIdx;
5511 }
5512 break;
5513
5514 case URX_LB_END:
5515 // End of a look-behind block, after a successful match.
5516 {
5517 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5518 if (fp->fInputIdx != fActiveLimit) {
5519 // The look-behind expression matched, but the match did not
5520 // extend all the way to the point that we are looking behind from.
5521 // FAIL out of here, which will take us back to the LB_CONT, which
5522 // will retry the match starting at another position or fail
5523 // the look-behind altogether, whichever is appropriate.
5524 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5525 break;
5526 }
5527
5528 // Look-behind match is good. Restore the orignal input string region,
5529 // which had been truncated to pin the end of the lookbehind match to the
5530 // position being looked-behind.
5531 fActiveStart = fData[opValue+2];
5532 fActiveLimit = fData[opValue+3];
5533 U_ASSERT(fActiveStart >= 0);
5534 U_ASSERT(fActiveLimit <= fInputLength);
5535 }
5536 break;
5537
5538
5539 case URX_LBN_CONT:
5540 {
5541 // Negative Look-Behind, at top of loop checking for matches of LB expression
5542 // at all possible input starting positions.
5543
5544 // Fetch the extra parameters of this op.
5545 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5546 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5547 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5548 continueLoc = URX_VAL(continueLoc);
5549 U_ASSERT(minML <= maxML);
5550 U_ASSERT(minML >= 0);
5551 U_ASSERT(continueLoc > fp->fPatIdx);
5552
5553 // Fetch (from data) the last input index where a match was attempted.
5554 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5555 int64_t &lbStartIdx = fData[opValue+4];
5556 if (lbStartIdx < 0) {
5557 // First time through loop.
5558 lbStartIdx = fp->fInputIdx - minML;
5559 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5560 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5561 }
5562 } else {
5563 // 2nd through nth time through the loop.
5564 // Back up start position for match by one.
5565 if (lbStartIdx == 0) {
5566 lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
5567 } else {
5568 U16_BACK_1(inputBuf, 0, lbStartIdx);
5569 }
5570 }
5571
5572 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5573 // We have tried all potential match starting points without
5574 // getting a match, which means that the negative lookbehind as
5575 // a whole has succeeded. Jump forward to the continue location
5576 fActiveStart = fData[opValue+2];
5577 fActiveLimit = fData[opValue+3];
5578 U_ASSERT(fActiveStart >= 0);
5579 U_ASSERT(fActiveLimit <= fInputLength);
5580 fp->fPatIdx = continueLoc;
5581 break;
5582 }
5583
5584 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5585 // (successful match will cause a FAIL out of the loop altogether.)
5586 fp = StateSave(fp, fp->fPatIdx-4, status);
5587 fp->fInputIdx = lbStartIdx;
5588 }
5589 break;
5590
5591 case URX_LBN_END:
5592 // End of a negative look-behind block, after a successful match.
5593 {
5594 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5595 if (fp->fInputIdx != fActiveLimit) {
5596 // The look-behind expression matched, but the match did not
5597 // extend all the way to the point that we are looking behind from.
5598 // FAIL out of here, which will take us back to the LB_CONT, which
5599 // will retry the match starting at another position or succeed
5600 // the look-behind altogether, whichever is appropriate.
5601 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5602 break;
5603 }
5604
5605 // Look-behind expression matched, which means look-behind test as
5606 // a whole Fails
5607
5608 // Restore the orignal input string length, which had been truncated
5609 // inorder to pin the end of the lookbehind match
5610 // to the position being looked-behind.
5611 fActiveStart = fData[opValue+2];
5612 fActiveLimit = fData[opValue+3];
5613 U_ASSERT(fActiveStart >= 0);
5614 U_ASSERT(fActiveLimit <= fInputLength);
5615
5616 // Restore original stack position, discarding any state saved
5617 // by the successful pattern match.
5618 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5619 int32_t newStackSize = (int32_t)fData[opValue];
5620 U_ASSERT(fStack->size() > newStackSize);
5621 fStack->setSize(newStackSize);
5622
5623 // FAIL, which will take control back to someplace
5624 // prior to entering the look-behind test.
5625 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5626 }
5627 break;
5628
5629
5630 case URX_LOOP_SR_I:
5631 // Loop Initialization for the optimized implementation of
5632 // [some character set]*
5633 // This op scans through all matching input.
5634 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5635 {
5636 U_ASSERT(opValue > 0 && opValue < fSets->size());
5637 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5638 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
5639
5640 // Loop through input, until either the input is exhausted or
5641 // we reach a character that is not a member of the set.
5642 int32_t ix = (int32_t)fp->fInputIdx;
5643 for (;;) {
5644 if (ix >= fActiveLimit) {
5645 fHitEnd = TRUE;
5646 break;
5647 }
5648 UChar32 c;
5649 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5650 if (c<256) {
5651 if (s8->contains(c) == FALSE) {
5652 U16_BACK_1(inputBuf, 0, ix);
5653 break;
5654 }
5655 } else {
5656 if (s->contains(c) == FALSE) {
5657 U16_BACK_1(inputBuf, 0, ix);
5658 break;
5659 }
5660 }
5661 }
5662
5663 // If there were no matching characters, skip over the loop altogether.
5664 // The loop doesn't run at all, a * op always succeeds.
5665 if (ix == fp->fInputIdx) {
5666 fp->fPatIdx++; // skip the URX_LOOP_C op.
5667 break;
5668 }
5669
5670 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5671 // must follow. It's operand is the stack location
5672 // that holds the starting input index for the match of this [set]*
5673 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5674 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5675 int32_t stackLoc = URX_VAL(loopcOp);
5676 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5677 fp->fExtra[stackLoc] = fp->fInputIdx;
5678 fp->fInputIdx = ix;
5679
5680 // Save State to the URX_LOOP_C op that follows this one,
5681 // so that match failures in the following code will return to there.
5682 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5683 fp = StateSave(fp, fp->fPatIdx, status);
5684 fp->fPatIdx++;
5685 }
5686 break;
5687
5688
5689 case URX_LOOP_DOT_I:
5690 // Loop Initialization for the optimized implementation of .*
5691 // This op scans through all remaining input.
5692 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5693 {
5694 // Loop through input until the input is exhausted (we reach an end-of-line)
5695 // In DOTALL mode, we can just go straight to the end of the input.
5696 int32_t ix;
5697 if ((opValue & 1) == 1) {
5698 // Dot-matches-All mode. Jump straight to the end of the string.
5699 ix = (int32_t)fActiveLimit;
5700 fHitEnd = TRUE;
5701 } else {
5702 // NOT DOT ALL mode. Line endings do not match '.'
5703 // Scan forward until a line ending or end of input.
5704 ix = (int32_t)fp->fInputIdx;
5705 for (;;) {
5706 if (ix >= fActiveLimit) {
5707 fHitEnd = TRUE;
5708 break;
5709 }
5710 UChar32 c;
5711 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5712 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5713 if ((c == 0x0a) || // 0x0a is newline in both modes.
5714 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5715 isLineTerminator(c))) {
5716 // char is a line ending. Put the input pos back to the
5717 // line ending char, and exit the scanning loop.
5718 U16_BACK_1(inputBuf, 0, ix);
5719 break;
5720 }
5721 }
5722 }
5723 }
5724
5725 // If there were no matching characters, skip over the loop altogether.
5726 // The loop doesn't run at all, a * op always succeeds.
5727 if (ix == fp->fInputIdx) {
5728 fp->fPatIdx++; // skip the URX_LOOP_C op.
5729 break;
5730 }
5731
5732 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5733 // must follow. It's operand is the stack location
5734 // that holds the starting input index for the match of this .*
5735 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5736 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5737 int32_t stackLoc = URX_VAL(loopcOp);
5738 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5739 fp->fExtra[stackLoc] = fp->fInputIdx;
5740 fp->fInputIdx = ix;
5741
5742 // Save State to the URX_LOOP_C op that follows this one,
5743 // so that match failures in the following code will return to there.
5744 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5745 fp = StateSave(fp, fp->fPatIdx, status);
5746 fp->fPatIdx++;
5747 }
5748 break;
5749
5750
5751 case URX_LOOP_C:
5752 {
5753 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5754 backSearchIndex = (int32_t)fp->fExtra[opValue];
5755 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5756 if (backSearchIndex == fp->fInputIdx) {
5757 // We've backed up the input idx to the point that the loop started.
5758 // The loop is done. Leave here without saving state.
5759 // Subsequent failures won't come back here.
5760 break;
5761 }
5762 // Set up for the next iteration of the loop, with input index
5763 // backed up by one from the last time through,
5764 // and a state save to this instruction in case the following code fails again.
5765 // (We're going backwards because this loop emulates stack unwinding, not
5766 // the initial scan forward.)
5767 U_ASSERT(fp->fInputIdx > 0);
5768 UChar32 prevC;
5769 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5770
5771 if (prevC == 0x0a &&
5772 fp->fInputIdx > backSearchIndex &&
5773 inputBuf[fp->fInputIdx-1] == 0x0d) {
5774 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5775 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5776 // .*, stepping back over CRLF pair.
5777 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5778 }
5779 }
5780
5781
5782 fp = StateSave(fp, fp->fPatIdx-1, status);
5783 }
5784 break;
5785
5786
5787
5788 default:
5789 // Trouble. The compiled pattern contains an entry with an
5790 // unrecognized type tag.
5791 UPRV_UNREACHABLE;
5792 }
5793
5794 if (U_FAILURE(status)) {
5795 isMatch = FALSE;
5796 break;
5797 }
5798 }
5799
5800breakFromLoop:
5801 fMatch = isMatch;
5802 if (isMatch) {
5803 fLastMatchEnd = fMatchEnd;
5804 fMatchStart = startIdx;
5805 fMatchEnd = fp->fInputIdx;
5806 }
5807
5808#ifdef REGEX_RUN_DEBUG
5809 if (fTraceDebug) {
5810 if (isMatch) {
5811 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5812 } else {
5813 printf("No match\n\n");
5814 }
5815 }
5816#endif
5817
5818 fFrame = fp; // The active stack frame when the engine stopped.
5819 // Contains the capture group results that we need to
5820 // access later.
5821
5822 return;
5823}
5824
5825
5826UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5827
5828U_NAMESPACE_END
5829
5830#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
5831
5832