1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4// file: regexcmp.cpp
5//
6// Copyright (C) 2002-2016 International Business Machines Corporation and others.
7// All Rights Reserved.
8//
9// This file contains the ICU regular expression compiler, which is responsible
10// for processing a regular expression pattern into the compiled form that
11// is used by the match finding engine.
12//
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_REGULAR_EXPRESSIONS
17
18#include "unicode/ustring.h"
19#include "unicode/unistr.h"
20#include "unicode/uniset.h"
21#include "unicode/uchar.h"
22#include "unicode/uchriter.h"
23#include "unicode/parsepos.h"
24#include "unicode/parseerr.h"
25#include "unicode/regex.h"
26#include "unicode/utf.h"
27#include "unicode/utf16.h"
28#include "patternprops.h"
29#include "putilimp.h"
30#include "cmemory.h"
31#include "cstr.h"
32#include "cstring.h"
33#include "uvectr32.h"
34#include "uvectr64.h"
35#include "uassert.h"
36#include "uinvchar.h"
37
38#include "regeximp.h"
39#include "regexcst.h" // Contains state table for the regex pattern parser.
40 // generated by a Perl script.
41#include "regexcmp.h"
42#include "regexst.h"
43#include "regextxt.h"
44
45
46
47U_NAMESPACE_BEGIN
48
49
50//------------------------------------------------------------------------------
51//
52// Constructor.
53//
54//------------------------------------------------------------------------------
55RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56 fParenStack(status), fSetStack(status), fSetOpStack(status)
57{
58 // Lazy init of all shared global sets (needed for init()'s empty text)
59 RegexStaticSets::initGlobals(&status);
60
61 fStatus = &status;
62
63 fRXPat = rxp;
64 fScanIndex = 0;
65 fLastChar = -1;
66 fPeekChar = -1;
67 fLineNum = 1;
68 fCharNum = 0;
69 fQuoteMode = FALSE;
70 fInBackslashQuote = FALSE;
71 fModeFlags = fRXPat->fFlags | 0x80000000;
72 fEOLComments = TRUE;
73
74 fMatchOpenParen = -1;
75 fMatchCloseParen = -1;
76 fCaptureName = NULL;
77 fLastSetLiteral = U_SENTINEL;
78
79 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80 status = rxp->fDeferredStatus;
81 }
82}
83
84static const UChar chAmp = 0x26; // '&'
85static const UChar chDash = 0x2d; // '-'
86
87
88//------------------------------------------------------------------------------
89//
90// Destructor
91//
92//------------------------------------------------------------------------------
93RegexCompile::~RegexCompile() {
94 delete fCaptureName; // Normally will be NULL, but can exist if pattern
95 // compilation stops with a syntax error.
96}
97
98static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100}
101
102//------------------------------------------------------------------------------
103//
104// Compile regex pattern. The state machine for rexexp pattern parsing is here.
105// The state tables are hand-written in the file regexcst.txt,
106// and converted to the form used here by a perl
107// script regexcst.pl
108//
109//------------------------------------------------------------------------------
110void RegexCompile::compile(
111 const UnicodeString &pat, // Source pat to be compiled.
112 UParseError &pp, // Error position info
113 UErrorCode &e) // Error Code
114{
115 fRXPat->fPatternString = new UnicodeString(pat);
116 UText patternText = UTEXT_INITIALIZER;
117 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118
119 if (U_SUCCESS(e)) {
120 compile(&patternText, pp, e);
121 utext_close(&patternText);
122 }
123}
124
125//
126// compile, UText mode
127// All the work is actually done here.
128//
129void RegexCompile::compile(
130 UText *pat, // Source pat to be compiled.
131 UParseError &pp, // Error position info
132 UErrorCode &e) // Error Code
133{
134 fStatus = &e;
135 fParseErr = &pp;
136 fStackPtr = 0;
137 fStack[fStackPtr] = 0;
138
139 if (U_FAILURE(*fStatus)) {
140 return;
141 }
142
143 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
144 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
145
146 // Prepare the RegexPattern object to receive the compiled pattern.
147 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
148 if (U_FAILURE(*fStatus)) {
149 return;
150 }
151 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
152 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
153
154
155 // Initialize the pattern scanning state machine
156 fPatternLength = utext_nativeLength(pat);
157 uint16_t state = 1;
158 const RegexTableEl *tableEl;
159
160 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
161 if (fModeFlags & UREGEX_LITERAL) {
162 fQuoteMode = TRUE;
163 }
164
165 nextChar(fC); // Fetch the first char from the pattern string.
166
167 //
168 // Main loop for the regex pattern parsing state machine.
169 // Runs once per state transition.
170 // Each time through optionally performs, depending on the state table,
171 // - an advance to the the next pattern char
172 // - an action to be performed.
173 // - pushing or popping a state to/from the local state return stack.
174 // file regexcst.txt is the source for the state table. The logic behind
175 // recongizing the pattern syntax is there, not here.
176 //
177 for (;;) {
178 // Bail out if anything has gone wrong.
179 // Regex pattern parsing stops on the first error encountered.
180 if (U_FAILURE(*fStatus)) {
181 break;
182 }
183
184 U_ASSERT(state != 0);
185
186 // Find the state table element that matches the input char from the pattern, or the
187 // class of the input character. Start with the first table row for this
188 // state, then linearly scan forward until we find a row that matches the
189 // character. The last row for each state always matches all characters, so
190 // the search will stop there, if not before.
191 //
192 tableEl = &gRuleParseStateTable[state];
193 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
194 fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
195
196 for (;;) { // loop through table rows belonging to this state, looking for one
197 // that matches the current input char.
198 REGEX_SCAN_DEBUG_PRINTF(("."));
199 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
200 // Table row specified an individual character, not a set, and
201 // the input character is not quoted, and
202 // the input character matched it.
203 break;
204 }
205 if (tableEl->fCharClass == 255) {
206 // Table row specified default, match anything character class.
207 break;
208 }
209 if (tableEl->fCharClass == 254 && fC.fQuoted) {
210 // Table row specified "quoted" and the char was quoted.
211 break;
212 }
213 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
214 // Table row specified eof and we hit eof on the input.
215 break;
216 }
217
218 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
219 fC.fQuoted == FALSE && // char is not escaped &&
220 fC.fChar != (UChar32)-1) { // char is not EOF
221 U_ASSERT(tableEl->fCharClass <= 137);
222 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
223 // Table row specified a character class, or set of characters,
224 // and the current char matches it.
225 break;
226 }
227 }
228
229 // No match on this row, advance to the next row for this state,
230 tableEl++;
231 }
232 REGEX_SCAN_DEBUG_PRINTF(("\n"));
233
234 //
235 // We've found the row of the state table that matches the current input
236 // character from the rules string.
237 // Perform any action specified by this row in the state table.
238 if (doParseActions(tableEl->fAction) == FALSE) {
239 // Break out of the state machine loop if the
240 // the action signalled some kind of error, or
241 // the action was to exit, occurs on normal end-of-rules-input.
242 break;
243 }
244
245 if (tableEl->fPushState != 0) {
246 fStackPtr++;
247 if (fStackPtr >= kStackSize) {
248 error(U_REGEX_INTERNAL_ERROR);
249 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
250 fStackPtr--;
251 }
252 fStack[fStackPtr] = tableEl->fPushState;
253 }
254
255 //
256 // NextChar. This is where characters are actually fetched from the pattern.
257 // Happens under control of the 'n' tag in the state table.
258 //
259 if (tableEl->fNextChar) {
260 nextChar(fC);
261 }
262
263 // Get the next state from the table entry, or from the
264 // state stack if the next state was specified as "pop".
265 if (tableEl->fNextState != 255) {
266 state = tableEl->fNextState;
267 } else {
268 state = fStack[fStackPtr];
269 fStackPtr--;
270 if (fStackPtr < 0) {
271 // state stack underflow
272 // This will occur if the user pattern has mis-matched parentheses,
273 // with extra close parens.
274 //
275 fStackPtr++;
276 error(U_REGEX_MISMATCHED_PAREN);
277 }
278 }
279
280 }
281
282 if (U_FAILURE(*fStatus)) {
283 // Bail out if the pattern had errors.
284 // Set stack cleanup: a successful compile would have left it empty,
285 // but errors can leave temporary sets hanging around.
286 while (!fSetStack.empty()) {
287 delete (UnicodeSet *)fSetStack.pop();
288 }
289 return;
290 }
291
292 //
293 // The pattern has now been read and processed, and the compiled code generated.
294 //
295
296 //
297 // The pattern's fFrameSize so far has accumulated the requirements for
298 // storage for capture parentheses, counters, etc. that are encountered
299 // in the pattern. Add space for the two variables that are always
300 // present in the saved state: the input string position (int64_t) and
301 // the position in the compiled pattern.
302 //
303 allocateStackData(RESTACKFRAME_HDRCOUNT);
304
305 //
306 // Optimization pass 1: NOPs, back-references, and case-folding
307 //
308 stripNOPs();
309
310 //
311 // Get bounds for the minimum and maximum length of a string that this
312 // pattern can match. Used to avoid looking for matches in strings that
313 // are too short.
314 //
315 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
316
317 //
318 // Optimization pass 2: match start type
319 //
320 matchStartType();
321
322 //
323 // Set up fast latin-1 range sets
324 //
325 int32_t numSets = fRXPat->fSets->size();
326 fRXPat->fSets8 = new Regex8BitSet[numSets];
327 // Null pointer check.
328 if (fRXPat->fSets8 == NULL) {
329 e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
330 return;
331 }
332 int32_t i;
333 for (i=0; i<numSets; i++) {
334 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
335 fRXPat->fSets8[i].init(s);
336 }
337
338}
339
340
341
342
343
344//------------------------------------------------------------------------------
345//
346// doParseAction Do some action during regex pattern parsing.
347// Called by the parse state machine.
348//
349// Generation of the match engine PCode happens here, or
350// in functions called from the parse actions defined here.
351//
352//
353//------------------------------------------------------------------------------
354UBool RegexCompile::doParseActions(int32_t action)
355{
356 UBool returnVal = TRUE;
357
358 switch ((Regex_PatternParseAction)action) {
359
360 case doPatStart:
361 // Start of pattern compiles to:
362 //0 SAVE 2 Fall back to position of FAIL
363 //1 jmp 3
364 //2 FAIL Stop if we ever reach here.
365 //3 NOP Dummy, so start of pattern looks the same as
366 // the start of an ( grouping.
367 //4 NOP Resreved, will be replaced by a save if there are
368 // OR | operators at the top level
369 appendOp(URX_STATE_SAVE, 2);
370 appendOp(URX_JMP, 3);
371 appendOp(URX_FAIL, 0);
372
373 // Standard open nonCapture paren action emits the two NOPs and
374 // sets up the paren stack frame.
375 doParseActions(doOpenNonCaptureParen);
376 break;
377
378 case doPatFinish:
379 // We've scanned to the end of the pattern
380 // The end of pattern compiles to:
381 // URX_END
382 // which will stop the runtime match engine.
383 // Encountering end of pattern also behaves like a close paren,
384 // and forces fixups of the State Save at the beginning of the compiled pattern
385 // and of any OR operations at the top level.
386 //
387 handleCloseParen();
388 if (fParenStack.size() > 0) {
389 // Missing close paren in pattern.
390 error(U_REGEX_MISMATCHED_PAREN);
391 }
392
393 // add the END operation to the compiled pattern.
394 appendOp(URX_END, 0);
395
396 // Terminate the pattern compilation state machine.
397 returnVal = FALSE;
398 break;
399
400
401
402 case doOrOperator:
403 // Scanning a '|', as in (A|B)
404 {
405 // Generate code for any pending literals preceding the '|'
406 fixLiterals(FALSE);
407
408 // Insert a SAVE operation at the start of the pattern section preceding
409 // this OR at this level. This SAVE will branch the match forward
410 // to the right hand side of the OR in the event that the left hand
411 // side fails to match and backtracks. Locate the position for the
412 // save from the location on the top of the parentheses stack.
413 int32_t savePosition = fParenStack.popi();
414 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
415 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
416 op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
417 fRXPat->fCompiledPat->setElementAt(op, savePosition);
418
419 // Append an JMP operation into the compiled pattern. The operand for
420 // the JMP will eventually be the location following the ')' for the
421 // group. This will be patched in later, when the ')' is encountered.
422 appendOp(URX_JMP, 0);
423
424 // Push the position of the newly added JMP op onto the parentheses stack.
425 // This registers if for fixup when this block's close paren is encountered.
426 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
427
428 // Append a NOP to the compiled pattern. This is the slot reserved
429 // for a SAVE in the event that there is yet another '|' following
430 // this one.
431 appendOp(URX_NOP, 0);
432 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
433 }
434 break;
435
436
437 case doBeginNamedCapture:
438 // Scanning (?<letter.
439 // The first letter of the name will come through again under doConinueNamedCapture.
440 fCaptureName = new UnicodeString();
441 if (fCaptureName == NULL) {
442 error(U_MEMORY_ALLOCATION_ERROR);
443 }
444 break;
445
446 case doContinueNamedCapture:
447 fCaptureName->append(fC.fChar);
448 break;
449
450 case doBadNamedCapture:
451 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
452 break;
453
454 case doOpenCaptureParen:
455 // Open Capturing Paren, possibly named.
456 // Compile to a
457 // - NOP, which later may be replaced by a save-state if the
458 // parenthesized group gets a * quantifier, followed by
459 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
460 // - NOP, which may later be replaced by a save-state if there
461 // is an '|' alternation within the parens.
462 //
463 // Each capture group gets three slots in the save stack frame:
464 // 0: Capture Group start position (in input string being matched.)
465 // 1: Capture Group end position.
466 // 2: Start of Match-in-progress.
467 // The first two locations are for a completed capture group, and are
468 // referred to by back references and the like.
469 // The third location stores the capture start position when an START_CAPTURE is
470 // encountered. This will be promoted to a completed capture when (and if) the corresponding
471 // END_CAPTURE is encountered.
472 {
473 fixLiterals();
474 appendOp(URX_NOP, 0);
475 int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame.
476 appendOp(URX_START_CAPTURE, varsLoc);
477 appendOp(URX_NOP, 0);
478
479 // On the Parentheses stack, start a new frame and add the postions
480 // of the two NOPs. Depending on what follows in the pattern, the
481 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
482 // address of the end of the parenthesized group.
483 fParenStack.push(fModeFlags, *fStatus); // Match mode state
484 fParenStack.push(capturing, *fStatus); // Frame type.
485 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
486 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
487
488 // Save the mapping from group number to stack frame variable position.
489 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
490
491 // If this is a named capture group, add the name->group number mapping.
492 if (fCaptureName != NULL) {
493 if (!fRXPat->initNamedCaptureMap()) {
494 if (U_SUCCESS(*fStatus)) {
495 error(fRXPat->fDeferredStatus);
496 }
497 break;
498 }
499 int32_t groupNumber = fRXPat->fGroupMap->size();
500 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
501 fCaptureName = NULL; // hash table takes ownership of the name (key) string.
502 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
503 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
504 }
505 }
506 }
507 break;
508
509 case doOpenNonCaptureParen:
510 // Open non-caputuring (grouping only) Paren.
511 // Compile to a
512 // - NOP, which later may be replaced by a save-state if the
513 // parenthesized group gets a * quantifier, followed by
514 // - NOP, which may later be replaced by a save-state if there
515 // is an '|' alternation within the parens.
516 {
517 fixLiterals();
518 appendOp(URX_NOP, 0);
519 appendOp(URX_NOP, 0);
520
521 // On the Parentheses stack, start a new frame and add the postions
522 // of the two NOPs.
523 fParenStack.push(fModeFlags, *fStatus); // Match mode state
524 fParenStack.push(plain, *fStatus); // Begin a new frame.
525 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
526 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
527 }
528 break;
529
530
531 case doOpenAtomicParen:
532 // Open Atomic Paren. (?>
533 // Compile to a
534 // - NOP, which later may be replaced if the parenthesized group
535 // has a quantifier, followed by
536 // - STO_SP save state stack position, so it can be restored at the ")"
537 // - NOP, which may later be replaced by a save-state if there
538 // is an '|' alternation within the parens.
539 {
540 fixLiterals();
541 appendOp(URX_NOP, 0);
542 int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr.
543 appendOp(URX_STO_SP, varLoc);
544 appendOp(URX_NOP, 0);
545
546 // On the Parentheses stack, start a new frame and add the postions
547 // of the two NOPs. Depending on what follows in the pattern, the
548 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
549 // address of the end of the parenthesized group.
550 fParenStack.push(fModeFlags, *fStatus); // Match mode state
551 fParenStack.push(atomic, *fStatus); // Frame type.
552 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
553 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
554 }
555 break;
556
557
558 case doOpenLookAhead:
559 // Positive Look-ahead (?= stuff )
560 //
561 // Note: Addition of transparent input regions, with the need to
562 // restore the original regions when failing out of a lookahead
563 // block, complicated this sequence. Some conbined opcodes
564 // might make sense - or might not, lookahead aren't that common.
565 //
566 // Caution: min match length optimization knows about this
567 // sequence; don't change without making updates there too.
568 //
569 // Compiles to
570 // 1 LA_START dataLoc Saves SP, Input Pos, Active input region.
571 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
572 // 3 JMP 6 continue ...
573 //
574 // 4. LA_END Look Ahead failed. Restore regions.
575 // 5. BACKTRACK and back track again.
576 //
577 // 6. NOP reserved for use by quantifiers on the block.
578 // Look-ahead can't have quantifiers, but paren stack
579 // compile time conventions require the slot anyhow.
580 // 7. NOP may be replaced if there is are '|' ops in the block.
581 // 8. code for parenthesized stuff.
582 // 9. LA_END
583 //
584 // Four data slots are reserved, for saving state on entry to the look-around
585 // 0: stack pointer on entry.
586 // 1: input position on entry.
587 // 2: fActiveStart, the active bounds start on entry.
588 // 3: fActiveLimit, the active bounds limit on entry.
589 {
590 fixLiterals();
591 int32_t dataLoc = allocateData(4);
592 appendOp(URX_LA_START, dataLoc);
593 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
594 appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
595 appendOp(URX_LA_END, dataLoc);
596 appendOp(URX_BACKTRACK, 0);
597 appendOp(URX_NOP, 0);
598 appendOp(URX_NOP, 0);
599
600 // On the Parentheses stack, start a new frame and add the postions
601 // of the NOPs.
602 fParenStack.push(fModeFlags, *fStatus); // Match mode state
603 fParenStack.push(lookAhead, *fStatus); // Frame type.
604 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
605 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
606 }
607 break;
608
609 case doOpenLookAheadNeg:
610 // Negated Lookahead. (?! stuff )
611 // Compiles to
612 // 1. LA_START dataloc
613 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
614 // // which continues with the match.
615 // 3. NOP // Std. Open Paren sequence, for possible '|'
616 // 4. code for parenthesized stuff.
617 // 5. LA_END // Cut back stack, remove saved state from step 2.
618 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
619 // 7. END_LA // Restore match region, in case look-ahead was using
620 // an alternate (transparent) region.
621 // Four data slots are reserved, for saving state on entry to the look-around
622 // 0: stack pointer on entry.
623 // 1: input position on entry.
624 // 2: fActiveStart, the active bounds start on entry.
625 // 3: fActiveLimit, the active bounds limit on entry.
626 {
627 fixLiterals();
628 int32_t dataLoc = allocateData(4);
629 appendOp(URX_LA_START, dataLoc);
630 appendOp(URX_STATE_SAVE, 0); // dest address will be patched later.
631 appendOp(URX_NOP, 0);
632
633 // On the Parentheses stack, start a new frame and add the postions
634 // of the StateSave and NOP.
635 fParenStack.push(fModeFlags, *fStatus); // Match mode state
636 fParenStack.push(negLookAhead, *fStatus); // Frame type
637 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
638 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
639
640 // Instructions #5 - #7 will be added when the ')' is encountered.
641 }
642 break;
643
644 case doOpenLookBehind:
645 {
646 // Compile a (?<= look-behind open paren.
647 //
648 // Compiles to
649 // 0 URX_LB_START dataLoc
650 // 1 URX_LB_CONT dataLoc
651 // 2 MinMatchLen
652 // 3 MaxMatchLen
653 // 4 URX_NOP Standard '(' boilerplate.
654 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
655 // 6 <code for LookBehind expression>
656 // 7 URX_LB_END dataLoc # Check match len, restore input len
657 // 8 URX_LA_END dataLoc # Restore stack, input pos
658 //
659 // Allocate a block of matcher data, to contain (when running a match)
660 // 0: Stack ptr on entry
661 // 1: Input Index on entry
662 // 2: fActiveStart, the active bounds start on entry.
663 // 3: fActiveLimit, the active bounds limit on entry.
664 // 4: Start index of match current match attempt.
665 // The first four items must match the layout of data for LA_START / LA_END
666
667 // Generate match code for any pending literals.
668 fixLiterals();
669
670 // Allocate data space
671 int32_t dataLoc = allocateData(5);
672
673 // Emit URX_LB_START
674 appendOp(URX_LB_START, dataLoc);
675
676 // Emit URX_LB_CONT
677 appendOp(URX_LB_CONT, dataLoc);
678 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
679 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
680
681 // Emit the NOPs
682 appendOp(URX_NOP, 0);
683 appendOp(URX_NOP, 0);
684
685 // On the Parentheses stack, start a new frame and add the postions
686 // of the URX_LB_CONT and the NOP.
687 fParenStack.push(fModeFlags, *fStatus); // Match mode state
688 fParenStack.push(lookBehind, *fStatus); // Frame type
689 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
690 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
691
692 // The final two instructions will be added when the ')' is encountered.
693 }
694
695 break;
696
697 case doOpenLookBehindNeg:
698 {
699 // Compile a (?<! negated look-behind open paren.
700 //
701 // Compiles to
702 // 0 URX_LB_START dataLoc # Save entry stack, input len
703 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
704 // 2 MinMatchLen
705 // 3 MaxMatchLen
706 // 4 continueLoc (9)
707 // 5 URX_NOP Standard '(' boilerplate.
708 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
709 // 7 <code for LookBehind expression>
710 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
711 // 9 ...
712 //
713 // Allocate a block of matcher data, to contain (when running a match)
714 // 0: Stack ptr on entry
715 // 1: Input Index on entry
716 // 2: fActiveStart, the active bounds start on entry.
717 // 3: fActiveLimit, the active bounds limit on entry.
718 // 4: Start index of match current match attempt.
719 // The first four items must match the layout of data for LA_START / LA_END
720
721 // Generate match code for any pending literals.
722 fixLiterals();
723
724 // Allocate data space
725 int32_t dataLoc = allocateData(5);
726
727 // Emit URX_LB_START
728 appendOp(URX_LB_START, dataLoc);
729
730 // Emit URX_LBN_CONT
731 appendOp(URX_LBN_CONT, dataLoc);
732 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
733 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
734 appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled later.
735
736 // Emit the NOPs
737 appendOp(URX_NOP, 0);
738 appendOp(URX_NOP, 0);
739
740 // On the Parentheses stack, start a new frame and add the postions
741 // of the URX_LB_CONT and the NOP.
742 fParenStack.push(fModeFlags, *fStatus); // Match mode state
743 fParenStack.push(lookBehindN, *fStatus); // Frame type
744 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
745 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
746
747 // The final two instructions will be added when the ')' is encountered.
748 }
749 break;
750
751 case doConditionalExpr:
752 // Conditionals such as (?(1)a:b)
753 case doPerlInline:
754 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
755 error(U_REGEX_UNIMPLEMENTED);
756 break;
757
758
759 case doCloseParen:
760 handleCloseParen();
761 if (fParenStack.size() <= 0) {
762 // Extra close paren, or missing open paren.
763 error(U_REGEX_MISMATCHED_PAREN);
764 }
765 break;
766
767 case doNOP:
768 break;
769
770
771 case doBadOpenParenType:
772 case doRuleError:
773 error(U_REGEX_RULE_SYNTAX);
774 break;
775
776
777 case doMismatchedParenErr:
778 error(U_REGEX_MISMATCHED_PAREN);
779 break;
780
781 case doPlus:
782 // Normal '+' compiles to
783 // 1. stuff to be repeated (already built)
784 // 2. jmp-sav 1
785 // 3. ...
786 //
787 // Or, if the item to be repeated can match a zero length string,
788 // 1. STO_INP_LOC data-loc
789 // 2. body of stuff to be repeated
790 // 3. JMP_SAV_X 2
791 // 4. ...
792
793 //
794 // Or, if the item to be repeated is simple
795 // 1. Item to be repeated.
796 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
797 // 3. LOOP_C stack location
798 {
799 int32_t topLoc = blockTopLoc(FALSE); // location of item #1
800 int32_t frameLoc;
801
802 // Check for simple constructs, which may get special optimized code.
803 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
804 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
805
806 if (URX_TYPE(repeatedOp) == URX_SETREF) {
807 // Emit optimized code for [char set]+
808 appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
809 frameLoc = allocateStackData(1);
810 appendOp(URX_LOOP_C, frameLoc);
811 break;
812 }
813
814 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
815 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
816 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
817 // Emit Optimized code for .+ operations.
818 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
819 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
820 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
821 loopOpI |= 1;
822 }
823 if (fModeFlags & UREGEX_UNIX_LINES) {
824 loopOpI |= 2;
825 }
826 appendOp(loopOpI);
827 frameLoc = allocateStackData(1);
828 appendOp(URX_LOOP_C, frameLoc);
829 break;
830 }
831
832 }
833
834 // General case.
835
836 // Check for minimum match length of zero, which requires
837 // extra loop-breaking code.
838 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
839 // Zero length match is possible.
840 // Emit the code sequence that can handle it.
841 insertOp(topLoc);
842 frameLoc = allocateStackData(1);
843
844 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
845 fRXPat->fCompiledPat->setElementAt(op, topLoc);
846
847 appendOp(URX_JMP_SAV_X, topLoc+1);
848 } else {
849 // Simpler code when the repeated body must match something non-empty
850 appendOp(URX_JMP_SAV, topLoc);
851 }
852 }
853 break;
854
855 case doNGPlus:
856 // Non-greedy '+?' compiles to
857 // 1. stuff to be repeated (already built)
858 // 2. state-save 1
859 // 3. ...
860 {
861 int32_t topLoc = blockTopLoc(FALSE);
862 appendOp(URX_STATE_SAVE, topLoc);
863 }
864 break;
865
866
867 case doOpt:
868 // Normal (greedy) ? quantifier.
869 // Compiles to
870 // 1. state save 3
871 // 2. body of optional block
872 // 3. ...
873 // Insert the state save into the compiled pattern, and we're done.
874 {
875 int32_t saveStateLoc = blockTopLoc(TRUE);
876 int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
877 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
878 }
879 break;
880
881 case doNGOpt:
882 // Non-greedy ?? quantifier
883 // compiles to
884 // 1. jmp 4
885 // 2. body of optional block
886 // 3 jmp 5
887 // 4. state save 2
888 // 5 ...
889 // This code is less than ideal, with two jmps instead of one, because we can only
890 // insert one instruction at the top of the block being iterated.
891 {
892 int32_t jmp1_loc = blockTopLoc(TRUE);
893 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
894
895 int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1);
896 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
897
898 appendOp(URX_JMP, jmp2_loc+2);
899
900 appendOp(URX_STATE_SAVE, jmp1_loc+1);
901 }
902 break;
903
904
905 case doStar:
906 // Normal (greedy) * quantifier.
907 // Compiles to
908 // 1. STATE_SAVE 4
909 // 2. body of stuff being iterated over
910 // 3. JMP_SAV 2
911 // 4. ...
912 //
913 // Or, if the body is a simple [Set],
914 // 1. LOOP_SR_I set number
915 // 2. LOOP_C stack location
916 // ...
917 //
918 // Or if this is a .*
919 // 1. LOOP_DOT_I (. matches all mode flag)
920 // 2. LOOP_C stack location
921 //
922 // Or, if the body can match a zero-length string, to inhibit infinite loops,
923 // 1. STATE_SAVE 5
924 // 2. STO_INP_LOC data-loc
925 // 3. body of stuff
926 // 4. JMP_SAV_X 2
927 // 5. ...
928 {
929 // location of item #1, the STATE_SAVE
930 int32_t topLoc = blockTopLoc(FALSE);
931 int32_t dataLoc = -1;
932
933 // Check for simple *, where the construct being repeated
934 // compiled to single opcode, and might be optimizable.
935 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
936 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
937
938 if (URX_TYPE(repeatedOp) == URX_SETREF) {
939 // Emit optimized code for a [char set]*
940 int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
941 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
942 dataLoc = allocateStackData(1);
943 appendOp(URX_LOOP_C, dataLoc);
944 break;
945 }
946
947 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
948 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
949 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
950 // Emit Optimized code for .* operations.
951 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
952 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
953 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
954 loopOpI |= 1;
955 }
956 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
957 loopOpI |= 2;
958 }
959 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
960 dataLoc = allocateStackData(1);
961 appendOp(URX_LOOP_C, dataLoc);
962 break;
963 }
964 }
965
966 // Emit general case code for this *
967 // The optimizations did not apply.
968
969 int32_t saveStateLoc = blockTopLoc(TRUE);
970 int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1);
971
972 // Check for minimum match length of zero, which requires
973 // extra loop-breaking code.
974 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
975 insertOp(saveStateLoc);
976 dataLoc = allocateStackData(1);
977
978 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
979 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
980 jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
981 }
982
983 // Locate the position in the compiled pattern where the match will continue
984 // after completing the *. (4 or 5 in the comment above)
985 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
986
987 // Put together the save state op and store it into the compiled code.
988 int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
989 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
990
991 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
992 appendOp(jmpOp);
993 }
994 break;
995
996 case doNGStar:
997 // Non-greedy *? quantifier
998 // compiles to
999 // 1. JMP 3
1000 // 2. body of stuff being iterated over
1001 // 3. STATE_SAVE 2
1002 // 4 ...
1003 {
1004 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
1005 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
1006 int32_t jmpOp = buildOp(URX_JMP, saveLoc);
1007 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1008 appendOp(URX_STATE_SAVE, jmpLoc+1);
1009 }
1010 break;
1011
1012
1013 case doIntervalInit:
1014 // The '{' opening an interval quantifier was just scanned.
1015 // Init the counter varaiables that will accumulate the values as the digits
1016 // are scanned.
1017 fIntervalLow = 0;
1018 fIntervalUpper = -1;
1019 break;
1020
1021 case doIntevalLowerDigit:
1022 // Scanned a digit from the lower value of an {lower,upper} interval
1023 {
1024 int32_t digitValue = u_charDigitValue(fC.fChar);
1025 U_ASSERT(digitValue >= 0);
1026 int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1027 if (val > INT32_MAX) {
1028 error(U_REGEX_NUMBER_TOO_BIG);
1029 } else {
1030 fIntervalLow = (int32_t)val;
1031 }
1032 }
1033 break;
1034
1035 case doIntervalUpperDigit:
1036 // Scanned a digit from the upper value of an {lower,upper} interval
1037 {
1038 if (fIntervalUpper < 0) {
1039 fIntervalUpper = 0;
1040 }
1041 int32_t digitValue = u_charDigitValue(fC.fChar);
1042 U_ASSERT(digitValue >= 0);
1043 int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1044 if (val > INT32_MAX) {
1045 error(U_REGEX_NUMBER_TOO_BIG);
1046 } else {
1047 fIntervalUpper = (int32_t)val;
1048 }
1049 }
1050 break;
1051
1052 case doIntervalSame:
1053 // Scanned a single value interval like {27}. Upper = Lower.
1054 fIntervalUpper = fIntervalLow;
1055 break;
1056
1057 case doInterval:
1058 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1059 if (compileInlineInterval() == FALSE) {
1060 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1061 }
1062 break;
1063
1064 case doPossessiveInterval:
1065 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1066 {
1067 // Remember the loc for the top of the block being looped over.
1068 // (Can not reserve a slot in the compiled pattern at this time, because
1069 // compileInterval needs to reserve also, and blockTopLoc can only reserve
1070 // once per block.)
1071 int32_t topLoc = blockTopLoc(FALSE);
1072
1073 // Produce normal looping code.
1074 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1075
1076 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1077 // just as if the loop was inclosed in atomic parentheses.
1078
1079 // First the STO_SP before the start of the loop
1080 insertOp(topLoc);
1081
1082 int32_t varLoc = allocateData(1); // Reserve a data location for saving the
1083 int32_t op = buildOp(URX_STO_SP, varLoc);
1084 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1085
1086 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1087 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1088 loopOp++; // point LoopOp after the just-inserted STO_SP
1089 fRXPat->fCompiledPat->push(loopOp, *fStatus);
1090
1091 // Then the LD_SP after the end of the loop
1092 appendOp(URX_LD_SP, varLoc);
1093 }
1094
1095 break;
1096
1097 case doNGInterval:
1098 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1099 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1100 break;
1101
1102 case doIntervalError:
1103 error(U_REGEX_BAD_INTERVAL);
1104 break;
1105
1106 case doLiteralChar:
1107 // We've just scanned a "normal" character from the pattern,
1108 literalChar(fC.fChar);
1109 break;
1110
1111
1112 case doEscapedLiteralChar:
1113 // We've just scanned an backslashed escaped character with no
1114 // special meaning. It represents itself.
1115 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1116 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1117 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1118 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1119 }
1120 literalChar(fC.fChar);
1121 break;
1122
1123
1124 case doDotAny:
1125 // scanned a ".", match any single character.
1126 {
1127 fixLiterals(FALSE);
1128 if (fModeFlags & UREGEX_DOTALL) {
1129 appendOp(URX_DOTANY_ALL, 0);
1130 } else if (fModeFlags & UREGEX_UNIX_LINES) {
1131 appendOp(URX_DOTANY_UNIX, 0);
1132 } else {
1133 appendOp(URX_DOTANY, 0);
1134 }
1135 }
1136 break;
1137
1138 case doCaret:
1139 {
1140 fixLiterals(FALSE);
1141 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1142 appendOp(URX_CARET, 0);
1143 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1144 appendOp(URX_CARET_M, 0);
1145 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1146 appendOp(URX_CARET, 0); // Only testing true start of input.
1147 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1148 appendOp(URX_CARET_M_UNIX, 0);
1149 }
1150 }
1151 break;
1152
1153 case doDollar:
1154 {
1155 fixLiterals(FALSE);
1156 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1157 appendOp(URX_DOLLAR, 0);
1158 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1159 appendOp(URX_DOLLAR_M, 0);
1160 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1161 appendOp(URX_DOLLAR_D, 0);
1162 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1163 appendOp(URX_DOLLAR_MD, 0);
1164 }
1165 }
1166 break;
1167
1168 case doBackslashA:
1169 fixLiterals(FALSE);
1170 appendOp(URX_CARET, 0);
1171 break;
1172
1173 case doBackslashB:
1174 {
1175 #if UCONFIG_NO_BREAK_ITERATION==1
1176 if (fModeFlags & UREGEX_UWORD) {
1177 error(U_UNSUPPORTED_ERROR);
1178 }
1179 #endif
1180 fixLiterals(FALSE);
1181 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1182 appendOp(op, 1);
1183 }
1184 break;
1185
1186 case doBackslashb:
1187 {
1188 #if UCONFIG_NO_BREAK_ITERATION==1
1189 if (fModeFlags & UREGEX_UWORD) {
1190 error(U_UNSUPPORTED_ERROR);
1191 }
1192 #endif
1193 fixLiterals(FALSE);
1194 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1195 appendOp(op, 0);
1196 }
1197 break;
1198
1199 case doBackslashD:
1200 fixLiterals(FALSE);
1201 appendOp(URX_BACKSLASH_D, 1);
1202 break;
1203
1204 case doBackslashd:
1205 fixLiterals(FALSE);
1206 appendOp(URX_BACKSLASH_D, 0);
1207 break;
1208
1209 case doBackslashG:
1210 fixLiterals(FALSE);
1211 appendOp(URX_BACKSLASH_G, 0);
1212 break;
1213
1214 case doBackslashH:
1215 fixLiterals(FALSE);
1216 appendOp(URX_BACKSLASH_H, 1);
1217 break;
1218
1219 case doBackslashh:
1220 fixLiterals(FALSE);
1221 appendOp(URX_BACKSLASH_H, 0);
1222 break;
1223
1224 case doBackslashR:
1225 fixLiterals(FALSE);
1226 appendOp(URX_BACKSLASH_R, 0);
1227 break;
1228
1229 case doBackslashS:
1230 fixLiterals(FALSE);
1231 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1232 break;
1233
1234 case doBackslashs:
1235 fixLiterals(FALSE);
1236 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1237 break;
1238
1239 case doBackslashV:
1240 fixLiterals(FALSE);
1241 appendOp(URX_BACKSLASH_V, 1);
1242 break;
1243
1244 case doBackslashv:
1245 fixLiterals(FALSE);
1246 appendOp(URX_BACKSLASH_V, 0);
1247 break;
1248
1249 case doBackslashW:
1250 fixLiterals(FALSE);
1251 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1252 break;
1253
1254 case doBackslashw:
1255 fixLiterals(FALSE);
1256 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1257 break;
1258
1259 case doBackslashX:
1260 fixLiterals(FALSE);
1261 appendOp(URX_BACKSLASH_X, 0);
1262 break;
1263
1264
1265 case doBackslashZ:
1266 fixLiterals(FALSE);
1267 appendOp(URX_DOLLAR, 0);
1268 break;
1269
1270 case doBackslashz:
1271 fixLiterals(FALSE);
1272 appendOp(URX_BACKSLASH_Z, 0);
1273 break;
1274
1275 case doEscapeError:
1276 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1277 break;
1278
1279 case doExit:
1280 fixLiterals(FALSE);
1281 returnVal = FALSE;
1282 break;
1283
1284 case doProperty:
1285 {
1286 fixLiterals(FALSE);
1287 UnicodeSet *theSet = scanProp();
1288 compileSet(theSet);
1289 }
1290 break;
1291
1292 case doNamedChar:
1293 {
1294 UChar32 c = scanNamedChar();
1295 literalChar(c);
1296 }
1297 break;
1298
1299
1300 case doBackRef:
1301 // BackReference. Somewhat unusual in that the front-end can not completely parse
1302 // the regular expression, because the number of digits to be consumed
1303 // depends on the number of capture groups that have been defined. So
1304 // we have to do it here instead.
1305 {
1306 int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1307 int32_t groupNum = 0;
1308 UChar32 c = fC.fChar;
1309
1310 for (;;) {
1311 // Loop once per digit, for max allowed number of digits in a back reference.
1312 int32_t digit = u_charDigitValue(c);
1313 groupNum = groupNum * 10 + digit;
1314 if (groupNum >= numCaptureGroups) {
1315 break;
1316 }
1317 c = peekCharLL();
1318 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1319 break;
1320 }
1321 nextCharLL();
1322 }
1323
1324 // Scan of the back reference in the source regexp is complete. Now generate
1325 // the compiled code for it.
1326 // Because capture groups can be forward-referenced by back-references,
1327 // we fill the operand with the capture group number. At the end
1328 // of compilation, it will be changed to the variable's location.
1329 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence,
1330 // and shouldn't enter this code path at all.
1331 fixLiterals(FALSE);
1332 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1333 appendOp(URX_BACKREF_I, groupNum);
1334 } else {
1335 appendOp(URX_BACKREF, groupNum);
1336 }
1337 }
1338 break;
1339
1340 case doBeginNamedBackRef:
1341 U_ASSERT(fCaptureName == NULL);
1342 fCaptureName = new UnicodeString;
1343 if (fCaptureName == NULL) {
1344 error(U_MEMORY_ALLOCATION_ERROR);
1345 }
1346 break;
1347
1348 case doContinueNamedBackRef:
1349 fCaptureName->append(fC.fChar);
1350 break;
1351
1352 case doCompleteNamedBackRef:
1353 {
1354 int32_t groupNumber =
1355 fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
1356 if (groupNumber == 0) {
1357 // Group name has not been defined.
1358 // Could be a forward reference. If we choose to support them at some
1359 // future time, extra mechanism will be required at this point.
1360 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1361 } else {
1362 // Given the number, handle identically to a \n numbered back reference.
1363 // See comments above, under doBackRef
1364 fixLiterals(FALSE);
1365 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1366 appendOp(URX_BACKREF_I, groupNumber);
1367 } else {
1368 appendOp(URX_BACKREF, groupNumber);
1369 }
1370 }
1371 delete fCaptureName;
1372 fCaptureName = NULL;
1373 break;
1374 }
1375
1376 case doPossessivePlus:
1377 // Possessive ++ quantifier.
1378 // Compiles to
1379 // 1. STO_SP
1380 // 2. body of stuff being iterated over
1381 // 3. STATE_SAVE 5
1382 // 4. JMP 2
1383 // 5. LD_SP
1384 // 6. ...
1385 //
1386 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1387 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1388 //
1389 {
1390 // Emit the STO_SP
1391 int32_t topLoc = blockTopLoc(TRUE);
1392 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1393 int32_t op = buildOp(URX_STO_SP, stoLoc);
1394 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1395
1396 // Emit the STATE_SAVE
1397 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1398
1399 // Emit the JMP
1400 appendOp(URX_JMP, topLoc+1);
1401
1402 // Emit the LD_SP
1403 appendOp(URX_LD_SP, stoLoc);
1404 }
1405 break;
1406
1407 case doPossessiveStar:
1408 // Possessive *+ quantifier.
1409 // Compiles to
1410 // 1. STO_SP loc
1411 // 2. STATE_SAVE 5
1412 // 3. body of stuff being iterated over
1413 // 4. JMP 2
1414 // 5. LD_SP loc
1415 // 6 ...
1416 // TODO: do something to cut back the state stack each time through the loop.
1417 {
1418 // Reserve two slots at the top of the block.
1419 int32_t topLoc = blockTopLoc(TRUE);
1420 insertOp(topLoc);
1421
1422 // emit STO_SP loc
1423 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1424 int32_t op = buildOp(URX_STO_SP, stoLoc);
1425 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1426
1427 // Emit the SAVE_STATE 5
1428 int32_t L7 = fRXPat->fCompiledPat->size()+1;
1429 op = buildOp(URX_STATE_SAVE, L7);
1430 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1431
1432 // Append the JMP operation.
1433 appendOp(URX_JMP, topLoc+1);
1434
1435 // Emit the LD_SP loc
1436 appendOp(URX_LD_SP, stoLoc);
1437 }
1438 break;
1439
1440 case doPossessiveOpt:
1441 // Possessive ?+ quantifier.
1442 // Compiles to
1443 // 1. STO_SP loc
1444 // 2. SAVE_STATE 5
1445 // 3. body of optional block
1446 // 4. LD_SP loc
1447 // 5. ...
1448 //
1449 {
1450 // Reserve two slots at the top of the block.
1451 int32_t topLoc = blockTopLoc(TRUE);
1452 insertOp(topLoc);
1453
1454 // Emit the STO_SP
1455 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1456 int32_t op = buildOp(URX_STO_SP, stoLoc);
1457 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1458
1459 // Emit the SAVE_STATE
1460 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
1461 op = buildOp(URX_STATE_SAVE, continueLoc);
1462 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1463
1464 // Emit the LD_SP
1465 appendOp(URX_LD_SP, stoLoc);
1466 }
1467 break;
1468
1469
1470 case doBeginMatchMode:
1471 fNewModeFlags = fModeFlags;
1472 fSetModeFlag = TRUE;
1473 break;
1474
1475 case doMatchMode: // (?i) and similar
1476 {
1477 int32_t bit = 0;
1478 switch (fC.fChar) {
1479 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
1480 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break;
1481 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
1482 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
1483 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break;
1484 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
1485 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
1486 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
1487 default:
1488 UPRV_UNREACHABLE; // Should never happen. Other chars are filtered out
1489 // by the scanner.
1490 }
1491 if (fSetModeFlag) {
1492 fNewModeFlags |= bit;
1493 } else {
1494 fNewModeFlags &= ~bit;
1495 }
1496 }
1497 break;
1498
1499 case doSetMatchMode:
1500 // Emit code to match any pending literals, using the not-yet changed match mode.
1501 fixLiterals();
1502
1503 // We've got a (?i) or similar. The match mode is being changed, but
1504 // the change is not scoped to a parenthesized block.
1505 U_ASSERT(fNewModeFlags < 0);
1506 fModeFlags = fNewModeFlags;
1507
1508 break;
1509
1510
1511 case doMatchModeParen:
1512 // We've got a (?i: or similar. Begin a parenthesized block, save old
1513 // mode flags so they can be restored at the close of the block.
1514 //
1515 // Compile to a
1516 // - NOP, which later may be replaced by a save-state if the
1517 // parenthesized group gets a * quantifier, followed by
1518 // - NOP, which may later be replaced by a save-state if there
1519 // is an '|' alternation within the parens.
1520 {
1521 fixLiterals(FALSE);
1522 appendOp(URX_NOP, 0);
1523 appendOp(URX_NOP, 0);
1524
1525 // On the Parentheses stack, start a new frame and add the postions
1526 // of the two NOPs (a normal non-capturing () frame, except for the
1527 // saving of the orignal mode flags.)
1528 fParenStack.push(fModeFlags, *fStatus);
1529 fParenStack.push(flags, *fStatus); // Frame Marker
1530 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
1531 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
1532
1533 // Set the current mode flags to the new values.
1534 U_ASSERT(fNewModeFlags < 0);
1535 fModeFlags = fNewModeFlags;
1536 }
1537 break;
1538
1539 case doBadModeFlag:
1540 error(U_REGEX_INVALID_FLAG);
1541 break;
1542
1543 case doSuppressComments:
1544 // We have just scanned a '(?'. We now need to prevent the character scanner from
1545 // treating a '#' as a to-the-end-of-line comment.
1546 // (This Perl compatibility just gets uglier and uglier to do...)
1547 fEOLComments = FALSE;
1548 break;
1549
1550
1551 case doSetAddAmp:
1552 {
1553 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1554 set->add(chAmp);
1555 }
1556 break;
1557
1558 case doSetAddDash:
1559 {
1560 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1561 set->add(chDash);
1562 }
1563 break;
1564
1565 case doSetBackslash_s:
1566 {
1567 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1568 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1569 break;
1570 }
1571
1572 case doSetBackslash_S:
1573 {
1574 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1575 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1576 SSet.complement();
1577 set->addAll(SSet);
1578 break;
1579 }
1580
1581 case doSetBackslash_d:
1582 {
1583 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1584 // TODO - make a static set, ticket 6058.
1585 addCategory(set, U_GC_ND_MASK, *fStatus);
1586 break;
1587 }
1588
1589 case doSetBackslash_D:
1590 {
1591 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1592 UnicodeSet digits;
1593 // TODO - make a static set, ticket 6058.
1594 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1595 digits.complement();
1596 set->addAll(digits);
1597 break;
1598 }
1599
1600 case doSetBackslash_h:
1601 {
1602 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1603 UnicodeSet h;
1604 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1605 h.add((UChar32)9); // Tab
1606 set->addAll(h);
1607 break;
1608 }
1609
1610 case doSetBackslash_H:
1611 {
1612 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1613 UnicodeSet h;
1614 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1615 h.add((UChar32)9); // Tab
1616 h.complement();
1617 set->addAll(h);
1618 break;
1619 }
1620
1621 case doSetBackslash_v:
1622 {
1623 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1624 set->add((UChar32)0x0a, (UChar32)0x0d); // add range
1625 set->add((UChar32)0x85);
1626 set->add((UChar32)0x2028, (UChar32)0x2029);
1627 break;
1628 }
1629
1630 case doSetBackslash_V:
1631 {
1632 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1633 UnicodeSet v;
1634 v.add((UChar32)0x0a, (UChar32)0x0d); // add range
1635 v.add((UChar32)0x85);
1636 v.add((UChar32)0x2028, (UChar32)0x2029);
1637 v.complement();
1638 set->addAll(v);
1639 break;
1640 }
1641
1642 case doSetBackslash_w:
1643 {
1644 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1645 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1646 break;
1647 }
1648
1649 case doSetBackslash_W:
1650 {
1651 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1652 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1653 SSet.complement();
1654 set->addAll(SSet);
1655 break;
1656 }
1657
1658 case doSetBegin:
1659 fixLiterals(FALSE);
1660 fSetStack.push(new UnicodeSet(), *fStatus);
1661 fSetOpStack.push(setStart, *fStatus);
1662 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1663 fSetOpStack.push(setCaseClose, *fStatus);
1664 }
1665 break;
1666
1667 case doSetBeginDifference1:
1668 // We have scanned something like [[abc]-[
1669 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
1670 // Push a Difference operator, which will cause the new set to be subtracted from what
1671 // went before once it is created.
1672 setPushOp(setDifference1);
1673 fSetOpStack.push(setStart, *fStatus);
1674 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1675 fSetOpStack.push(setCaseClose, *fStatus);
1676 }
1677 break;
1678
1679 case doSetBeginIntersection1:
1680 // We have scanned something like [[abc]&[
1681 // Need both the '&' operator and the open '[' operator.
1682 setPushOp(setIntersection1);
1683 fSetOpStack.push(setStart, *fStatus);
1684 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1685 fSetOpStack.push(setCaseClose, *fStatus);
1686 }
1687 break;
1688
1689 case doSetBeginUnion:
1690 // We have scanned something like [[abc][
1691 // Need to handle the union operation explicitly [[abc] | [
1692 setPushOp(setUnion);
1693 fSetOpStack.push(setStart, *fStatus);
1694 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1695 fSetOpStack.push(setCaseClose, *fStatus);
1696 }
1697 break;
1698
1699 case doSetDifference2:
1700 // We have scanned something like [abc--
1701 // Consider this to unambiguously be a set difference operator.
1702 setPushOp(setDifference2);
1703 break;
1704
1705 case doSetEnd:
1706 // Have encountered the ']' that closes a set.
1707 // Force the evaluation of any pending operations within this set,
1708 // leave the completed set on the top of the set stack.
1709 setEval(setEnd);
1710 U_ASSERT(fSetOpStack.peeki()==setStart);
1711 fSetOpStack.popi();
1712 break;
1713
1714 case doSetFinish:
1715 {
1716 // Finished a complete set expression, including all nested sets.
1717 // The close bracket has already triggered clearing out pending set operators,
1718 // the operator stack should be empty and the operand stack should have just
1719 // one entry, the result set.
1720 U_ASSERT(fSetOpStack.empty());
1721 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1722 U_ASSERT(fSetStack.empty());
1723 compileSet(theSet);
1724 break;
1725 }
1726
1727 case doSetIntersection2:
1728 // Have scanned something like [abc&&
1729 setPushOp(setIntersection2);
1730 break;
1731
1732 case doSetLiteral:
1733 // Union the just-scanned literal character into the set being built.
1734 // This operation is the highest precedence set operation, so we can always do
1735 // it immediately, without waiting to see what follows. It is necessary to perform
1736 // any pending '-' or '&' operation first, because these have the same precedence
1737 // as union-ing in a literal'
1738 {
1739 setEval(setUnion);
1740 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1741 s->add(fC.fChar);
1742 fLastSetLiteral = fC.fChar;
1743 break;
1744 }
1745
1746 case doSetLiteralEscaped:
1747 // A back-slash escaped literal character was encountered.
1748 // Processing is the same as with setLiteral, above, with the addition of
1749 // the optional check for errors on escaped ASCII letters.
1750 {
1751 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1752 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1753 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1754 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1755 }
1756 setEval(setUnion);
1757 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1758 s->add(fC.fChar);
1759 fLastSetLiteral = fC.fChar;
1760 break;
1761 }
1762
1763 case doSetNamedChar:
1764 // Scanning a \N{UNICODE CHARACTER NAME}
1765 // Aside from the source of the character, the processing is identical to doSetLiteral,
1766 // above.
1767 {
1768 UChar32 c = scanNamedChar();
1769 setEval(setUnion);
1770 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1771 s->add(c);
1772 fLastSetLiteral = c;
1773 break;
1774 }
1775
1776 case doSetNamedRange:
1777 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1778 // The left character is already in the set, and is saved in fLastSetLiteral.
1779 // The right side needs to be picked up, the scan is at the 'N'.
1780 // Lower Limit > Upper limit being an error matches both Java
1781 // and ICU UnicodeSet behavior.
1782 {
1783 UChar32 c = scanNamedChar();
1784 if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1785 error(U_REGEX_INVALID_RANGE);
1786 }
1787 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1788 s->add(fLastSetLiteral, c);
1789 fLastSetLiteral = c;
1790 break;
1791 }
1792
1793
1794 case doSetNegate:
1795 // Scanned a '^' at the start of a set.
1796 // Push the negation operator onto the set op stack.
1797 // A twist for case-insensitive matching:
1798 // the case closure operation must happen _before_ negation.
1799 // But the case closure operation will already be on the stack if it's required.
1800 // This requires checking for case closure, and swapping the stack order
1801 // if it is present.
1802 {
1803 int32_t tosOp = fSetOpStack.peeki();
1804 if (tosOp == setCaseClose) {
1805 fSetOpStack.popi();
1806 fSetOpStack.push(setNegation, *fStatus);
1807 fSetOpStack.push(setCaseClose, *fStatus);
1808 } else {
1809 fSetOpStack.push(setNegation, *fStatus);
1810 }
1811 }
1812 break;
1813
1814 case doSetNoCloseError:
1815 error(U_REGEX_MISSING_CLOSE_BRACKET);
1816 break;
1817
1818 case doSetOpError:
1819 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
1820 break;
1821
1822 case doSetPosixProp:
1823 {
1824 UnicodeSet *s = scanPosixProp();
1825 if (s != NULL) {
1826 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1827 tos->addAll(*s);
1828 delete s;
1829 } // else error. scanProp() reported the error status already.
1830 }
1831 break;
1832
1833 case doSetProp:
1834 // Scanned a \p \P within [brackets].
1835 {
1836 UnicodeSet *s = scanProp();
1837 if (s != NULL) {
1838 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1839 tos->addAll(*s);
1840 delete s;
1841 } // else error. scanProp() reported the error status already.
1842 }
1843 break;
1844
1845
1846 case doSetRange:
1847 // We have scanned literal-literal. Add the range to the set.
1848 // The left character is already in the set, and is saved in fLastSetLiteral.
1849 // The right side is the current character.
1850 // Lower Limit > Upper limit being an error matches both Java
1851 // and ICU UnicodeSet behavior.
1852 {
1853
1854 if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1855 error(U_REGEX_INVALID_RANGE);
1856 }
1857 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1858 s->add(fLastSetLiteral, fC.fChar);
1859 break;
1860 }
1861
1862 default:
1863 UPRV_UNREACHABLE;
1864 }
1865
1866 if (U_FAILURE(*fStatus)) {
1867 returnVal = FALSE;
1868 }
1869
1870 return returnVal;
1871}
1872
1873
1874
1875//------------------------------------------------------------------------------
1876//
1877// literalChar We've encountered a literal character from the pattern,
1878// or an escape sequence that reduces to a character.
1879// Add it to the string containing all literal chars/strings from
1880// the pattern.
1881//
1882//------------------------------------------------------------------------------
1883void RegexCompile::literalChar(UChar32 c) {
1884 fLiteralChars.append(c);
1885}
1886
1887
1888//------------------------------------------------------------------------------
1889//
1890// fixLiterals When compiling something that can follow a literal
1891// string in a pattern, emit the code to match the
1892// accumulated literal string.
1893//
1894// Optionally, split the last char of the string off into
1895// a single "ONE_CHAR" operation, so that quantifiers can
1896// apply to that char alone. Example: abc*
1897// The * must apply to the 'c' only.
1898//
1899//------------------------------------------------------------------------------
1900void RegexCompile::fixLiterals(UBool split) {
1901
1902 // If no literal characters have been scanned but not yet had code generated
1903 // for them, nothing needs to be done.
1904 if (fLiteralChars.length() == 0) {
1905 return;
1906 }
1907
1908 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1909 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1910
1911 // Split: We need to ensure that the last item in the compiled pattern
1912 // refers only to the last literal scanned in the pattern, so that
1913 // quantifiers (*, +, etc.) affect only it, and not a longer string.
1914 // Split before case folding for case insensitive matches.
1915
1916 if (split) {
1917 fLiteralChars.truncate(indexOfLastCodePoint);
1918 fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string.
1919 // Note that the truncated literal string may be empty, in which case
1920 // nothing will be emitted.
1921
1922 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
1923 fixLiterals(FALSE); // Second recursive call, code for the final code point.
1924 return;
1925 }
1926
1927 // If we are doing case-insensitive matching, case fold the string. This may expand
1928 // the string, e.g. the German sharp-s turns into "ss"
1929 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1930 fLiteralChars.foldCase();
1931 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1932 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1933 }
1934
1935 if (indexOfLastCodePoint == 0) {
1936 // Single character, emit a URX_ONECHAR op to match it.
1937 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1938 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1939 appendOp(URX_ONECHAR_I, lastCodePoint);
1940 } else {
1941 appendOp(URX_ONECHAR, lastCodePoint);
1942 }
1943 } else {
1944 // Two or more chars, emit a URX_STRING to match them.
1945 if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1946 error(U_REGEX_PATTERN_TOO_BIG);
1947 }
1948 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1949 appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1950 } else {
1951 // TODO here: add optimization to split case sensitive strings of length two
1952 // into two single char ops, for efficiency.
1953 appendOp(URX_STRING, fRXPat->fLiteralText.length());
1954 }
1955 appendOp(URX_STRING_LEN, fLiteralChars.length());
1956
1957 // Add this string into the accumulated strings of the compiled pattern.
1958 fRXPat->fLiteralText.append(fLiteralChars);
1959 }
1960
1961 fLiteralChars.remove();
1962}
1963
1964
1965int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1966 if (U_FAILURE(*fStatus)) {
1967 return 0;
1968 }
1969 if (type < 0 || type > 255) {
1970 UPRV_UNREACHABLE;
1971 }
1972 if (val > 0x00ffffff) {
1973 UPRV_UNREACHABLE;
1974 }
1975 if (val < 0) {
1976 if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1977 UPRV_UNREACHABLE;
1978 }
1979 if (URX_TYPE(val) != 0xff) {
1980 UPRV_UNREACHABLE;
1981 }
1982 type = URX_RESERVED_OP_N;
1983 }
1984 return (type << 24) | val;
1985}
1986
1987
1988//------------------------------------------------------------------------------
1989//
1990// appendOp() Append a new instruction onto the compiled pattern
1991// Includes error checking, limiting the size of the
1992// pattern to lengths that can be represented in the
1993// 24 bit operand field of an instruction.
1994//
1995//------------------------------------------------------------------------------
1996void RegexCompile::appendOp(int32_t op) {
1997 if (U_FAILURE(*fStatus)) {
1998 return;
1999 }
2000 fRXPat->fCompiledPat->addElement(op, *fStatus);
2001 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
2002 error(U_REGEX_PATTERN_TOO_BIG);
2003 }
2004}
2005
2006void RegexCompile::appendOp(int32_t type, int32_t val) {
2007 appendOp(buildOp(type, val));
2008}
2009
2010
2011//------------------------------------------------------------------------------
2012//
2013// insertOp() Insert a slot for a new opcode into the already
2014// compiled pattern code.
2015//
2016// Fill the slot with a NOP. Our caller will replace it
2017// with what they really wanted.
2018//
2019//------------------------------------------------------------------------------
2020void RegexCompile::insertOp(int32_t where) {
2021 UVector64 *code = fRXPat->fCompiledPat;
2022 U_ASSERT(where>0 && where < code->size());
2023
2024 int32_t nop = buildOp(URX_NOP, 0);
2025 code->insertElementAt(nop, where, *fStatus);
2026
2027 // Walk through the pattern, looking for any ops with targets that
2028 // were moved down by the insert. Fix them.
2029 int32_t loc;
2030 for (loc=0; loc<code->size(); loc++) {
2031 int32_t op = (int32_t)code->elementAti(loc);
2032 int32_t opType = URX_TYPE(op);
2033 int32_t opValue = URX_VAL(op);
2034 if ((opType == URX_JMP ||
2035 opType == URX_JMPX ||
2036 opType == URX_STATE_SAVE ||
2037 opType == URX_CTR_LOOP ||
2038 opType == URX_CTR_LOOP_NG ||
2039 opType == URX_JMP_SAV ||
2040 opType == URX_JMP_SAV_X ||
2041 opType == URX_RELOC_OPRND) && opValue > where) {
2042 // Target location for this opcode is after the insertion point and
2043 // needs to be incremented to adjust for the insertion.
2044 opValue++;
2045 op = buildOp(opType, opValue);
2046 code->setElementAt(op, loc);
2047 }
2048 }
2049
2050 // Now fix up the parentheses stack. All positive values in it are locations in
2051 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2052 for (loc=0; loc<fParenStack.size(); loc++) {
2053 int32_t x = fParenStack.elementAti(loc);
2054 U_ASSERT(x < code->size());
2055 if (x>where) {
2056 x++;
2057 fParenStack.setElementAt(x, loc);
2058 }
2059 }
2060
2061 if (fMatchCloseParen > where) {
2062 fMatchCloseParen++;
2063 }
2064 if (fMatchOpenParen > where) {
2065 fMatchOpenParen++;
2066 }
2067}
2068
2069
2070//------------------------------------------------------------------------------
2071//
2072// allocateData() Allocate storage in the matcher's static data area.
2073// Return the index for the newly allocated data.
2074// The storage won't actually exist until we are running a match
2075// operation, but the storage indexes are inserted into various
2076// opcodes while compiling the pattern.
2077//
2078//------------------------------------------------------------------------------
2079int32_t RegexCompile::allocateData(int32_t size) {
2080 if (U_FAILURE(*fStatus)) {
2081 return 0;
2082 }
2083 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2084 error(U_REGEX_INTERNAL_ERROR);
2085 return 0;
2086 }
2087 int32_t dataIndex = fRXPat->fDataSize;
2088 fRXPat->fDataSize += size;
2089 if (fRXPat->fDataSize >= 0x00fffff0) {
2090 error(U_REGEX_INTERNAL_ERROR);
2091 }
2092 return dataIndex;
2093}
2094
2095
2096//------------------------------------------------------------------------------
2097//
2098// allocateStackData() Allocate space in the back-tracking stack frame.
2099// Return the index for the newly allocated data.
2100// The frame indexes are inserted into various
2101// opcodes while compiling the pattern, meaning that frame
2102// size must be restricted to the size that will fit
2103// as an operand (24 bits).
2104//
2105//------------------------------------------------------------------------------
2106int32_t RegexCompile::allocateStackData(int32_t size) {
2107 if (U_FAILURE(*fStatus)) {
2108 return 0;
2109 }
2110 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2111 error(U_REGEX_INTERNAL_ERROR);
2112 return 0;
2113 }
2114 int32_t dataIndex = fRXPat->fFrameSize;
2115 fRXPat->fFrameSize += size;
2116 if (fRXPat->fFrameSize >= 0x00fffff0) {
2117 error(U_REGEX_PATTERN_TOO_BIG);
2118 }
2119 return dataIndex;
2120}
2121
2122
2123//------------------------------------------------------------------------------
2124//
2125// blockTopLoc() Find or create a location in the compiled pattern
2126// at the start of the operation or block that has
2127// just been compiled. Needed when a quantifier (* or
2128// whatever) appears, and we need to add an operation
2129// at the start of the thing being quantified.
2130//
2131// (Parenthesized Blocks) have a slot with a NOP that
2132// is reserved for this purpose. .* or similar don't
2133// and a slot needs to be added.
2134//
2135// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2136// at the returned location.
2137// FALSE - just return the address,
2138// do not reserve a location there.
2139//
2140//------------------------------------------------------------------------------
2141int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
2142 int32_t theLoc;
2143 fixLiterals(TRUE); // Emit code for any pending literals.
2144 // If last item was a string, emit separate op for the its last char.
2145 if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2146 {
2147 // The item just processed is a parenthesized block.
2148 theLoc = fMatchOpenParen; // A slot is already reserved for us.
2149 U_ASSERT(theLoc > 0);
2150 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2151 }
2152 else {
2153 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2154 // No slot for STATE_SAVE was pre-reserved in the compiled code.
2155 // We need to make space now.
2156 theLoc = fRXPat->fCompiledPat->size()-1;
2157 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2158 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2159 // Strings take two opcode, we want the position of the first one.
2160 // We can have a string at this point if a single character case-folded to two.
2161 theLoc--;
2162 }
2163 if (reserveLoc) {
2164 int32_t nop = buildOp(URX_NOP, 0);
2165 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2166 }
2167 }
2168 return theLoc;
2169}
2170
2171
2172
2173//------------------------------------------------------------------------------
2174//
2175// handleCloseParen When compiling a close paren, we need to go back
2176// and fix up any JMP or SAVE operations within the
2177// parenthesized block that need to target the end
2178// of the block. The locations of these are kept on
2179// the paretheses stack.
2180//
2181// This function is called both when encountering a
2182// real ) and at the end of the pattern.
2183//
2184//------------------------------------------------------------------------------
2185void RegexCompile::handleCloseParen() {
2186 int32_t patIdx;
2187 int32_t patOp;
2188 if (fParenStack.size() <= 0) {
2189 error(U_REGEX_MISMATCHED_PAREN);
2190 return;
2191 }
2192
2193 // Emit code for any pending literals.
2194 fixLiterals(FALSE);
2195
2196 // Fixup any operations within the just-closed parenthesized group
2197 // that need to reference the end of the (block).
2198 // (The first one popped from the stack is an unused slot for
2199 // alternation (OR) state save, but applying the fixup to it does no harm.)
2200 for (;;) {
2201 patIdx = fParenStack.popi();
2202 if (patIdx < 0) {
2203 // value < 0 flags the start of the frame on the paren stack.
2204 break;
2205 }
2206 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2207 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2208 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set.
2209 patOp |= fRXPat->fCompiledPat->size(); // Set it now.
2210 fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2211 fMatchOpenParen = patIdx;
2212 }
2213
2214 // At the close of any parenthesized block, restore the match mode flags to
2215 // the value they had at the open paren. Saved value is
2216 // at the top of the paren stack.
2217 fModeFlags = fParenStack.popi();
2218 U_ASSERT(fModeFlags < 0);
2219
2220 // DO any additional fixups, depending on the specific kind of
2221 // parentesized grouping this is
2222
2223 switch (patIdx) {
2224 case plain:
2225 case flags:
2226 // No additional fixups required.
2227 // (Grouping-only parentheses)
2228 break;
2229 case capturing:
2230 // Capturing Parentheses.
2231 // Insert a End Capture op into the pattern.
2232 // The frame offset of the variables for this cg is obtained from the
2233 // start capture op and put it into the end-capture op.
2234 {
2235 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2236 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2237
2238 int32_t frameVarLocation = URX_VAL(captureOp);
2239 appendOp(URX_END_CAPTURE, frameVarLocation);
2240 }
2241 break;
2242 case atomic:
2243 // Atomic Parenthesis.
2244 // Insert a LD_SP operation to restore the state stack to the position
2245 // it was when the atomic parens were entered.
2246 {
2247 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2248 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2249 int32_t stoLoc = URX_VAL(stoOp);
2250 appendOp(URX_LD_SP, stoLoc);
2251 }
2252 break;
2253
2254 case lookAhead:
2255 {
2256 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2257 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2258 int32_t dataLoc = URX_VAL(startOp);
2259 appendOp(URX_LA_END, dataLoc);
2260 }
2261 break;
2262
2263 case negLookAhead:
2264 {
2265 // See comment at doOpenLookAheadNeg
2266 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2267 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2268 int32_t dataLoc = URX_VAL(startOp);
2269 appendOp(URX_LA_END, dataLoc);
2270 appendOp(URX_BACKTRACK, 0);
2271 appendOp(URX_LA_END, dataLoc);
2272
2273 // Patch the URX_SAVE near the top of the block.
2274 // The destination of the SAVE is the final LA_END that was just added.
2275 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2276 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2277 int32_t dest = fRXPat->fCompiledPat->size()-1;
2278 saveOp = buildOp(URX_STATE_SAVE, dest);
2279 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2280 }
2281 break;
2282
2283 case lookBehind:
2284 {
2285 // See comment at doOpenLookBehind.
2286
2287 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2288 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2289 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2290 int32_t dataLoc = URX_VAL(startOp);
2291 appendOp(URX_LB_END, dataLoc);
2292 appendOp(URX_LA_END, dataLoc);
2293
2294 // Determine the min and max bounds for the length of the
2295 // string that the pattern can match.
2296 // An unbounded upper limit is an error.
2297 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2298 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2299 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2300 if (URX_TYPE(maxML) != 0) {
2301 error(U_REGEX_LOOK_BEHIND_LIMIT);
2302 break;
2303 }
2304 if (maxML == INT32_MAX) {
2305 error(U_REGEX_LOOK_BEHIND_LIMIT);
2306 break;
2307 }
2308 if (minML == INT32_MAX) {
2309 // This condition happens when no match is possible, such as with a
2310 // [set] expression containing no elements.
2311 // In principle, the generated code to evaluate the expression could be deleted,
2312 // but it's probably not worth the complication.
2313 minML = 0;
2314 }
2315 U_ASSERT(minML <= maxML);
2316
2317 // Insert the min and max match len bounds into the URX_LB_CONT op that
2318 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2319 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
2320 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
2321
2322 }
2323 break;
2324
2325
2326
2327 case lookBehindN:
2328 {
2329 // See comment at doOpenLookBehindNeg.
2330
2331 // Append the URX_LBN_END to the compiled pattern.
2332 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2333 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2334 int32_t dataLoc = URX_VAL(startOp);
2335 appendOp(URX_LBN_END, dataLoc);
2336
2337 // Determine the min and max bounds for the length of the
2338 // string that the pattern can match.
2339 // An unbounded upper limit is an error.
2340 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2341 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2342 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2343 if (URX_TYPE(maxML) != 0) {
2344 error(U_REGEX_LOOK_BEHIND_LIMIT);
2345 break;
2346 }
2347 if (maxML == INT32_MAX) {
2348 error(U_REGEX_LOOK_BEHIND_LIMIT);
2349 break;
2350 }
2351 if (minML == INT32_MAX) {
2352 // This condition happens when no match is possible, such as with a
2353 // [set] expression containing no elements.
2354 // In principle, the generated code to evaluate the expression could be deleted,
2355 // but it's probably not worth the complication.
2356 minML = 0;
2357 }
2358
2359 U_ASSERT(minML <= maxML);
2360
2361 // Insert the min and max match len bounds into the URX_LB_CONT op that
2362 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2363 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
2364 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
2365
2366 // Insert the pattern location to continue at after a successful match
2367 // as the last operand of the URX_LBN_CONT
2368 int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2369 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
2370 }
2371 break;
2372
2373
2374
2375 default:
2376 UPRV_UNREACHABLE;
2377 }
2378
2379 // remember the next location in the compiled pattern.
2380 // The compilation of Quantifiers will look at this to see whether its looping
2381 // over a parenthesized block or a single item
2382 fMatchCloseParen = fRXPat->fCompiledPat->size();
2383}
2384
2385
2386
2387//------------------------------------------------------------------------------
2388//
2389// compileSet Compile the pattern operations for a reference to a
2390// UnicodeSet.
2391//
2392//------------------------------------------------------------------------------
2393void RegexCompile::compileSet(UnicodeSet *theSet)
2394{
2395 if (theSet == NULL) {
2396 return;
2397 }
2398 // Remove any strings from the set.
2399 // There shoudn't be any, but just in case.
2400 // (Case Closure can add them; if we had a simple case closure avaialble that
2401 // ignored strings, that would be better.)
2402 theSet->removeAllStrings();
2403 int32_t setSize = theSet->size();
2404
2405 switch (setSize) {
2406 case 0:
2407 {
2408 // Set of no elements. Always fails to match.
2409 appendOp(URX_BACKTRACK, 0);
2410 delete theSet;
2411 }
2412 break;
2413
2414 case 1:
2415 {
2416 // The set contains only a single code point. Put it into
2417 // the compiled pattern as a single char operation rather
2418 // than a set, and discard the set itself.
2419 literalChar(theSet->charAt(0));
2420 delete theSet;
2421 }
2422 break;
2423
2424 default:
2425 {
2426 // The set contains two or more chars. (the normal case)
2427 // Put it into the compiled pattern as a set.
2428 int32_t setNumber = fRXPat->fSets->size();
2429 fRXPat->fSets->addElement(theSet, *fStatus);
2430 appendOp(URX_SETREF, setNumber);
2431 }
2432 }
2433}
2434
2435
2436//------------------------------------------------------------------------------
2437//
2438// compileInterval Generate the code for a {min, max} style interval quantifier.
2439// Except for the specific opcodes used, the code is the same
2440// for all three types (greedy, non-greedy, possessive) of
2441// intervals. The opcodes are supplied as parameters.
2442// (There are two sets of opcodes - greedy & possessive use the
2443// same ones, while non-greedy has it's own.)
2444//
2445// The code for interval loops has this form:
2446// 0 CTR_INIT counter loc (in stack frame)
2447// 1 5 patt address of CTR_LOOP at bottom of block
2448// 2 min count
2449// 3 max count (-1 for unbounded)
2450// 4 ... block to be iterated over
2451// 5 CTR_LOOP
2452//
2453// In
2454//------------------------------------------------------------------------------
2455void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
2456{
2457 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2458 // four slots in the compiled code. Reserve them.
2459 int32_t topOfBlock = blockTopLoc(TRUE);
2460 insertOp(topOfBlock);
2461 insertOp(topOfBlock);
2462 insertOp(topOfBlock);
2463
2464 // The operands for the CTR_INIT opcode include the index in the matcher data
2465 // of the counter. Allocate it now. There are two data items
2466 // counterLoc --> Loop counter
2467 // +1 --> Input index (for breaking non-progressing loops)
2468 // (Only present if unbounded upper limit on loop)
2469 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1;
2470 int32_t counterLoc = allocateStackData(dataSize);
2471
2472 int32_t op = buildOp(InitOp, counterLoc);
2473 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2474
2475 // The second operand of CTR_INIT is the location following the end of the loop.
2476 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2477 // compilation of something later on causes the code to grow and the target
2478 // position to move.
2479 int32_t loopEnd = fRXPat->fCompiledPat->size();
2480 op = buildOp(URX_RELOC_OPRND, loopEnd);
2481 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2482
2483 // Followed by the min and max counts.
2484 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2485 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2486
2487 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2488 // Goes at end of the block being looped over, so just append to the code so far.
2489 appendOp(LoopOp, topOfBlock);
2490
2491 if ((fIntervalLow & 0xff000000) != 0 ||
2492 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2493 error(U_REGEX_NUMBER_TOO_BIG);
2494 }
2495
2496 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2497 error(U_REGEX_MAX_LT_MIN);
2498 }
2499}
2500
2501
2502
2503UBool RegexCompile::compileInlineInterval() {
2504 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2505 // Too big to inline. Fail, which will cause looping code to be generated.
2506 // (Upper < Lower picks up unbounded upper and errors, both.)
2507 return FALSE;
2508 }
2509
2510 int32_t topOfBlock = blockTopLoc(FALSE);
2511 if (fIntervalUpper == 0) {
2512 // Pathological case. Attempt no matches, as if the block doesn't exist.
2513 // Discard the generated code for the block.
2514 // If the block included parens, discard the info pertaining to them as well.
2515 fRXPat->fCompiledPat->setSize(topOfBlock);
2516 if (fMatchOpenParen >= topOfBlock) {
2517 fMatchOpenParen = -1;
2518 }
2519 if (fMatchCloseParen >= topOfBlock) {
2520 fMatchCloseParen = -1;
2521 }
2522 return TRUE;
2523 }
2524
2525 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2526 // The thing being repeated is not a single op, but some
2527 // more complex block. Do it as a loop, not inlines.
2528 // Note that things "repeated" a max of once are handled as inline, because
2529 // the one copy of the code already generated is just fine.
2530 return FALSE;
2531 }
2532
2533 // Pick up the opcode that is to be repeated
2534 //
2535 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2536
2537 // Compute the pattern location where the inline sequence
2538 // will end, and set up the state save op that will be needed.
2539 //
2540 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2541 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2542 int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2543 if (fIntervalLow == 0) {
2544 insertOp(topOfBlock);
2545 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2546 }
2547
2548
2549
2550 // Loop, emitting the op for the thing being repeated each time.
2551 // Loop starts at 1 because one instance of the op already exists in the pattern,
2552 // it was put there when it was originally encountered.
2553 int32_t i;
2554 for (i=1; i<fIntervalUpper; i++ ) {
2555 if (i >= fIntervalLow) {
2556 appendOp(saveOp);
2557 }
2558 appendOp(op);
2559 }
2560 return TRUE;
2561}
2562
2563
2564
2565//------------------------------------------------------------------------------
2566//
2567// caseInsensitiveStart given a single code point from a pattern string, determine the
2568// set of characters that could potentially begin a case-insensitive
2569// match of a string beginning with that character, using full Unicode
2570// case insensitive matching.
2571//
2572// This is used in optimizing find().
2573//
2574// closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2575// misses cases like this:
2576// A string from the pattern begins with 'ss' (although all we know
2577// in this context is that it begins with 's')
2578// The pattern could match a string beginning with a German sharp-s
2579//
2580// To the ordinary case closure for a character c, we add all other
2581// characters cx where the case closure of cx incudes a string form that begins
2582// with the original character c.
2583//
2584// This function could be made smarter. The full pattern string is available
2585// and it would be possible to verify that the extra characters being added
2586// to the starting set fully match, rather than having just a first-char of the
2587// folded form match.
2588//
2589//------------------------------------------------------------------------------
2590void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2591
2592// Machine Generated below.
2593// It may need updating with new versions of Unicode.
2594// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2595// The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2596
2597// Machine Generated Data. Do not hand edit.
2598 static const UChar32 RECaseFixCodePoints[] = {
2599 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2600 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2601 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2602 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2603 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2604
2605 static const int16_t RECaseFixStringOffsets[] = {
2606 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2607 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2608 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2609 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2610 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2611
2612 static const int16_t RECaseFixCounts[] = {
2613 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2614 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2615 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2616 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2617 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2618
2619 static const UChar RECaseFixData[] = {
2620 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2621 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2622 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2623 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2624 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2625 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2626 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2627 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2628 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2629 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2630 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2631
2632// End of machine generated data.
2633
2634 if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2635 // This function should never be called with an invalid input character.
2636 UPRV_UNREACHABLE;
2637 } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2638 UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2639 starterChars->set(caseFoldedC, caseFoldedC);
2640
2641 int32_t i;
2642 for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2643 // Simple linear search through the sorted list of interesting code points.
2644 }
2645
2646 if (RECaseFixCodePoints[i] == c) {
2647 int32_t dataIndex = RECaseFixStringOffsets[i];
2648 int32_t numCharsToAdd = RECaseFixCounts[i];
2649 UChar32 cpToAdd = 0;
2650 for (int32_t j=0; j<numCharsToAdd; j++) {
2651 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2652 starterChars->add(cpToAdd);
2653 }
2654 }
2655
2656 starterChars->closeOver(USET_CASE_INSENSITIVE);
2657 starterChars->removeAllStrings();
2658 } else {
2659 // Not a cased character. Just return it alone.
2660 starterChars->set(c, c);
2661 }
2662}
2663
2664
2665// Increment with overflow check.
2666// val and delta will both be positive.
2667
2668static int32_t safeIncrement(int32_t val, int32_t delta) {
2669 if (INT32_MAX - val > delta) {
2670 return val + delta;
2671 } else {
2672 return INT32_MAX;
2673 }
2674}
2675
2676
2677//------------------------------------------------------------------------------
2678//
2679// matchStartType Determine how a match can start.
2680// Used to optimize find() operations.
2681//
2682// Operation is very similar to minMatchLength(). Walk the compiled
2683// pattern, keeping an on-going minimum-match-length. For any
2684// op where the min match coming in is zero, add that ops possible
2685// starting matches to the possible starts for the overall pattern.
2686//
2687//------------------------------------------------------------------------------
2688void RegexCompile::matchStartType() {
2689 if (U_FAILURE(*fStatus)) {
2690 return;
2691 }
2692
2693
2694 int32_t loc; // Location in the pattern of the current op being processed.
2695 int32_t op; // The op being processed
2696 int32_t opType; // The opcode type of the op
2697 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
2698 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
2699
2700 UBool atStart = TRUE; // True if no part of the pattern yet encountered
2701 // could have advanced the position in a match.
2702 // (Maximum match length so far == 0)
2703
2704 // forwardedLength is a vector holding minimum-match-length values that
2705 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2706 // It must be one longer than the pattern being checked because some ops
2707 // will jmp to a end-of-block+1 location from within a block, and we must
2708 // count those when checking the block.
2709 int32_t end = fRXPat->fCompiledPat->size();
2710 UVector32 forwardedLength(end+1, *fStatus);
2711 forwardedLength.setSize(end+1);
2712 for (loc=3; loc<end; loc++) {
2713 forwardedLength.setElementAt(INT32_MAX, loc);
2714 }
2715
2716 for (loc = 3; loc<end; loc++) {
2717 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2718 opType = URX_TYPE(op);
2719
2720 // The loop is advancing linearly through the pattern.
2721 // If the op we are now at was the destination of a branch in the pattern,
2722 // and that path has a shorter minimum length than the current accumulated value,
2723 // replace the current accumulated value.
2724 if (forwardedLength.elementAti(loc) < currentLen) {
2725 currentLen = forwardedLength.elementAti(loc);
2726 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2727 }
2728
2729 switch (opType) {
2730 // Ops that don't change the total length matched
2731 case URX_RESERVED_OP:
2732 case URX_END:
2733 case URX_FAIL:
2734 case URX_STRING_LEN:
2735 case URX_NOP:
2736 case URX_START_CAPTURE:
2737 case URX_END_CAPTURE:
2738 case URX_BACKSLASH_B:
2739 case URX_BACKSLASH_BU:
2740 case URX_BACKSLASH_G:
2741 case URX_BACKSLASH_Z:
2742 case URX_DOLLAR:
2743 case URX_DOLLAR_M:
2744 case URX_DOLLAR_D:
2745 case URX_DOLLAR_MD:
2746 case URX_RELOC_OPRND:
2747 case URX_STO_INP_LOC:
2748 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2749 case URX_BACKREF_I:
2750
2751 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2752 case URX_LD_SP:
2753 break;
2754
2755 case URX_CARET:
2756 if (atStart) {
2757 fRXPat->fStartType = START_START;
2758 }
2759 break;
2760
2761 case URX_CARET_M:
2762 case URX_CARET_M_UNIX:
2763 if (atStart) {
2764 fRXPat->fStartType = START_LINE;
2765 }
2766 break;
2767
2768 case URX_ONECHAR:
2769 if (currentLen == 0) {
2770 // This character could appear at the start of a match.
2771 // Add it to the set of possible starting characters.
2772 fRXPat->fInitialChars->add(URX_VAL(op));
2773 numInitialStrings += 2;
2774 }
2775 currentLen = safeIncrement(currentLen, 1);
2776 atStart = FALSE;
2777 break;
2778
2779
2780 case URX_SETREF:
2781 if (currentLen == 0) {
2782 int32_t sn = URX_VAL(op);
2783 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2784 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2785 fRXPat->fInitialChars->addAll(*s);
2786 numInitialStrings += 2;
2787 }
2788 currentLen = safeIncrement(currentLen, 1);
2789 atStart = FALSE;
2790 break;
2791
2792 case URX_LOOP_SR_I:
2793 // [Set]*, like a SETREF, above, in what it can match,
2794 // but may not match at all, so currentLen is not incremented.
2795 if (currentLen == 0) {
2796 int32_t sn = URX_VAL(op);
2797 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2798 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2799 fRXPat->fInitialChars->addAll(*s);
2800 numInitialStrings += 2;
2801 }
2802 atStart = FALSE;
2803 break;
2804
2805 case URX_LOOP_DOT_I:
2806 if (currentLen == 0) {
2807 // .* at the start of a pattern.
2808 // Any character can begin the match.
2809 fRXPat->fInitialChars->clear();
2810 fRXPat->fInitialChars->complement();
2811 numInitialStrings += 2;
2812 }
2813 atStart = FALSE;
2814 break;
2815
2816
2817 case URX_STATIC_SETREF:
2818 if (currentLen == 0) {
2819 int32_t sn = URX_VAL(op);
2820 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2821 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2822 fRXPat->fInitialChars->addAll(*s);
2823 numInitialStrings += 2;
2824 }
2825 currentLen = safeIncrement(currentLen, 1);
2826 atStart = FALSE;
2827 break;
2828
2829
2830
2831 case URX_STAT_SETREF_N:
2832 if (currentLen == 0) {
2833 int32_t sn = URX_VAL(op);
2834 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2835 UnicodeSet sc(*s);
2836 sc.complement();
2837 fRXPat->fInitialChars->addAll(sc);
2838 numInitialStrings += 2;
2839 }
2840 currentLen = safeIncrement(currentLen, 1);
2841 atStart = FALSE;
2842 break;
2843
2844
2845
2846 case URX_BACKSLASH_D:
2847 // Digit Char
2848 if (currentLen == 0) {
2849 UnicodeSet s;
2850 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2851 if (URX_VAL(op) != 0) {
2852 s.complement();
2853 }
2854 fRXPat->fInitialChars->addAll(s);
2855 numInitialStrings += 2;
2856 }
2857 currentLen = safeIncrement(currentLen, 1);
2858 atStart = FALSE;
2859 break;
2860
2861
2862 case URX_BACKSLASH_H:
2863 // Horiz white space
2864 if (currentLen == 0) {
2865 UnicodeSet s;
2866 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2867 s.add((UChar32)9); // Tab
2868 if (URX_VAL(op) != 0) {
2869 s.complement();
2870 }
2871 fRXPat->fInitialChars->addAll(s);
2872 numInitialStrings += 2;
2873 }
2874 currentLen = safeIncrement(currentLen, 1);
2875 atStart = FALSE;
2876 break;
2877
2878
2879 case URX_BACKSLASH_R: // Any line ending sequence
2880 case URX_BACKSLASH_V: // Any line ending code point, with optional negation
2881 if (currentLen == 0) {
2882 UnicodeSet s;
2883 s.add((UChar32)0x0a, (UChar32)0x0d); // add range
2884 s.add((UChar32)0x85);
2885 s.add((UChar32)0x2028, (UChar32)0x2029);
2886 if (URX_VAL(op) != 0) {
2887 // Complement option applies to URX_BACKSLASH_V only.
2888 s.complement();
2889 }
2890 fRXPat->fInitialChars->addAll(s);
2891 numInitialStrings += 2;
2892 }
2893 currentLen = safeIncrement(currentLen, 1);
2894 atStart = FALSE;
2895 break;
2896
2897
2898
2899 case URX_ONECHAR_I:
2900 // Case Insensitive Single Character.
2901 if (currentLen == 0) {
2902 UChar32 c = URX_VAL(op);
2903 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2904 UnicodeSet starters(c, c);
2905 starters.closeOver(USET_CASE_INSENSITIVE);
2906 // findCaseInsensitiveStarters(c, &starters);
2907 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2908 // The expanded folding can't match the pattern.
2909 fRXPat->fInitialChars->addAll(starters);
2910 } else {
2911 // Char has no case variants. Just add it as-is to the
2912 // set of possible starting chars.
2913 fRXPat->fInitialChars->add(c);
2914 }
2915 numInitialStrings += 2;
2916 }
2917 currentLen = safeIncrement(currentLen, 1);
2918 atStart = FALSE;
2919 break;
2920
2921
2922 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2923 case URX_DOTANY_ALL: // . matches one or two.
2924 case URX_DOTANY:
2925 case URX_DOTANY_UNIX:
2926 if (currentLen == 0) {
2927 // These constructs are all bad news when they appear at the start
2928 // of a match. Any character can begin the match.
2929 fRXPat->fInitialChars->clear();
2930 fRXPat->fInitialChars->complement();
2931 numInitialStrings += 2;
2932 }
2933 currentLen = safeIncrement(currentLen, 1);
2934 atStart = FALSE;
2935 break;
2936
2937
2938 case URX_JMPX:
2939 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2940 U_FALLTHROUGH;
2941 case URX_JMP:
2942 {
2943 int32_t jmpDest = URX_VAL(op);
2944 if (jmpDest < loc) {
2945 // Loop of some kind. Can safely ignore, the worst that will happen
2946 // is that we understate the true minimum length
2947 currentLen = forwardedLength.elementAti(loc+1);
2948
2949 } else {
2950 // Forward jump. Propagate the current min length to the target loc of the jump.
2951 U_ASSERT(jmpDest <= end+1);
2952 if (forwardedLength.elementAti(jmpDest) > currentLen) {
2953 forwardedLength.setElementAt(currentLen, jmpDest);
2954 }
2955 }
2956 }
2957 atStart = FALSE;
2958 break;
2959
2960 case URX_JMP_SAV:
2961 case URX_JMP_SAV_X:
2962 // Combo of state save to the next loc, + jmp backwards.
2963 // Net effect on min. length computation is nothing.
2964 atStart = FALSE;
2965 break;
2966
2967 case URX_BACKTRACK:
2968 // Fails are kind of like a branch, except that the min length was
2969 // propagated already, by the state save.
2970 currentLen = forwardedLength.elementAti(loc+1);
2971 atStart = FALSE;
2972 break;
2973
2974
2975 case URX_STATE_SAVE:
2976 {
2977 // State Save, for forward jumps, propagate the current minimum.
2978 // of the state save.
2979 int32_t jmpDest = URX_VAL(op);
2980 if (jmpDest > loc) {
2981 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2982 forwardedLength.setElementAt(currentLen, jmpDest);
2983 }
2984 }
2985 }
2986 atStart = FALSE;
2987 break;
2988
2989
2990
2991
2992 case URX_STRING:
2993 {
2994 loc++;
2995 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2996 int32_t stringLen = URX_VAL(stringLenOp);
2997 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2998 U_ASSERT(stringLenOp >= 2);
2999 if (currentLen == 0) {
3000 // Add the starting character of this string to the set of possible starting
3001 // characters for this pattern.
3002 int32_t stringStartIdx = URX_VAL(op);
3003 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3004 fRXPat->fInitialChars->add(c);
3005
3006 // Remember this string. After the entire pattern has been checked,
3007 // if nothing else is identified that can start a match, we'll use it.
3008 numInitialStrings++;
3009 fRXPat->fInitialStringIdx = stringStartIdx;
3010 fRXPat->fInitialStringLen = stringLen;
3011 }
3012
3013 currentLen = safeIncrement(currentLen, stringLen);
3014 atStart = FALSE;
3015 }
3016 break;
3017
3018 case URX_STRING_I:
3019 {
3020 // Case-insensitive string. Unlike exact-match strings, we won't
3021 // attempt a string search for possible match positions. But we
3022 // do update the set of possible starting characters.
3023 loc++;
3024 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3025 int32_t stringLen = URX_VAL(stringLenOp);
3026 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3027 U_ASSERT(stringLenOp >= 2);
3028 if (currentLen == 0) {
3029 // Add the starting character of this string to the set of possible starting
3030 // characters for this pattern.
3031 int32_t stringStartIdx = URX_VAL(op);
3032 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3033 UnicodeSet s;
3034 findCaseInsensitiveStarters(c, &s);
3035 fRXPat->fInitialChars->addAll(s);
3036 numInitialStrings += 2; // Matching on an initial string not possible.
3037 }
3038 currentLen = safeIncrement(currentLen, stringLen);
3039 atStart = FALSE;
3040 }
3041 break;
3042
3043 case URX_CTR_INIT:
3044 case URX_CTR_INIT_NG:
3045 {
3046 // Loop Init Ops. These don't change the min length, but they are 4 word ops
3047 // so location must be updated accordingly.
3048 // Loop Init Ops.
3049 // If the min loop count == 0
3050 // move loc forwards to the end of the loop, skipping over the body.
3051 // If the min count is > 0,
3052 // continue normal processing of the body of the loop.
3053 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3054 loopEndLoc = URX_VAL(loopEndLoc);
3055 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3056 if (minLoopCount == 0) {
3057 // Min Loop Count of 0, treat like a forward branch and
3058 // move the current minimum length up to the target
3059 // (end of loop) location.
3060 U_ASSERT(loopEndLoc <= end+1);
3061 if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3062 forwardedLength.setElementAt(currentLen, loopEndLoc);
3063 }
3064 }
3065 loc+=3; // Skips over operands of CTR_INIT
3066 }
3067 atStart = FALSE;
3068 break;
3069
3070
3071 case URX_CTR_LOOP:
3072 case URX_CTR_LOOP_NG:
3073 // Loop ops.
3074 // The jump is conditional, backwards only.
3075 atStart = FALSE;
3076 break;
3077
3078 case URX_LOOP_C:
3079 // More loop ops. These state-save to themselves.
3080 // don't change the minimum match
3081 atStart = FALSE;
3082 break;
3083
3084
3085 case URX_LA_START:
3086 case URX_LB_START:
3087 {
3088 // Look-around. Scan forward until the matching look-ahead end,
3089 // without processing the look-around block. This is overly pessimistic.
3090
3091 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
3092 // lookahead contains two LA_END instructions, so count goes up by two
3093 // for each LA_START.
3094 int32_t depth = (opType == URX_LA_START? 2: 1);
3095 for (;;) {
3096 loc++;
3097 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3098 if (URX_TYPE(op) == URX_LA_START) {
3099 depth+=2;
3100 }
3101 if (URX_TYPE(op) == URX_LB_START) {
3102 depth++;
3103 }
3104 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3105 depth--;
3106 if (depth == 0) {
3107 break;
3108 }
3109 }
3110 if (URX_TYPE(op) == URX_STATE_SAVE) {
3111 // Need this because neg lookahead blocks will FAIL to outside
3112 // of the block.
3113 int32_t jmpDest = URX_VAL(op);
3114 if (jmpDest > loc) {
3115 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3116 forwardedLength.setElementAt(currentLen, jmpDest);
3117 }
3118 }
3119 }
3120 U_ASSERT(loc <= end);
3121 }
3122 }
3123 break;
3124
3125 case URX_LA_END:
3126 case URX_LB_CONT:
3127 case URX_LB_END:
3128 case URX_LBN_CONT:
3129 case URX_LBN_END:
3130 UPRV_UNREACHABLE; // Shouldn't get here. These ops should be
3131 // consumed by the scan in URX_LA_START and LB_START
3132 default:
3133 UPRV_UNREACHABLE;
3134 }
3135
3136 }
3137
3138
3139 // We have finished walking through the ops. Check whether some forward jump
3140 // propagated a shorter length to location end+1.
3141 if (forwardedLength.elementAti(end+1) < currentLen) {
3142 currentLen = forwardedLength.elementAti(end+1);
3143 }
3144
3145
3146 fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3147
3148
3149 // Sort out what we should check for when looking for candidate match start positions.
3150 // In order of preference,
3151 // 1. Start of input text buffer.
3152 // 2. A literal string.
3153 // 3. Start of line in multi-line mode.
3154 // 4. A single literal character.
3155 // 5. A character from a set of characters.
3156 //
3157 if (fRXPat->fStartType == START_START) {
3158 // Match only at the start of an input text string.
3159 // start type is already set. We're done.
3160 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3161 // Match beginning only with a literal string.
3162 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3163 U_ASSERT(fRXPat->fInitialChars->contains(c));
3164 fRXPat->fStartType = START_STRING;
3165 fRXPat->fInitialChar = c;
3166 } else if (fRXPat->fStartType == START_LINE) {
3167 // Match at start of line in Multi-Line mode.
3168 // Nothing to do here; everything is already set.
3169 } else if (fRXPat->fMinMatchLen == 0) {
3170 // Zero length match possible. We could start anywhere.
3171 fRXPat->fStartType = START_NO_INFO;
3172 } else if (fRXPat->fInitialChars->size() == 1) {
3173 // All matches begin with the same char.
3174 fRXPat->fStartType = START_CHAR;
3175 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3176 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3177 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3178 fRXPat->fMinMatchLen > 0) {
3179 // Matches start with a set of character smaller than the set of all chars.
3180 fRXPat->fStartType = START_SET;
3181 } else {
3182 // Matches can start with anything
3183 fRXPat->fStartType = START_NO_INFO;
3184 }
3185
3186 return;
3187}
3188
3189
3190
3191//------------------------------------------------------------------------------
3192//
3193// minMatchLength Calculate the length of the shortest string that could
3194// match the specified pattern.
3195// Length is in 16 bit code units, not code points.
3196//
3197// The calculated length may not be exact. The returned
3198// value may be shorter than the actual minimum; it must
3199// never be longer.
3200//
3201// start and end are the range of p-code operations to be
3202// examined. The endpoints are included in the range.
3203//
3204//------------------------------------------------------------------------------
3205int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
3206 if (U_FAILURE(*fStatus)) {
3207 return 0;
3208 }
3209
3210 U_ASSERT(start <= end);
3211 U_ASSERT(end < fRXPat->fCompiledPat->size());
3212
3213
3214 int32_t loc;
3215 int32_t op;
3216 int32_t opType;
3217 int32_t currentLen = 0;
3218
3219
3220 // forwardedLength is a vector holding minimum-match-length values that
3221 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
3222 // It must be one longer than the pattern being checked because some ops
3223 // will jmp to a end-of-block+1 location from within a block, and we must
3224 // count those when checking the block.
3225 UVector32 forwardedLength(end+2, *fStatus);
3226 forwardedLength.setSize(end+2);
3227 for (loc=start; loc<=end+1; loc++) {
3228 forwardedLength.setElementAt(INT32_MAX, loc);
3229 }
3230
3231 for (loc = start; loc<=end; loc++) {
3232 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3233 opType = URX_TYPE(op);
3234
3235 // The loop is advancing linearly through the pattern.
3236 // If the op we are now at was the destination of a branch in the pattern,
3237 // and that path has a shorter minimum length than the current accumulated value,
3238 // replace the current accumulated value.
3239 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3240 // no-match-possible cases.
3241 if (forwardedLength.elementAti(loc) < currentLen) {
3242 currentLen = forwardedLength.elementAti(loc);
3243 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3244 }
3245
3246 switch (opType) {
3247 // Ops that don't change the total length matched
3248 case URX_RESERVED_OP:
3249 case URX_END:
3250 case URX_STRING_LEN:
3251 case URX_NOP:
3252 case URX_START_CAPTURE:
3253 case URX_END_CAPTURE:
3254 case URX_BACKSLASH_B:
3255 case URX_BACKSLASH_BU:
3256 case URX_BACKSLASH_G:
3257 case URX_BACKSLASH_Z:
3258 case URX_CARET:
3259 case URX_DOLLAR:
3260 case URX_DOLLAR_M:
3261 case URX_DOLLAR_D:
3262 case URX_DOLLAR_MD:
3263 case URX_RELOC_OPRND:
3264 case URX_STO_INP_LOC:
3265 case URX_CARET_M:
3266 case URX_CARET_M_UNIX:
3267 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3268 case URX_BACKREF_I:
3269
3270 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3271 case URX_LD_SP:
3272
3273 case URX_JMP_SAV:
3274 case URX_JMP_SAV_X:
3275 break;
3276
3277
3278 // Ops that match a minimum of one character (one or two 16 bit code units.)
3279 //
3280 case URX_ONECHAR:
3281 case URX_STATIC_SETREF:
3282 case URX_STAT_SETREF_N:
3283 case URX_SETREF:
3284 case URX_BACKSLASH_D:
3285 case URX_BACKSLASH_H:
3286 case URX_BACKSLASH_R:
3287 case URX_BACKSLASH_V:
3288 case URX_ONECHAR_I:
3289 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3290 case URX_DOTANY_ALL: // . matches one or two.
3291 case URX_DOTANY:
3292 case URX_DOTANY_UNIX:
3293 currentLen = safeIncrement(currentLen, 1);
3294 break;
3295
3296
3297 case URX_JMPX:
3298 loc++; // URX_JMPX has an extra operand, ignored here,
3299 // otherwise processed identically to URX_JMP.
3300 U_FALLTHROUGH;
3301 case URX_JMP:
3302 {
3303 int32_t jmpDest = URX_VAL(op);
3304 if (jmpDest < loc) {
3305 // Loop of some kind. Can safely ignore, the worst that will happen
3306 // is that we understate the true minimum length
3307 currentLen = forwardedLength.elementAti(loc+1);
3308 } else {
3309 // Forward jump. Propagate the current min length to the target loc of the jump.
3310 U_ASSERT(jmpDest <= end+1);
3311 if (forwardedLength.elementAti(jmpDest) > currentLen) {
3312 forwardedLength.setElementAt(currentLen, jmpDest);
3313 }
3314 }
3315 }
3316 break;
3317
3318 case URX_BACKTRACK:
3319 {
3320 // Back-tracks are kind of like a branch, except that the min length was
3321 // propagated already, by the state save.
3322 currentLen = forwardedLength.elementAti(loc+1);
3323 }
3324 break;
3325
3326
3327 case URX_STATE_SAVE:
3328 {
3329 // State Save, for forward jumps, propagate the current minimum.
3330 // of the state save.
3331 int32_t jmpDest = URX_VAL(op);
3332 if (jmpDest > loc) {
3333 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3334 forwardedLength.setElementAt(currentLen, jmpDest);
3335 }
3336 }
3337 }
3338 break;
3339
3340
3341 case URX_STRING:
3342 {
3343 loc++;
3344 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3345 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3346 }
3347 break;
3348
3349
3350 case URX_STRING_I:
3351 {
3352 loc++;
3353 // TODO: with full case folding, matching input text may be shorter than
3354 // the string we have here. More smarts could put some bounds on it.
3355 // Assume a min length of one for now. A min length of zero causes
3356 // optimization failures for a pattern like "string"+
3357 // currentLen += URX_VAL(stringLenOp);
3358 currentLen = safeIncrement(currentLen, 1);
3359 }
3360 break;
3361
3362 case URX_CTR_INIT:
3363 case URX_CTR_INIT_NG:
3364 {
3365 // Loop Init Ops.
3366 // If the min loop count == 0
3367 // move loc forwards to the end of the loop, skipping over the body.
3368 // If the min count is > 0,
3369 // continue normal processing of the body of the loop.
3370 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3371 loopEndLoc = URX_VAL(loopEndLoc);
3372 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3373 if (minLoopCount == 0) {
3374 loc = loopEndLoc;
3375 } else {
3376 loc+=3; // Skips over operands of CTR_INIT
3377 }
3378 }
3379 break;
3380
3381
3382 case URX_CTR_LOOP:
3383 case URX_CTR_LOOP_NG:
3384 // Loop ops.
3385 // The jump is conditional, backwards only.
3386 break;
3387
3388 case URX_LOOP_SR_I:
3389 case URX_LOOP_DOT_I:
3390 case URX_LOOP_C:
3391 // More loop ops. These state-save to themselves.
3392 // don't change the minimum match - could match nothing at all.
3393 break;
3394
3395
3396 case URX_LA_START:
3397 case URX_LB_START:
3398 {
3399 // Look-around. Scan forward until the matching look-ahead end,
3400 // without processing the look-around block. This is overly pessimistic for look-ahead,
3401 // it assumes that the look-ahead match might be zero-length.
3402 // TODO: Positive lookahead could recursively do the block, then continue
3403 // with the longer of the block or the value coming in. Ticket 6060
3404 int32_t depth = (opType == URX_LA_START? 2: 1);
3405 for (;;) {
3406 loc++;
3407 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3408 if (URX_TYPE(op) == URX_LA_START) {
3409 // The boilerplate for look-ahead includes two LA_END insturctions,
3410 // Depth will be decremented by each one when it is seen.
3411 depth += 2;
3412 }
3413 if (URX_TYPE(op) == URX_LB_START) {
3414 depth++;
3415 }
3416 if (URX_TYPE(op) == URX_LA_END) {
3417 depth--;
3418 if (depth == 0) {
3419 break;
3420 }
3421 }
3422 if (URX_TYPE(op)==URX_LBN_END) {
3423 depth--;
3424 if (depth == 0) {
3425 break;
3426 }
3427 }
3428 if (URX_TYPE(op) == URX_STATE_SAVE) {
3429 // Need this because neg lookahead blocks will FAIL to outside
3430 // of the block.
3431 int32_t jmpDest = URX_VAL(op);
3432 if (jmpDest > loc) {
3433 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3434 forwardedLength.setElementAt(currentLen, jmpDest);
3435 }
3436 }
3437 }
3438 U_ASSERT(loc <= end);
3439 }
3440 }
3441 break;
3442
3443 case URX_LA_END:
3444 case URX_LB_CONT:
3445 case URX_LB_END:
3446 case URX_LBN_CONT:
3447 case URX_LBN_END:
3448 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3449 // range being sized, which happens when measuring size of look-behind blocks.
3450 break;
3451
3452 default:
3453 UPRV_UNREACHABLE;
3454 }
3455
3456 }
3457
3458 // We have finished walking through the ops. Check whether some forward jump
3459 // propagated a shorter length to location end+1.
3460 if (forwardedLength.elementAti(end+1) < currentLen) {
3461 currentLen = forwardedLength.elementAti(end+1);
3462 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3463 }
3464
3465 return currentLen;
3466}
3467
3468//------------------------------------------------------------------------------
3469//
3470// maxMatchLength Calculate the length of the longest string that could
3471// match the specified pattern.
3472// Length is in 16 bit code units, not code points.
3473//
3474// The calculated length may not be exact. The returned
3475// value may be longer than the actual maximum; it must
3476// never be shorter.
3477//
3478//------------------------------------------------------------------------------
3479int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3480 if (U_FAILURE(*fStatus)) {
3481 return 0;
3482 }
3483 U_ASSERT(start <= end);
3484 U_ASSERT(end < fRXPat->fCompiledPat->size());
3485
3486 int32_t loc;
3487 int32_t op;
3488 int32_t opType;
3489 int32_t currentLen = 0;
3490 UVector32 forwardedLength(end+1, *fStatus);
3491 forwardedLength.setSize(end+1);
3492
3493 for (loc=start; loc<=end; loc++) {
3494 forwardedLength.setElementAt(0, loc);
3495 }
3496
3497 for (loc = start; loc<=end; loc++) {
3498 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3499 opType = URX_TYPE(op);
3500
3501 // The loop is advancing linearly through the pattern.
3502 // If the op we are now at was the destination of a branch in the pattern,
3503 // and that path has a longer maximum length than the current accumulated value,
3504 // replace the current accumulated value.
3505 if (forwardedLength.elementAti(loc) > currentLen) {
3506 currentLen = forwardedLength.elementAti(loc);
3507 }
3508
3509 switch (opType) {
3510 // Ops that don't change the total length matched
3511 case URX_RESERVED_OP:
3512 case URX_END:
3513 case URX_STRING_LEN:
3514 case URX_NOP:
3515 case URX_START_CAPTURE:
3516 case URX_END_CAPTURE:
3517 case URX_BACKSLASH_B:
3518 case URX_BACKSLASH_BU:
3519 case URX_BACKSLASH_G:
3520 case URX_BACKSLASH_Z:
3521 case URX_CARET:
3522 case URX_DOLLAR:
3523 case URX_DOLLAR_M:
3524 case URX_DOLLAR_D:
3525 case URX_DOLLAR_MD:
3526 case URX_RELOC_OPRND:
3527 case URX_STO_INP_LOC:
3528 case URX_CARET_M:
3529 case URX_CARET_M_UNIX:
3530
3531 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3532 case URX_LD_SP:
3533
3534 case URX_LB_END:
3535 case URX_LB_CONT:
3536 case URX_LBN_CONT:
3537 case URX_LBN_END:
3538 break;
3539
3540
3541 // Ops that increase that cause an unbounded increase in the length
3542 // of a matched string, or that increase it a hard to characterize way.
3543 // Call the max length unbounded, and stop further checking.
3544 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3545 case URX_BACKREF_I:
3546 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3547 currentLen = INT32_MAX;
3548 break;
3549
3550
3551 // Ops that match a max of one character (possibly two 16 bit code units.)
3552 //
3553 case URX_STATIC_SETREF:
3554 case URX_STAT_SETREF_N:
3555 case URX_SETREF:
3556 case URX_BACKSLASH_D:
3557 case URX_BACKSLASH_H:
3558 case URX_BACKSLASH_R:
3559 case URX_BACKSLASH_V:
3560 case URX_ONECHAR_I:
3561 case URX_DOTANY_ALL:
3562 case URX_DOTANY:
3563 case URX_DOTANY_UNIX:
3564 currentLen = safeIncrement(currentLen, 2);
3565 break;
3566
3567 // Single literal character. Increase current max length by one or two,
3568 // depending on whether the char is in the supplementary range.
3569 case URX_ONECHAR:
3570 currentLen = safeIncrement(currentLen, 1);
3571 if (URX_VAL(op) > 0x10000) {
3572 currentLen = safeIncrement(currentLen, 1);
3573 }
3574 break;
3575
3576 // Jumps.
3577 //
3578 case URX_JMP:
3579 case URX_JMPX:
3580 case URX_JMP_SAV:
3581 case URX_JMP_SAV_X:
3582 {
3583 int32_t jmpDest = URX_VAL(op);
3584 if (jmpDest < loc) {
3585 // Loop of some kind. Max match length is unbounded.
3586 currentLen = INT32_MAX;
3587 } else {
3588 // Forward jump. Propagate the current min length to the target loc of the jump.
3589 if (forwardedLength.elementAti(jmpDest) < currentLen) {
3590 forwardedLength.setElementAt(currentLen, jmpDest);
3591 }
3592 currentLen = 0;
3593 }
3594 }
3595 break;
3596
3597 case URX_BACKTRACK:
3598 // back-tracks are kind of like a branch, except that the max length was
3599 // propagated already, by the state save.
3600 currentLen = forwardedLength.elementAti(loc+1);
3601 break;
3602
3603
3604 case URX_STATE_SAVE:
3605 {
3606 // State Save, for forward jumps, propagate the current minimum.
3607 // of the state save.
3608 // For backwards jumps, they create a loop, maximum
3609 // match length is unbounded.
3610 int32_t jmpDest = URX_VAL(op);
3611 if (jmpDest > loc) {
3612 if (currentLen > forwardedLength.elementAti(jmpDest)) {
3613 forwardedLength.setElementAt(currentLen, jmpDest);
3614 }
3615 } else {
3616 currentLen = INT32_MAX;
3617 }
3618 }
3619 break;
3620
3621
3622
3623
3624 case URX_STRING:
3625 {
3626 loc++;
3627 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3628 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3629 break;
3630 }
3631
3632 case URX_STRING_I:
3633 // TODO: This code assumes that any user string that matches will be no longer
3634 // than our compiled string, with case insensitive matching.
3635 // Our compiled string has been case-folded already.
3636 //
3637 // Any matching user string will have no more code points than our
3638 // compiled (folded) string. Folding may add code points, but
3639 // not remove them.
3640 //
3641 // There is a potential problem if a supplemental code point
3642 // case-folds to a BMP code point. In this case our compiled string
3643 // could be shorter (in code units) than a matching user string.
3644 //
3645 // At this time (Unicode 6.1) there are no such characters, and this case
3646 // is not being handled. A test, intltest regex/Bug9283, will fail if
3647 // any problematic characters are added to Unicode.
3648 //
3649 // If this happens, we can make a set of the BMP chars that the
3650 // troublesome supplementals fold to, scan our string, and bump the
3651 // currentLen one extra for each that is found.
3652 //
3653 {
3654 loc++;
3655 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3656 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3657 }
3658 break;
3659
3660 case URX_CTR_INIT:
3661 case URX_CTR_INIT_NG:
3662 // For Loops, recursively call this function on the pattern for the loop body,
3663 // then multiply the result by the maximum loop count.
3664 {
3665 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3666 if (loopEndLoc == loc+4) {
3667 // Loop has an empty body. No affect on max match length.
3668 // Continue processing with code after the loop end.
3669 loc = loopEndLoc;
3670 break;
3671 }
3672
3673 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3674 if (maxLoopCount == -1) {
3675 // Unbounded Loop. No upper bound on match length.
3676 currentLen = INT32_MAX;
3677 break;
3678 }
3679
3680 U_ASSERT(loopEndLoc >= loc+4);
3681 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
3682 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3683 if (updatedLen >= INT32_MAX) {
3684 currentLen = INT32_MAX;
3685 break;
3686 }
3687 currentLen = (int32_t)updatedLen;
3688 loc = loopEndLoc;
3689 break;
3690 }
3691
3692 case URX_CTR_LOOP:
3693 case URX_CTR_LOOP_NG:
3694 // These opcodes will be skipped over by code for URX_CTR_INIT.
3695 // We shouldn't encounter them here.
3696 UPRV_UNREACHABLE;
3697
3698 case URX_LOOP_SR_I:
3699 case URX_LOOP_DOT_I:
3700 case URX_LOOP_C:
3701 // For anything to do with loops, make the match length unbounded.
3702 currentLen = INT32_MAX;
3703 break;
3704
3705
3706
3707 case URX_LA_START:
3708 case URX_LA_END:
3709 // Look-ahead. Just ignore, treat the look-ahead block as if
3710 // it were normal pattern. Gives a too-long match length,
3711 // but good enough for now.
3712 break;
3713
3714 // End of look-ahead ops should always be consumed by the processing at
3715 // the URX_LA_START op.
3716 // UPRV_UNREACHABLE;
3717
3718 case URX_LB_START:
3719 {
3720 // Look-behind. Scan forward until the matching look-around end,
3721 // without processing the look-behind block.
3722 int32_t dataLoc = URX_VAL(op);
3723 for (loc = loc + 1; loc < end; ++loc) {
3724 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3725 int32_t opType = URX_TYPE(op);
3726 if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3727 break;
3728 }
3729 }
3730 U_ASSERT(loc < end);
3731 }
3732 break;
3733
3734 default:
3735 UPRV_UNREACHABLE;
3736 }
3737
3738
3739 if (currentLen == INT32_MAX) {
3740 // The maximum length is unbounded.
3741 // Stop further processing of the pattern.
3742 break;
3743 }
3744
3745 }
3746 return currentLen;
3747
3748}
3749
3750
3751//------------------------------------------------------------------------------
3752//
3753// stripNOPs Remove any NOP operations from the compiled pattern code.
3754// Extra NOPs are inserted for some constructs during the initial
3755// code generation to provide locations that may be patched later.
3756// Many end up unneeded, and are removed by this function.
3757//
3758// In order to minimize the number of passes through the pattern,
3759// back-reference fixup is also performed here (adjusting
3760// back-reference operands to point to the correct frame offsets).
3761//
3762//------------------------------------------------------------------------------
3763void RegexCompile::stripNOPs() {
3764
3765 if (U_FAILURE(*fStatus)) {
3766 return;
3767 }
3768
3769 int32_t end = fRXPat->fCompiledPat->size();
3770 UVector32 deltas(end, *fStatus);
3771
3772 // Make a first pass over the code, computing the amount that things
3773 // will be offset at each location in the original code.
3774 int32_t loc;
3775 int32_t d = 0;
3776 for (loc=0; loc<end; loc++) {
3777 deltas.addElement(d, *fStatus);
3778 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3779 if (URX_TYPE(op) == URX_NOP) {
3780 d++;
3781 }
3782 }
3783
3784 UnicodeString caseStringBuffer;
3785
3786 // Make a second pass over the code, removing the NOPs by moving following
3787 // code up, and patching operands that refer to code locations that
3788 // are being moved. The array of offsets from the first step is used
3789 // to compute the new operand values.
3790 int32_t src;
3791 int32_t dst = 0;
3792 for (src=0; src<end; src++) {
3793 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3794 int32_t opType = URX_TYPE(op);
3795 switch (opType) {
3796 case URX_NOP:
3797 break;
3798
3799 case URX_STATE_SAVE:
3800 case URX_JMP:
3801 case URX_CTR_LOOP:
3802 case URX_CTR_LOOP_NG:
3803 case URX_RELOC_OPRND:
3804 case URX_JMPX:
3805 case URX_JMP_SAV:
3806 case URX_JMP_SAV_X:
3807 // These are instructions with operands that refer to code locations.
3808 {
3809 int32_t operandAddress = URX_VAL(op);
3810 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3811 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3812 op = buildOp(opType, fixedOperandAddress);
3813 fRXPat->fCompiledPat->setElementAt(op, dst);
3814 dst++;
3815 break;
3816 }
3817
3818 case URX_BACKREF:
3819 case URX_BACKREF_I:
3820 {
3821 int32_t where = URX_VAL(op);
3822 if (where > fRXPat->fGroupMap->size()) {
3823 error(U_REGEX_INVALID_BACK_REF);
3824 break;
3825 }
3826 where = fRXPat->fGroupMap->elementAti(where-1);
3827 op = buildOp(opType, where);
3828 fRXPat->fCompiledPat->setElementAt(op, dst);
3829 dst++;
3830
3831 fRXPat->fNeedsAltInput = TRUE;
3832 break;
3833 }
3834 case URX_RESERVED_OP:
3835 case URX_RESERVED_OP_N:
3836 case URX_BACKTRACK:
3837 case URX_END:
3838 case URX_ONECHAR:
3839 case URX_STRING:
3840 case URX_STRING_LEN:
3841 case URX_START_CAPTURE:
3842 case URX_END_CAPTURE:
3843 case URX_STATIC_SETREF:
3844 case URX_STAT_SETREF_N:
3845 case URX_SETREF:
3846 case URX_DOTANY:
3847 case URX_FAIL:
3848 case URX_BACKSLASH_B:
3849 case URX_BACKSLASH_BU:
3850 case URX_BACKSLASH_G:
3851 case URX_BACKSLASH_X:
3852 case URX_BACKSLASH_Z:
3853 case URX_DOTANY_ALL:
3854 case URX_BACKSLASH_D:
3855 case URX_CARET:
3856 case URX_DOLLAR:
3857 case URX_CTR_INIT:
3858 case URX_CTR_INIT_NG:
3859 case URX_DOTANY_UNIX:
3860 case URX_STO_SP:
3861 case URX_LD_SP:
3862 case URX_STO_INP_LOC:
3863 case URX_LA_START:
3864 case URX_LA_END:
3865 case URX_ONECHAR_I:
3866 case URX_STRING_I:
3867 case URX_DOLLAR_M:
3868 case URX_CARET_M:
3869 case URX_CARET_M_UNIX:
3870 case URX_LB_START:
3871 case URX_LB_CONT:
3872 case URX_LB_END:
3873 case URX_LBN_CONT:
3874 case URX_LBN_END:
3875 case URX_LOOP_SR_I:
3876 case URX_LOOP_DOT_I:
3877 case URX_LOOP_C:
3878 case URX_DOLLAR_D:
3879 case URX_DOLLAR_MD:
3880 case URX_BACKSLASH_H:
3881 case URX_BACKSLASH_R:
3882 case URX_BACKSLASH_V:
3883 // These instructions are unaltered by the relocation.
3884 fRXPat->fCompiledPat->setElementAt(op, dst);
3885 dst++;
3886 break;
3887
3888 default:
3889 // Some op is unaccounted for.
3890 UPRV_UNREACHABLE;
3891 }
3892 }
3893
3894 fRXPat->fCompiledPat->setSize(dst);
3895}
3896
3897
3898
3899
3900//------------------------------------------------------------------------------
3901//
3902// Error Report a rule parse error.
3903// Only report it if no previous error has been recorded.
3904//
3905//------------------------------------------------------------------------------
3906void RegexCompile::error(UErrorCode e) {
3907 if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3908 *fStatus = e;
3909 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3910 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3911 // int64_t. If the values of the latter are out of range for the former,
3912 // set them to the appropriate "field not supported" values.
3913 if (fLineNum > 0x7FFFFFFF) {
3914 fParseErr->line = 0;
3915 fParseErr->offset = -1;
3916 } else if (fCharNum > 0x7FFFFFFF) {
3917 fParseErr->line = (int32_t)fLineNum;
3918 fParseErr->offset = -1;
3919 } else {
3920 fParseErr->line = (int32_t)fLineNum;
3921 fParseErr->offset = (int32_t)fCharNum;
3922 }
3923
3924 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3925
3926 // Fill in the context.
3927 // Note: extractBetween() pins supplied indicies to the string bounds.
3928 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
3929 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3930 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3931 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3932 }
3933}
3934
3935
3936//
3937// Assorted Unicode character constants.
3938// Numeric because there is no portable way to enter them as literals.
3939// (Think EBCDIC).
3940//
3941static const UChar chCR = 0x0d; // New lines, for terminating comments.
3942static const UChar chLF = 0x0a; // Line Feed
3943static const UChar chPound = 0x23; // '#', introduces a comment.
3944static const UChar chDigit0 = 0x30; // '0'
3945static const UChar chDigit7 = 0x37; // '9'
3946static const UChar chColon = 0x3A; // ':'
3947static const UChar chE = 0x45; // 'E'
3948static const UChar chQ = 0x51; // 'Q'
3949//static const UChar chN = 0x4E; // 'N'
3950static const UChar chP = 0x50; // 'P'
3951static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
3952//static const UChar chLBracket = 0x5b; // '['
3953static const UChar chRBracket = 0x5d; // ']'
3954static const UChar chUp = 0x5e; // '^'
3955static const UChar chLowerP = 0x70;
3956static const UChar chLBrace = 0x7b; // '{'
3957static const UChar chRBrace = 0x7d; // '}'
3958static const UChar chNEL = 0x85; // NEL newline variant
3959static const UChar chLS = 0x2028; // Unicode Line Separator
3960
3961
3962//------------------------------------------------------------------------------
3963//
3964// nextCharLL Low Level Next Char from the regex pattern.
3965// Get a char from the string, keep track of input position
3966// for error reporting.
3967//
3968//------------------------------------------------------------------------------
3969UChar32 RegexCompile::nextCharLL() {
3970 UChar32 ch;
3971
3972 if (fPeekChar != -1) {
3973 ch = fPeekChar;
3974 fPeekChar = -1;
3975 return ch;
3976 }
3977
3978 // assume we're already in the right place
3979 ch = UTEXT_NEXT32(fRXPat->fPattern);
3980 if (ch == U_SENTINEL) {
3981 return ch;
3982 }
3983
3984 if (ch == chCR ||
3985 ch == chNEL ||
3986 ch == chLS ||
3987 (ch == chLF && fLastChar != chCR)) {
3988 // Character is starting a new line. Bump up the line number, and
3989 // reset the column to 0.
3990 fLineNum++;
3991 fCharNum=0;
3992 }
3993 else {
3994 // Character is not starting a new line. Except in the case of a
3995 // LF following a CR, increment the column position.
3996 if (ch != chLF) {
3997 fCharNum++;
3998 }
3999 }
4000 fLastChar = ch;
4001 return ch;
4002}
4003
4004//------------------------------------------------------------------------------
4005//
4006// peekCharLL Low Level Character Scanning, sneak a peek at the next
4007// character without actually getting it.
4008//
4009//------------------------------------------------------------------------------
4010UChar32 RegexCompile::peekCharLL() {
4011 if (fPeekChar == -1) {
4012 fPeekChar = nextCharLL();
4013 }
4014 return fPeekChar;
4015}
4016
4017
4018//------------------------------------------------------------------------------
4019//
4020// nextChar for pattern scanning. At this level, we handle stripping
4021// out comments and processing some backslash character escapes.
4022// The rest of the pattern grammar is handled at the next level up.
4023//
4024//------------------------------------------------------------------------------
4025void RegexCompile::nextChar(RegexPatternChar &c) {
4026 tailRecursion:
4027 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4028 c.fChar = nextCharLL();
4029 c.fQuoted = FALSE;
4030
4031 if (fQuoteMode) {
4032 c.fQuoted = TRUE;
4033 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4034 c.fChar == (UChar32)-1) {
4035 fQuoteMode = FALSE; // Exit quote mode,
4036 nextCharLL(); // discard the E
4037 // nextChar(c); // recurse to get the real next char
4038 goto tailRecursion; // Note: fuzz testing produced testcases that
4039 // resulted in stack overflow here.
4040 }
4041 }
4042 else if (fInBackslashQuote) {
4043 // The current character immediately follows a '\'
4044 // Don't check for any further escapes, just return it as-is.
4045 // Don't set c.fQuoted, because that would prevent the state machine from
4046 // dispatching on the character.
4047 fInBackslashQuote = FALSE;
4048 }
4049 else
4050 {
4051 // We are not in a \Q quoted region \E of the source.
4052 //
4053 if (fModeFlags & UREGEX_COMMENTS) {
4054 //
4055 // We are in free-spacing and comments mode.
4056 // Scan through any white space and comments, until we
4057 // reach a significant character or the end of inut.
4058 for (;;) {
4059 if (c.fChar == (UChar32)-1) {
4060 break; // End of Input
4061 }
4062 if (c.fChar == chPound && fEOLComments == TRUE) {
4063 // Start of a comment. Consume the rest of it, until EOF or a new line
4064 for (;;) {
4065 c.fChar = nextCharLL();
4066 if (c.fChar == (UChar32)-1 || // EOF
4067 c.fChar == chCR ||
4068 c.fChar == chLF ||
4069 c.fChar == chNEL ||
4070 c.fChar == chLS) {
4071 break;
4072 }
4073 }
4074 }
4075 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4076 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4077 break;
4078 }
4079 c.fChar = nextCharLL();
4080 }
4081 }
4082
4083 //
4084 // check for backslash escaped characters.
4085 //
4086 if (c.fChar == chBackSlash) {
4087 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4088 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4089 //
4090 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4091 // Includes \uxxxx, \n, \r, many others.
4092 // Return the single equivalent character.
4093 //
4094 nextCharLL(); // get & discard the peeked char.
4095 c.fQuoted = TRUE;
4096
4097 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4098 int32_t endIndex = (int32_t)pos;
4099 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4100
4101 if (endIndex == pos) {
4102 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4103 }
4104 fCharNum += endIndex - pos;
4105 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4106 } else {
4107 int32_t offset = 0;
4108 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4109
4110 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4111 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4112
4113 if (offset == 0) {
4114 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4115 } else if (context.lastOffset == offset) {
4116 UTEXT_PREVIOUS32(fRXPat->fPattern);
4117 } else if (context.lastOffset != offset-1) {
4118 utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4119 }
4120 fCharNum += offset;
4121 }
4122 }
4123 else if (peekCharLL() == chDigit0) {
4124 // Octal Escape, using Java Regexp Conventions
4125 // which are \0 followed by 1-3 octal digits.
4126 // Different from ICU Unescape handling of Octal, which does not
4127 // require the leading 0.
4128 // Java also has the convention of only consuming 2 octal digits if
4129 // the three digit number would be > 0xff
4130 //
4131 c.fChar = 0;
4132 nextCharLL(); // Consume the initial 0.
4133 int index;
4134 for (index=0; index<3; index++) {
4135 int32_t ch = peekCharLL();
4136 if (ch<chDigit0 || ch>chDigit7) {
4137 if (index==0) {
4138 // \0 is not followed by any octal digits.
4139 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4140 }
4141 break;
4142 }
4143 c.fChar <<= 3;
4144 c.fChar += ch&7;
4145 if (c.fChar <= 255) {
4146 nextCharLL();
4147 } else {
4148 // The last digit made the number too big. Forget we saw it.
4149 c.fChar >>= 3;
4150 }
4151 }
4152 c.fQuoted = TRUE;
4153 }
4154 else if (peekCharLL() == chQ) {
4155 // "\Q" enter quote mode, which will continue until "\E"
4156 fQuoteMode = TRUE;
4157 nextCharLL(); // discard the 'Q'.
4158 // nextChar(c); // recurse to get the real next char.
4159 goto tailRecursion; // Note: fuzz testing produced test cases that
4160 // resulted in stack overflow here.
4161 }
4162 else
4163 {
4164 // We are in a '\' escape that will be handled by the state table scanner.
4165 // Just return the backslash, but remember that the following char is to
4166 // be taken literally.
4167 fInBackslashQuote = TRUE;
4168 }
4169 }
4170 }
4171
4172 // re-enable # to end-of-line comments, in case they were disabled.
4173 // They are disabled by the parser upon seeing '(?', but this lasts for
4174 // the fetching of the next character only.
4175 fEOLComments = TRUE;
4176
4177 // putc(c.fChar, stdout);
4178}
4179
4180
4181
4182//------------------------------------------------------------------------------
4183//
4184// scanNamedChar
4185// Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4186//
4187// The scan position will be at the 'N'. On return
4188// the scan position should be just after the '}'
4189//
4190// Return the UChar32
4191//
4192//------------------------------------------------------------------------------
4193UChar32 RegexCompile::scanNamedChar() {
4194 if (U_FAILURE(*fStatus)) {
4195 return 0;
4196 }
4197
4198 nextChar(fC);
4199 if (fC.fChar != chLBrace) {
4200 error(U_REGEX_PROPERTY_SYNTAX);
4201 return 0;
4202 }
4203
4204 UnicodeString charName;
4205 for (;;) {
4206 nextChar(fC);
4207 if (fC.fChar == chRBrace) {
4208 break;
4209 }
4210 if (fC.fChar == -1) {
4211 error(U_REGEX_PROPERTY_SYNTAX);
4212 return 0;
4213 }
4214 charName.append(fC.fChar);
4215 }
4216
4217 char name[100];
4218 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4219 (uint32_t)charName.length()>=sizeof(name)) {
4220 // All Unicode character names have only invariant characters.
4221 // The API to get a character, given a name, accepts only char *, forcing us to convert,
4222 // which requires this error check
4223 error(U_REGEX_PROPERTY_SYNTAX);
4224 return 0;
4225 }
4226 charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4227
4228 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4229 if (U_FAILURE(*fStatus)) {
4230 error(U_REGEX_PROPERTY_SYNTAX);
4231 }
4232
4233 nextChar(fC); // Continue overall regex pattern processing with char after the '}'
4234 return theChar;
4235}
4236
4237//------------------------------------------------------------------------------
4238//
4239// scanProp Construct a UnicodeSet from the text at the current scan
4240// position, which will be of the form \p{whaterver}
4241//
4242// The scan position will be at the 'p' or 'P'. On return
4243// the scan position should be just after the '}'
4244//
4245// Return a UnicodeSet, constructed from the \P pattern,
4246// or NULL if the pattern is invalid.
4247//
4248//------------------------------------------------------------------------------
4249UnicodeSet *RegexCompile::scanProp() {
4250 UnicodeSet *uset = NULL;
4251
4252 if (U_FAILURE(*fStatus)) {
4253 return NULL;
4254 }
4255 (void)chLowerP; // Suppress compiler unused variable warning.
4256 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4257 UBool negated = (fC.fChar == chP);
4258
4259 UnicodeString propertyName;
4260 nextChar(fC);
4261 if (fC.fChar != chLBrace) {
4262 error(U_REGEX_PROPERTY_SYNTAX);
4263 return NULL;
4264 }
4265 for (;;) {
4266 nextChar(fC);
4267 if (fC.fChar == chRBrace) {
4268 break;
4269 }
4270 if (fC.fChar == -1) {
4271 // Hit the end of the input string without finding the closing '}'
4272 error(U_REGEX_PROPERTY_SYNTAX);
4273 return NULL;
4274 }
4275 propertyName.append(fC.fChar);
4276 }
4277 uset = createSetForProperty(propertyName, negated);
4278 nextChar(fC); // Move input scan to position following the closing '}'
4279 return uset;
4280}
4281
4282//------------------------------------------------------------------------------
4283//
4284// scanPosixProp Construct a UnicodeSet from the text at the current scan
4285// position, which is expected be of the form [:property expression:]
4286//
4287// The scan position will be at the opening ':'. On return
4288// the scan position must be on the closing ']'
4289//
4290// Return a UnicodeSet constructed from the pattern,
4291// or NULL if this is not a valid POSIX-style set expression.
4292// If not a property expression, restore the initial scan position
4293// (to the opening ':')
4294//
4295// Note: the opening '[:' is not sufficient to guarantee that
4296// this is a [:property:] expression.
4297// [:'+=,] is a perfectly good ordinary set expression that
4298// happens to include ':' as one of its characters.
4299//
4300//------------------------------------------------------------------------------
4301UnicodeSet *RegexCompile::scanPosixProp() {
4302 UnicodeSet *uset = NULL;
4303
4304 if (U_FAILURE(*fStatus)) {
4305 return NULL;
4306 }
4307
4308 U_ASSERT(fC.fChar == chColon);
4309
4310 // Save the scanner state.
4311 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4312 int64_t savedScanIndex = fScanIndex;
4313 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4314 UBool savedQuoteMode = fQuoteMode;
4315 UBool savedInBackslashQuote = fInBackslashQuote;
4316 UBool savedEOLComments = fEOLComments;
4317 int64_t savedLineNum = fLineNum;
4318 int64_t savedCharNum = fCharNum;
4319 UChar32 savedLastChar = fLastChar;
4320 UChar32 savedPeekChar = fPeekChar;
4321 RegexPatternChar savedfC = fC;
4322
4323 // Scan for a closing ]. A little tricky because there are some perverse
4324 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4325 // ending on the second closing ].
4326
4327 UnicodeString propName;
4328 UBool negated = FALSE;
4329
4330 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4331 nextChar(fC);
4332 if (fC.fChar == chUp) {
4333 negated = TRUE;
4334 nextChar(fC);
4335 }
4336
4337 // Scan for the closing ":]", collecting the property name along the way.
4338 UBool sawPropSetTerminator = FALSE;
4339 for (;;) {
4340 propName.append(fC.fChar);
4341 nextChar(fC);
4342 if (fC.fQuoted || fC.fChar == -1) {
4343 // Escaped characters or end of input - either says this isn't a [:Property:]
4344 break;
4345 }
4346 if (fC.fChar == chColon) {
4347 nextChar(fC);
4348 if (fC.fChar == chRBracket) {
4349 sawPropSetTerminator = TRUE;
4350 }
4351 break;
4352 }
4353 }
4354
4355 if (sawPropSetTerminator) {
4356 uset = createSetForProperty(propName, negated);
4357 }
4358 else
4359 {
4360 // No closing ":]".
4361 // Restore the original scan position.
4362 // The main scanner will retry the input as a normal set expression,
4363 // not a [:Property:] expression.
4364 fScanIndex = savedScanIndex;
4365 fQuoteMode = savedQuoteMode;
4366 fInBackslashQuote = savedInBackslashQuote;
4367 fEOLComments = savedEOLComments;
4368 fLineNum = savedLineNum;
4369 fCharNum = savedCharNum;
4370 fLastChar = savedLastChar;
4371 fPeekChar = savedPeekChar;
4372 fC = savedfC;
4373 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4374 }
4375 return uset;
4376}
4377
4378static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4379 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4380 addCategory(set, U_GC_CF_MASK, ec);
4381}
4382
4383//
4384// Create a Unicode Set from a Unicode Property expression.
4385// This is common code underlying both \p{...} ane [:...:] expressions.
4386// Includes trying the Java "properties" that aren't supported as
4387// normal ICU UnicodeSet properties
4388//
4389UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4390
4391 if (U_FAILURE(*fStatus)) {
4392 return nullptr;
4393 }
4394 LocalPointer<UnicodeSet> set;
4395 UErrorCode status = U_ZERO_ERROR;
4396
4397 do { // non-loop, exists to allow breaks from the block.
4398 //
4399 // First try the property as we received it
4400 //
4401 UnicodeString setExpr;
4402 uint32_t usetFlags = 0;
4403 setExpr.append(u"[\\p{", -1);
4404 setExpr.append(propName);
4405 setExpr.append(u"}]", -1);
4406 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4407 usetFlags |= USET_CASE_INSENSITIVE;
4408 }
4409 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status);
4410 if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4411 break;
4412 }
4413
4414 //
4415 // The incoming property wasn't directly recognized by ICU.
4416
4417 // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4418 // Java accepts 'word' with mixed case.
4419 // Java accepts 'all' only in all lower case.
4420
4421 status = U_ZERO_ERROR;
4422 if (propName.caseCompare(u"word", -1, 0) == 0) {
4423 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])), status);
4424 break;
4425 }
4426 if (propName.compare(u"all", -1) == 0) {
4427 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4428 break;
4429 }
4430
4431
4432 // Do Java InBlock expressions
4433 //
4434 UnicodeString mPropName = propName;
4435 if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4436 status = U_ZERO_ERROR;
4437 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4438 if (U_FAILURE(status)) {
4439 break;
4440 }
4441 UnicodeString blockName(mPropName, 2); // Property with the leading "In" removed.
4442 set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4443 break;
4444 }
4445
4446 // Check for the Java form "IsBooleanPropertyValue", which we will recast
4447 // as "BooleanPropertyValue". The property value can be either a
4448 // a General Category or a Script Name.
4449
4450 if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4451 mPropName.remove(0, 2); // Strip the "Is"
4452 if (mPropName.indexOf(u'=') >= 0) {
4453 // Reject any "Is..." property expression containing an '=', that is,
4454 // any non-binary property expression.
4455 status = U_REGEX_PROPERTY_SYNTAX;
4456 break;
4457 }
4458
4459 if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4460 mPropName.setTo(u"unassigned", -1);
4461 negated = !negated;
4462 } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4463 mPropName.setTo(u"Titlecase_Letter", -1);
4464 }
4465
4466 mPropName.insert(0, u"[\\p{", -1);
4467 mPropName.append(u"}]", -1);
4468 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4469
4470 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4471 set->closeOver(USET_CASE_INSENSITIVE);
4472 }
4473 break;
4474
4475 }
4476
4477 if (propName.startsWith(u"java", -1)) {
4478 status = U_ZERO_ERROR;
4479 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4480 if (U_FAILURE(status)) {
4481 break;
4482 }
4483 //
4484 // Try the various Java specific properties.
4485 // These all begin with "java"
4486 //
4487 if (propName.compare(u"javaDefined", -1) == 0) {
4488 addCategory(set.getAlias(), U_GC_CN_MASK, status);
4489 set->complement();
4490 }
4491 else if (propName.compare(u"javaDigit", -1) == 0) {
4492 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4493 }
4494 else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4495 addIdentifierIgnorable(set.getAlias(), status);
4496 }
4497 else if (propName.compare(u"javaISOControl", -1) == 0) {
4498 set->add(0, 0x1F).add(0x7F, 0x9F);
4499 }
4500 else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4501 addCategory(set.getAlias(), U_GC_L_MASK, status);
4502 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4503 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4504 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4505 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4506 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4507 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4508 addIdentifierIgnorable(set.getAlias(), status);
4509 }
4510 else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4511 addCategory(set.getAlias(), U_GC_L_MASK, status);
4512 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4513 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4514 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4515 }
4516 else if (propName.compare(u"javaLetter", -1) == 0) {
4517 addCategory(set.getAlias(), U_GC_L_MASK, status);
4518 }
4519 else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4520 addCategory(set.getAlias(), U_GC_L_MASK, status);
4521 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4522 }
4523 else if (propName.compare(u"javaLowerCase", -1) == 0) {
4524 addCategory(set.getAlias(), U_GC_LL_MASK, status);
4525 }
4526 else if (propName.compare(u"javaMirrored", -1) == 0) {
4527 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4528 }
4529 else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4530 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4531 }
4532 else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4533 set->add(0x10000, UnicodeSet::MAX_VALUE);
4534 }
4535 else if (propName.compare(u"javaTitleCase", -1) == 0) {
4536 addCategory(set.getAlias(), U_GC_LT_MASK, status);
4537 }
4538 else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4539 addCategory(set.getAlias(), U_GC_L_MASK, status);
4540 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4541 }
4542 else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4543 addCategory(set.getAlias(), U_GC_L_MASK, status);
4544 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4545 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4546 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4547 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4548 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4549 addIdentifierIgnorable(set.getAlias(), status);
4550 }
4551 else if (propName.compare(u"javaUpperCase", -1) == 0) {
4552 addCategory(set.getAlias(), U_GC_LU_MASK, status);
4553 }
4554 else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4555 set->add(0, UnicodeSet::MAX_VALUE);
4556 }
4557 else if (propName.compare(u"javaWhitespace", -1) == 0) {
4558 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4559 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4560 set->add(9, 0x0d).add(0x1c, 0x1f);
4561 } else {
4562 status = U_REGEX_PROPERTY_SYNTAX;
4563 }
4564
4565 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4566 set->closeOver(USET_CASE_INSENSITIVE);
4567 }
4568 break;
4569 }
4570
4571 // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4572 // extensions matched it.
4573 status = U_REGEX_PROPERTY_SYNTAX;
4574 } while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
4575
4576 if (U_SUCCESS(status)) {
4577 U_ASSERT(set.isValid());
4578 if (negated) {
4579 set->complement();
4580 }
4581 return set.orphan();
4582 } else {
4583 if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4584 status = U_REGEX_PROPERTY_SYNTAX;
4585 }
4586 error(status);
4587 return nullptr;
4588 }
4589}
4590
4591
4592//
4593// SetEval Part of the evaluation of [set expressions].
4594// Perform any pending (stacked) operations with precedence
4595// equal or greater to that of the next operator encountered
4596// in the expression.
4597//
4598void RegexCompile::setEval(int32_t nextOp) {
4599 UnicodeSet *rightOperand = NULL;
4600 UnicodeSet *leftOperand = NULL;
4601 for (;;) {
4602 U_ASSERT(fSetOpStack.empty()==FALSE);
4603 int32_t pendingSetOperation = fSetOpStack.peeki();
4604 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4605 break;
4606 }
4607 fSetOpStack.popi();
4608 U_ASSERT(fSetStack.empty() == FALSE);
4609 rightOperand = (UnicodeSet *)fSetStack.peek();
4610 switch (pendingSetOperation) {
4611 case setNegation:
4612 rightOperand->complement();
4613 break;
4614 case setCaseClose:
4615 // TODO: need a simple close function. Ticket 6065
4616 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4617 rightOperand->removeAllStrings();
4618 break;
4619 case setDifference1:
4620 case setDifference2:
4621 fSetStack.pop();
4622 leftOperand = (UnicodeSet *)fSetStack.peek();
4623 leftOperand->removeAll(*rightOperand);
4624 delete rightOperand;
4625 break;
4626 case setIntersection1:
4627 case setIntersection2:
4628 fSetStack.pop();
4629 leftOperand = (UnicodeSet *)fSetStack.peek();
4630 leftOperand->retainAll(*rightOperand);
4631 delete rightOperand;
4632 break;
4633 case setUnion:
4634 fSetStack.pop();
4635 leftOperand = (UnicodeSet *)fSetStack.peek();
4636 leftOperand->addAll(*rightOperand);
4637 delete rightOperand;
4638 break;
4639 default:
4640 UPRV_UNREACHABLE;
4641 }
4642 }
4643 }
4644
4645void RegexCompile::setPushOp(int32_t op) {
4646 setEval(op);
4647 fSetOpStack.push(op, *fStatus);
4648 fSetStack.push(new UnicodeSet(), *fStatus);
4649}
4650
4651U_NAMESPACE_END
4652#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4653
4654