1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4// file: regexcmp.cpp
5//
6// Copyright (C) 2002-2016 International Business Machines Corporation and others.
7// All Rights Reserved.
8//
9// This file contains the ICU regular expression compiler, which is responsible
10// for processing a regular expression pattern into the compiled form that
11// is used by the match finding engine.
12//
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_REGULAR_EXPRESSIONS
17
18#include "unicode/ustring.h"
19#include "unicode/unistr.h"
20#include "unicode/uniset.h"
21#include "unicode/uchar.h"
22#include "unicode/uchriter.h"
23#include "unicode/parsepos.h"
24#include "unicode/parseerr.h"
25#include "unicode/regex.h"
26#include "unicode/utf.h"
27#include "unicode/utf16.h"
28#include "patternprops.h"
29#include "putilimp.h"
30#include "cmemory.h"
31#include "cstr.h"
32#include "cstring.h"
33#include "uvectr32.h"
34#include "uvectr64.h"
35#include "uassert.h"
36#include "uinvchar.h"
37
38#include "regeximp.h"
39#include "regexcst.h" // Contains state table for the regex pattern parser.
40 // generated by a Perl script.
41#include "regexcmp.h"
42#include "regexst.h"
43#include "regextxt.h"
44
45
46
47U_NAMESPACE_BEGIN
48
49
50//------------------------------------------------------------------------------
51//
52// Constructor.
53//
54//------------------------------------------------------------------------------
55RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56 fParenStack(status), fSetStack(status), fSetOpStack(status)
57{
58 // Lazy init of all shared global sets (needed for init()'s empty text)
59 RegexStaticSets::initGlobals(&status);
60
61 fStatus = &status;
62
63 fRXPat = rxp;
64 fScanIndex = 0;
65 fLastChar = -1;
66 fPeekChar = -1;
67 fLineNum = 1;
68 fCharNum = 0;
69 fQuoteMode = FALSE;
70 fInBackslashQuote = FALSE;
71 fModeFlags = fRXPat->fFlags | 0x80000000;
72 fEOLComments = TRUE;
73
74 fMatchOpenParen = -1;
75 fMatchCloseParen = -1;
76 fCaptureName = NULL;
77 fLastSetLiteral = U_SENTINEL;
78
79 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80 status = rxp->fDeferredStatus;
81 }
82}
83
84static const UChar chAmp = 0x26; // '&'
85static const UChar chDash = 0x2d; // '-'
86
87
88//------------------------------------------------------------------------------
89//
90// Destructor
91//
92//------------------------------------------------------------------------------
93RegexCompile::~RegexCompile() {
94 delete fCaptureName; // Normally will be NULL, but can exist if pattern
95 // compilation stops with a syntax error.
96}
97
98static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100}
101
102//------------------------------------------------------------------------------
103//
104// Compile regex pattern. The state machine for rexexp pattern parsing is here.
105// The state tables are hand-written in the file regexcst.txt,
106// and converted to the form used here by a perl
107// script regexcst.pl
108//
109//------------------------------------------------------------------------------
110void RegexCompile::compile(
111 const UnicodeString &pat, // Source pat to be compiled.
112 UParseError &pp, // Error position info
113 UErrorCode &e) // Error Code
114{
115 fRXPat->fPatternString = new UnicodeString(pat);
116 UText patternText = UTEXT_INITIALIZER;
117 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118
119 if (U_SUCCESS(e)) {
120 compile(&patternText, pp, e);
121 utext_close(&patternText);
122 }
123}
124
125//
126// compile, UText mode
127// All the work is actually done here.
128//
129void RegexCompile::compile(
130 UText *pat, // Source pat to be compiled.
131 UParseError &pp, // Error position info
132 UErrorCode &e) // Error Code
133{
134 fStatus = &e;
135 fParseErr = &pp;
136 fStackPtr = 0;
137 fStack[fStackPtr] = 0;
138
139 if (U_FAILURE(*fStatus)) {
140 return;
141 }
142
143 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
144 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
145
146 // Prepare the RegexPattern object to receive the compiled pattern.
147 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
148 if (U_FAILURE(*fStatus)) {
149 return;
150 }
151 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
152 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
153
154
155 // Initialize the pattern scanning state machine
156 fPatternLength = utext_nativeLength(pat);
157 uint16_t state = 1;
158 const RegexTableEl *tableEl;
159
160 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
161 if (fModeFlags & UREGEX_LITERAL) {
162 fQuoteMode = TRUE;
163 }
164
165 nextChar(fC); // Fetch the first char from the pattern string.
166
167 //
168 // Main loop for the regex pattern parsing state machine.
169 // Runs once per state transition.
170 // Each time through optionally performs, depending on the state table,
171 // - an advance to the the next pattern char
172 // - an action to be performed.
173 // - pushing or popping a state to/from the local state return stack.
174 // file regexcst.txt is the source for the state table. The logic behind
175 // recongizing the pattern syntax is there, not here.
176 //
177 for (;;) {
178 // Bail out if anything has gone wrong.
179 // Regex pattern parsing stops on the first error encountered.
180 if (U_FAILURE(*fStatus)) {
181 break;
182 }
183
184 U_ASSERT(state != 0);
185
186 // Find the state table element that matches the input char from the pattern, or the
187 // class of the input character. Start with the first table row for this
188 // state, then linearly scan forward until we find a row that matches the
189 // character. The last row for each state always matches all characters, so
190 // the search will stop there, if not before.
191 //
192 tableEl = &gRuleParseStateTable[state];
193 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
194 fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
195
196 for (;;) { // loop through table rows belonging to this state, looking for one
197 // that matches the current input char.
198 REGEX_SCAN_DEBUG_PRINTF(("."));
199 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
200 // Table row specified an individual character, not a set, and
201 // the input character is not quoted, and
202 // the input character matched it.
203 break;
204 }
205 if (tableEl->fCharClass == 255) {
206 // Table row specified default, match anything character class.
207 break;
208 }
209 if (tableEl->fCharClass == 254 && fC.fQuoted) {
210 // Table row specified "quoted" and the char was quoted.
211 break;
212 }
213 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
214 // Table row specified eof and we hit eof on the input.
215 break;
216 }
217
218 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
219 fC.fQuoted == FALSE && // char is not escaped &&
220 fC.fChar != (UChar32)-1) { // char is not EOF
221 U_ASSERT(tableEl->fCharClass <= 137);
222 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
223 // Table row specified a character class, or set of characters,
224 // and the current char matches it.
225 break;
226 }
227 }
228
229 // No match on this row, advance to the next row for this state,
230 tableEl++;
231 }
232 REGEX_SCAN_DEBUG_PRINTF(("\n"));
233
234 //
235 // We've found the row of the state table that matches the current input
236 // character from the rules string.
237 // Perform any action specified by this row in the state table.
238 if (doParseActions(tableEl->fAction) == FALSE) {
239 // Break out of the state machine loop if the
240 // the action signalled some kind of error, or
241 // the action was to exit, occurs on normal end-of-rules-input.
242 break;
243 }
244
245 if (tableEl->fPushState != 0) {
246 fStackPtr++;
247 if (fStackPtr >= kStackSize) {
248 error(U_REGEX_INTERNAL_ERROR);
249 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
250 fStackPtr--;
251 }
252 fStack[fStackPtr] = tableEl->fPushState;
253 }
254
255 //
256 // NextChar. This is where characters are actually fetched from the pattern.
257 // Happens under control of the 'n' tag in the state table.
258 //
259 if (tableEl->fNextChar) {
260 nextChar(fC);
261 }
262
263 // Get the next state from the table entry, or from the
264 // state stack if the next state was specified as "pop".
265 if (tableEl->fNextState != 255) {
266 state = tableEl->fNextState;
267 } else {
268 state = fStack[fStackPtr];
269 fStackPtr--;
270 if (fStackPtr < 0) {
271 // state stack underflow
272 // This will occur if the user pattern has mis-matched parentheses,
273 // with extra close parens.
274 //
275 fStackPtr++;
276 error(U_REGEX_MISMATCHED_PAREN);
277 }
278 }
279
280 }
281
282 if (U_FAILURE(*fStatus)) {
283 // Bail out if the pattern had errors.
284 // Set stack cleanup: a successful compile would have left it empty,
285 // but errors can leave temporary sets hanging around.
286 while (!fSetStack.empty()) {
287 delete (UnicodeSet *)fSetStack.pop();
288 }
289 return;
290 }
291
292 //
293 // The pattern has now been read and processed, and the compiled code generated.
294 //
295
296 //
297 // The pattern's fFrameSize so far has accumulated the requirements for
298 // storage for capture parentheses, counters, etc. that are encountered
299 // in the pattern. Add space for the two variables that are always
300 // present in the saved state: the input string position (int64_t) and
301 // the position in the compiled pattern.
302 //
303 allocateStackData(RESTACKFRAME_HDRCOUNT);
304
305 //
306 // Optimization pass 1: NOPs, back-references, and case-folding
307 //
308 stripNOPs();
309
310 //
311 // Get bounds for the minimum and maximum length of a string that this
312 // pattern can match. Used to avoid looking for matches in strings that
313 // are too short.
314 //
315 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
316
317 //
318 // Optimization pass 2: match start type
319 //
320 matchStartType();
321
322 //
323 // Set up fast latin-1 range sets
324 //
325 int32_t numSets = fRXPat->fSets->size();
326 fRXPat->fSets8 = new Regex8BitSet[numSets];
327 // Null pointer check.
328 if (fRXPat->fSets8 == NULL) {
329 e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
330 return;
331 }
332 int32_t i;
333 for (i=0; i<numSets; i++) {
334 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
335 fRXPat->fSets8[i].init(s);
336 }
337
338}
339
340
341
342
343
344//------------------------------------------------------------------------------
345//
346// doParseAction Do some action during regex pattern parsing.
347// Called by the parse state machine.
348//
349// Generation of the match engine PCode happens here, or
350// in functions called from the parse actions defined here.
351//
352//
353//------------------------------------------------------------------------------
354UBool RegexCompile::doParseActions(int32_t action)
355{
356 UBool returnVal = TRUE;
357
358 switch ((Regex_PatternParseAction)action) {
359
360 case doPatStart:
361 // Start of pattern compiles to:
362 //0 SAVE 2 Fall back to position of FAIL
363 //1 jmp 3
364 //2 FAIL Stop if we ever reach here.
365 //3 NOP Dummy, so start of pattern looks the same as
366 // the start of an ( grouping.
367 //4 NOP Resreved, will be replaced by a save if there are
368 // OR | operators at the top level
369 appendOp(URX_STATE_SAVE, 2);
370 appendOp(URX_JMP, 3);
371 appendOp(URX_FAIL, 0);
372
373 // Standard open nonCapture paren action emits the two NOPs and
374 // sets up the paren stack frame.
375 doParseActions(doOpenNonCaptureParen);
376 break;
377
378 case doPatFinish:
379 // We've scanned to the end of the pattern
380 // The end of pattern compiles to:
381 // URX_END
382 // which will stop the runtime match engine.
383 // Encountering end of pattern also behaves like a close paren,
384 // and forces fixups of the State Save at the beginning of the compiled pattern
385 // and of any OR operations at the top level.
386 //
387 handleCloseParen();
388 if (fParenStack.size() > 0) {
389 // Missing close paren in pattern.
390 error(U_REGEX_MISMATCHED_PAREN);
391 }
392
393 // add the END operation to the compiled pattern.
394 appendOp(URX_END, 0);
395
396 // Terminate the pattern compilation state machine.
397 returnVal = FALSE;
398 break;
399
400
401
402 case doOrOperator:
403 // Scanning a '|', as in (A|B)
404 {
405 // Generate code for any pending literals preceding the '|'
406 fixLiterals(FALSE);
407
408 // Insert a SAVE operation at the start of the pattern section preceding
409 // this OR at this level. This SAVE will branch the match forward
410 // to the right hand side of the OR in the event that the left hand
411 // side fails to match and backtracks. Locate the position for the
412 // save from the location on the top of the parentheses stack.
413 int32_t savePosition = fParenStack.popi();
414 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
415 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
416 op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
417 fRXPat->fCompiledPat->setElementAt(op, savePosition);
418
419 // Append an JMP operation into the compiled pattern. The operand for
420 // the JMP will eventually be the location following the ')' for the
421 // group. This will be patched in later, when the ')' is encountered.
422 appendOp(URX_JMP, 0);
423
424 // Push the position of the newly added JMP op onto the parentheses stack.
425 // This registers if for fixup when this block's close paren is encountered.
426 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
427
428 // Append a NOP to the compiled pattern. This is the slot reserved
429 // for a SAVE in the event that there is yet another '|' following
430 // this one.
431 appendOp(URX_NOP, 0);
432 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
433 }
434 break;
435
436
437 case doBeginNamedCapture:
438 // Scanning (?<letter.
439 // The first letter of the name will come through again under doConinueNamedCapture.
440 fCaptureName = new UnicodeString();
441 if (fCaptureName == NULL) {
442 error(U_MEMORY_ALLOCATION_ERROR);
443 }
444 break;
445
446 case doContinueNamedCapture:
447 fCaptureName->append(fC.fChar);
448 break;
449
450 case doBadNamedCapture:
451 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
452 break;
453
454 case doOpenCaptureParen:
455 // Open Capturing Paren, possibly named.
456 // Compile to a
457 // - NOP, which later may be replaced by a save-state if the
458 // parenthesized group gets a * quantifier, followed by
459 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
460 // - NOP, which may later be replaced by a save-state if there
461 // is an '|' alternation within the parens.
462 //
463 // Each capture group gets three slots in the save stack frame:
464 // 0: Capture Group start position (in input string being matched.)
465 // 1: Capture Group end position.
466 // 2: Start of Match-in-progress.
467 // The first two locations are for a completed capture group, and are
468 // referred to by back references and the like.
469 // The third location stores the capture start position when an START_CAPTURE is
470 // encountered. This will be promoted to a completed capture when (and if) the corresponding
471 // END_CAPTURE is encountered.
472 {
473 fixLiterals();
474 appendOp(URX_NOP, 0);
475 int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame.
476 appendOp(URX_START_CAPTURE, varsLoc);
477 appendOp(URX_NOP, 0);
478
479 // On the Parentheses stack, start a new frame and add the postions
480 // of the two NOPs. Depending on what follows in the pattern, the
481 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
482 // address of the end of the parenthesized group.
483 fParenStack.push(fModeFlags, *fStatus); // Match mode state
484 fParenStack.push(capturing, *fStatus); // Frame type.
485 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
486 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
487
488 // Save the mapping from group number to stack frame variable position.
489 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
490
491 // If this is a named capture group, add the name->group number mapping.
492 if (fCaptureName != NULL) {
493 int32_t groupNumber = fRXPat->fGroupMap->size();
494 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
495 fCaptureName = NULL; // hash table takes ownership of the name (key) string.
496 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
497 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
498 }
499 }
500 }
501 break;
502
503 case doOpenNonCaptureParen:
504 // Open non-caputuring (grouping only) Paren.
505 // Compile to a
506 // - NOP, which later may be replaced by a save-state if the
507 // parenthesized group gets a * quantifier, followed by
508 // - NOP, which may later be replaced by a save-state if there
509 // is an '|' alternation within the parens.
510 {
511 fixLiterals();
512 appendOp(URX_NOP, 0);
513 appendOp(URX_NOP, 0);
514
515 // On the Parentheses stack, start a new frame and add the postions
516 // of the two NOPs.
517 fParenStack.push(fModeFlags, *fStatus); // Match mode state
518 fParenStack.push(plain, *fStatus); // Begin a new frame.
519 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
520 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
521 }
522 break;
523
524
525 case doOpenAtomicParen:
526 // Open Atomic Paren. (?>
527 // Compile to a
528 // - NOP, which later may be replaced if the parenthesized group
529 // has a quantifier, followed by
530 // - STO_SP save state stack position, so it can be restored at the ")"
531 // - NOP, which may later be replaced by a save-state if there
532 // is an '|' alternation within the parens.
533 {
534 fixLiterals();
535 appendOp(URX_NOP, 0);
536 int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr.
537 appendOp(URX_STO_SP, varLoc);
538 appendOp(URX_NOP, 0);
539
540 // On the Parentheses stack, start a new frame and add the postions
541 // of the two NOPs. Depending on what follows in the pattern, the
542 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
543 // address of the end of the parenthesized group.
544 fParenStack.push(fModeFlags, *fStatus); // Match mode state
545 fParenStack.push(atomic, *fStatus); // Frame type.
546 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
547 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
548 }
549 break;
550
551
552 case doOpenLookAhead:
553 // Positive Look-ahead (?= stuff )
554 //
555 // Note: Addition of transparent input regions, with the need to
556 // restore the original regions when failing out of a lookahead
557 // block, complicated this sequence. Some conbined opcodes
558 // might make sense - or might not, lookahead aren't that common.
559 //
560 // Caution: min match length optimization knows about this
561 // sequence; don't change without making updates there too.
562 //
563 // Compiles to
564 // 1 LA_START dataLoc Saves SP, Input Pos, Active input region.
565 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
566 // 3 JMP 6 continue ...
567 //
568 // 4. LA_END Look Ahead failed. Restore regions.
569 // 5. BACKTRACK and back track again.
570 //
571 // 6. NOP reserved for use by quantifiers on the block.
572 // Look-ahead can't have quantifiers, but paren stack
573 // compile time conventions require the slot anyhow.
574 // 7. NOP may be replaced if there is are '|' ops in the block.
575 // 8. code for parenthesized stuff.
576 // 9. LA_END
577 //
578 // Four data slots are reserved, for saving state on entry to the look-around
579 // 0: stack pointer on entry.
580 // 1: input position on entry.
581 // 2: fActiveStart, the active bounds start on entry.
582 // 3: fActiveLimit, the active bounds limit on entry.
583 {
584 fixLiterals();
585 int32_t dataLoc = allocateData(4);
586 appendOp(URX_LA_START, dataLoc);
587 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
588 appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
589 appendOp(URX_LA_END, dataLoc);
590 appendOp(URX_BACKTRACK, 0);
591 appendOp(URX_NOP, 0);
592 appendOp(URX_NOP, 0);
593
594 // On the Parentheses stack, start a new frame and add the postions
595 // of the NOPs.
596 fParenStack.push(fModeFlags, *fStatus); // Match mode state
597 fParenStack.push(lookAhead, *fStatus); // Frame type.
598 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
599 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
600 }
601 break;
602
603 case doOpenLookAheadNeg:
604 // Negated Lookahead. (?! stuff )
605 // Compiles to
606 // 1. LA_START dataloc
607 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
608 // // which continues with the match.
609 // 3. NOP // Std. Open Paren sequence, for possible '|'
610 // 4. code for parenthesized stuff.
611 // 5. LA_END // Cut back stack, remove saved state from step 2.
612 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
613 // 7. END_LA // Restore match region, in case look-ahead was using
614 // an alternate (transparent) region.
615 // Four data slots are reserved, for saving state on entry to the look-around
616 // 0: stack pointer on entry.
617 // 1: input position on entry.
618 // 2: fActiveStart, the active bounds start on entry.
619 // 3: fActiveLimit, the active bounds limit on entry.
620 {
621 fixLiterals();
622 int32_t dataLoc = allocateData(4);
623 appendOp(URX_LA_START, dataLoc);
624 appendOp(URX_STATE_SAVE, 0); // dest address will be patched later.
625 appendOp(URX_NOP, 0);
626
627 // On the Parentheses stack, start a new frame and add the postions
628 // of the StateSave and NOP.
629 fParenStack.push(fModeFlags, *fStatus); // Match mode state
630 fParenStack.push(negLookAhead, *fStatus); // Frame type
631 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
632 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
633
634 // Instructions #5 - #7 will be added when the ')' is encountered.
635 }
636 break;
637
638 case doOpenLookBehind:
639 {
640 // Compile a (?<= look-behind open paren.
641 //
642 // Compiles to
643 // 0 URX_LB_START dataLoc
644 // 1 URX_LB_CONT dataLoc
645 // 2 MinMatchLen
646 // 3 MaxMatchLen
647 // 4 URX_NOP Standard '(' boilerplate.
648 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
649 // 6 <code for LookBehind expression>
650 // 7 URX_LB_END dataLoc # Check match len, restore input len
651 // 8 URX_LA_END dataLoc # Restore stack, input pos
652 //
653 // Allocate a block of matcher data, to contain (when running a match)
654 // 0: Stack ptr on entry
655 // 1: Input Index on entry
656 // 2: fActiveStart, the active bounds start on entry.
657 // 3: fActiveLimit, the active bounds limit on entry.
658 // 4: Start index of match current match attempt.
659 // The first four items must match the layout of data for LA_START / LA_END
660
661 // Generate match code for any pending literals.
662 fixLiterals();
663
664 // Allocate data space
665 int32_t dataLoc = allocateData(5);
666
667 // Emit URX_LB_START
668 appendOp(URX_LB_START, dataLoc);
669
670 // Emit URX_LB_CONT
671 appendOp(URX_LB_CONT, dataLoc);
672 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
673 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
674
675 // Emit the NOPs
676 appendOp(URX_NOP, 0);
677 appendOp(URX_NOP, 0);
678
679 // On the Parentheses stack, start a new frame and add the postions
680 // of the URX_LB_CONT and the NOP.
681 fParenStack.push(fModeFlags, *fStatus); // Match mode state
682 fParenStack.push(lookBehind, *fStatus); // Frame type
683 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
684 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
685
686 // The final two instructions will be added when the ')' is encountered.
687 }
688
689 break;
690
691 case doOpenLookBehindNeg:
692 {
693 // Compile a (?<! negated look-behind open paren.
694 //
695 // Compiles to
696 // 0 URX_LB_START dataLoc # Save entry stack, input len
697 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
698 // 2 MinMatchLen
699 // 3 MaxMatchLen
700 // 4 continueLoc (9)
701 // 5 URX_NOP Standard '(' boilerplate.
702 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
703 // 7 <code for LookBehind expression>
704 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
705 // 9 ...
706 //
707 // Allocate a block of matcher data, to contain (when running a match)
708 // 0: Stack ptr on entry
709 // 1: Input Index on entry
710 // 2: fActiveStart, the active bounds start on entry.
711 // 3: fActiveLimit, the active bounds limit on entry.
712 // 4: Start index of match current match attempt.
713 // The first four items must match the layout of data for LA_START / LA_END
714
715 // Generate match code for any pending literals.
716 fixLiterals();
717
718 // Allocate data space
719 int32_t dataLoc = allocateData(5);
720
721 // Emit URX_LB_START
722 appendOp(URX_LB_START, dataLoc);
723
724 // Emit URX_LBN_CONT
725 appendOp(URX_LBN_CONT, dataLoc);
726 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
727 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
728 appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled later.
729
730 // Emit the NOPs
731 appendOp(URX_NOP, 0);
732 appendOp(URX_NOP, 0);
733
734 // On the Parentheses stack, start a new frame and add the postions
735 // of the URX_LB_CONT and the NOP.
736 fParenStack.push(fModeFlags, *fStatus); // Match mode state
737 fParenStack.push(lookBehindN, *fStatus); // Frame type
738 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
739 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
740
741 // The final two instructions will be added when the ')' is encountered.
742 }
743 break;
744
745 case doConditionalExpr:
746 // Conditionals such as (?(1)a:b)
747 case doPerlInline:
748 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
749 error(U_REGEX_UNIMPLEMENTED);
750 break;
751
752
753 case doCloseParen:
754 handleCloseParen();
755 if (fParenStack.size() <= 0) {
756 // Extra close paren, or missing open paren.
757 error(U_REGEX_MISMATCHED_PAREN);
758 }
759 break;
760
761 case doNOP:
762 break;
763
764
765 case doBadOpenParenType:
766 case doRuleError:
767 error(U_REGEX_RULE_SYNTAX);
768 break;
769
770
771 case doMismatchedParenErr:
772 error(U_REGEX_MISMATCHED_PAREN);
773 break;
774
775 case doPlus:
776 // Normal '+' compiles to
777 // 1. stuff to be repeated (already built)
778 // 2. jmp-sav 1
779 // 3. ...
780 //
781 // Or, if the item to be repeated can match a zero length string,
782 // 1. STO_INP_LOC data-loc
783 // 2. body of stuff to be repeated
784 // 3. JMP_SAV_X 2
785 // 4. ...
786
787 //
788 // Or, if the item to be repeated is simple
789 // 1. Item to be repeated.
790 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
791 // 3. LOOP_C stack location
792 {
793 int32_t topLoc = blockTopLoc(FALSE); // location of item #1
794 int32_t frameLoc;
795
796 // Check for simple constructs, which may get special optimized code.
797 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
798 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
799
800 if (URX_TYPE(repeatedOp) == URX_SETREF) {
801 // Emit optimized code for [char set]+
802 appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
803 frameLoc = allocateStackData(1);
804 appendOp(URX_LOOP_C, frameLoc);
805 break;
806 }
807
808 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
809 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
810 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
811 // Emit Optimized code for .+ operations.
812 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
813 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
814 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
815 loopOpI |= 1;
816 }
817 if (fModeFlags & UREGEX_UNIX_LINES) {
818 loopOpI |= 2;
819 }
820 appendOp(loopOpI);
821 frameLoc = allocateStackData(1);
822 appendOp(URX_LOOP_C, frameLoc);
823 break;
824 }
825
826 }
827
828 // General case.
829
830 // Check for minimum match length of zero, which requires
831 // extra loop-breaking code.
832 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
833 // Zero length match is possible.
834 // Emit the code sequence that can handle it.
835 insertOp(topLoc);
836 frameLoc = allocateStackData(1);
837
838 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
839 fRXPat->fCompiledPat->setElementAt(op, topLoc);
840
841 appendOp(URX_JMP_SAV_X, topLoc+1);
842 } else {
843 // Simpler code when the repeated body must match something non-empty
844 appendOp(URX_JMP_SAV, topLoc);
845 }
846 }
847 break;
848
849 case doNGPlus:
850 // Non-greedy '+?' compiles to
851 // 1. stuff to be repeated (already built)
852 // 2. state-save 1
853 // 3. ...
854 {
855 int32_t topLoc = blockTopLoc(FALSE);
856 appendOp(URX_STATE_SAVE, topLoc);
857 }
858 break;
859
860
861 case doOpt:
862 // Normal (greedy) ? quantifier.
863 // Compiles to
864 // 1. state save 3
865 // 2. body of optional block
866 // 3. ...
867 // Insert the state save into the compiled pattern, and we're done.
868 {
869 int32_t saveStateLoc = blockTopLoc(TRUE);
870 int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
871 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
872 }
873 break;
874
875 case doNGOpt:
876 // Non-greedy ?? quantifier
877 // compiles to
878 // 1. jmp 4
879 // 2. body of optional block
880 // 3 jmp 5
881 // 4. state save 2
882 // 5 ...
883 // This code is less than ideal, with two jmps instead of one, because we can only
884 // insert one instruction at the top of the block being iterated.
885 {
886 int32_t jmp1_loc = blockTopLoc(TRUE);
887 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
888
889 int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1);
890 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
891
892 appendOp(URX_JMP, jmp2_loc+2);
893
894 appendOp(URX_STATE_SAVE, jmp1_loc+1);
895 }
896 break;
897
898
899 case doStar:
900 // Normal (greedy) * quantifier.
901 // Compiles to
902 // 1. STATE_SAVE 4
903 // 2. body of stuff being iterated over
904 // 3. JMP_SAV 2
905 // 4. ...
906 //
907 // Or, if the body is a simple [Set],
908 // 1. LOOP_SR_I set number
909 // 2. LOOP_C stack location
910 // ...
911 //
912 // Or if this is a .*
913 // 1. LOOP_DOT_I (. matches all mode flag)
914 // 2. LOOP_C stack location
915 //
916 // Or, if the body can match a zero-length string, to inhibit infinite loops,
917 // 1. STATE_SAVE 5
918 // 2. STO_INP_LOC data-loc
919 // 3. body of stuff
920 // 4. JMP_SAV_X 2
921 // 5. ...
922 {
923 // location of item #1, the STATE_SAVE
924 int32_t topLoc = blockTopLoc(FALSE);
925 int32_t dataLoc = -1;
926
927 // Check for simple *, where the construct being repeated
928 // compiled to single opcode, and might be optimizable.
929 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
930 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
931
932 if (URX_TYPE(repeatedOp) == URX_SETREF) {
933 // Emit optimized code for a [char set]*
934 int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
935 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
936 dataLoc = allocateStackData(1);
937 appendOp(URX_LOOP_C, dataLoc);
938 break;
939 }
940
941 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
942 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
943 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
944 // Emit Optimized code for .* operations.
945 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
946 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
947 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
948 loopOpI |= 1;
949 }
950 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
951 loopOpI |= 2;
952 }
953 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
954 dataLoc = allocateStackData(1);
955 appendOp(URX_LOOP_C, dataLoc);
956 break;
957 }
958 }
959
960 // Emit general case code for this *
961 // The optimizations did not apply.
962
963 int32_t saveStateLoc = blockTopLoc(TRUE);
964 int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1);
965
966 // Check for minimum match length of zero, which requires
967 // extra loop-breaking code.
968 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
969 insertOp(saveStateLoc);
970 dataLoc = allocateStackData(1);
971
972 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
973 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
974 jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
975 }
976
977 // Locate the position in the compiled pattern where the match will continue
978 // after completing the *. (4 or 5 in the comment above)
979 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
980
981 // Put together the save state op and store it into the compiled code.
982 int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
983 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
984
985 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
986 appendOp(jmpOp);
987 }
988 break;
989
990 case doNGStar:
991 // Non-greedy *? quantifier
992 // compiles to
993 // 1. JMP 3
994 // 2. body of stuff being iterated over
995 // 3. STATE_SAVE 2
996 // 4 ...
997 {
998 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
999 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
1000 int32_t jmpOp = buildOp(URX_JMP, saveLoc);
1001 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1002 appendOp(URX_STATE_SAVE, jmpLoc+1);
1003 }
1004 break;
1005
1006
1007 case doIntervalInit:
1008 // The '{' opening an interval quantifier was just scanned.
1009 // Init the counter varaiables that will accumulate the values as the digits
1010 // are scanned.
1011 fIntervalLow = 0;
1012 fIntervalUpper = -1;
1013 break;
1014
1015 case doIntevalLowerDigit:
1016 // Scanned a digit from the lower value of an {lower,upper} interval
1017 {
1018 int32_t digitValue = u_charDigitValue(fC.fChar);
1019 U_ASSERT(digitValue >= 0);
1020 int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1021 if (val > INT32_MAX) {
1022 error(U_REGEX_NUMBER_TOO_BIG);
1023 } else {
1024 fIntervalLow = (int32_t)val;
1025 }
1026 }
1027 break;
1028
1029 case doIntervalUpperDigit:
1030 // Scanned a digit from the upper value of an {lower,upper} interval
1031 {
1032 if (fIntervalUpper < 0) {
1033 fIntervalUpper = 0;
1034 }
1035 int32_t digitValue = u_charDigitValue(fC.fChar);
1036 U_ASSERT(digitValue >= 0);
1037 int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1038 if (val > INT32_MAX) {
1039 error(U_REGEX_NUMBER_TOO_BIG);
1040 } else {
1041 fIntervalUpper = (int32_t)val;
1042 }
1043 }
1044 break;
1045
1046 case doIntervalSame:
1047 // Scanned a single value interval like {27}. Upper = Lower.
1048 fIntervalUpper = fIntervalLow;
1049 break;
1050
1051 case doInterval:
1052 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1053 if (compileInlineInterval() == FALSE) {
1054 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1055 }
1056 break;
1057
1058 case doPossessiveInterval:
1059 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1060 {
1061 // Remember the loc for the top of the block being looped over.
1062 // (Can not reserve a slot in the compiled pattern at this time, because
1063 // compileInterval needs to reserve also, and blockTopLoc can only reserve
1064 // once per block.)
1065 int32_t topLoc = blockTopLoc(FALSE);
1066
1067 // Produce normal looping code.
1068 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1069
1070 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1071 // just as if the loop was inclosed in atomic parentheses.
1072
1073 // First the STO_SP before the start of the loop
1074 insertOp(topLoc);
1075
1076 int32_t varLoc = allocateData(1); // Reserve a data location for saving the
1077 int32_t op = buildOp(URX_STO_SP, varLoc);
1078 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1079
1080 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1081 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1082 loopOp++; // point LoopOp after the just-inserted STO_SP
1083 fRXPat->fCompiledPat->push(loopOp, *fStatus);
1084
1085 // Then the LD_SP after the end of the loop
1086 appendOp(URX_LD_SP, varLoc);
1087 }
1088
1089 break;
1090
1091 case doNGInterval:
1092 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1093 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1094 break;
1095
1096 case doIntervalError:
1097 error(U_REGEX_BAD_INTERVAL);
1098 break;
1099
1100 case doLiteralChar:
1101 // We've just scanned a "normal" character from the pattern,
1102 literalChar(fC.fChar);
1103 break;
1104
1105
1106 case doEscapedLiteralChar:
1107 // We've just scanned an backslashed escaped character with no
1108 // special meaning. It represents itself.
1109 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1110 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1111 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1112 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1113 }
1114 literalChar(fC.fChar);
1115 break;
1116
1117
1118 case doDotAny:
1119 // scanned a ".", match any single character.
1120 {
1121 fixLiterals(FALSE);
1122 if (fModeFlags & UREGEX_DOTALL) {
1123 appendOp(URX_DOTANY_ALL, 0);
1124 } else if (fModeFlags & UREGEX_UNIX_LINES) {
1125 appendOp(URX_DOTANY_UNIX, 0);
1126 } else {
1127 appendOp(URX_DOTANY, 0);
1128 }
1129 }
1130 break;
1131
1132 case doCaret:
1133 {
1134 fixLiterals(FALSE);
1135 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136 appendOp(URX_CARET, 0);
1137 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1138 appendOp(URX_CARET_M, 0);
1139 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140 appendOp(URX_CARET, 0); // Only testing true start of input.
1141 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1142 appendOp(URX_CARET_M_UNIX, 0);
1143 }
1144 }
1145 break;
1146
1147 case doDollar:
1148 {
1149 fixLiterals(FALSE);
1150 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1151 appendOp(URX_DOLLAR, 0);
1152 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1153 appendOp(URX_DOLLAR_M, 0);
1154 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1155 appendOp(URX_DOLLAR_D, 0);
1156 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1157 appendOp(URX_DOLLAR_MD, 0);
1158 }
1159 }
1160 break;
1161
1162 case doBackslashA:
1163 fixLiterals(FALSE);
1164 appendOp(URX_CARET, 0);
1165 break;
1166
1167 case doBackslashB:
1168 {
1169 #if UCONFIG_NO_BREAK_ITERATION==1
1170 if (fModeFlags & UREGEX_UWORD) {
1171 error(U_UNSUPPORTED_ERROR);
1172 }
1173 #endif
1174 fixLiterals(FALSE);
1175 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1176 appendOp(op, 1);
1177 }
1178 break;
1179
1180 case doBackslashb:
1181 {
1182 #if UCONFIG_NO_BREAK_ITERATION==1
1183 if (fModeFlags & UREGEX_UWORD) {
1184 error(U_UNSUPPORTED_ERROR);
1185 }
1186 #endif
1187 fixLiterals(FALSE);
1188 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1189 appendOp(op, 0);
1190 }
1191 break;
1192
1193 case doBackslashD:
1194 fixLiterals(FALSE);
1195 appendOp(URX_BACKSLASH_D, 1);
1196 break;
1197
1198 case doBackslashd:
1199 fixLiterals(FALSE);
1200 appendOp(URX_BACKSLASH_D, 0);
1201 break;
1202
1203 case doBackslashG:
1204 fixLiterals(FALSE);
1205 appendOp(URX_BACKSLASH_G, 0);
1206 break;
1207
1208 case doBackslashH:
1209 fixLiterals(FALSE);
1210 appendOp(URX_BACKSLASH_H, 1);
1211 break;
1212
1213 case doBackslashh:
1214 fixLiterals(FALSE);
1215 appendOp(URX_BACKSLASH_H, 0);
1216 break;
1217
1218 case doBackslashR:
1219 fixLiterals(FALSE);
1220 appendOp(URX_BACKSLASH_R, 0);
1221 break;
1222
1223 case doBackslashS:
1224 fixLiterals(FALSE);
1225 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1226 break;
1227
1228 case doBackslashs:
1229 fixLiterals(FALSE);
1230 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1231 break;
1232
1233 case doBackslashV:
1234 fixLiterals(FALSE);
1235 appendOp(URX_BACKSLASH_V, 1);
1236 break;
1237
1238 case doBackslashv:
1239 fixLiterals(FALSE);
1240 appendOp(URX_BACKSLASH_V, 0);
1241 break;
1242
1243 case doBackslashW:
1244 fixLiterals(FALSE);
1245 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1246 break;
1247
1248 case doBackslashw:
1249 fixLiterals(FALSE);
1250 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1251 break;
1252
1253 case doBackslashX:
1254 fixLiterals(FALSE);
1255 appendOp(URX_BACKSLASH_X, 0);
1256 break;
1257
1258
1259 case doBackslashZ:
1260 fixLiterals(FALSE);
1261 appendOp(URX_DOLLAR, 0);
1262 break;
1263
1264 case doBackslashz:
1265 fixLiterals(FALSE);
1266 appendOp(URX_BACKSLASH_Z, 0);
1267 break;
1268
1269 case doEscapeError:
1270 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1271 break;
1272
1273 case doExit:
1274 fixLiterals(FALSE);
1275 returnVal = FALSE;
1276 break;
1277
1278 case doProperty:
1279 {
1280 fixLiterals(FALSE);
1281 UnicodeSet *theSet = scanProp();
1282 compileSet(theSet);
1283 }
1284 break;
1285
1286 case doNamedChar:
1287 {
1288 UChar32 c = scanNamedChar();
1289 literalChar(c);
1290 }
1291 break;
1292
1293
1294 case doBackRef:
1295 // BackReference. Somewhat unusual in that the front-end can not completely parse
1296 // the regular expression, because the number of digits to be consumed
1297 // depends on the number of capture groups that have been defined. So
1298 // we have to do it here instead.
1299 {
1300 int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1301 int32_t groupNum = 0;
1302 UChar32 c = fC.fChar;
1303
1304 for (;;) {
1305 // Loop once per digit, for max allowed number of digits in a back reference.
1306 int32_t digit = u_charDigitValue(c);
1307 groupNum = groupNum * 10 + digit;
1308 if (groupNum >= numCaptureGroups) {
1309 break;
1310 }
1311 c = peekCharLL();
1312 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1313 break;
1314 }
1315 nextCharLL();
1316 }
1317
1318 // Scan of the back reference in the source regexp is complete. Now generate
1319 // the compiled code for it.
1320 // Because capture groups can be forward-referenced by back-references,
1321 // we fill the operand with the capture group number. At the end
1322 // of compilation, it will be changed to the variable's location.
1323 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence,
1324 // and shouldn't enter this code path at all.
1325 fixLiterals(FALSE);
1326 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1327 appendOp(URX_BACKREF_I, groupNum);
1328 } else {
1329 appendOp(URX_BACKREF, groupNum);
1330 }
1331 }
1332 break;
1333
1334 case doBeginNamedBackRef:
1335 U_ASSERT(fCaptureName == NULL);
1336 fCaptureName = new UnicodeString;
1337 if (fCaptureName == NULL) {
1338 error(U_MEMORY_ALLOCATION_ERROR);
1339 }
1340 break;
1341
1342 case doContinueNamedBackRef:
1343 fCaptureName->append(fC.fChar);
1344 break;
1345
1346 case doCompleteNamedBackRef:
1347 {
1348 int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
1349 if (groupNumber == 0) {
1350 // Group name has not been defined.
1351 // Could be a forward reference. If we choose to support them at some
1352 // future time, extra mechanism will be required at this point.
1353 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1354 } else {
1355 // Given the number, handle identically to a \n numbered back reference.
1356 // See comments above, under doBackRef
1357 fixLiterals(FALSE);
1358 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1359 appendOp(URX_BACKREF_I, groupNumber);
1360 } else {
1361 appendOp(URX_BACKREF, groupNumber);
1362 }
1363 }
1364 delete fCaptureName;
1365 fCaptureName = NULL;
1366 break;
1367 }
1368
1369 case doPossessivePlus:
1370 // Possessive ++ quantifier.
1371 // Compiles to
1372 // 1. STO_SP
1373 // 2. body of stuff being iterated over
1374 // 3. STATE_SAVE 5
1375 // 4. JMP 2
1376 // 5. LD_SP
1377 // 6. ...
1378 //
1379 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1380 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1381 //
1382 {
1383 // Emit the STO_SP
1384 int32_t topLoc = blockTopLoc(TRUE);
1385 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1386 int32_t op = buildOp(URX_STO_SP, stoLoc);
1387 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1388
1389 // Emit the STATE_SAVE
1390 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1391
1392 // Emit the JMP
1393 appendOp(URX_JMP, topLoc+1);
1394
1395 // Emit the LD_SP
1396 appendOp(URX_LD_SP, stoLoc);
1397 }
1398 break;
1399
1400 case doPossessiveStar:
1401 // Possessive *+ quantifier.
1402 // Compiles to
1403 // 1. STO_SP loc
1404 // 2. STATE_SAVE 5
1405 // 3. body of stuff being iterated over
1406 // 4. JMP 2
1407 // 5. LD_SP loc
1408 // 6 ...
1409 // TODO: do something to cut back the state stack each time through the loop.
1410 {
1411 // Reserve two slots at the top of the block.
1412 int32_t topLoc = blockTopLoc(TRUE);
1413 insertOp(topLoc);
1414
1415 // emit STO_SP loc
1416 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1417 int32_t op = buildOp(URX_STO_SP, stoLoc);
1418 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1419
1420 // Emit the SAVE_STATE 5
1421 int32_t L7 = fRXPat->fCompiledPat->size()+1;
1422 op = buildOp(URX_STATE_SAVE, L7);
1423 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1424
1425 // Append the JMP operation.
1426 appendOp(URX_JMP, topLoc+1);
1427
1428 // Emit the LD_SP loc
1429 appendOp(URX_LD_SP, stoLoc);
1430 }
1431 break;
1432
1433 case doPossessiveOpt:
1434 // Possessive ?+ quantifier.
1435 // Compiles to
1436 // 1. STO_SP loc
1437 // 2. SAVE_STATE 5
1438 // 3. body of optional block
1439 // 4. LD_SP loc
1440 // 5. ...
1441 //
1442 {
1443 // Reserve two slots at the top of the block.
1444 int32_t topLoc = blockTopLoc(TRUE);
1445 insertOp(topLoc);
1446
1447 // Emit the STO_SP
1448 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1449 int32_t op = buildOp(URX_STO_SP, stoLoc);
1450 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1451
1452 // Emit the SAVE_STATE
1453 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
1454 op = buildOp(URX_STATE_SAVE, continueLoc);
1455 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1456
1457 // Emit the LD_SP
1458 appendOp(URX_LD_SP, stoLoc);
1459 }
1460 break;
1461
1462
1463 case doBeginMatchMode:
1464 fNewModeFlags = fModeFlags;
1465 fSetModeFlag = TRUE;
1466 break;
1467
1468 case doMatchMode: // (?i) and similar
1469 {
1470 int32_t bit = 0;
1471 switch (fC.fChar) {
1472 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
1473 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break;
1474 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
1475 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
1476 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break;
1477 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
1478 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
1479 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
1480 default:
1481 UPRV_UNREACHABLE; // Should never happen. Other chars are filtered out
1482 // by the scanner.
1483 }
1484 if (fSetModeFlag) {
1485 fNewModeFlags |= bit;
1486 } else {
1487 fNewModeFlags &= ~bit;
1488 }
1489 }
1490 break;
1491
1492 case doSetMatchMode:
1493 // Emit code to match any pending literals, using the not-yet changed match mode.
1494 fixLiterals();
1495
1496 // We've got a (?i) or similar. The match mode is being changed, but
1497 // the change is not scoped to a parenthesized block.
1498 U_ASSERT(fNewModeFlags < 0);
1499 fModeFlags = fNewModeFlags;
1500
1501 break;
1502
1503
1504 case doMatchModeParen:
1505 // We've got a (?i: or similar. Begin a parenthesized block, save old
1506 // mode flags so they can be restored at the close of the block.
1507 //
1508 // Compile to a
1509 // - NOP, which later may be replaced by a save-state if the
1510 // parenthesized group gets a * quantifier, followed by
1511 // - NOP, which may later be replaced by a save-state if there
1512 // is an '|' alternation within the parens.
1513 {
1514 fixLiterals(FALSE);
1515 appendOp(URX_NOP, 0);
1516 appendOp(URX_NOP, 0);
1517
1518 // On the Parentheses stack, start a new frame and add the postions
1519 // of the two NOPs (a normal non-capturing () frame, except for the
1520 // saving of the orignal mode flags.)
1521 fParenStack.push(fModeFlags, *fStatus);
1522 fParenStack.push(flags, *fStatus); // Frame Marker
1523 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
1524 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
1525
1526 // Set the current mode flags to the new values.
1527 U_ASSERT(fNewModeFlags < 0);
1528 fModeFlags = fNewModeFlags;
1529 }
1530 break;
1531
1532 case doBadModeFlag:
1533 error(U_REGEX_INVALID_FLAG);
1534 break;
1535
1536 case doSuppressComments:
1537 // We have just scanned a '(?'. We now need to prevent the character scanner from
1538 // treating a '#' as a to-the-end-of-line comment.
1539 // (This Perl compatibility just gets uglier and uglier to do...)
1540 fEOLComments = FALSE;
1541 break;
1542
1543
1544 case doSetAddAmp:
1545 {
1546 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1547 set->add(chAmp);
1548 }
1549 break;
1550
1551 case doSetAddDash:
1552 {
1553 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1554 set->add(chDash);
1555 }
1556 break;
1557
1558 case doSetBackslash_s:
1559 {
1560 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1561 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1562 break;
1563 }
1564
1565 case doSetBackslash_S:
1566 {
1567 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1568 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1569 SSet.complement();
1570 set->addAll(SSet);
1571 break;
1572 }
1573
1574 case doSetBackslash_d:
1575 {
1576 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1577 // TODO - make a static set, ticket 6058.
1578 addCategory(set, U_GC_ND_MASK, *fStatus);
1579 break;
1580 }
1581
1582 case doSetBackslash_D:
1583 {
1584 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1585 UnicodeSet digits;
1586 // TODO - make a static set, ticket 6058.
1587 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1588 digits.complement();
1589 set->addAll(digits);
1590 break;
1591 }
1592
1593 case doSetBackslash_h:
1594 {
1595 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1596 UnicodeSet h;
1597 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1598 h.add((UChar32)9); // Tab
1599 set->addAll(h);
1600 break;
1601 }
1602
1603 case doSetBackslash_H:
1604 {
1605 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1606 UnicodeSet h;
1607 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1608 h.add((UChar32)9); // Tab
1609 h.complement();
1610 set->addAll(h);
1611 break;
1612 }
1613
1614 case doSetBackslash_v:
1615 {
1616 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1617 set->add((UChar32)0x0a, (UChar32)0x0d); // add range
1618 set->add((UChar32)0x85);
1619 set->add((UChar32)0x2028, (UChar32)0x2029);
1620 break;
1621 }
1622
1623 case doSetBackslash_V:
1624 {
1625 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1626 UnicodeSet v;
1627 v.add((UChar32)0x0a, (UChar32)0x0d); // add range
1628 v.add((UChar32)0x85);
1629 v.add((UChar32)0x2028, (UChar32)0x2029);
1630 v.complement();
1631 set->addAll(v);
1632 break;
1633 }
1634
1635 case doSetBackslash_w:
1636 {
1637 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1638 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1639 break;
1640 }
1641
1642 case doSetBackslash_W:
1643 {
1644 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1645 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1646 SSet.complement();
1647 set->addAll(SSet);
1648 break;
1649 }
1650
1651 case doSetBegin:
1652 fixLiterals(FALSE);
1653 fSetStack.push(new UnicodeSet(), *fStatus);
1654 fSetOpStack.push(setStart, *fStatus);
1655 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1656 fSetOpStack.push(setCaseClose, *fStatus);
1657 }
1658 break;
1659
1660 case doSetBeginDifference1:
1661 // We have scanned something like [[abc]-[
1662 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
1663 // Push a Difference operator, which will cause the new set to be subtracted from what
1664 // went before once it is created.
1665 setPushOp(setDifference1);
1666 fSetOpStack.push(setStart, *fStatus);
1667 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1668 fSetOpStack.push(setCaseClose, *fStatus);
1669 }
1670 break;
1671
1672 case doSetBeginIntersection1:
1673 // We have scanned something like [[abc]&[
1674 // Need both the '&' operator and the open '[' operator.
1675 setPushOp(setIntersection1);
1676 fSetOpStack.push(setStart, *fStatus);
1677 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1678 fSetOpStack.push(setCaseClose, *fStatus);
1679 }
1680 break;
1681
1682 case doSetBeginUnion:
1683 // We have scanned something like [[abc][
1684 // Need to handle the union operation explicitly [[abc] | [
1685 setPushOp(setUnion);
1686 fSetOpStack.push(setStart, *fStatus);
1687 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1688 fSetOpStack.push(setCaseClose, *fStatus);
1689 }
1690 break;
1691
1692 case doSetDifference2:
1693 // We have scanned something like [abc--
1694 // Consider this to unambiguously be a set difference operator.
1695 setPushOp(setDifference2);
1696 break;
1697
1698 case doSetEnd:
1699 // Have encountered the ']' that closes a set.
1700 // Force the evaluation of any pending operations within this set,
1701 // leave the completed set on the top of the set stack.
1702 setEval(setEnd);
1703 U_ASSERT(fSetOpStack.peeki()==setStart);
1704 fSetOpStack.popi();
1705 break;
1706
1707 case doSetFinish:
1708 {
1709 // Finished a complete set expression, including all nested sets.
1710 // The close bracket has already triggered clearing out pending set operators,
1711 // the operator stack should be empty and the operand stack should have just
1712 // one entry, the result set.
1713 U_ASSERT(fSetOpStack.empty());
1714 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1715 U_ASSERT(fSetStack.empty());
1716 compileSet(theSet);
1717 break;
1718 }
1719
1720 case doSetIntersection2:
1721 // Have scanned something like [abc&&
1722 setPushOp(setIntersection2);
1723 break;
1724
1725 case doSetLiteral:
1726 // Union the just-scanned literal character into the set being built.
1727 // This operation is the highest precedence set operation, so we can always do
1728 // it immediately, without waiting to see what follows. It is necessary to perform
1729 // any pending '-' or '&' operation first, because these have the same precedence
1730 // as union-ing in a literal'
1731 {
1732 setEval(setUnion);
1733 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1734 s->add(fC.fChar);
1735 fLastSetLiteral = fC.fChar;
1736 break;
1737 }
1738
1739 case doSetLiteralEscaped:
1740 // A back-slash escaped literal character was encountered.
1741 // Processing is the same as with setLiteral, above, with the addition of
1742 // the optional check for errors on escaped ASCII letters.
1743 {
1744 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1745 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1746 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1747 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1748 }
1749 setEval(setUnion);
1750 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1751 s->add(fC.fChar);
1752 fLastSetLiteral = fC.fChar;
1753 break;
1754 }
1755
1756 case doSetNamedChar:
1757 // Scanning a \N{UNICODE CHARACTER NAME}
1758 // Aside from the source of the character, the processing is identical to doSetLiteral,
1759 // above.
1760 {
1761 UChar32 c = scanNamedChar();
1762 setEval(setUnion);
1763 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1764 s->add(c);
1765 fLastSetLiteral = c;
1766 break;
1767 }
1768
1769 case doSetNamedRange:
1770 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1771 // The left character is already in the set, and is saved in fLastSetLiteral.
1772 // The right side needs to be picked up, the scan is at the 'N'.
1773 // Lower Limit > Upper limit being an error matches both Java
1774 // and ICU UnicodeSet behavior.
1775 {
1776 UChar32 c = scanNamedChar();
1777 if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1778 error(U_REGEX_INVALID_RANGE);
1779 }
1780 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1781 s->add(fLastSetLiteral, c);
1782 fLastSetLiteral = c;
1783 break;
1784 }
1785
1786
1787 case doSetNegate:
1788 // Scanned a '^' at the start of a set.
1789 // Push the negation operator onto the set op stack.
1790 // A twist for case-insensitive matching:
1791 // the case closure operation must happen _before_ negation.
1792 // But the case closure operation will already be on the stack if it's required.
1793 // This requires checking for case closure, and swapping the stack order
1794 // if it is present.
1795 {
1796 int32_t tosOp = fSetOpStack.peeki();
1797 if (tosOp == setCaseClose) {
1798 fSetOpStack.popi();
1799 fSetOpStack.push(setNegation, *fStatus);
1800 fSetOpStack.push(setCaseClose, *fStatus);
1801 } else {
1802 fSetOpStack.push(setNegation, *fStatus);
1803 }
1804 }
1805 break;
1806
1807 case doSetNoCloseError:
1808 error(U_REGEX_MISSING_CLOSE_BRACKET);
1809 break;
1810
1811 case doSetOpError:
1812 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
1813 break;
1814
1815 case doSetPosixProp:
1816 {
1817 UnicodeSet *s = scanPosixProp();
1818 if (s != NULL) {
1819 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1820 tos->addAll(*s);
1821 delete s;
1822 } // else error. scanProp() reported the error status already.
1823 }
1824 break;
1825
1826 case doSetProp:
1827 // Scanned a \p \P within [brackets].
1828 {
1829 UnicodeSet *s = scanProp();
1830 if (s != NULL) {
1831 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1832 tos->addAll(*s);
1833 delete s;
1834 } // else error. scanProp() reported the error status already.
1835 }
1836 break;
1837
1838
1839 case doSetRange:
1840 // We have scanned literal-literal. Add the range to the set.
1841 // The left character is already in the set, and is saved in fLastSetLiteral.
1842 // The right side is the current character.
1843 // Lower Limit > Upper limit being an error matches both Java
1844 // and ICU UnicodeSet behavior.
1845 {
1846
1847 if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1848 error(U_REGEX_INVALID_RANGE);
1849 }
1850 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1851 s->add(fLastSetLiteral, fC.fChar);
1852 break;
1853 }
1854
1855 default:
1856 UPRV_UNREACHABLE;
1857 }
1858
1859 if (U_FAILURE(*fStatus)) {
1860 returnVal = FALSE;
1861 }
1862
1863 return returnVal;
1864}
1865
1866
1867
1868//------------------------------------------------------------------------------
1869//
1870// literalChar We've encountered a literal character from the pattern,
1871// or an escape sequence that reduces to a character.
1872// Add it to the string containing all literal chars/strings from
1873// the pattern.
1874//
1875//------------------------------------------------------------------------------
1876void RegexCompile::literalChar(UChar32 c) {
1877 fLiteralChars.append(c);
1878}
1879
1880
1881//------------------------------------------------------------------------------
1882//
1883// fixLiterals When compiling something that can follow a literal
1884// string in a pattern, emit the code to match the
1885// accumulated literal string.
1886//
1887// Optionally, split the last char of the string off into
1888// a single "ONE_CHAR" operation, so that quantifiers can
1889// apply to that char alone. Example: abc*
1890// The * must apply to the 'c' only.
1891//
1892//------------------------------------------------------------------------------
1893void RegexCompile::fixLiterals(UBool split) {
1894
1895 // If no literal characters have been scanned but not yet had code generated
1896 // for them, nothing needs to be done.
1897 if (fLiteralChars.length() == 0) {
1898 return;
1899 }
1900
1901 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1902 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1903
1904 // Split: We need to ensure that the last item in the compiled pattern
1905 // refers only to the last literal scanned in the pattern, so that
1906 // quantifiers (*, +, etc.) affect only it, and not a longer string.
1907 // Split before case folding for case insensitive matches.
1908
1909 if (split) {
1910 fLiteralChars.truncate(indexOfLastCodePoint);
1911 fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string.
1912 // Note that the truncated literal string may be empty, in which case
1913 // nothing will be emitted.
1914
1915 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
1916 fixLiterals(FALSE); // Second recursive call, code for the final code point.
1917 return;
1918 }
1919
1920 // If we are doing case-insensitive matching, case fold the string. This may expand
1921 // the string, e.g. the German sharp-s turns into "ss"
1922 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1923 fLiteralChars.foldCase();
1924 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1925 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1926 }
1927
1928 if (indexOfLastCodePoint == 0) {
1929 // Single character, emit a URX_ONECHAR op to match it.
1930 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1931 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1932 appendOp(URX_ONECHAR_I, lastCodePoint);
1933 } else {
1934 appendOp(URX_ONECHAR, lastCodePoint);
1935 }
1936 } else {
1937 // Two or more chars, emit a URX_STRING to match them.
1938 if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1939 error(U_REGEX_PATTERN_TOO_BIG);
1940 }
1941 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1942 appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1943 } else {
1944 // TODO here: add optimization to split case sensitive strings of length two
1945 // into two single char ops, for efficiency.
1946 appendOp(URX_STRING, fRXPat->fLiteralText.length());
1947 }
1948 appendOp(URX_STRING_LEN, fLiteralChars.length());
1949
1950 // Add this string into the accumulated strings of the compiled pattern.
1951 fRXPat->fLiteralText.append(fLiteralChars);
1952 }
1953
1954 fLiteralChars.remove();
1955}
1956
1957
1958int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1959 if (U_FAILURE(*fStatus)) {
1960 return 0;
1961 }
1962 if (type < 0 || type > 255) {
1963 UPRV_UNREACHABLE;
1964 }
1965 if (val > 0x00ffffff) {
1966 UPRV_UNREACHABLE;
1967 }
1968 if (val < 0) {
1969 if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1970 UPRV_UNREACHABLE;
1971 }
1972 if (URX_TYPE(val) != 0xff) {
1973 UPRV_UNREACHABLE;
1974 }
1975 type = URX_RESERVED_OP_N;
1976 }
1977 return (type << 24) | val;
1978}
1979
1980
1981//------------------------------------------------------------------------------
1982//
1983// appendOp() Append a new instruction onto the compiled pattern
1984// Includes error checking, limiting the size of the
1985// pattern to lengths that can be represented in the
1986// 24 bit operand field of an instruction.
1987//
1988//------------------------------------------------------------------------------
1989void RegexCompile::appendOp(int32_t op) {
1990 if (U_FAILURE(*fStatus)) {
1991 return;
1992 }
1993 fRXPat->fCompiledPat->addElement(op, *fStatus);
1994 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1995 error(U_REGEX_PATTERN_TOO_BIG);
1996 }
1997}
1998
1999void RegexCompile::appendOp(int32_t type, int32_t val) {
2000 appendOp(buildOp(type, val));
2001}
2002
2003
2004//------------------------------------------------------------------------------
2005//
2006// insertOp() Insert a slot for a new opcode into the already
2007// compiled pattern code.
2008//
2009// Fill the slot with a NOP. Our caller will replace it
2010// with what they really wanted.
2011//
2012//------------------------------------------------------------------------------
2013void RegexCompile::insertOp(int32_t where) {
2014 UVector64 *code = fRXPat->fCompiledPat;
2015 U_ASSERT(where>0 && where < code->size());
2016
2017 int32_t nop = buildOp(URX_NOP, 0);
2018 code->insertElementAt(nop, where, *fStatus);
2019
2020 // Walk through the pattern, looking for any ops with targets that
2021 // were moved down by the insert. Fix them.
2022 int32_t loc;
2023 for (loc=0; loc<code->size(); loc++) {
2024 int32_t op = (int32_t)code->elementAti(loc);
2025 int32_t opType = URX_TYPE(op);
2026 int32_t opValue = URX_VAL(op);
2027 if ((opType == URX_JMP ||
2028 opType == URX_JMPX ||
2029 opType == URX_STATE_SAVE ||
2030 opType == URX_CTR_LOOP ||
2031 opType == URX_CTR_LOOP_NG ||
2032 opType == URX_JMP_SAV ||
2033 opType == URX_JMP_SAV_X ||
2034 opType == URX_RELOC_OPRND) && opValue > where) {
2035 // Target location for this opcode is after the insertion point and
2036 // needs to be incremented to adjust for the insertion.
2037 opValue++;
2038 op = buildOp(opType, opValue);
2039 code->setElementAt(op, loc);
2040 }
2041 }
2042
2043 // Now fix up the parentheses stack. All positive values in it are locations in
2044 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2045 for (loc=0; loc<fParenStack.size(); loc++) {
2046 int32_t x = fParenStack.elementAti(loc);
2047 U_ASSERT(x < code->size());
2048 if (x>where) {
2049 x++;
2050 fParenStack.setElementAt(x, loc);
2051 }
2052 }
2053
2054 if (fMatchCloseParen > where) {
2055 fMatchCloseParen++;
2056 }
2057 if (fMatchOpenParen > where) {
2058 fMatchOpenParen++;
2059 }
2060}
2061
2062
2063//------------------------------------------------------------------------------
2064//
2065// allocateData() Allocate storage in the matcher's static data area.
2066// Return the index for the newly allocated data.
2067// The storage won't actually exist until we are running a match
2068// operation, but the storage indexes are inserted into various
2069// opcodes while compiling the pattern.
2070//
2071//------------------------------------------------------------------------------
2072int32_t RegexCompile::allocateData(int32_t size) {
2073 if (U_FAILURE(*fStatus)) {
2074 return 0;
2075 }
2076 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2077 error(U_REGEX_INTERNAL_ERROR);
2078 return 0;
2079 }
2080 int32_t dataIndex = fRXPat->fDataSize;
2081 fRXPat->fDataSize += size;
2082 if (fRXPat->fDataSize >= 0x00fffff0) {
2083 error(U_REGEX_INTERNAL_ERROR);
2084 }
2085 return dataIndex;
2086}
2087
2088
2089//------------------------------------------------------------------------------
2090//
2091// allocateStackData() Allocate space in the back-tracking stack frame.
2092// Return the index for the newly allocated data.
2093// The frame indexes are inserted into various
2094// opcodes while compiling the pattern, meaning that frame
2095// size must be restricted to the size that will fit
2096// as an operand (24 bits).
2097//
2098//------------------------------------------------------------------------------
2099int32_t RegexCompile::allocateStackData(int32_t size) {
2100 if (U_FAILURE(*fStatus)) {
2101 return 0;
2102 }
2103 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2104 error(U_REGEX_INTERNAL_ERROR);
2105 return 0;
2106 }
2107 int32_t dataIndex = fRXPat->fFrameSize;
2108 fRXPat->fFrameSize += size;
2109 if (fRXPat->fFrameSize >= 0x00fffff0) {
2110 error(U_REGEX_PATTERN_TOO_BIG);
2111 }
2112 return dataIndex;
2113}
2114
2115
2116//------------------------------------------------------------------------------
2117//
2118// blockTopLoc() Find or create a location in the compiled pattern
2119// at the start of the operation or block that has
2120// just been compiled. Needed when a quantifier (* or
2121// whatever) appears, and we need to add an operation
2122// at the start of the thing being quantified.
2123//
2124// (Parenthesized Blocks) have a slot with a NOP that
2125// is reserved for this purpose. .* or similar don't
2126// and a slot needs to be added.
2127//
2128// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2129// at the returned location.
2130// FALSE - just return the address,
2131// do not reserve a location there.
2132//
2133//------------------------------------------------------------------------------
2134int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
2135 int32_t theLoc;
2136 fixLiterals(TRUE); // Emit code for any pending literals.
2137 // If last item was a string, emit separate op for the its last char.
2138 if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2139 {
2140 // The item just processed is a parenthesized block.
2141 theLoc = fMatchOpenParen; // A slot is already reserved for us.
2142 U_ASSERT(theLoc > 0);
2143 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2144 }
2145 else {
2146 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2147 // No slot for STATE_SAVE was pre-reserved in the compiled code.
2148 // We need to make space now.
2149 theLoc = fRXPat->fCompiledPat->size()-1;
2150 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2151 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2152 // Strings take two opcode, we want the position of the first one.
2153 // We can have a string at this point if a single character case-folded to two.
2154 theLoc--;
2155 }
2156 if (reserveLoc) {
2157 int32_t nop = buildOp(URX_NOP, 0);
2158 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2159 }
2160 }
2161 return theLoc;
2162}
2163
2164
2165
2166//------------------------------------------------------------------------------
2167//
2168// handleCloseParen When compiling a close paren, we need to go back
2169// and fix up any JMP or SAVE operations within the
2170// parenthesized block that need to target the end
2171// of the block. The locations of these are kept on
2172// the paretheses stack.
2173//
2174// This function is called both when encountering a
2175// real ) and at the end of the pattern.
2176//
2177//------------------------------------------------------------------------------
2178void RegexCompile::handleCloseParen() {
2179 int32_t patIdx;
2180 int32_t patOp;
2181 if (fParenStack.size() <= 0) {
2182 error(U_REGEX_MISMATCHED_PAREN);
2183 return;
2184 }
2185
2186 // Emit code for any pending literals.
2187 fixLiterals(FALSE);
2188
2189 // Fixup any operations within the just-closed parenthesized group
2190 // that need to reference the end of the (block).
2191 // (The first one popped from the stack is an unused slot for
2192 // alternation (OR) state save, but applying the fixup to it does no harm.)
2193 for (;;) {
2194 patIdx = fParenStack.popi();
2195 if (patIdx < 0) {
2196 // value < 0 flags the start of the frame on the paren stack.
2197 break;
2198 }
2199 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2200 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2201 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set.
2202 patOp |= fRXPat->fCompiledPat->size(); // Set it now.
2203 fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2204 fMatchOpenParen = patIdx;
2205 }
2206
2207 // At the close of any parenthesized block, restore the match mode flags to
2208 // the value they had at the open paren. Saved value is
2209 // at the top of the paren stack.
2210 fModeFlags = fParenStack.popi();
2211 U_ASSERT(fModeFlags < 0);
2212
2213 // DO any additional fixups, depending on the specific kind of
2214 // parentesized grouping this is
2215
2216 switch (patIdx) {
2217 case plain:
2218 case flags:
2219 // No additional fixups required.
2220 // (Grouping-only parentheses)
2221 break;
2222 case capturing:
2223 // Capturing Parentheses.
2224 // Insert a End Capture op into the pattern.
2225 // The frame offset of the variables for this cg is obtained from the
2226 // start capture op and put it into the end-capture op.
2227 {
2228 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2229 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2230
2231 int32_t frameVarLocation = URX_VAL(captureOp);
2232 appendOp(URX_END_CAPTURE, frameVarLocation);
2233 }
2234 break;
2235 case atomic:
2236 // Atomic Parenthesis.
2237 // Insert a LD_SP operation to restore the state stack to the position
2238 // it was when the atomic parens were entered.
2239 {
2240 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2241 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2242 int32_t stoLoc = URX_VAL(stoOp);
2243 appendOp(URX_LD_SP, stoLoc);
2244 }
2245 break;
2246
2247 case lookAhead:
2248 {
2249 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2250 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2251 int32_t dataLoc = URX_VAL(startOp);
2252 appendOp(URX_LA_END, dataLoc);
2253 }
2254 break;
2255
2256 case negLookAhead:
2257 {
2258 // See comment at doOpenLookAheadNeg
2259 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2260 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2261 int32_t dataLoc = URX_VAL(startOp);
2262 appendOp(URX_LA_END, dataLoc);
2263 appendOp(URX_BACKTRACK, 0);
2264 appendOp(URX_LA_END, dataLoc);
2265
2266 // Patch the URX_SAVE near the top of the block.
2267 // The destination of the SAVE is the final LA_END that was just added.
2268 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2269 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2270 int32_t dest = fRXPat->fCompiledPat->size()-1;
2271 saveOp = buildOp(URX_STATE_SAVE, dest);
2272 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2273 }
2274 break;
2275
2276 case lookBehind:
2277 {
2278 // See comment at doOpenLookBehind.
2279
2280 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2281 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2282 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2283 int32_t dataLoc = URX_VAL(startOp);
2284 appendOp(URX_LB_END, dataLoc);
2285 appendOp(URX_LA_END, dataLoc);
2286
2287 // Determine the min and max bounds for the length of the
2288 // string that the pattern can match.
2289 // An unbounded upper limit is an error.
2290 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2291 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2292 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2293 if (URX_TYPE(maxML) != 0) {
2294 error(U_REGEX_LOOK_BEHIND_LIMIT);
2295 break;
2296 }
2297 if (maxML == INT32_MAX) {
2298 error(U_REGEX_LOOK_BEHIND_LIMIT);
2299 break;
2300 }
2301 if (minML == INT32_MAX) {
2302 // This condition happens when no match is possible, such as with a
2303 // [set] expression containing no elements.
2304 // In principle, the generated code to evaluate the expression could be deleted,
2305 // but it's probably not worth the complication.
2306 minML = 0;
2307 }
2308 U_ASSERT(minML <= maxML);
2309
2310 // Insert the min and max match len bounds into the URX_LB_CONT op that
2311 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2312 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
2313 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
2314
2315 }
2316 break;
2317
2318
2319
2320 case lookBehindN:
2321 {
2322 // See comment at doOpenLookBehindNeg.
2323
2324 // Append the URX_LBN_END to the compiled pattern.
2325 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2326 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2327 int32_t dataLoc = URX_VAL(startOp);
2328 appendOp(URX_LBN_END, dataLoc);
2329
2330 // Determine the min and max bounds for the length of the
2331 // string that the pattern can match.
2332 // An unbounded upper limit is an error.
2333 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2334 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2335 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2336 if (URX_TYPE(maxML) != 0) {
2337 error(U_REGEX_LOOK_BEHIND_LIMIT);
2338 break;
2339 }
2340 if (maxML == INT32_MAX) {
2341 error(U_REGEX_LOOK_BEHIND_LIMIT);
2342 break;
2343 }
2344 if (minML == INT32_MAX) {
2345 // This condition happens when no match is possible, such as with a
2346 // [set] expression containing no elements.
2347 // In principle, the generated code to evaluate the expression could be deleted,
2348 // but it's probably not worth the complication.
2349 minML = 0;
2350 }
2351
2352 U_ASSERT(minML <= maxML);
2353
2354 // Insert the min and max match len bounds into the URX_LB_CONT op that
2355 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2356 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
2357 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
2358
2359 // Insert the pattern location to continue at after a successful match
2360 // as the last operand of the URX_LBN_CONT
2361 int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2362 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
2363 }
2364 break;
2365
2366
2367
2368 default:
2369 UPRV_UNREACHABLE;
2370 }
2371
2372 // remember the next location in the compiled pattern.
2373 // The compilation of Quantifiers will look at this to see whether its looping
2374 // over a parenthesized block or a single item
2375 fMatchCloseParen = fRXPat->fCompiledPat->size();
2376}
2377
2378
2379
2380//------------------------------------------------------------------------------
2381//
2382// compileSet Compile the pattern operations for a reference to a
2383// UnicodeSet.
2384//
2385//------------------------------------------------------------------------------
2386void RegexCompile::compileSet(UnicodeSet *theSet)
2387{
2388 if (theSet == NULL) {
2389 return;
2390 }
2391 // Remove any strings from the set.
2392 // There shoudn't be any, but just in case.
2393 // (Case Closure can add them; if we had a simple case closure avaialble that
2394 // ignored strings, that would be better.)
2395 theSet->removeAllStrings();
2396 int32_t setSize = theSet->size();
2397
2398 switch (setSize) {
2399 case 0:
2400 {
2401 // Set of no elements. Always fails to match.
2402 appendOp(URX_BACKTRACK, 0);
2403 delete theSet;
2404 }
2405 break;
2406
2407 case 1:
2408 {
2409 // The set contains only a single code point. Put it into
2410 // the compiled pattern as a single char operation rather
2411 // than a set, and discard the set itself.
2412 literalChar(theSet->charAt(0));
2413 delete theSet;
2414 }
2415 break;
2416
2417 default:
2418 {
2419 // The set contains two or more chars. (the normal case)
2420 // Put it into the compiled pattern as a set.
2421 int32_t setNumber = fRXPat->fSets->size();
2422 fRXPat->fSets->addElement(theSet, *fStatus);
2423 appendOp(URX_SETREF, setNumber);
2424 }
2425 }
2426}
2427
2428
2429//------------------------------------------------------------------------------
2430//
2431// compileInterval Generate the code for a {min, max} style interval quantifier.
2432// Except for the specific opcodes used, the code is the same
2433// for all three types (greedy, non-greedy, possessive) of
2434// intervals. The opcodes are supplied as parameters.
2435// (There are two sets of opcodes - greedy & possessive use the
2436// same ones, while non-greedy has it's own.)
2437//
2438// The code for interval loops has this form:
2439// 0 CTR_INIT counter loc (in stack frame)
2440// 1 5 patt address of CTR_LOOP at bottom of block
2441// 2 min count
2442// 3 max count (-1 for unbounded)
2443// 4 ... block to be iterated over
2444// 5 CTR_LOOP
2445//
2446// In
2447//------------------------------------------------------------------------------
2448void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
2449{
2450 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2451 // four slots in the compiled code. Reserve them.
2452 int32_t topOfBlock = blockTopLoc(TRUE);
2453 insertOp(topOfBlock);
2454 insertOp(topOfBlock);
2455 insertOp(topOfBlock);
2456
2457 // The operands for the CTR_INIT opcode include the index in the matcher data
2458 // of the counter. Allocate it now. There are two data items
2459 // counterLoc --> Loop counter
2460 // +1 --> Input index (for breaking non-progressing loops)
2461 // (Only present if unbounded upper limit on loop)
2462 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1;
2463 int32_t counterLoc = allocateStackData(dataSize);
2464
2465 int32_t op = buildOp(InitOp, counterLoc);
2466 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2467
2468 // The second operand of CTR_INIT is the location following the end of the loop.
2469 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2470 // compilation of something later on causes the code to grow and the target
2471 // position to move.
2472 int32_t loopEnd = fRXPat->fCompiledPat->size();
2473 op = buildOp(URX_RELOC_OPRND, loopEnd);
2474 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2475
2476 // Followed by the min and max counts.
2477 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2478 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2479
2480 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2481 // Goes at end of the block being looped over, so just append to the code so far.
2482 appendOp(LoopOp, topOfBlock);
2483
2484 if ((fIntervalLow & 0xff000000) != 0 ||
2485 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2486 error(U_REGEX_NUMBER_TOO_BIG);
2487 }
2488
2489 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2490 error(U_REGEX_MAX_LT_MIN);
2491 }
2492}
2493
2494
2495
2496UBool RegexCompile::compileInlineInterval() {
2497 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2498 // Too big to inline. Fail, which will cause looping code to be generated.
2499 // (Upper < Lower picks up unbounded upper and errors, both.)
2500 return FALSE;
2501 }
2502
2503 int32_t topOfBlock = blockTopLoc(FALSE);
2504 if (fIntervalUpper == 0) {
2505 // Pathological case. Attempt no matches, as if the block doesn't exist.
2506 // Discard the generated code for the block.
2507 // If the block included parens, discard the info pertaining to them as well.
2508 fRXPat->fCompiledPat->setSize(topOfBlock);
2509 if (fMatchOpenParen >= topOfBlock) {
2510 fMatchOpenParen = -1;
2511 }
2512 if (fMatchCloseParen >= topOfBlock) {
2513 fMatchCloseParen = -1;
2514 }
2515 return TRUE;
2516 }
2517
2518 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2519 // The thing being repeated is not a single op, but some
2520 // more complex block. Do it as a loop, not inlines.
2521 // Note that things "repeated" a max of once are handled as inline, because
2522 // the one copy of the code already generated is just fine.
2523 return FALSE;
2524 }
2525
2526 // Pick up the opcode that is to be repeated
2527 //
2528 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2529
2530 // Compute the pattern location where the inline sequence
2531 // will end, and set up the state save op that will be needed.
2532 //
2533 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2534 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2535 int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2536 if (fIntervalLow == 0) {
2537 insertOp(topOfBlock);
2538 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2539 }
2540
2541
2542
2543 // Loop, emitting the op for the thing being repeated each time.
2544 // Loop starts at 1 because one instance of the op already exists in the pattern,
2545 // it was put there when it was originally encountered.
2546 int32_t i;
2547 for (i=1; i<fIntervalUpper; i++ ) {
2548 if (i >= fIntervalLow) {
2549 appendOp(saveOp);
2550 }
2551 appendOp(op);
2552 }
2553 return TRUE;
2554}
2555
2556
2557
2558//------------------------------------------------------------------------------
2559//
2560// caseInsensitiveStart given a single code point from a pattern string, determine the
2561// set of characters that could potentially begin a case-insensitive
2562// match of a string beginning with that character, using full Unicode
2563// case insensitive matching.
2564//
2565// This is used in optimizing find().
2566//
2567// closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2568// misses cases like this:
2569// A string from the pattern begins with 'ss' (although all we know
2570// in this context is that it begins with 's')
2571// The pattern could match a string beginning with a German sharp-s
2572//
2573// To the ordinary case closure for a character c, we add all other
2574// characters cx where the case closure of cx incudes a string form that begins
2575// with the original character c.
2576//
2577// This function could be made smarter. The full pattern string is available
2578// and it would be possible to verify that the extra characters being added
2579// to the starting set fully match, rather than having just a first-char of the
2580// folded form match.
2581//
2582//------------------------------------------------------------------------------
2583void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2584
2585// Machine Generated below.
2586// It may need updating with new versions of Unicode.
2587// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2588// The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2589
2590// Machine Generated Data. Do not hand edit.
2591 static const UChar32 RECaseFixCodePoints[] = {
2592 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2593 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2594 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2595 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2596 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2597
2598 static const int16_t RECaseFixStringOffsets[] = {
2599 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2600 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2601 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2602 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2603 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2604
2605 static const int16_t RECaseFixCounts[] = {
2606 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2607 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2608 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2609 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2610 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2611
2612 static const UChar RECaseFixData[] = {
2613 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2614 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2615 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2616 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2617 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2618 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2619 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2620 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2621 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2622 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2623 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2624
2625// End of machine generated data.
2626
2627 if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2628 // This function should never be called with an invalid input character.
2629 UPRV_UNREACHABLE;
2630 } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2631 UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2632 starterChars->set(caseFoldedC, caseFoldedC);
2633
2634 int32_t i;
2635 for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2636 // Simple linear search through the sorted list of interesting code points.
2637 }
2638
2639 if (RECaseFixCodePoints[i] == c) {
2640 int32_t dataIndex = RECaseFixStringOffsets[i];
2641 int32_t numCharsToAdd = RECaseFixCounts[i];
2642 UChar32 cpToAdd = 0;
2643 for (int32_t j=0; j<numCharsToAdd; j++) {
2644 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2645 starterChars->add(cpToAdd);
2646 }
2647 }
2648
2649 starterChars->closeOver(USET_CASE_INSENSITIVE);
2650 starterChars->removeAllStrings();
2651 } else {
2652 // Not a cased character. Just return it alone.
2653 starterChars->set(c, c);
2654 }
2655}
2656
2657
2658// Increment with overflow check.
2659// val and delta will both be positive.
2660
2661static int32_t safeIncrement(int32_t val, int32_t delta) {
2662 if (INT32_MAX - val > delta) {
2663 return val + delta;
2664 } else {
2665 return INT32_MAX;
2666 }
2667}
2668
2669
2670//------------------------------------------------------------------------------
2671//
2672// matchStartType Determine how a match can start.
2673// Used to optimize find() operations.
2674//
2675// Operation is very similar to minMatchLength(). Walk the compiled
2676// pattern, keeping an on-going minimum-match-length. For any
2677// op where the min match coming in is zero, add that ops possible
2678// starting matches to the possible starts for the overall pattern.
2679//
2680//------------------------------------------------------------------------------
2681void RegexCompile::matchStartType() {
2682 if (U_FAILURE(*fStatus)) {
2683 return;
2684 }
2685
2686
2687 int32_t loc; // Location in the pattern of the current op being processed.
2688 int32_t op; // The op being processed
2689 int32_t opType; // The opcode type of the op
2690 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
2691 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
2692
2693 UBool atStart = TRUE; // True if no part of the pattern yet encountered
2694 // could have advanced the position in a match.
2695 // (Maximum match length so far == 0)
2696
2697 // forwardedLength is a vector holding minimum-match-length values that
2698 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2699 // It must be one longer than the pattern being checked because some ops
2700 // will jmp to a end-of-block+1 location from within a block, and we must
2701 // count those when checking the block.
2702 int32_t end = fRXPat->fCompiledPat->size();
2703 UVector32 forwardedLength(end+1, *fStatus);
2704 forwardedLength.setSize(end+1);
2705 for (loc=3; loc<end; loc++) {
2706 forwardedLength.setElementAt(INT32_MAX, loc);
2707 }
2708
2709 for (loc = 3; loc<end; loc++) {
2710 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2711 opType = URX_TYPE(op);
2712
2713 // The loop is advancing linearly through the pattern.
2714 // If the op we are now at was the destination of a branch in the pattern,
2715 // and that path has a shorter minimum length than the current accumulated value,
2716 // replace the current accumulated value.
2717 if (forwardedLength.elementAti(loc) < currentLen) {
2718 currentLen = forwardedLength.elementAti(loc);
2719 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2720 }
2721
2722 switch (opType) {
2723 // Ops that don't change the total length matched
2724 case URX_RESERVED_OP:
2725 case URX_END:
2726 case URX_FAIL:
2727 case URX_STRING_LEN:
2728 case URX_NOP:
2729 case URX_START_CAPTURE:
2730 case URX_END_CAPTURE:
2731 case URX_BACKSLASH_B:
2732 case URX_BACKSLASH_BU:
2733 case URX_BACKSLASH_G:
2734 case URX_BACKSLASH_Z:
2735 case URX_DOLLAR:
2736 case URX_DOLLAR_M:
2737 case URX_DOLLAR_D:
2738 case URX_DOLLAR_MD:
2739 case URX_RELOC_OPRND:
2740 case URX_STO_INP_LOC:
2741 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2742 case URX_BACKREF_I:
2743
2744 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2745 case URX_LD_SP:
2746 break;
2747
2748 case URX_CARET:
2749 if (atStart) {
2750 fRXPat->fStartType = START_START;
2751 }
2752 break;
2753
2754 case URX_CARET_M:
2755 case URX_CARET_M_UNIX:
2756 if (atStart) {
2757 fRXPat->fStartType = START_LINE;
2758 }
2759 break;
2760
2761 case URX_ONECHAR:
2762 if (currentLen == 0) {
2763 // This character could appear at the start of a match.
2764 // Add it to the set of possible starting characters.
2765 fRXPat->fInitialChars->add(URX_VAL(op));
2766 numInitialStrings += 2;
2767 }
2768 currentLen = safeIncrement(currentLen, 1);
2769 atStart = FALSE;
2770 break;
2771
2772
2773 case URX_SETREF:
2774 if (currentLen == 0) {
2775 int32_t sn = URX_VAL(op);
2776 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2777 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2778 fRXPat->fInitialChars->addAll(*s);
2779 numInitialStrings += 2;
2780 }
2781 currentLen = safeIncrement(currentLen, 1);
2782 atStart = FALSE;
2783 break;
2784
2785 case URX_LOOP_SR_I:
2786 // [Set]*, like a SETREF, above, in what it can match,
2787 // but may not match at all, so currentLen is not incremented.
2788 if (currentLen == 0) {
2789 int32_t sn = URX_VAL(op);
2790 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2791 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2792 fRXPat->fInitialChars->addAll(*s);
2793 numInitialStrings += 2;
2794 }
2795 atStart = FALSE;
2796 break;
2797
2798 case URX_LOOP_DOT_I:
2799 if (currentLen == 0) {
2800 // .* at the start of a pattern.
2801 // Any character can begin the match.
2802 fRXPat->fInitialChars->clear();
2803 fRXPat->fInitialChars->complement();
2804 numInitialStrings += 2;
2805 }
2806 atStart = FALSE;
2807 break;
2808
2809
2810 case URX_STATIC_SETREF:
2811 if (currentLen == 0) {
2812 int32_t sn = URX_VAL(op);
2813 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2814 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2815 fRXPat->fInitialChars->addAll(*s);
2816 numInitialStrings += 2;
2817 }
2818 currentLen = safeIncrement(currentLen, 1);
2819 atStart = FALSE;
2820 break;
2821
2822
2823
2824 case URX_STAT_SETREF_N:
2825 if (currentLen == 0) {
2826 int32_t sn = URX_VAL(op);
2827 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2828 UnicodeSet sc(*s);
2829 sc.complement();
2830 fRXPat->fInitialChars->addAll(sc);
2831 numInitialStrings += 2;
2832 }
2833 currentLen = safeIncrement(currentLen, 1);
2834 atStart = FALSE;
2835 break;
2836
2837
2838
2839 case URX_BACKSLASH_D:
2840 // Digit Char
2841 if (currentLen == 0) {
2842 UnicodeSet s;
2843 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2844 if (URX_VAL(op) != 0) {
2845 s.complement();
2846 }
2847 fRXPat->fInitialChars->addAll(s);
2848 numInitialStrings += 2;
2849 }
2850 currentLen = safeIncrement(currentLen, 1);
2851 atStart = FALSE;
2852 break;
2853
2854
2855 case URX_BACKSLASH_H:
2856 // Horiz white space
2857 if (currentLen == 0) {
2858 UnicodeSet s;
2859 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2860 s.add((UChar32)9); // Tab
2861 if (URX_VAL(op) != 0) {
2862 s.complement();
2863 }
2864 fRXPat->fInitialChars->addAll(s);
2865 numInitialStrings += 2;
2866 }
2867 currentLen = safeIncrement(currentLen, 1);
2868 atStart = FALSE;
2869 break;
2870
2871
2872 case URX_BACKSLASH_R: // Any line ending sequence
2873 case URX_BACKSLASH_V: // Any line ending code point, with optional negation
2874 if (currentLen == 0) {
2875 UnicodeSet s;
2876 s.add((UChar32)0x0a, (UChar32)0x0d); // add range
2877 s.add((UChar32)0x85);
2878 s.add((UChar32)0x2028, (UChar32)0x2029);
2879 if (URX_VAL(op) != 0) {
2880 // Complement option applies to URX_BACKSLASH_V only.
2881 s.complement();
2882 }
2883 fRXPat->fInitialChars->addAll(s);
2884 numInitialStrings += 2;
2885 }
2886 currentLen = safeIncrement(currentLen, 1);
2887 atStart = FALSE;
2888 break;
2889
2890
2891
2892 case URX_ONECHAR_I:
2893 // Case Insensitive Single Character.
2894 if (currentLen == 0) {
2895 UChar32 c = URX_VAL(op);
2896 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2897 UnicodeSet starters(c, c);
2898 starters.closeOver(USET_CASE_INSENSITIVE);
2899 // findCaseInsensitiveStarters(c, &starters);
2900 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2901 // The expanded folding can't match the pattern.
2902 fRXPat->fInitialChars->addAll(starters);
2903 } else {
2904 // Char has no case variants. Just add it as-is to the
2905 // set of possible starting chars.
2906 fRXPat->fInitialChars->add(c);
2907 }
2908 numInitialStrings += 2;
2909 }
2910 currentLen = safeIncrement(currentLen, 1);
2911 atStart = FALSE;
2912 break;
2913
2914
2915 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2916 case URX_DOTANY_ALL: // . matches one or two.
2917 case URX_DOTANY:
2918 case URX_DOTANY_UNIX:
2919 if (currentLen == 0) {
2920 // These constructs are all bad news when they appear at the start
2921 // of a match. Any character can begin the match.
2922 fRXPat->fInitialChars->clear();
2923 fRXPat->fInitialChars->complement();
2924 numInitialStrings += 2;
2925 }
2926 currentLen = safeIncrement(currentLen, 1);
2927 atStart = FALSE;
2928 break;
2929
2930
2931 case URX_JMPX:
2932 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2933 U_FALLTHROUGH;
2934 case URX_JMP:
2935 {
2936 int32_t jmpDest = URX_VAL(op);
2937 if (jmpDest < loc) {
2938 // Loop of some kind. Can safely ignore, the worst that will happen
2939 // is that we understate the true minimum length
2940 currentLen = forwardedLength.elementAti(loc+1);
2941
2942 } else {
2943 // Forward jump. Propagate the current min length to the target loc of the jump.
2944 U_ASSERT(jmpDest <= end+1);
2945 if (forwardedLength.elementAti(jmpDest) > currentLen) {
2946 forwardedLength.setElementAt(currentLen, jmpDest);
2947 }
2948 }
2949 }
2950 atStart = FALSE;
2951 break;
2952
2953 case URX_JMP_SAV:
2954 case URX_JMP_SAV_X:
2955 // Combo of state save to the next loc, + jmp backwards.
2956 // Net effect on min. length computation is nothing.
2957 atStart = FALSE;
2958 break;
2959
2960 case URX_BACKTRACK:
2961 // Fails are kind of like a branch, except that the min length was
2962 // propagated already, by the state save.
2963 currentLen = forwardedLength.elementAti(loc+1);
2964 atStart = FALSE;
2965 break;
2966
2967
2968 case URX_STATE_SAVE:
2969 {
2970 // State Save, for forward jumps, propagate the current minimum.
2971 // of the state save.
2972 int32_t jmpDest = URX_VAL(op);
2973 if (jmpDest > loc) {
2974 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2975 forwardedLength.setElementAt(currentLen, jmpDest);
2976 }
2977 }
2978 }
2979 atStart = FALSE;
2980 break;
2981
2982
2983
2984
2985 case URX_STRING:
2986 {
2987 loc++;
2988 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2989 int32_t stringLen = URX_VAL(stringLenOp);
2990 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2991 U_ASSERT(stringLenOp >= 2);
2992 if (currentLen == 0) {
2993 // Add the starting character of this string to the set of possible starting
2994 // characters for this pattern.
2995 int32_t stringStartIdx = URX_VAL(op);
2996 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2997 fRXPat->fInitialChars->add(c);
2998
2999 // Remember this string. After the entire pattern has been checked,
3000 // if nothing else is identified that can start a match, we'll use it.
3001 numInitialStrings++;
3002 fRXPat->fInitialStringIdx = stringStartIdx;
3003 fRXPat->fInitialStringLen = stringLen;
3004 }
3005
3006 currentLen = safeIncrement(currentLen, stringLen);
3007 atStart = FALSE;
3008 }
3009 break;
3010
3011 case URX_STRING_I:
3012 {
3013 // Case-insensitive string. Unlike exact-match strings, we won't
3014 // attempt a string search for possible match positions. But we
3015 // do update the set of possible starting characters.
3016 loc++;
3017 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3018 int32_t stringLen = URX_VAL(stringLenOp);
3019 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3020 U_ASSERT(stringLenOp >= 2);
3021 if (currentLen == 0) {
3022 // Add the starting character of this string to the set of possible starting
3023 // characters for this pattern.
3024 int32_t stringStartIdx = URX_VAL(op);
3025 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3026 UnicodeSet s;
3027 findCaseInsensitiveStarters(c, &s);
3028 fRXPat->fInitialChars->addAll(s);
3029 numInitialStrings += 2; // Matching on an initial string not possible.
3030 }
3031 currentLen = safeIncrement(currentLen, stringLen);
3032 atStart = FALSE;
3033 }
3034 break;
3035
3036 case URX_CTR_INIT:
3037 case URX_CTR_INIT_NG:
3038 {
3039 // Loop Init Ops. These don't change the min length, but they are 4 word ops
3040 // so location must be updated accordingly.
3041 // Loop Init Ops.
3042 // If the min loop count == 0
3043 // move loc forwards to the end of the loop, skipping over the body.
3044 // If the min count is > 0,
3045 // continue normal processing of the body of the loop.
3046 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3047 loopEndLoc = URX_VAL(loopEndLoc);
3048 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3049 if (minLoopCount == 0) {
3050 // Min Loop Count of 0, treat like a forward branch and
3051 // move the current minimum length up to the target
3052 // (end of loop) location.
3053 U_ASSERT(loopEndLoc <= end+1);
3054 if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3055 forwardedLength.setElementAt(currentLen, loopEndLoc);
3056 }
3057 }
3058 loc+=3; // Skips over operands of CTR_INIT
3059 }
3060 atStart = FALSE;
3061 break;
3062
3063
3064 case URX_CTR_LOOP:
3065 case URX_CTR_LOOP_NG:
3066 // Loop ops.
3067 // The jump is conditional, backwards only.
3068 atStart = FALSE;
3069 break;
3070
3071 case URX_LOOP_C:
3072 // More loop ops. These state-save to themselves.
3073 // don't change the minimum match
3074 atStart = FALSE;
3075 break;
3076
3077
3078 case URX_LA_START:
3079 case URX_LB_START:
3080 {
3081 // Look-around. Scan forward until the matching look-ahead end,
3082 // without processing the look-around block. This is overly pessimistic.
3083
3084 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
3085 // lookahead contains two LA_END instructions, so count goes up by two
3086 // for each LA_START.
3087 int32_t depth = (opType == URX_LA_START? 2: 1);
3088 for (;;) {
3089 loc++;
3090 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3091 if (URX_TYPE(op) == URX_LA_START) {
3092 depth+=2;
3093 }
3094 if (URX_TYPE(op) == URX_LB_START) {
3095 depth++;
3096 }
3097 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3098 depth--;
3099 if (depth == 0) {
3100 break;
3101 }
3102 }
3103 if (URX_TYPE(op) == URX_STATE_SAVE) {
3104 // Need this because neg lookahead blocks will FAIL to outside
3105 // of the block.
3106 int32_t jmpDest = URX_VAL(op);
3107 if (jmpDest > loc) {
3108 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3109 forwardedLength.setElementAt(currentLen, jmpDest);
3110 }
3111 }
3112 }
3113 U_ASSERT(loc <= end);
3114 }
3115 }
3116 break;
3117
3118 case URX_LA_END:
3119 case URX_LB_CONT:
3120 case URX_LB_END:
3121 case URX_LBN_CONT:
3122 case URX_LBN_END:
3123 UPRV_UNREACHABLE; // Shouldn't get here. These ops should be
3124 // consumed by the scan in URX_LA_START and LB_START
3125 default:
3126 UPRV_UNREACHABLE;
3127 }
3128
3129 }
3130
3131
3132 // We have finished walking through the ops. Check whether some forward jump
3133 // propagated a shorter length to location end+1.
3134 if (forwardedLength.elementAti(end+1) < currentLen) {
3135 currentLen = forwardedLength.elementAti(end+1);
3136 }
3137
3138
3139 fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3140
3141
3142 // Sort out what we should check for when looking for candidate match start positions.
3143 // In order of preference,
3144 // 1. Start of input text buffer.
3145 // 2. A literal string.
3146 // 3. Start of line in multi-line mode.
3147 // 4. A single literal character.
3148 // 5. A character from a set of characters.
3149 //
3150 if (fRXPat->fStartType == START_START) {
3151 // Match only at the start of an input text string.
3152 // start type is already set. We're done.
3153 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3154 // Match beginning only with a literal string.
3155 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3156 U_ASSERT(fRXPat->fInitialChars->contains(c));
3157 fRXPat->fStartType = START_STRING;
3158 fRXPat->fInitialChar = c;
3159 } else if (fRXPat->fStartType == START_LINE) {
3160 // Match at start of line in Multi-Line mode.
3161 // Nothing to do here; everything is already set.
3162 } else if (fRXPat->fMinMatchLen == 0) {
3163 // Zero length match possible. We could start anywhere.
3164 fRXPat->fStartType = START_NO_INFO;
3165 } else if (fRXPat->fInitialChars->size() == 1) {
3166 // All matches begin with the same char.
3167 fRXPat->fStartType = START_CHAR;
3168 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3169 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3170 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3171 fRXPat->fMinMatchLen > 0) {
3172 // Matches start with a set of character smaller than the set of all chars.
3173 fRXPat->fStartType = START_SET;
3174 } else {
3175 // Matches can start with anything
3176 fRXPat->fStartType = START_NO_INFO;
3177 }
3178
3179 return;
3180}
3181
3182
3183
3184//------------------------------------------------------------------------------
3185//
3186// minMatchLength Calculate the length of the shortest string that could
3187// match the specified pattern.
3188// Length is in 16 bit code units, not code points.
3189//
3190// The calculated length may not be exact. The returned
3191// value may be shorter than the actual minimum; it must
3192// never be longer.
3193//
3194// start and end are the range of p-code operations to be
3195// examined. The endpoints are included in the range.
3196//
3197//------------------------------------------------------------------------------
3198int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
3199 if (U_FAILURE(*fStatus)) {
3200 return 0;
3201 }
3202
3203 U_ASSERT(start <= end);
3204 U_ASSERT(end < fRXPat->fCompiledPat->size());
3205
3206
3207 int32_t loc;
3208 int32_t op;
3209 int32_t opType;
3210 int32_t currentLen = 0;
3211
3212
3213 // forwardedLength is a vector holding minimum-match-length values that
3214 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
3215 // It must be one longer than the pattern being checked because some ops
3216 // will jmp to a end-of-block+1 location from within a block, and we must
3217 // count those when checking the block.
3218 UVector32 forwardedLength(end+2, *fStatus);
3219 forwardedLength.setSize(end+2);
3220 for (loc=start; loc<=end+1; loc++) {
3221 forwardedLength.setElementAt(INT32_MAX, loc);
3222 }
3223
3224 for (loc = start; loc<=end; loc++) {
3225 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3226 opType = URX_TYPE(op);
3227
3228 // The loop is advancing linearly through the pattern.
3229 // If the op we are now at was the destination of a branch in the pattern,
3230 // and that path has a shorter minimum length than the current accumulated value,
3231 // replace the current accumulated value.
3232 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3233 // no-match-possible cases.
3234 if (forwardedLength.elementAti(loc) < currentLen) {
3235 currentLen = forwardedLength.elementAti(loc);
3236 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3237 }
3238
3239 switch (opType) {
3240 // Ops that don't change the total length matched
3241 case URX_RESERVED_OP:
3242 case URX_END:
3243 case URX_STRING_LEN:
3244 case URX_NOP:
3245 case URX_START_CAPTURE:
3246 case URX_END_CAPTURE:
3247 case URX_BACKSLASH_B:
3248 case URX_BACKSLASH_BU:
3249 case URX_BACKSLASH_G:
3250 case URX_BACKSLASH_Z:
3251 case URX_CARET:
3252 case URX_DOLLAR:
3253 case URX_DOLLAR_M:
3254 case URX_DOLLAR_D:
3255 case URX_DOLLAR_MD:
3256 case URX_RELOC_OPRND:
3257 case URX_STO_INP_LOC:
3258 case URX_CARET_M:
3259 case URX_CARET_M_UNIX:
3260 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3261 case URX_BACKREF_I:
3262
3263 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3264 case URX_LD_SP:
3265
3266 case URX_JMP_SAV:
3267 case URX_JMP_SAV_X:
3268 break;
3269
3270
3271 // Ops that match a minimum of one character (one or two 16 bit code units.)
3272 //
3273 case URX_ONECHAR:
3274 case URX_STATIC_SETREF:
3275 case URX_STAT_SETREF_N:
3276 case URX_SETREF:
3277 case URX_BACKSLASH_D:
3278 case URX_BACKSLASH_H:
3279 case URX_BACKSLASH_R:
3280 case URX_BACKSLASH_V:
3281 case URX_ONECHAR_I:
3282 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3283 case URX_DOTANY_ALL: // . matches one or two.
3284 case URX_DOTANY:
3285 case URX_DOTANY_UNIX:
3286 currentLen = safeIncrement(currentLen, 1);
3287 break;
3288
3289
3290 case URX_JMPX:
3291 loc++; // URX_JMPX has an extra operand, ignored here,
3292 // otherwise processed identically to URX_JMP.
3293 U_FALLTHROUGH;
3294 case URX_JMP:
3295 {
3296 int32_t jmpDest = URX_VAL(op);
3297 if (jmpDest < loc) {
3298 // Loop of some kind. Can safely ignore, the worst that will happen
3299 // is that we understate the true minimum length
3300 currentLen = forwardedLength.elementAti(loc+1);
3301 } else {
3302 // Forward jump. Propagate the current min length to the target loc of the jump.
3303 U_ASSERT(jmpDest <= end+1);
3304 if (forwardedLength.elementAti(jmpDest) > currentLen) {
3305 forwardedLength.setElementAt(currentLen, jmpDest);
3306 }
3307 }
3308 }
3309 break;
3310
3311 case URX_BACKTRACK:
3312 {
3313 // Back-tracks are kind of like a branch, except that the min length was
3314 // propagated already, by the state save.
3315 currentLen = forwardedLength.elementAti(loc+1);
3316 }
3317 break;
3318
3319
3320 case URX_STATE_SAVE:
3321 {
3322 // State Save, for forward jumps, propagate the current minimum.
3323 // of the state save.
3324 int32_t jmpDest = URX_VAL(op);
3325 if (jmpDest > loc) {
3326 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3327 forwardedLength.setElementAt(currentLen, jmpDest);
3328 }
3329 }
3330 }
3331 break;
3332
3333
3334 case URX_STRING:
3335 {
3336 loc++;
3337 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3338 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3339 }
3340 break;
3341
3342
3343 case URX_STRING_I:
3344 {
3345 loc++;
3346 // TODO: with full case folding, matching input text may be shorter than
3347 // the string we have here. More smarts could put some bounds on it.
3348 // Assume a min length of one for now. A min length of zero causes
3349 // optimization failures for a pattern like "string"+
3350 // currentLen += URX_VAL(stringLenOp);
3351 currentLen = safeIncrement(currentLen, 1);
3352 }
3353 break;
3354
3355 case URX_CTR_INIT:
3356 case URX_CTR_INIT_NG:
3357 {
3358 // Loop Init Ops.
3359 // If the min loop count == 0
3360 // move loc forwards to the end of the loop, skipping over the body.
3361 // If the min count is > 0,
3362 // continue normal processing of the body of the loop.
3363 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3364 loopEndLoc = URX_VAL(loopEndLoc);
3365 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3366 if (minLoopCount == 0) {
3367 loc = loopEndLoc;
3368 } else {
3369 loc+=3; // Skips over operands of CTR_INIT
3370 }
3371 }
3372 break;
3373
3374
3375 case URX_CTR_LOOP:
3376 case URX_CTR_LOOP_NG:
3377 // Loop ops.
3378 // The jump is conditional, backwards only.
3379 break;
3380
3381 case URX_LOOP_SR_I:
3382 case URX_LOOP_DOT_I:
3383 case URX_LOOP_C:
3384 // More loop ops. These state-save to themselves.
3385 // don't change the minimum match - could match nothing at all.
3386 break;
3387
3388
3389 case URX_LA_START:
3390 case URX_LB_START:
3391 {
3392 // Look-around. Scan forward until the matching look-ahead end,
3393 // without processing the look-around block. This is overly pessimistic for look-ahead,
3394 // it assumes that the look-ahead match might be zero-length.
3395 // TODO: Positive lookahead could recursively do the block, then continue
3396 // with the longer of the block or the value coming in. Ticket 6060
3397 int32_t depth = (opType == URX_LA_START? 2: 1);
3398 for (;;) {
3399 loc++;
3400 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3401 if (URX_TYPE(op) == URX_LA_START) {
3402 // The boilerplate for look-ahead includes two LA_END insturctions,
3403 // Depth will be decremented by each one when it is seen.
3404 depth += 2;
3405 }
3406 if (URX_TYPE(op) == URX_LB_START) {
3407 depth++;
3408 }
3409 if (URX_TYPE(op) == URX_LA_END) {
3410 depth--;
3411 if (depth == 0) {
3412 break;
3413 }
3414 }
3415 if (URX_TYPE(op)==URX_LBN_END) {
3416 depth--;
3417 if (depth == 0) {
3418 break;
3419 }
3420 }
3421 if (URX_TYPE(op) == URX_STATE_SAVE) {
3422 // Need this because neg lookahead blocks will FAIL to outside
3423 // of the block.
3424 int32_t jmpDest = URX_VAL(op);
3425 if (jmpDest > loc) {
3426 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3427 forwardedLength.setElementAt(currentLen, jmpDest);
3428 }
3429 }
3430 }
3431 U_ASSERT(loc <= end);
3432 }
3433 }
3434 break;
3435
3436 case URX_LA_END:
3437 case URX_LB_CONT:
3438 case URX_LB_END:
3439 case URX_LBN_CONT:
3440 case URX_LBN_END:
3441 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3442 // range being sized, which happens when measuring size of look-behind blocks.
3443 break;
3444
3445 default:
3446 UPRV_UNREACHABLE;
3447 }
3448
3449 }
3450
3451 // We have finished walking through the ops. Check whether some forward jump
3452 // propagated a shorter length to location end+1.
3453 if (forwardedLength.elementAti(end+1) < currentLen) {
3454 currentLen = forwardedLength.elementAti(end+1);
3455 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3456 }
3457
3458 return currentLen;
3459}
3460
3461//------------------------------------------------------------------------------
3462//
3463// maxMatchLength Calculate the length of the longest string that could
3464// match the specified pattern.
3465// Length is in 16 bit code units, not code points.
3466//
3467// The calculated length may not be exact. The returned
3468// value may be longer than the actual maximum; it must
3469// never be shorter.
3470//
3471//------------------------------------------------------------------------------
3472int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3473 if (U_FAILURE(*fStatus)) {
3474 return 0;
3475 }
3476 U_ASSERT(start <= end);
3477 U_ASSERT(end < fRXPat->fCompiledPat->size());
3478
3479 int32_t loc;
3480 int32_t op;
3481 int32_t opType;
3482 int32_t currentLen = 0;
3483 UVector32 forwardedLength(end+1, *fStatus);
3484 forwardedLength.setSize(end+1);
3485
3486 for (loc=start; loc<=end; loc++) {
3487 forwardedLength.setElementAt(0, loc);
3488 }
3489
3490 for (loc = start; loc<=end; loc++) {
3491 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3492 opType = URX_TYPE(op);
3493
3494 // The loop is advancing linearly through the pattern.
3495 // If the op we are now at was the destination of a branch in the pattern,
3496 // and that path has a longer maximum length than the current accumulated value,
3497 // replace the current accumulated value.
3498 if (forwardedLength.elementAti(loc) > currentLen) {
3499 currentLen = forwardedLength.elementAti(loc);
3500 }
3501
3502 switch (opType) {
3503 // Ops that don't change the total length matched
3504 case URX_RESERVED_OP:
3505 case URX_END:
3506 case URX_STRING_LEN:
3507 case URX_NOP:
3508 case URX_START_CAPTURE:
3509 case URX_END_CAPTURE:
3510 case URX_BACKSLASH_B:
3511 case URX_BACKSLASH_BU:
3512 case URX_BACKSLASH_G:
3513 case URX_BACKSLASH_Z:
3514 case URX_CARET:
3515 case URX_DOLLAR:
3516 case URX_DOLLAR_M:
3517 case URX_DOLLAR_D:
3518 case URX_DOLLAR_MD:
3519 case URX_RELOC_OPRND:
3520 case URX_STO_INP_LOC:
3521 case URX_CARET_M:
3522 case URX_CARET_M_UNIX:
3523
3524 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3525 case URX_LD_SP:
3526
3527 case URX_LB_END:
3528 case URX_LB_CONT:
3529 case URX_LBN_CONT:
3530 case URX_LBN_END:
3531 break;
3532
3533
3534 // Ops that increase that cause an unbounded increase in the length
3535 // of a matched string, or that increase it a hard to characterize way.
3536 // Call the max length unbounded, and stop further checking.
3537 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3538 case URX_BACKREF_I:
3539 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3540 currentLen = INT32_MAX;
3541 break;
3542
3543
3544 // Ops that match a max of one character (possibly two 16 bit code units.)
3545 //
3546 case URX_STATIC_SETREF:
3547 case URX_STAT_SETREF_N:
3548 case URX_SETREF:
3549 case URX_BACKSLASH_D:
3550 case URX_BACKSLASH_H:
3551 case URX_BACKSLASH_R:
3552 case URX_BACKSLASH_V:
3553 case URX_ONECHAR_I:
3554 case URX_DOTANY_ALL:
3555 case URX_DOTANY:
3556 case URX_DOTANY_UNIX:
3557 currentLen = safeIncrement(currentLen, 2);
3558 break;
3559
3560 // Single literal character. Increase current max length by one or two,
3561 // depending on whether the char is in the supplementary range.
3562 case URX_ONECHAR:
3563 currentLen = safeIncrement(currentLen, 1);
3564 if (URX_VAL(op) > 0x10000) {
3565 currentLen = safeIncrement(currentLen, 1);
3566 }
3567 break;
3568
3569 // Jumps.
3570 //
3571 case URX_JMP:
3572 case URX_JMPX:
3573 case URX_JMP_SAV:
3574 case URX_JMP_SAV_X:
3575 {
3576 int32_t jmpDest = URX_VAL(op);
3577 if (jmpDest < loc) {
3578 // Loop of some kind. Max match length is unbounded.
3579 currentLen = INT32_MAX;
3580 } else {
3581 // Forward jump. Propagate the current min length to the target loc of the jump.
3582 if (forwardedLength.elementAti(jmpDest) < currentLen) {
3583 forwardedLength.setElementAt(currentLen, jmpDest);
3584 }
3585 currentLen = 0;
3586 }
3587 }
3588 break;
3589
3590 case URX_BACKTRACK:
3591 // back-tracks are kind of like a branch, except that the max length was
3592 // propagated already, by the state save.
3593 currentLen = forwardedLength.elementAti(loc+1);
3594 break;
3595
3596
3597 case URX_STATE_SAVE:
3598 {
3599 // State Save, for forward jumps, propagate the current minimum.
3600 // of the state save.
3601 // For backwards jumps, they create a loop, maximum
3602 // match length is unbounded.
3603 int32_t jmpDest = URX_VAL(op);
3604 if (jmpDest > loc) {
3605 if (currentLen > forwardedLength.elementAti(jmpDest)) {
3606 forwardedLength.setElementAt(currentLen, jmpDest);
3607 }
3608 } else {
3609 currentLen = INT32_MAX;
3610 }
3611 }
3612 break;
3613
3614
3615
3616
3617 case URX_STRING:
3618 {
3619 loc++;
3620 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3621 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3622 break;
3623 }
3624
3625 case URX_STRING_I:
3626 // TODO: This code assumes that any user string that matches will be no longer
3627 // than our compiled string, with case insensitive matching.
3628 // Our compiled string has been case-folded already.
3629 //
3630 // Any matching user string will have no more code points than our
3631 // compiled (folded) string. Folding may add code points, but
3632 // not remove them.
3633 //
3634 // There is a potential problem if a supplemental code point
3635 // case-folds to a BMP code point. In this case our compiled string
3636 // could be shorter (in code units) than a matching user string.
3637 //
3638 // At this time (Unicode 6.1) there are no such characters, and this case
3639 // is not being handled. A test, intltest regex/Bug9283, will fail if
3640 // any problematic characters are added to Unicode.
3641 //
3642 // If this happens, we can make a set of the BMP chars that the
3643 // troublesome supplementals fold to, scan our string, and bump the
3644 // currentLen one extra for each that is found.
3645 //
3646 {
3647 loc++;
3648 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3649 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3650 }
3651 break;
3652
3653 case URX_CTR_INIT:
3654 case URX_CTR_INIT_NG:
3655 // For Loops, recursively call this function on the pattern for the loop body,
3656 // then multiply the result by the maximum loop count.
3657 {
3658 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3659 if (loopEndLoc == loc+4) {
3660 // Loop has an empty body. No affect on max match length.
3661 // Continue processing with code after the loop end.
3662 loc = loopEndLoc;
3663 break;
3664 }
3665
3666 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3667 if (maxLoopCount == -1) {
3668 // Unbounded Loop. No upper bound on match length.
3669 currentLen = INT32_MAX;
3670 break;
3671 }
3672
3673 U_ASSERT(loopEndLoc >= loc+4);
3674 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
3675 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3676 if (updatedLen >= INT32_MAX) {
3677 currentLen = INT32_MAX;
3678 break;
3679 }
3680 currentLen = (int32_t)updatedLen;
3681 loc = loopEndLoc;
3682 break;
3683 }
3684
3685 case URX_CTR_LOOP:
3686 case URX_CTR_LOOP_NG:
3687 // These opcodes will be skipped over by code for URX_CTR_INIT.
3688 // We shouldn't encounter them here.
3689 UPRV_UNREACHABLE;
3690
3691 case URX_LOOP_SR_I:
3692 case URX_LOOP_DOT_I:
3693 case URX_LOOP_C:
3694 // For anything to do with loops, make the match length unbounded.
3695 currentLen = INT32_MAX;
3696 break;
3697
3698
3699
3700 case URX_LA_START:
3701 case URX_LA_END:
3702 // Look-ahead. Just ignore, treat the look-ahead block as if
3703 // it were normal pattern. Gives a too-long match length,
3704 // but good enough for now.
3705 break;
3706
3707 // End of look-ahead ops should always be consumed by the processing at
3708 // the URX_LA_START op.
3709 // UPRV_UNREACHABLE;
3710
3711 case URX_LB_START:
3712 {
3713 // Look-behind. Scan forward until the matching look-around end,
3714 // without processing the look-behind block.
3715 int32_t dataLoc = URX_VAL(op);
3716 for (loc = loc + 1; loc < end; ++loc) {
3717 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3718 int32_t opType = URX_TYPE(op);
3719 if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3720 break;
3721 }
3722 }
3723 U_ASSERT(loc < end);
3724 }
3725 break;
3726
3727 default:
3728 UPRV_UNREACHABLE;
3729 }
3730
3731
3732 if (currentLen == INT32_MAX) {
3733 // The maximum length is unbounded.
3734 // Stop further processing of the pattern.
3735 break;
3736 }
3737
3738 }
3739 return currentLen;
3740
3741}
3742
3743
3744//------------------------------------------------------------------------------
3745//
3746// stripNOPs Remove any NOP operations from the compiled pattern code.
3747// Extra NOPs are inserted for some constructs during the initial
3748// code generation to provide locations that may be patched later.
3749// Many end up unneeded, and are removed by this function.
3750//
3751// In order to minimize the number of passes through the pattern,
3752// back-reference fixup is also performed here (adjusting
3753// back-reference operands to point to the correct frame offsets).
3754//
3755//------------------------------------------------------------------------------
3756void RegexCompile::stripNOPs() {
3757
3758 if (U_FAILURE(*fStatus)) {
3759 return;
3760 }
3761
3762 int32_t end = fRXPat->fCompiledPat->size();
3763 UVector32 deltas(end, *fStatus);
3764
3765 // Make a first pass over the code, computing the amount that things
3766 // will be offset at each location in the original code.
3767 int32_t loc;
3768 int32_t d = 0;
3769 for (loc=0; loc<end; loc++) {
3770 deltas.addElement(d, *fStatus);
3771 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3772 if (URX_TYPE(op) == URX_NOP) {
3773 d++;
3774 }
3775 }
3776
3777 UnicodeString caseStringBuffer;
3778
3779 // Make a second pass over the code, removing the NOPs by moving following
3780 // code up, and patching operands that refer to code locations that
3781 // are being moved. The array of offsets from the first step is used
3782 // to compute the new operand values.
3783 int32_t src;
3784 int32_t dst = 0;
3785 for (src=0; src<end; src++) {
3786 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3787 int32_t opType = URX_TYPE(op);
3788 switch (opType) {
3789 case URX_NOP:
3790 break;
3791
3792 case URX_STATE_SAVE:
3793 case URX_JMP:
3794 case URX_CTR_LOOP:
3795 case URX_CTR_LOOP_NG:
3796 case URX_RELOC_OPRND:
3797 case URX_JMPX:
3798 case URX_JMP_SAV:
3799 case URX_JMP_SAV_X:
3800 // These are instructions with operands that refer to code locations.
3801 {
3802 int32_t operandAddress = URX_VAL(op);
3803 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3804 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3805 op = buildOp(opType, fixedOperandAddress);
3806 fRXPat->fCompiledPat->setElementAt(op, dst);
3807 dst++;
3808 break;
3809 }
3810
3811 case URX_BACKREF:
3812 case URX_BACKREF_I:
3813 {
3814 int32_t where = URX_VAL(op);
3815 if (where > fRXPat->fGroupMap->size()) {
3816 error(U_REGEX_INVALID_BACK_REF);
3817 break;
3818 }
3819 where = fRXPat->fGroupMap->elementAti(where-1);
3820 op = buildOp(opType, where);
3821 fRXPat->fCompiledPat->setElementAt(op, dst);
3822 dst++;
3823
3824 fRXPat->fNeedsAltInput = TRUE;
3825 break;
3826 }
3827 case URX_RESERVED_OP:
3828 case URX_RESERVED_OP_N:
3829 case URX_BACKTRACK:
3830 case URX_END:
3831 case URX_ONECHAR:
3832 case URX_STRING:
3833 case URX_STRING_LEN:
3834 case URX_START_CAPTURE:
3835 case URX_END_CAPTURE:
3836 case URX_STATIC_SETREF:
3837 case URX_STAT_SETREF_N:
3838 case URX_SETREF:
3839 case URX_DOTANY:
3840 case URX_FAIL:
3841 case URX_BACKSLASH_B:
3842 case URX_BACKSLASH_BU:
3843 case URX_BACKSLASH_G:
3844 case URX_BACKSLASH_X:
3845 case URX_BACKSLASH_Z:
3846 case URX_DOTANY_ALL:
3847 case URX_BACKSLASH_D:
3848 case URX_CARET:
3849 case URX_DOLLAR:
3850 case URX_CTR_INIT:
3851 case URX_CTR_INIT_NG:
3852 case URX_DOTANY_UNIX:
3853 case URX_STO_SP:
3854 case URX_LD_SP:
3855 case URX_STO_INP_LOC:
3856 case URX_LA_START:
3857 case URX_LA_END:
3858 case URX_ONECHAR_I:
3859 case URX_STRING_I:
3860 case URX_DOLLAR_M:
3861 case URX_CARET_M:
3862 case URX_CARET_M_UNIX:
3863 case URX_LB_START:
3864 case URX_LB_CONT:
3865 case URX_LB_END:
3866 case URX_LBN_CONT:
3867 case URX_LBN_END:
3868 case URX_LOOP_SR_I:
3869 case URX_LOOP_DOT_I:
3870 case URX_LOOP_C:
3871 case URX_DOLLAR_D:
3872 case URX_DOLLAR_MD:
3873 case URX_BACKSLASH_H:
3874 case URX_BACKSLASH_R:
3875 case URX_BACKSLASH_V:
3876 // These instructions are unaltered by the relocation.
3877 fRXPat->fCompiledPat->setElementAt(op, dst);
3878 dst++;
3879 break;
3880
3881 default:
3882 // Some op is unaccounted for.
3883 UPRV_UNREACHABLE;
3884 }
3885 }
3886
3887 fRXPat->fCompiledPat->setSize(dst);
3888}
3889
3890
3891
3892
3893//------------------------------------------------------------------------------
3894//
3895// Error Report a rule parse error.
3896// Only report it if no previous error has been recorded.
3897//
3898//------------------------------------------------------------------------------
3899void RegexCompile::error(UErrorCode e) {
3900 if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3901 *fStatus = e;
3902 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3903 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3904 // int64_t. If the values of the latter are out of range for the former,
3905 // set them to the appropriate "field not supported" values.
3906 if (fLineNum > 0x7FFFFFFF) {
3907 fParseErr->line = 0;
3908 fParseErr->offset = -1;
3909 } else if (fCharNum > 0x7FFFFFFF) {
3910 fParseErr->line = (int32_t)fLineNum;
3911 fParseErr->offset = -1;
3912 } else {
3913 fParseErr->line = (int32_t)fLineNum;
3914 fParseErr->offset = (int32_t)fCharNum;
3915 }
3916
3917 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3918
3919 // Fill in the context.
3920 // Note: extractBetween() pins supplied indicies to the string bounds.
3921 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
3922 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3923 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3924 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3925 }
3926}
3927
3928
3929//
3930// Assorted Unicode character constants.
3931// Numeric because there is no portable way to enter them as literals.
3932// (Think EBCDIC).
3933//
3934static const UChar chCR = 0x0d; // New lines, for terminating comments.
3935static const UChar chLF = 0x0a; // Line Feed
3936static const UChar chPound = 0x23; // '#', introduces a comment.
3937static const UChar chDigit0 = 0x30; // '0'
3938static const UChar chDigit7 = 0x37; // '9'
3939static const UChar chColon = 0x3A; // ':'
3940static const UChar chE = 0x45; // 'E'
3941static const UChar chQ = 0x51; // 'Q'
3942//static const UChar chN = 0x4E; // 'N'
3943static const UChar chP = 0x50; // 'P'
3944static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
3945//static const UChar chLBracket = 0x5b; // '['
3946static const UChar chRBracket = 0x5d; // ']'
3947static const UChar chUp = 0x5e; // '^'
3948static const UChar chLowerP = 0x70;
3949static const UChar chLBrace = 0x7b; // '{'
3950static const UChar chRBrace = 0x7d; // '}'
3951static const UChar chNEL = 0x85; // NEL newline variant
3952static const UChar chLS = 0x2028; // Unicode Line Separator
3953
3954
3955//------------------------------------------------------------------------------
3956//
3957// nextCharLL Low Level Next Char from the regex pattern.
3958// Get a char from the string, keep track of input position
3959// for error reporting.
3960//
3961//------------------------------------------------------------------------------
3962UChar32 RegexCompile::nextCharLL() {
3963 UChar32 ch;
3964
3965 if (fPeekChar != -1) {
3966 ch = fPeekChar;
3967 fPeekChar = -1;
3968 return ch;
3969 }
3970
3971 // assume we're already in the right place
3972 ch = UTEXT_NEXT32(fRXPat->fPattern);
3973 if (ch == U_SENTINEL) {
3974 return ch;
3975 }
3976
3977 if (ch == chCR ||
3978 ch == chNEL ||
3979 ch == chLS ||
3980 (ch == chLF && fLastChar != chCR)) {
3981 // Character is starting a new line. Bump up the line number, and
3982 // reset the column to 0.
3983 fLineNum++;
3984 fCharNum=0;
3985 }
3986 else {
3987 // Character is not starting a new line. Except in the case of a
3988 // LF following a CR, increment the column position.
3989 if (ch != chLF) {
3990 fCharNum++;
3991 }
3992 }
3993 fLastChar = ch;
3994 return ch;
3995}
3996
3997//------------------------------------------------------------------------------
3998//
3999// peekCharLL Low Level Character Scanning, sneak a peek at the next
4000// character without actually getting it.
4001//
4002//------------------------------------------------------------------------------
4003UChar32 RegexCompile::peekCharLL() {
4004 if (fPeekChar == -1) {
4005 fPeekChar = nextCharLL();
4006 }
4007 return fPeekChar;
4008}
4009
4010
4011//------------------------------------------------------------------------------
4012//
4013// nextChar for pattern scanning. At this level, we handle stripping
4014// out comments and processing some backslash character escapes.
4015// The rest of the pattern grammar is handled at the next level up.
4016//
4017//------------------------------------------------------------------------------
4018void RegexCompile::nextChar(RegexPatternChar &c) {
4019 tailRecursion:
4020 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4021 c.fChar = nextCharLL();
4022 c.fQuoted = FALSE;
4023
4024 if (fQuoteMode) {
4025 c.fQuoted = TRUE;
4026 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4027 c.fChar == (UChar32)-1) {
4028 fQuoteMode = FALSE; // Exit quote mode,
4029 nextCharLL(); // discard the E
4030 // nextChar(c); // recurse to get the real next char
4031 goto tailRecursion; // Note: fuzz testing produced testcases that
4032 // resulted in stack overflow here.
4033 }
4034 }
4035 else if (fInBackslashQuote) {
4036 // The current character immediately follows a '\'
4037 // Don't check for any further escapes, just return it as-is.
4038 // Don't set c.fQuoted, because that would prevent the state machine from
4039 // dispatching on the character.
4040 fInBackslashQuote = FALSE;
4041 }
4042 else
4043 {
4044 // We are not in a \Q quoted region \E of the source.
4045 //
4046 if (fModeFlags & UREGEX_COMMENTS) {
4047 //
4048 // We are in free-spacing and comments mode.
4049 // Scan through any white space and comments, until we
4050 // reach a significant character or the end of inut.
4051 for (;;) {
4052 if (c.fChar == (UChar32)-1) {
4053 break; // End of Input
4054 }
4055 if (c.fChar == chPound && fEOLComments == TRUE) {
4056 // Start of a comment. Consume the rest of it, until EOF or a new line
4057 for (;;) {
4058 c.fChar = nextCharLL();
4059 if (c.fChar == (UChar32)-1 || // EOF
4060 c.fChar == chCR ||
4061 c.fChar == chLF ||
4062 c.fChar == chNEL ||
4063 c.fChar == chLS) {
4064 break;
4065 }
4066 }
4067 }
4068 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4069 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4070 break;
4071 }
4072 c.fChar = nextCharLL();
4073 }
4074 }
4075
4076 //
4077 // check for backslash escaped characters.
4078 //
4079 if (c.fChar == chBackSlash) {
4080 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4081 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4082 //
4083 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4084 // Includes \uxxxx, \n, \r, many others.
4085 // Return the single equivalent character.
4086 //
4087 nextCharLL(); // get & discard the peeked char.
4088 c.fQuoted = TRUE;
4089
4090 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4091 int32_t endIndex = (int32_t)pos;
4092 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4093
4094 if (endIndex == pos) {
4095 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4096 }
4097 fCharNum += endIndex - pos;
4098 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4099 } else {
4100 int32_t offset = 0;
4101 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4102
4103 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4104 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4105
4106 if (offset == 0) {
4107 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4108 } else if (context.lastOffset == offset) {
4109 UTEXT_PREVIOUS32(fRXPat->fPattern);
4110 } else if (context.lastOffset != offset-1) {
4111 utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4112 }
4113 fCharNum += offset;
4114 }
4115 }
4116 else if (peekCharLL() == chDigit0) {
4117 // Octal Escape, using Java Regexp Conventions
4118 // which are \0 followed by 1-3 octal digits.
4119 // Different from ICU Unescape handling of Octal, which does not
4120 // require the leading 0.
4121 // Java also has the convention of only consuming 2 octal digits if
4122 // the three digit number would be > 0xff
4123 //
4124 c.fChar = 0;
4125 nextCharLL(); // Consume the initial 0.
4126 int index;
4127 for (index=0; index<3; index++) {
4128 int32_t ch = peekCharLL();
4129 if (ch<chDigit0 || ch>chDigit7) {
4130 if (index==0) {
4131 // \0 is not followed by any octal digits.
4132 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4133 }
4134 break;
4135 }
4136 c.fChar <<= 3;
4137 c.fChar += ch&7;
4138 if (c.fChar <= 255) {
4139 nextCharLL();
4140 } else {
4141 // The last digit made the number too big. Forget we saw it.
4142 c.fChar >>= 3;
4143 }
4144 }
4145 c.fQuoted = TRUE;
4146 }
4147 else if (peekCharLL() == chQ) {
4148 // "\Q" enter quote mode, which will continue until "\E"
4149 fQuoteMode = TRUE;
4150 nextCharLL(); // discard the 'Q'.
4151 // nextChar(c); // recurse to get the real next char.
4152 goto tailRecursion; // Note: fuzz testing produced test cases that
4153 // resulted in stack overflow here.
4154 }
4155 else
4156 {
4157 // We are in a '\' escape that will be handled by the state table scanner.
4158 // Just return the backslash, but remember that the following char is to
4159 // be taken literally.
4160 fInBackslashQuote = TRUE;
4161 }
4162 }
4163 }
4164
4165 // re-enable # to end-of-line comments, in case they were disabled.
4166 // They are disabled by the parser upon seeing '(?', but this lasts for
4167 // the fetching of the next character only.
4168 fEOLComments = TRUE;
4169
4170 // putc(c.fChar, stdout);
4171}
4172
4173
4174
4175//------------------------------------------------------------------------------
4176//
4177// scanNamedChar
4178// Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4179//
4180// The scan position will be at the 'N'. On return
4181// the scan position should be just after the '}'
4182//
4183// Return the UChar32
4184//
4185//------------------------------------------------------------------------------
4186UChar32 RegexCompile::scanNamedChar() {
4187 if (U_FAILURE(*fStatus)) {
4188 return 0;
4189 }
4190
4191 nextChar(fC);
4192 if (fC.fChar != chLBrace) {
4193 error(U_REGEX_PROPERTY_SYNTAX);
4194 return 0;
4195 }
4196
4197 UnicodeString charName;
4198 for (;;) {
4199 nextChar(fC);
4200 if (fC.fChar == chRBrace) {
4201 break;
4202 }
4203 if (fC.fChar == -1) {
4204 error(U_REGEX_PROPERTY_SYNTAX);
4205 return 0;
4206 }
4207 charName.append(fC.fChar);
4208 }
4209
4210 char name[100];
4211 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4212 (uint32_t)charName.length()>=sizeof(name)) {
4213 // All Unicode character names have only invariant characters.
4214 // The API to get a character, given a name, accepts only char *, forcing us to convert,
4215 // which requires this error check
4216 error(U_REGEX_PROPERTY_SYNTAX);
4217 return 0;
4218 }
4219 charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4220
4221 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4222 if (U_FAILURE(*fStatus)) {
4223 error(U_REGEX_PROPERTY_SYNTAX);
4224 }
4225
4226 nextChar(fC); // Continue overall regex pattern processing with char after the '}'
4227 return theChar;
4228}
4229
4230//------------------------------------------------------------------------------
4231//
4232// scanProp Construct a UnicodeSet from the text at the current scan
4233// position, which will be of the form \p{whaterver}
4234//
4235// The scan position will be at the 'p' or 'P'. On return
4236// the scan position should be just after the '}'
4237//
4238// Return a UnicodeSet, constructed from the \P pattern,
4239// or NULL if the pattern is invalid.
4240//
4241//------------------------------------------------------------------------------
4242UnicodeSet *RegexCompile::scanProp() {
4243 UnicodeSet *uset = NULL;
4244
4245 if (U_FAILURE(*fStatus)) {
4246 return NULL;
4247 }
4248 (void)chLowerP; // Suppress compiler unused variable warning.
4249 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4250 UBool negated = (fC.fChar == chP);
4251
4252 UnicodeString propertyName;
4253 nextChar(fC);
4254 if (fC.fChar != chLBrace) {
4255 error(U_REGEX_PROPERTY_SYNTAX);
4256 return NULL;
4257 }
4258 for (;;) {
4259 nextChar(fC);
4260 if (fC.fChar == chRBrace) {
4261 break;
4262 }
4263 if (fC.fChar == -1) {
4264 // Hit the end of the input string without finding the closing '}'
4265 error(U_REGEX_PROPERTY_SYNTAX);
4266 return NULL;
4267 }
4268 propertyName.append(fC.fChar);
4269 }
4270 uset = createSetForProperty(propertyName, negated);
4271 nextChar(fC); // Move input scan to position following the closing '}'
4272 return uset;
4273}
4274
4275//------------------------------------------------------------------------------
4276//
4277// scanPosixProp Construct a UnicodeSet from the text at the current scan
4278// position, which is expected be of the form [:property expression:]
4279//
4280// The scan position will be at the opening ':'. On return
4281// the scan position must be on the closing ']'
4282//
4283// Return a UnicodeSet constructed from the pattern,
4284// or NULL if this is not a valid POSIX-style set expression.
4285// If not a property expression, restore the initial scan position
4286// (to the opening ':')
4287//
4288// Note: the opening '[:' is not sufficient to guarantee that
4289// this is a [:property:] expression.
4290// [:'+=,] is a perfectly good ordinary set expression that
4291// happens to include ':' as one of its characters.
4292//
4293//------------------------------------------------------------------------------
4294UnicodeSet *RegexCompile::scanPosixProp() {
4295 UnicodeSet *uset = NULL;
4296
4297 if (U_FAILURE(*fStatus)) {
4298 return NULL;
4299 }
4300
4301 U_ASSERT(fC.fChar == chColon);
4302
4303 // Save the scanner state.
4304 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4305 int64_t savedScanIndex = fScanIndex;
4306 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4307 UBool savedQuoteMode = fQuoteMode;
4308 UBool savedInBackslashQuote = fInBackslashQuote;
4309 UBool savedEOLComments = fEOLComments;
4310 int64_t savedLineNum = fLineNum;
4311 int64_t savedCharNum = fCharNum;
4312 UChar32 savedLastChar = fLastChar;
4313 UChar32 savedPeekChar = fPeekChar;
4314 RegexPatternChar savedfC = fC;
4315
4316 // Scan for a closing ]. A little tricky because there are some perverse
4317 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4318 // ending on the second closing ].
4319
4320 UnicodeString propName;
4321 UBool negated = FALSE;
4322
4323 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4324 nextChar(fC);
4325 if (fC.fChar == chUp) {
4326 negated = TRUE;
4327 nextChar(fC);
4328 }
4329
4330 // Scan for the closing ":]", collecting the property name along the way.
4331 UBool sawPropSetTerminator = FALSE;
4332 for (;;) {
4333 propName.append(fC.fChar);
4334 nextChar(fC);
4335 if (fC.fQuoted || fC.fChar == -1) {
4336 // Escaped characters or end of input - either says this isn't a [:Property:]
4337 break;
4338 }
4339 if (fC.fChar == chColon) {
4340 nextChar(fC);
4341 if (fC.fChar == chRBracket) {
4342 sawPropSetTerminator = TRUE;
4343 }
4344 break;
4345 }
4346 }
4347
4348 if (sawPropSetTerminator) {
4349 uset = createSetForProperty(propName, negated);
4350 }
4351 else
4352 {
4353 // No closing ":]".
4354 // Restore the original scan position.
4355 // The main scanner will retry the input as a normal set expression,
4356 // not a [:Property:] expression.
4357 fScanIndex = savedScanIndex;
4358 fQuoteMode = savedQuoteMode;
4359 fInBackslashQuote = savedInBackslashQuote;
4360 fEOLComments = savedEOLComments;
4361 fLineNum = savedLineNum;
4362 fCharNum = savedCharNum;
4363 fLastChar = savedLastChar;
4364 fPeekChar = savedPeekChar;
4365 fC = savedfC;
4366 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4367 }
4368 return uset;
4369}
4370
4371static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4372 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4373 addCategory(set, U_GC_CF_MASK, ec);
4374}
4375
4376//
4377// Create a Unicode Set from a Unicode Property expression.
4378// This is common code underlying both \p{...} ane [:...:] expressions.
4379// Includes trying the Java "properties" that aren't supported as
4380// normal ICU UnicodeSet properties
4381//
4382UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4383
4384 if (U_FAILURE(*fStatus)) {
4385 return nullptr;
4386 }
4387 LocalPointer<UnicodeSet> set;
4388 UErrorCode status = U_ZERO_ERROR;
4389
4390 do { // non-loop, exists to allow breaks from the block.
4391 //
4392 // First try the property as we received it
4393 //
4394 UnicodeString setExpr;
4395 uint32_t usetFlags = 0;
4396 setExpr.append(u"[\\p{", -1);
4397 setExpr.append(propName);
4398 setExpr.append(u"}]", -1);
4399 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4400 usetFlags |= USET_CASE_INSENSITIVE;
4401 }
4402 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status);
4403 if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4404 break;
4405 }
4406
4407 //
4408 // The incoming property wasn't directly recognized by ICU.
4409
4410 // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4411 // Java accepts 'word' with mixed case.
4412 // Java accepts 'all' only in all lower case.
4413
4414 status = U_ZERO_ERROR;
4415 if (propName.caseCompare(u"word", -1, 0) == 0) {
4416 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])), status);
4417 break;
4418 }
4419 if (propName.compare(u"all", -1) == 0) {
4420 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4421 break;
4422 }
4423
4424
4425 // Do Java InBlock expressions
4426 //
4427 UnicodeString mPropName = propName;
4428 if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4429 status = U_ZERO_ERROR;
4430 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4431 if (U_FAILURE(status)) {
4432 break;
4433 }
4434 UnicodeString blockName(mPropName, 2); // Property with the leading "In" removed.
4435 set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4436 break;
4437 }
4438
4439 // Check for the Java form "IsBooleanPropertyValue", which we will recast
4440 // as "BooleanPropertyValue". The property value can be either a
4441 // a General Category or a Script Name.
4442
4443 if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4444 mPropName.remove(0, 2); // Strip the "Is"
4445 if (mPropName.indexOf(u'=') >= 0) {
4446 // Reject any "Is..." property expression containing an '=', that is,
4447 // any non-binary property expression.
4448 status = U_REGEX_PROPERTY_SYNTAX;
4449 break;
4450 }
4451
4452 if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4453 mPropName.setTo(u"unassigned", -1);
4454 negated = !negated;
4455 } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4456 mPropName.setTo(u"Titlecase_Letter", -1);
4457 }
4458
4459 mPropName.insert(0, u"[\\p{", -1);
4460 mPropName.append(u"}]", -1);
4461 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4462
4463 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4464 set->closeOver(USET_CASE_INSENSITIVE);
4465 }
4466 break;
4467
4468 }
4469
4470 if (propName.startsWith(u"java", -1)) {
4471 status = U_ZERO_ERROR;
4472 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4473 if (U_FAILURE(status)) {
4474 break;
4475 }
4476 //
4477 // Try the various Java specific properties.
4478 // These all begin with "java"
4479 //
4480 if (propName.compare(u"javaDefined", -1) == 0) {
4481 addCategory(set.getAlias(), U_GC_CN_MASK, status);
4482 set->complement();
4483 }
4484 else if (propName.compare(u"javaDigit", -1) == 0) {
4485 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4486 }
4487 else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4488 addIdentifierIgnorable(set.getAlias(), status);
4489 }
4490 else if (propName.compare(u"javaISOControl", -1) == 0) {
4491 set->add(0, 0x1F).add(0x7F, 0x9F);
4492 }
4493 else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4494 addCategory(set.getAlias(), U_GC_L_MASK, status);
4495 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4496 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4497 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4498 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4499 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4500 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4501 addIdentifierIgnorable(set.getAlias(), status);
4502 }
4503 else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4504 addCategory(set.getAlias(), U_GC_L_MASK, status);
4505 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4506 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4507 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4508 }
4509 else if (propName.compare(u"javaLetter", -1) == 0) {
4510 addCategory(set.getAlias(), U_GC_L_MASK, status);
4511 }
4512 else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4513 addCategory(set.getAlias(), U_GC_L_MASK, status);
4514 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4515 }
4516 else if (propName.compare(u"javaLowerCase", -1) == 0) {
4517 addCategory(set.getAlias(), U_GC_LL_MASK, status);
4518 }
4519 else if (propName.compare(u"javaMirrored", -1) == 0) {
4520 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4521 }
4522 else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4523 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4524 }
4525 else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4526 set->add(0x10000, UnicodeSet::MAX_VALUE);
4527 }
4528 else if (propName.compare(u"javaTitleCase", -1) == 0) {
4529 addCategory(set.getAlias(), U_GC_LT_MASK, status);
4530 }
4531 else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4532 addCategory(set.getAlias(), U_GC_L_MASK, status);
4533 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4534 }
4535 else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4536 addCategory(set.getAlias(), U_GC_L_MASK, status);
4537 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4538 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4539 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4540 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4541 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4542 addIdentifierIgnorable(set.getAlias(), status);
4543 }
4544 else if (propName.compare(u"javaUpperCase", -1) == 0) {
4545 addCategory(set.getAlias(), U_GC_LU_MASK, status);
4546 }
4547 else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4548 set->add(0, UnicodeSet::MAX_VALUE);
4549 }
4550 else if (propName.compare(u"javaWhitespace", -1) == 0) {
4551 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4552 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4553 set->add(9, 0x0d).add(0x1c, 0x1f);
4554 } else {
4555 status = U_REGEX_PROPERTY_SYNTAX;
4556 }
4557
4558 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4559 set->closeOver(USET_CASE_INSENSITIVE);
4560 }
4561 break;
4562 }
4563
4564 // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4565 // extensions matched it.
4566 status = U_REGEX_PROPERTY_SYNTAX;
4567 } while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
4568
4569 if (U_SUCCESS(status)) {
4570 U_ASSERT(set.isValid());
4571 if (negated) {
4572 set->complement();
4573 }
4574 return set.orphan();
4575 } else {
4576 if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4577 status = U_REGEX_PROPERTY_SYNTAX;
4578 }
4579 error(status);
4580 return nullptr;
4581 }
4582}
4583
4584
4585//
4586// SetEval Part of the evaluation of [set expressions].
4587// Perform any pending (stacked) operations with precedence
4588// equal or greater to that of the next operator encountered
4589// in the expression.
4590//
4591void RegexCompile::setEval(int32_t nextOp) {
4592 UnicodeSet *rightOperand = NULL;
4593 UnicodeSet *leftOperand = NULL;
4594 for (;;) {
4595 U_ASSERT(fSetOpStack.empty()==FALSE);
4596 int32_t pendingSetOperation = fSetOpStack.peeki();
4597 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4598 break;
4599 }
4600 fSetOpStack.popi();
4601 U_ASSERT(fSetStack.empty() == FALSE);
4602 rightOperand = (UnicodeSet *)fSetStack.peek();
4603 switch (pendingSetOperation) {
4604 case setNegation:
4605 rightOperand->complement();
4606 break;
4607 case setCaseClose:
4608 // TODO: need a simple close function. Ticket 6065
4609 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4610 rightOperand->removeAllStrings();
4611 break;
4612 case setDifference1:
4613 case setDifference2:
4614 fSetStack.pop();
4615 leftOperand = (UnicodeSet *)fSetStack.peek();
4616 leftOperand->removeAll(*rightOperand);
4617 delete rightOperand;
4618 break;
4619 case setIntersection1:
4620 case setIntersection2:
4621 fSetStack.pop();
4622 leftOperand = (UnicodeSet *)fSetStack.peek();
4623 leftOperand->retainAll(*rightOperand);
4624 delete rightOperand;
4625 break;
4626 case setUnion:
4627 fSetStack.pop();
4628 leftOperand = (UnicodeSet *)fSetStack.peek();
4629 leftOperand->addAll(*rightOperand);
4630 delete rightOperand;
4631 break;
4632 default:
4633 UPRV_UNREACHABLE;
4634 }
4635 }
4636 }
4637
4638void RegexCompile::setPushOp(int32_t op) {
4639 setEval(op);
4640 fSetOpStack.push(op, *fStatus);
4641 fSetStack.push(new UnicodeSet(), *fStatus);
4642}
4643
4644U_NAMESPACE_END
4645#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4646
4647