regexcmp.cpp source code [engine/third_party/icu/source/i18n/regexcmp.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	//
4	// file: regexcmp.cpp
5	//
6	// Copyright (C) 2002-2016 International Business Machines Corporation and others.
7	// All Rights Reserved.
8	//
9	// This file contains the ICU regular expression compiler, which is responsible
10	// for processing a regular expression pattern into the compiled form that
11	// is used by the match finding engine.
12	//
13
14	#include "unicode/utypes.h"
15
16	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
17
18	#include "unicode/ustring.h"
19	#include "unicode/unistr.h"
20	#include "unicode/uniset.h"
21	#include "unicode/uchar.h"
22	#include "unicode/uchriter.h"
23	#include "unicode/parsepos.h"
24	#include "unicode/parseerr.h"
25	#include "unicode/regex.h"
26	#include "unicode/utf.h"
27	#include "unicode/utf16.h"
28	#include "patternprops.h"
29	#include "putilimp.h"
30	#include "cmemory.h"
31	#include "cstr.h"
32	#include "cstring.h"
33	#include "uvectr32.h"
34	#include "uvectr64.h"
35	#include "uassert.h"
36	#include "uinvchar.h"
37
38	#include "regeximp.h"
39	#include "regexcst.h" // Contains state table for the regex pattern parser.
40	// generated by a Perl script.
41	#include "regexcmp.h"
42	#include "regexst.h"
43	#include "regextxt.h"
44
45
46
47	U_NAMESPACE_BEGIN
48
49
50	//------------------------------------------------------------------------------
51	//
52	// Constructor.
53	//
54	//------------------------------------------------------------------------------
55	RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56	fParenStack (status), fSetStack (status), fSetOpStack (status)
57	{
58	// Lazy init of all shared global sets (needed for init()'s empty text)
59	RegexStaticSets::initGlobals(&status);
60
61	fStatus = &status;
62
63	fRXPat = rxp;
64	fScanIndex = `0`;
65	fLastChar = -`1`;
66	fPeekChar = -`1`;
67	fLineNum = `1`;
68	fCharNum = `0`;
69	fQuoteMode = FALSE;
70	fInBackslashQuote = FALSE;
71	fModeFlags = fRXPat->fFlags \| `0x80000000`;
72	fEOLComments = TRUE;
73
74	fMatchOpenParen = -`1`;
75	fMatchCloseParen = -`1`;
76	fCaptureName = NULL;
77	fLastSetLiteral = U_SENTINEL;
78
79	if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80	status = rxp->fDeferredStatus;
81	}
82	}
83
84	static const UChar chAmp = `0x26`; // '&'
85	static const UChar chDash = `0x2d`; // '-'
86
87
88	//------------------------------------------------------------------------------
89	//
90	// Destructor
91	//
92	//------------------------------------------------------------------------------
93	RegexCompile::~RegexCompile() {
94	delete fCaptureName; // Normally will be NULL, but can exist if pattern
95	// compilation stops with a syntax error.
96	}
97
98	static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99	set->addAll(UnicodeSet ().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100	}
101
102	//------------------------------------------------------------------------------
103	//
104	// Compile regex pattern. The state machine for rexexp pattern parsing is here.
105	// The state tables are hand-written in the file regexcst.txt,
106	// and converted to the form used here by a perl
107	// script regexcst.pl
108	//
109	//------------------------------------------------------------------------------
110	void RegexCompile::compile(
111	const UnicodeString &pat, // Source pat to be compiled.
112	UParseError &pp, // Error position info
113	UErrorCode &e) // Error Code
114	{
115	fRXPat->fPatternString = new UnicodeString (pat);
116	UText patternText = UTEXT_INITIALIZER;
117	utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118
119	if (U_SUCCESS(e)) {
120	compile(&patternText, pp, e);
121	utext_close(&patternText);
122	}
123	}
124
125	//
126	// compile, UText mode
127	// All the work is actually done here.
128	//
129	void RegexCompile::compile(
130	UText pat, // Source pat to be compiled.*
131	UParseError &pp, // Error position info
132	UErrorCode &e) // Error Code
133	{
134	fStatus = &e;
135	fParseErr = &pp;
136	fStackPtr = `0`;
137	fStack[fStackPtr] = `0`;
138
139	if (U_FAILURE(*fStatus)) {
140	return;
141	}
142
143	// There should be no pattern stuff in the RegexPattern object. They can not be reused.
144	U_ASSERT(fRXPat->fPattern == NULL \|\| utext_nativeLength(fRXPat->fPattern) == `0`);
145
146	// Prepare the RegexPattern object to receive the compiled pattern.
147	fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
148	if (U_FAILURE(*fStatus)) {
149	return;
150	}
151
152	// Initialize the pattern scanning state machine
153	fPatternLength = utext_nativeLength(pat);
154	uint16_t state = `1`;
155	const RegexTableEl *tableEl;
156
157	// UREGEX_LITERAL force entire pattern to be treated as a literal string.
158	if (fModeFlags & UREGEX_LITERAL) {
159	fQuoteMode = TRUE;
160	}
161
162	nextChar(fC); // Fetch the first char from the pattern string.
163
164	//
165	// Main loop for the regex pattern parsing state machine.
166	// Runs once per state transition.
167	// Each time through optionally performs, depending on the state table,
168	// - an advance to the the next pattern char
169	// - an action to be performed.
170	// - pushing or popping a state to/from the local state return stack.
171	// file regexcst.txt is the source for the state table. The logic behind
172	// recongizing the pattern syntax is there, not here.
173	//
174	for (;;) {
175	// Bail out if anything has gone wrong.
176	// Regex pattern parsing stops on the first error encountered.
177	if (U_FAILURE(*fStatus)) {
178	break;
179	}
180
181	U_ASSERT(state != `0`);
182
183	// Find the state table element that matches the input char from the pattern, or the
184	// class of the input character. Start with the first table row for this
185	// state, then linearly scan forward until we find a row that matches the
186	// character. The last row for each state always matches all characters, so
187	// the search will stop there, if not before.
188	//
189	tableEl = &gRuleParseStateTable[state];
190	REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
191	fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
192
193	for (;;) { // loop through table rows belonging to this state, looking for one
194	// that matches the current input char.
195	REGEX_SCAN_DEBUG_PRINTF(("."));
196	if (tableEl->fCharClass < `127` && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
197	// Table row specified an individual character, not a set, and
198	// the input character is not quoted, and
199	// the input character matched it.
200	break;
201	}
202	if (tableEl->fCharClass == `255`) {
203	// Table row specified default, match anything character class.
204	break;
205	}
206	if (tableEl->fCharClass == `254` && fC.fQuoted) {
207	// Table row specified "quoted" and the char was quoted.
208	break;
209	}
210	if (tableEl->fCharClass == `253` && fC.fChar == (UChar32)-`1`) {
211	// Table row specified eof and we hit eof on the input.
212	break;
213	}
214
215	if (tableEl->fCharClass >= `128` && tableEl->fCharClass < `240` && // Table specs a char class &&
216	fC.fQuoted == FALSE && // char is not escaped &&
217	fC.fChar != (UChar32)-`1`) { // char is not EOF
218	U_ASSERT(tableEl->fCharClass <= `137`);
219	if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-`128`].contains(fC.fChar)) {
220	// Table row specified a character class, or set of characters,
221	// and the current char matches it.
222	break;
223	}
224	}
225
226	// No match on this row, advance to the next row for this state,
227	tableEl++;
228	}
229	REGEX_SCAN_DEBUG_PRINTF(("\n"));
230
231	//
232	// We've found the row of the state table that matches the current input
233	// character from the rules string.
234	// Perform any action specified by this row in the state table.
235	if (doParseActions(tableEl->fAction) == FALSE) {
236	// Break out of the state machine loop if the
237	// the action signalled some kind of error, or
238	// the action was to exit, occurs on normal end-of-rules-input.
239	break;
240	}
241
242	if (tableEl->fPushState != `0`) {
243	fStackPtr++;
244	if (fStackPtr >= kStackSize) {
245	error(U_REGEX_INTERNAL_ERROR);
246	REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
247	fStackPtr--;
248	}
249	fStack[fStackPtr] = tableEl->fPushState;
250	}
251
252	//
253	// NextChar. This is where characters are actually fetched from the pattern.
254	// Happens under control of the 'n' tag in the state table.
255	//
256	if (tableEl->fNextChar) {
257	nextChar(fC);
258	}
259
260	// Get the next state from the table entry, or from the
261	// state stack if the next state was specified as "pop".
262	if (tableEl->fNextState != `255`) {
263	state = tableEl->fNextState;
264	} else {
265	state = fStack[fStackPtr];
266	fStackPtr--;
267	if (fStackPtr < `0`) {
268	// state stack underflow
269	// This will occur if the user pattern has mis-matched parentheses,
270	// with extra close parens.
271	//
272	fStackPtr++;
273	error(U_REGEX_MISMATCHED_PAREN);
274	}
275	}
276
277	}
278
279	if (U_FAILURE(*fStatus)) {
280	// Bail out if the pattern had errors.
281	// Set stack cleanup: a successful compile would have left it empty,
282	// but errors can leave temporary sets hanging around.
283	while (!fSetStack.empty()) {
284	delete (UnicodeSet *)fSetStack.pop();
285	}
286	return;
287	}
288
289	//
290	// The pattern has now been read and processed, and the compiled code generated.
291	//
292
293	//
294	// The pattern's fFrameSize so far has accumulated the requirements for
295	// storage for capture parentheses, counters, etc. that are encountered
296	// in the pattern. Add space for the two variables that are always
297	// present in the saved state: the input string position (int64_t) and
298	// the position in the compiled pattern.
299	//
300	allocateStackData(RESTACKFRAME_HDRCOUNT);
301
302	//
303	// Optimization pass 1: NOPs, back-references, and case-folding
304	//
305	stripNOPs();
306
307	//
308	// Get bounds for the minimum and maximum length of a string that this
309	// pattern can match. Used to avoid looking for matches in strings that
310	// are too short.
311	//
312	fRXPat->fMinMatchLen = minMatchLength(`3`, fRXPat->fCompiledPat->size()-`1`);
313
314	//
315	// Optimization pass 2: match start type
316	//
317	matchStartType();
318
319	//
320	// Set up fast latin-1 range sets
321	//
322	int32_t numSets = fRXPat->fSets->size();
323	fRXPat->fSets8 = new Regex8BitSet[numSets];
324	// Null pointer check.
325	if (fRXPat->fSets8 == NULL) {
326	e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
327	return;
328	}
329	int32_t i;
330	for (i=`0`; i<numSets; i++) {
331	UnicodeSet s = (UnicodeSet )fRXPat->fSets->elementAt(i);
332	fRXPat->fSets8[i].init(s);
333	}
334
335	}
336
337
338
339
340
341	//------------------------------------------------------------------------------
342	//
343	// doParseAction Do some action during regex pattern parsing.
344	// Called by the parse state machine.
345	//
346	// Generation of the match engine PCode happens here, or
347	// in functions called from the parse actions defined here.
348	//
349	//
350	//------------------------------------------------------------------------------
351	UBool RegexCompile::doParseActions(int32_t action)
352	{
353	UBool returnVal = TRUE;
354
355	switch ((Regex_PatternParseAction)action) {
356
357	case doPatStart:
358	// Start of pattern compiles to:
359	//0 SAVE 2 Fall back to position of FAIL
360	//1 jmp 3
361	//2 FAIL Stop if we ever reach here.
362	//3 NOP Dummy, so start of pattern looks the same as
363	// the start of an ( grouping.
364	//4 NOP Resreved, will be replaced by a save if there are
365	// OR \| operators at the top level
366	appendOp(URX_STATE_SAVE, `2`);
367	appendOp(URX_JMP, `3`);
368	appendOp(URX_FAIL, `0`);
369
370	// Standard open nonCapture paren action emits the two NOPs and
371	// sets up the paren stack frame.
372	doParseActions(doOpenNonCaptureParen);
373	break;
374
375	case doPatFinish:
376	// We've scanned to the end of the pattern
377	// The end of pattern compiles to:
378	// URX_END
379	// which will stop the runtime match engine.
380	// Encountering end of pattern also behaves like a close paren,
381	// and forces fixups of the State Save at the beginning of the compiled pattern
382	// and of any OR operations at the top level.
383	//
384	handleCloseParen();
385	if (fParenStack.size() > `0`) {
386	// Missing close paren in pattern.
387	error(U_REGEX_MISMATCHED_PAREN);
388	}
389
390	// add the END operation to the compiled pattern.
391	appendOp(URX_END, `0`);
392
393	// Terminate the pattern compilation state machine.
394	returnVal = FALSE;
395	break;
396
397
398
399	case doOrOperator:
400	// Scanning a '\|', as in (A\|B)
401	{
402	// Generate code for any pending literals preceding the '\|'
403	fixLiterals(FALSE);
404
405	// Insert a SAVE operation at the start of the pattern section preceding
406	// this OR at this level. This SAVE will branch the match forward
407	// to the right hand side of the OR in the event that the left hand
408	// side fails to match and backtracks. Locate the position for the
409	// save from the location on the top of the parentheses stack.
410	int32_t savePosition = fParenStack.popi();
411	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
412	U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
413	op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+`1`);
414	fRXPat->fCompiledPat->setElementAt(op, savePosition);
415
416	// Append an JMP operation into the compiled pattern. The operand for
417	// the JMP will eventually be the location following the ')' for the
418	// group. This will be patched in later, when the ')' is encountered.
419	appendOp(URX_JMP, `0`);
420
421	// Push the position of the newly added JMP op onto the parentheses stack.
422	// This registers if for fixup when this block's close paren is encountered.
423	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, *fStatus);
424
425	// Append a NOP to the compiled pattern. This is the slot reserved
426	// for a SAVE in the event that there is yet another '\|' following
427	// this one.
428	appendOp(URX_NOP, `0`);
429	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, *fStatus);
430	}
431	break;
432
433
434	case doBeginNamedCapture:
435	// Scanning (?<letter.
436	// The first letter of the name will come through again under doConinueNamedCapture.
437	fCaptureName = new UnicodeString ();
438	if (fCaptureName == NULL) {
439	error(U_MEMORY_ALLOCATION_ERROR);
440	}
441	break;
442
443	case doContinueNamedCapture:
444	fCaptureName->append(fC.fChar);
445	break;
446
447	case doBadNamedCapture:
448	error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
449	break;
450
451	case doOpenCaptureParen:
452	// Open Capturing Paren, possibly named.
453	// Compile to a
454	// - NOP, which later may be replaced by a save-state if the
455	// parenthesized group gets a quantifier, followed by*
456	// - START_CAPTURE n where n is stack frame offset to the capture group variables.
457	// - NOP, which may later be replaced by a save-state if there
458	// is an '\|' alternation within the parens.
459	//
460	// Each capture group gets three slots in the save stack frame:
461	// 0: Capture Group start position (in input string being matched.)
462	// 1: Capture Group end position.
463	// 2: Start of Match-in-progress.
464	// The first two locations are for a completed capture group, and are
465	// referred to by back references and the like.
466	// The third location stores the capture start position when an START_CAPTURE is
467	// encountered. This will be promoted to a completed capture when (and if) the corresponding
468	// END_CAPTURE is encountered.
469	{
470	fixLiterals();
471	appendOp(URX_NOP, `0`);
472	int32_t varsLoc = allocateStackData(`3`); // Reserve three slots in match stack frame.
473	appendOp(URX_START_CAPTURE, varsLoc);
474	appendOp(URX_NOP, `0`);
475
476	// On the Parentheses stack, start a new frame and add the postions
477	// of the two NOPs. Depending on what follows in the pattern, the
478	// NOPs may be changed to SAVE_STATE or JMP ops, with a target
479	// address of the end of the parenthesized group.
480	fParenStack.push(fModeFlags, fStatus); // Match mode state*
481	fParenStack.push(capturing, fStatus); // Frame type.*
482	fParenStack.push(fRXPat->fCompiledPat->size()-`3`, fStatus); // The first NOP location*
483	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP loc*
484
485	// Save the mapping from group number to stack frame variable position.
486	fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
487
488	// If this is a named capture group, add the name->group number mapping.
489	if (fCaptureName != NULL) {
490	if (!fRXPat->initNamedCaptureMap()) {
491	if (U_SUCCESS(*fStatus)) {
492	error(fRXPat->fDeferredStatus);
493	}
494	break;
495	}
496	int32_t groupNumber = fRXPat->fGroupMap->size();
497	int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
498	fCaptureName = NULL; // hash table takes ownership of the name (key) string.
499	if (previousMapping > `0` && U_SUCCESS(*fStatus)) {
500	error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
501	}
502	}
503	}
504	break;
505
506	case doOpenNonCaptureParen:
507	// Open non-caputuring (grouping only) Paren.
508	// Compile to a
509	// - NOP, which later may be replaced by a save-state if the
510	// parenthesized group gets a quantifier, followed by*
511	// - NOP, which may later be replaced by a save-state if there
512	// is an '\|' alternation within the parens.
513	{
514	fixLiterals();
515	appendOp(URX_NOP, `0`);
516	appendOp(URX_NOP, `0`);
517
518	// On the Parentheses stack, start a new frame and add the postions
519	// of the two NOPs.
520	fParenStack.push(fModeFlags, fStatus); // Match mode state*
521	fParenStack.push(plain, fStatus); // Begin a new frame.*
522	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP location*
523	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP loc*
524	}
525	break;
526
527
528	case doOpenAtomicParen:
529	// Open Atomic Paren. (?>
530	// Compile to a
531	// - NOP, which later may be replaced if the parenthesized group
532	// has a quantifier, followed by
533	// - STO_SP save state stack position, so it can be restored at the ")"
534	// - NOP, which may later be replaced by a save-state if there
535	// is an '\|' alternation within the parens.
536	{
537	fixLiterals();
538	appendOp(URX_NOP, `0`);
539	int32_t varLoc = allocateData(`1`); // Reserve a data location for saving the state stack ptr.
540	appendOp(URX_STO_SP, varLoc);
541	appendOp(URX_NOP, `0`);
542
543	// On the Parentheses stack, start a new frame and add the postions
544	// of the two NOPs. Depending on what follows in the pattern, the
545	// NOPs may be changed to SAVE_STATE or JMP ops, with a target
546	// address of the end of the parenthesized group.
547	fParenStack.push(fModeFlags, fStatus); // Match mode state*
548	fParenStack.push(atomic, fStatus); // Frame type.*
549	fParenStack.push(fRXPat->fCompiledPat->size()-`3`, fStatus); // The first NOP*
550	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP*
551	}
552	break;
553
554
555	case doOpenLookAhead:
556	// Positive Look-ahead (?= stuff )
557	//
558	// Note: Addition of transparent input regions, with the need to
559	// restore the original regions when failing out of a lookahead
560	// block, complicated this sequence. Some conbined opcodes
561	// might make sense - or might not, lookahead aren't that common.
562	//
563	// Caution: min match length optimization knows about this
564	// sequence; don't change without making updates there too.
565	//
566	// Compiles to
567	// 1 LA_START dataLoc Saves SP, Input Pos, Active input region.
568	// 2. STATE_SAVE 4 on failure of lookahead, goto 4
569	// 3 JMP 6 continue ...
570	//
571	// 4. LA_END Look Ahead failed. Restore regions.
572	// 5. BACKTRACK and back track again.
573	//
574	// 6. NOP reserved for use by quantifiers on the block.
575	// Look-ahead can't have quantifiers, but paren stack
576	// compile time conventions require the slot anyhow.
577	// 7. NOP may be replaced if there is are '\|' ops in the block.
578	// 8. code for parenthesized stuff.
579	// 9. LA_END
580	//
581	// Four data slots are reserved, for saving state on entry to the look-around
582	// 0: stack pointer on entry.
583	// 1: input position on entry.
584	// 2: fActiveStart, the active bounds start on entry.
585	// 3: fActiveLimit, the active bounds limit on entry.
586	{
587	fixLiterals();
588	int32_t dataLoc = allocateData(`4`);
589	appendOp(URX_LA_START, dataLoc);
590	appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ `2`);
591	appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ `3`);
592	appendOp(URX_LA_END, dataLoc);
593	appendOp(URX_BACKTRACK, `0`);
594	appendOp(URX_NOP, `0`);
595	appendOp(URX_NOP, `0`);
596
597	// On the Parentheses stack, start a new frame and add the postions
598	// of the NOPs.
599	fParenStack.push(fModeFlags, fStatus); // Match mode state*
600	fParenStack.push(lookAhead, fStatus); // Frame type.*
601	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP location*
602	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP location*
603	}
604	break;
605
606	case doOpenLookAheadNeg:
607	// Negated Lookahead. (?! stuff )
608	// Compiles to
609	// 1. LA_START dataloc
610	// 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
611	// // which continues with the match.
612	// 3. NOP // Std. Open Paren sequence, for possible '\|'
613	// 4. code for parenthesized stuff.
614	// 5. LA_END // Cut back stack, remove saved state from step 2.
615	// 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
616	// 7. END_LA // Restore match region, in case look-ahead was using
617	// an alternate (transparent) region.
618	// Four data slots are reserved, for saving state on entry to the look-around
619	// 0: stack pointer on entry.
620	// 1: input position on entry.
621	// 2: fActiveStart, the active bounds start on entry.
622	// 3: fActiveLimit, the active bounds limit on entry.
623	{
624	fixLiterals();
625	int32_t dataLoc = allocateData(`4`);
626	appendOp(URX_LA_START, dataLoc);
627	appendOp(URX_STATE_SAVE, `0`); // dest address will be patched later.
628	appendOp(URX_NOP, `0`);
629
630	// On the Parentheses stack, start a new frame and add the postions
631	// of the StateSave and NOP.
632	fParenStack.push(fModeFlags, fStatus); // Match mode state*
633	fParenStack.push(negLookAhead, fStatus); // Frame type*
634	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The STATE_SAVE location*
635	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP location*
636
637	// Instructions #5 - #7 will be added when the ')' is encountered.
638	}
639	break;
640
641	case doOpenLookBehind:
642	{
643	// Compile a (?<= look-behind open paren.
644	//
645	// Compiles to
646	// 0 URX_LB_START dataLoc
647	// 1 URX_LB_CONT dataLoc
648	// 2 MinMatchLen
649	// 3 MaxMatchLen
650	// 4 URX_NOP Standard '(' boilerplate.
651	// 5 URX_NOP Reserved slot for use with '\|' ops within (block).
652	// 6 <code for LookBehind expression>
653	// 7 URX_LB_END dataLoc # Check match len, restore input len
654	// 8 URX_LA_END dataLoc # Restore stack, input pos
655	//
656	// Allocate a block of matcher data, to contain (when running a match)
657	// 0: Stack ptr on entry
658	// 1: Input Index on entry
659	// 2: fActiveStart, the active bounds start on entry.
660	// 3: fActiveLimit, the active bounds limit on entry.
661	// 4: Start index of match current match attempt.
662	// The first four items must match the layout of data for LA_START / LA_END
663
664	// Generate match code for any pending literals.
665	fixLiterals();
666
667	// Allocate data space
668	int32_t dataLoc = allocateData(`5`);
669
670	// Emit URX_LB_START
671	appendOp(URX_LB_START, dataLoc);
672
673	// Emit URX_LB_CONT
674	appendOp(URX_LB_CONT, dataLoc);
675	appendOp(URX_RESERVED_OP, `0`); // MinMatchLength. To be filled later.
676	appendOp(URX_RESERVED_OP, `0`); // MaxMatchLength. To be filled later.
677
678	// Emit the NOPs
679	appendOp(URX_NOP, `0`);
680	appendOp(URX_NOP, `0`);
681
682	// On the Parentheses stack, start a new frame and add the postions
683	// of the URX_LB_CONT and the NOP.
684	fParenStack.push(fModeFlags, fStatus); // Match mode state*
685	fParenStack.push(lookBehind, fStatus); // Frame type*
686	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP location*
687	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The 2nd NOP location*
688
689	// The final two instructions will be added when the ')' is encountered.
690	}
691
692	break;
693
694	case doOpenLookBehindNeg:
695	{
696	// Compile a (?<! negated look-behind open paren.
697	//
698	// Compiles to
699	// 0 URX_LB_START dataLoc # Save entry stack, input len
700	// 1 URX_LBN_CONT dataLoc # Iterate possible match positions
701	// 2 MinMatchLen
702	// 3 MaxMatchLen
703	// 4 continueLoc (9)
704	// 5 URX_NOP Standard '(' boilerplate.
705	// 6 URX_NOP Reserved slot for use with '\|' ops within (block).
706	// 7 <code for LookBehind expression>
707	// 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
708	// 9 ...
709	//
710	// Allocate a block of matcher data, to contain (when running a match)
711	// 0: Stack ptr on entry
712	// 1: Input Index on entry
713	// 2: fActiveStart, the active bounds start on entry.
714	// 3: fActiveLimit, the active bounds limit on entry.
715	// 4: Start index of match current match attempt.
716	// The first four items must match the layout of data for LA_START / LA_END
717
718	// Generate match code for any pending literals.
719	fixLiterals();
720
721	// Allocate data space
722	int32_t dataLoc = allocateData(`5`);
723
724	// Emit URX_LB_START
725	appendOp(URX_LB_START, dataLoc);
726
727	// Emit URX_LBN_CONT
728	appendOp(URX_LBN_CONT, dataLoc);
729	appendOp(URX_RESERVED_OP, `0`); // MinMatchLength. To be filled later.
730	appendOp(URX_RESERVED_OP, `0`); // MaxMatchLength. To be filled later.
731	appendOp(URX_RESERVED_OP, `0`); // Continue Loc. To be filled later.
732
733	// Emit the NOPs
734	appendOp(URX_NOP, `0`);
735	appendOp(URX_NOP, `0`);
736
737	// On the Parentheses stack, start a new frame and add the postions
738	// of the URX_LB_CONT and the NOP.
739	fParenStack.push(fModeFlags, fStatus); // Match mode state*
740	fParenStack.push(lookBehindN, fStatus); // Frame type*
741	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP location*
742	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The 2nd NOP location*
743
744	// The final two instructions will be added when the ')' is encountered.
745	}
746	break;
747
748	case doConditionalExpr:
749	// Conditionals such as (?(1)a:b)
750	case doPerlInline:
751	// Perl inline-condtionals. (?{perl code}a\|b) We're not perl, no way to do them.
752	error(U_REGEX_UNIMPLEMENTED);
753	break;
754
755
756	case doCloseParen:
757	handleCloseParen();
758	if (fParenStack.size() <= `0`) {
759	// Extra close paren, or missing open paren.
760	error(U_REGEX_MISMATCHED_PAREN);
761	}
762	break;
763
764	case doNOP:
765	break;
766
767
768	case doBadOpenParenType:
769	case doRuleError:
770	error(U_REGEX_RULE_SYNTAX);
771	break;
772
773
774	case doMismatchedParenErr:
775	error(U_REGEX_MISMATCHED_PAREN);
776	break;
777
778	case doPlus:
779	// Normal '+' compiles to
780	// 1. stuff to be repeated (already built)
781	// 2. jmp-sav 1
782	// 3. ...
783	//
784	// Or, if the item to be repeated can match a zero length string,
785	// 1. STO_INP_LOC data-loc
786	// 2. body of stuff to be repeated
787	// 3. JMP_SAV_X 2
788	// 4. ...
789
790	//
791	// Or, if the item to be repeated is simple
792	// 1. Item to be repeated.
793	// 2. LOOP_SR_I set number (assuming repeated item is a set ref)
794	// 3. LOOP_C stack location
795	{
796	int32_t topLoc = blockTopLoc(FALSE); // location of item #1
797	int32_t frameLoc;
798
799	// Check for simple constructs, which may get special optimized code.
800	if (topLoc == fRXPat->fCompiledPat->size() - `1`) {
801	int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
802
803	if (URX_TYPE(repeatedOp) == URX_SETREF) {
804	// Emit optimized code for [char set]+
805	appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
806	frameLoc = allocateStackData(`1`);
807	appendOp(URX_LOOP_C, frameLoc);
808	break;
809	}
810
811	if (URX_TYPE(repeatedOp) == URX_DOTANY \|\|
812	URX_TYPE(repeatedOp) == URX_DOTANY_ALL \|\|
813	URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
814	// Emit Optimized code for .+ operations.
815	int32_t loopOpI = buildOp(URX_LOOP_DOT_I, `0`);
816	if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
817	// URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
818	loopOpI \|= `1`;
819	}
820	if (fModeFlags & UREGEX_UNIX_LINES) {
821	loopOpI \|= `2`;
822	}
823	appendOp(loopOpI);
824	frameLoc = allocateStackData(`1`);
825	appendOp(URX_LOOP_C, frameLoc);
826	break;
827	}
828
829	}
830
831	// General case.
832
833	// Check for minimum match length of zero, which requires
834	// extra loop-breaking code.
835	if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-`1`) == `0`) {
836	// Zero length match is possible.
837	// Emit the code sequence that can handle it.
838	insertOp(topLoc);
839	frameLoc = allocateStackData(`1`);
840
841	int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
842	fRXPat->fCompiledPat->setElementAt(op, topLoc);
843
844	appendOp(URX_JMP_SAV_X, topLoc+`1`);
845	} else {
846	// Simpler code when the repeated body must match something non-empty
847	appendOp(URX_JMP_SAV, topLoc);
848	}
849	}
850	break;
851
852	case doNGPlus:
853	// Non-greedy '+?' compiles to
854	// 1. stuff to be repeated (already built)
855	// 2. state-save 1
856	// 3. ...
857	{
858	int32_t topLoc = blockTopLoc(FALSE);
859	appendOp(URX_STATE_SAVE, topLoc);
860	}
861	break;
862
863
864	case doOpt:
865	// Normal (greedy) ? quantifier.
866	// Compiles to
867	// 1. state save 3
868	// 2. body of optional block
869	// 3. ...
870	// Insert the state save into the compiled pattern, and we're done.
871	{
872	int32_t saveStateLoc = blockTopLoc(TRUE);
873	int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
874	fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
875	}
876	break;
877
878	case doNGOpt:
879	// Non-greedy ?? quantifier
880	// compiles to
881	// 1. jmp 4
882	// 2. body of optional block
883	// 3 jmp 5
884	// 4. state save 2
885	// 5 ...
886	// This code is less than ideal, with two jmps instead of one, because we can only
887	// insert one instruction at the top of the block being iterated.
888	{
889	int32_t jmp1_loc = blockTopLoc(TRUE);
890	int32_t jmp2_loc = fRXPat->fCompiledPat->size();
891
892	int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+`1`);
893	fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
894
895	appendOp(URX_JMP, jmp2_loc+`2`);
896
897	appendOp(URX_STATE_SAVE, jmp1_loc+`1`);
898	}
899	break;
900
901
902	case doStar:
903	// Normal (greedy) quantifier.*
904	// Compiles to
905	// 1. STATE_SAVE 4
906	// 2. body of stuff being iterated over
907	// 3. JMP_SAV 2
908	// 4. ...
909	//
910	// Or, if the body is a simple [Set],
911	// 1. LOOP_SR_I set number
912	// 2. LOOP_C stack location
913	// ...
914	//
915	// Or if this is a .*
916	// 1. LOOP_DOT_I (. matches all mode flag)
917	// 2. LOOP_C stack location
918	//
919	// Or, if the body can match a zero-length string, to inhibit infinite loops,
920	// 1. STATE_SAVE 5
921	// 2. STO_INP_LOC data-loc
922	// 3. body of stuff
923	// 4. JMP_SAV_X 2
924	// 5. ...
925	{
926	// location of item #1, the STATE_SAVE
927	int32_t topLoc = blockTopLoc(FALSE);
928	int32_t dataLoc = -`1`;
929
930	// Check for simple , where the construct being repeated*
931	// compiled to single opcode, and might be optimizable.
932	if (topLoc == fRXPat->fCompiledPat->size() - `1`) {
933	int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
934
935	if (URX_TYPE(repeatedOp) == URX_SETREF) {
936	// Emit optimized code for a [char set]*
937	int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
938	fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
939	dataLoc = allocateStackData(`1`);
940	appendOp(URX_LOOP_C, dataLoc);
941	break;
942	}
943
944	if (URX_TYPE(repeatedOp) == URX_DOTANY \|\|
945	URX_TYPE(repeatedOp) == URX_DOTANY_ALL \|\|
946	URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
947	// Emit Optimized code for . operations.*
948	int32_t loopOpI = buildOp(URX_LOOP_DOT_I, `0`);
949	if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
950	// URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
951	loopOpI \|= `1`;
952	}
953	if ((fModeFlags & UREGEX_UNIX_LINES) != `0`) {
954	loopOpI \|= `2`;
955	}
956	fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
957	dataLoc = allocateStackData(`1`);
958	appendOp(URX_LOOP_C, dataLoc);
959	break;
960	}
961	}
962
963	// Emit general case code for this *
964	// The optimizations did not apply.
965
966	int32_t saveStateLoc = blockTopLoc(TRUE);
967	int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+`1`);
968
969	// Check for minimum match length of zero, which requires
970	// extra loop-breaking code.
971	if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-`1`) == `0`) {
972	insertOp(saveStateLoc);
973	dataLoc = allocateStackData(`1`);
974
975	int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
976	fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+`1`);
977	jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+`2`);
978	}
979
980	// Locate the position in the compiled pattern where the match will continue
981	// after completing the . (4 or 5 in the comment above)*
982	int32_t continueLoc = fRXPat->fCompiledPat->size()+`1`;
983
984	// Put together the save state op and store it into the compiled code.
985	int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
986	fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
987
988	// Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
989	appendOp(jmpOp);
990	}
991	break;
992
993	case doNGStar:
994	// Non-greedy ? quantifier*
995	// compiles to
996	// 1. JMP 3
997	// 2. body of stuff being iterated over
998	// 3. STATE_SAVE 2
999	// 4 ...
1000	{
1001	int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
1002	int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
1003	int32_t jmpOp = buildOp(URX_JMP, saveLoc);
1004	fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1005	appendOp(URX_STATE_SAVE, jmpLoc+`1`);
1006	}
1007	break;
1008
1009
1010	case doIntervalInit:
1011	// The '{' opening an interval quantifier was just scanned.
1012	// Init the counter varaiables that will accumulate the values as the digits
1013	// are scanned.
1014	fIntervalLow = `0`;
1015	fIntervalUpper = -`1`;
1016	break;
1017
1018	case doIntevalLowerDigit:
1019	// Scanned a digit from the lower value of an {lower,upper} interval
1020	{
1021	int32_t digitValue = u_charDigitValue(fC.fChar);
1022	U_ASSERT(digitValue >= `0`);
1023	int64_t val = (int64_t)fIntervalLow*`10` + digitValue;
1024	if (val > INT32_MAX) {
1025	error(U_REGEX_NUMBER_TOO_BIG);
1026	} else {
1027	fIntervalLow = (int32_t)val;
1028	}
1029	}
1030	break;
1031
1032	case doIntervalUpperDigit:
1033	// Scanned a digit from the upper value of an {lower,upper} interval
1034	{
1035	if (fIntervalUpper < `0`) {
1036	fIntervalUpper = `0`;
1037	}
1038	int32_t digitValue = u_charDigitValue(fC.fChar);
1039	U_ASSERT(digitValue >= `0`);
1040	int64_t val = (int64_t)fIntervalUpper*`10` + digitValue;
1041	if (val > INT32_MAX) {
1042	error(U_REGEX_NUMBER_TOO_BIG);
1043	} else {
1044	fIntervalUpper = (int32_t)val;
1045	}
1046	}
1047	break;
1048
1049	case doIntervalSame:
1050	// Scanned a single value interval like {27}. Upper = Lower.
1051	fIntervalUpper = fIntervalLow;
1052	break;
1053
1054	case doInterval:
1055	// Finished scanning a normal {lower,upper} interval. Generate the code for it.
1056	if (compileInlineInterval() == FALSE) {
1057	compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1058	}
1059	break;
1060
1061	case doPossessiveInterval:
1062	// Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1063	{
1064	// Remember the loc for the top of the block being looped over.
1065	// (Can not reserve a slot in the compiled pattern at this time, because
1066	// compileInterval needs to reserve also, and blockTopLoc can only reserve
1067	// once per block.)
1068	int32_t topLoc = blockTopLoc(FALSE);
1069
1070	// Produce normal looping code.
1071	compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1072
1073	// Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1074	// just as if the loop was inclosed in atomic parentheses.
1075
1076	// First the STO_SP before the start of the loop
1077	insertOp(topLoc);
1078
1079	int32_t varLoc = allocateData(`1`); // Reserve a data location for saving the
1080	int32_t op = buildOp(URX_STO_SP, varLoc);
1081	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1082
1083	int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1084	U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1085	loopOp++; // point LoopOp after the just-inserted STO_SP
1086	fRXPat->fCompiledPat->push(loopOp, *fStatus);
1087
1088	// Then the LD_SP after the end of the loop
1089	appendOp(URX_LD_SP, varLoc);
1090	}
1091
1092	break;
1093
1094	case doNGInterval:
1095	// Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1096	compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1097	break;
1098
1099	case doIntervalError:
1100	error(U_REGEX_BAD_INTERVAL);
1101	break;
1102
1103	case doLiteralChar:
1104	// We've just scanned a "normal" character from the pattern,
1105	literalChar(fC.fChar);
1106	break;
1107
1108
1109	case doEscapedLiteralChar:
1110	// We've just scanned an backslashed escaped character with no
1111	// special meaning. It represents itself.
1112	if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != `0` &&
1113	((fC.fChar >= `0x41` && fC.fChar<= `0x5A`) \|\| // in [A-Z]
1114	(fC.fChar >= `0x61` && fC.fChar <= `0x7a`))) { // in [a-z]
1115	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1116	}
1117	literalChar(fC.fChar);
1118	break;
1119
1120
1121	case doDotAny:
1122	// scanned a ".", match any single character.
1123	{
1124	fixLiterals(FALSE);
1125	if (fModeFlags & UREGEX_DOTALL) {
1126	appendOp(URX_DOTANY_ALL, `0`);
1127	} else if (fModeFlags & UREGEX_UNIX_LINES) {
1128	appendOp(URX_DOTANY_UNIX, `0`);
1129	} else {
1130	appendOp(URX_DOTANY, `0`);
1131	}
1132	}
1133	break;
1134
1135	case doCaret:
1136	{
1137	fixLiterals(FALSE);
1138	if ( (fModeFlags & UREGEX_MULTILINE) == `0` && (fModeFlags & UREGEX_UNIX_LINES) == `0`) {
1139	appendOp(URX_CARET, `0`);
1140	} else if ((fModeFlags & UREGEX_MULTILINE) != `0` && (fModeFlags & UREGEX_UNIX_LINES) == `0`) {
1141	appendOp(URX_CARET_M, `0`);
1142	} else if ((fModeFlags & UREGEX_MULTILINE) == `0` && (fModeFlags & UREGEX_UNIX_LINES) != `0`) {
1143	appendOp(URX_CARET, `0`); // Only testing true start of input.
1144	} else if ((fModeFlags & UREGEX_MULTILINE) != `0` && (fModeFlags & UREGEX_UNIX_LINES) != `0`) {
1145	appendOp(URX_CARET_M_UNIX, `0`);
1146	}
1147	}
1148	break;
1149
1150	case doDollar:
1151	{
1152	fixLiterals(FALSE);
1153	if ( (fModeFlags & UREGEX_MULTILINE) == `0` && (fModeFlags & UREGEX_UNIX_LINES) == `0`) {
1154	appendOp(URX_DOLLAR, `0`);
1155	} else if ((fModeFlags & UREGEX_MULTILINE) != `0` && (fModeFlags & UREGEX_UNIX_LINES) == `0`) {
1156	appendOp(URX_DOLLAR_M, `0`);
1157	} else if ((fModeFlags & UREGEX_MULTILINE) == `0` && (fModeFlags & UREGEX_UNIX_LINES) != `0`) {
1158	appendOp(URX_DOLLAR_D, `0`);
1159	} else if ((fModeFlags & UREGEX_MULTILINE) != `0` && (fModeFlags & UREGEX_UNIX_LINES) != `0`) {
1160	appendOp(URX_DOLLAR_MD, `0`);
1161	}
1162	}
1163	break;
1164
1165	case doBackslashA:
1166	fixLiterals(FALSE);
1167	appendOp(URX_CARET, `0`);
1168	break;
1169
1170	case doBackslashB:
1171	{
1172	#if UCONFIG_NO_BREAK_ITERATION==1
1173	if (fModeFlags & UREGEX_UWORD) {
1174	error(U_UNSUPPORTED_ERROR);
1175	}
1176	#endif
1177	fixLiterals(FALSE);
1178	int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1179	appendOp(op, `1`);
1180	}
1181	break;
1182
1183	case doBackslashb:
1184	{
1185	#if UCONFIG_NO_BREAK_ITERATION==1
1186	if (fModeFlags & UREGEX_UWORD) {
1187	error(U_UNSUPPORTED_ERROR);
1188	}
1189	#endif
1190	fixLiterals(FALSE);
1191	int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1192	appendOp(op, `0`);
1193	}
1194	break;
1195
1196	case doBackslashD:
1197	fixLiterals(FALSE);
1198	appendOp(URX_BACKSLASH_D, `1`);
1199	break;
1200
1201	case doBackslashd:
1202	fixLiterals(FALSE);
1203	appendOp(URX_BACKSLASH_D, `0`);
1204	break;
1205
1206	case doBackslashG:
1207	fixLiterals(FALSE);
1208	appendOp(URX_BACKSLASH_G, `0`);
1209	break;
1210
1211	case doBackslashH:
1212	fixLiterals(FALSE);
1213	appendOp(URX_BACKSLASH_H, `1`);
1214	break;
1215
1216	case doBackslashh:
1217	fixLiterals(FALSE);
1218	appendOp(URX_BACKSLASH_H, `0`);
1219	break;
1220
1221	case doBackslashR:
1222	fixLiterals(FALSE);
1223	appendOp(URX_BACKSLASH_R, `0`);
1224	break;
1225
1226	case doBackslashS:
1227	fixLiterals(FALSE);
1228	appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1229	break;
1230
1231	case doBackslashs:
1232	fixLiterals(FALSE);
1233	appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1234	break;
1235
1236	case doBackslashV:
1237	fixLiterals(FALSE);
1238	appendOp(URX_BACKSLASH_V, `1`);
1239	break;
1240
1241	case doBackslashv:
1242	fixLiterals(FALSE);
1243	appendOp(URX_BACKSLASH_V, `0`);
1244	break;
1245
1246	case doBackslashW:
1247	fixLiterals(FALSE);
1248	appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1249	break;
1250
1251	case doBackslashw:
1252	fixLiterals(FALSE);
1253	appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1254	break;
1255
1256	case doBackslashX:
1257	#if UCONFIG_NO_BREAK_ITERATION==1
1258	// Grapheme Cluster Boundary requires ICU break iteration.
1259	error(U_UNSUPPORTED_ERROR);
1260	#endif
1261	fixLiterals(FALSE);
1262	appendOp(URX_BACKSLASH_X, `0`);
1263	break;
1264
1265	case doBackslashZ:
1266	fixLiterals(FALSE);
1267	appendOp(URX_DOLLAR, `0`);
1268	break;
1269
1270	case doBackslashz:
1271	fixLiterals(FALSE);
1272	appendOp(URX_BACKSLASH_Z, `0`);
1273	break;
1274
1275	case doEscapeError:
1276	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1277	break;
1278
1279	case doExit:
1280	fixLiterals(FALSE);
1281	returnVal = FALSE;
1282	break;
1283
1284	case doProperty:
1285	{
1286	fixLiterals(FALSE);
1287	UnicodeSet *theSet = scanProp();
1288	compileSet(theSet);
1289	}
1290	break;
1291
1292	case doNamedChar:
1293	{
1294	UChar32 c = scanNamedChar();
1295	literalChar(c);
1296	}
1297	break;
1298
1299
1300	case doBackRef:
1301	// BackReference. Somewhat unusual in that the front-end can not completely parse
1302	// the regular expression, because the number of digits to be consumed
1303	// depends on the number of capture groups that have been defined. So
1304	// we have to do it here instead.
1305	{
1306	int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1307	int32_t groupNum = `0`;
1308	UChar32 c = fC.fChar;
1309
1310	for (;;) {
1311	// Loop once per digit, for max allowed number of digits in a back reference.
1312	int32_t digit = u_charDigitValue(c);
1313	groupNum = groupNum * `10` + digit;
1314	if (groupNum >= numCaptureGroups) {
1315	break;
1316	}
1317	c = peekCharLL();
1318	if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1319	break;
1320	}
1321	nextCharLL();
1322	}
1323
1324	// Scan of the back reference in the source regexp is complete. Now generate
1325	// the compiled code for it.
1326	// Because capture groups can be forward-referenced by back-references,
1327	// we fill the operand with the capture group number. At the end
1328	// of compilation, it will be changed to the variable's location.
1329	U_ASSERT(groupNum > `0`); // Shouldn't happen. '\0' begins an octal escape sequence,
1330	// and shouldn't enter this code path at all.
1331	fixLiterals(FALSE);
1332	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1333	appendOp(URX_BACKREF_I, groupNum);
1334	} else {
1335	appendOp(URX_BACKREF, groupNum);
1336	}
1337	}
1338	break;
1339
1340	case doBeginNamedBackRef:
1341	U_ASSERT(fCaptureName == NULL);
1342	fCaptureName = new UnicodeString;
1343	if (fCaptureName == NULL) {
1344	error(U_MEMORY_ALLOCATION_ERROR);
1345	}
1346	break;
1347
1348	case doContinueNamedBackRef:
1349	fCaptureName->append(fC.fChar);
1350	break;
1351
1352	case doCompleteNamedBackRef:
1353	{
1354	int32_t groupNumber =
1355	fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : `0`;
1356	if (groupNumber == `0`) {
1357	// Group name has not been defined.
1358	// Could be a forward reference. If we choose to support them at some
1359	// future time, extra mechanism will be required at this point.
1360	error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1361	} else {
1362	// Given the number, handle identically to a \n numbered back reference.
1363	// See comments above, under doBackRef
1364	fixLiterals(FALSE);
1365	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1366	appendOp(URX_BACKREF_I, groupNumber);
1367	} else {
1368	appendOp(URX_BACKREF, groupNumber);
1369	}
1370	}
1371	delete fCaptureName;
1372	fCaptureName = NULL;
1373	break;
1374	}
1375
1376	case doPossessivePlus:
1377	// Possessive ++ quantifier.
1378	// Compiles to
1379	// 1. STO_SP
1380	// 2. body of stuff being iterated over
1381	// 3. STATE_SAVE 5
1382	// 4. JMP 2
1383	// 5. LD_SP
1384	// 6. ...
1385	//
1386	// Note: TODO: This is pretty inefficient. A mass of saved state is built up
1387	// then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1388	//
1389	{
1390	// Emit the STO_SP
1391	int32_t topLoc = blockTopLoc(TRUE);
1392	int32_t stoLoc = allocateData(`1`); // Reserve the data location for storing save stack ptr.
1393	int32_t op = buildOp(URX_STO_SP, stoLoc);
1394	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1395
1396	// Emit the STATE_SAVE
1397	appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+`2`);
1398
1399	// Emit the JMP
1400	appendOp(URX_JMP, topLoc+`1`);
1401
1402	// Emit the LD_SP
1403	appendOp(URX_LD_SP, stoLoc);
1404	}
1405	break;
1406
1407	case doPossessiveStar:
1408	// Possessive + quantifier.*
1409	// Compiles to
1410	// 1. STO_SP loc
1411	// 2. STATE_SAVE 5
1412	// 3. body of stuff being iterated over
1413	// 4. JMP 2
1414	// 5. LD_SP loc
1415	// 6 ...
1416	// TODO: do something to cut back the state stack each time through the loop.
1417	{
1418	// Reserve two slots at the top of the block.
1419	int32_t topLoc = blockTopLoc(TRUE);
1420	insertOp(topLoc);
1421
1422	// emit STO_SP loc
1423	int32_t stoLoc = allocateData(`1`); // Reserve the data location for storing save stack ptr.
1424	int32_t op = buildOp(URX_STO_SP, stoLoc);
1425	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1426
1427	// Emit the SAVE_STATE 5
1428	int32_t L7 = fRXPat->fCompiledPat->size()+`1`;
1429	op = buildOp(URX_STATE_SAVE, L7);
1430	fRXPat->fCompiledPat->setElementAt(op, topLoc+`1`);
1431
1432	// Append the JMP operation.
1433	appendOp(URX_JMP, topLoc+`1`);
1434
1435	// Emit the LD_SP loc
1436	appendOp(URX_LD_SP, stoLoc);
1437	}
1438	break;
1439
1440	case doPossessiveOpt:
1441	// Possessive ?+ quantifier.
1442	// Compiles to
1443	// 1. STO_SP loc
1444	// 2. SAVE_STATE 5
1445	// 3. body of optional block
1446	// 4. LD_SP loc
1447	// 5. ...
1448	//
1449	{
1450	// Reserve two slots at the top of the block.
1451	int32_t topLoc = blockTopLoc(TRUE);
1452	insertOp(topLoc);
1453
1454	// Emit the STO_SP
1455	int32_t stoLoc = allocateData(`1`); // Reserve the data location for storing save stack ptr.
1456	int32_t op = buildOp(URX_STO_SP, stoLoc);
1457	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1458
1459	// Emit the SAVE_STATE
1460	int32_t continueLoc = fRXPat->fCompiledPat->size()+`1`;
1461	op = buildOp(URX_STATE_SAVE, continueLoc);
1462	fRXPat->fCompiledPat->setElementAt(op, topLoc+`1`);
1463
1464	// Emit the LD_SP
1465	appendOp(URX_LD_SP, stoLoc);
1466	}
1467	break;
1468
1469
1470	case doBeginMatchMode:
1471	fNewModeFlags = fModeFlags;
1472	fSetModeFlag = TRUE;
1473	break;
1474
1475	case doMatchMode: // (?i) and similar
1476	{
1477	int32_t bit = `0`;
1478	switch (fC.fChar) {
1479	case `0x69`: / 'i' / bit = UREGEX_CASE_INSENSITIVE; break;
1480	case `0x64`: / 'd' / bit = UREGEX_UNIX_LINES; break;
1481	case `0x6d`: / 'm' / bit = UREGEX_MULTILINE; break;
1482	case `0x73`: / 's' / bit = UREGEX_DOTALL; break;
1483	case `0x75`: / 'u' / bit = `0`; / Unicode casing / break;
1484	case `0x77`: / 'w' / bit = UREGEX_UWORD; break;
1485	case `0x78`: / 'x' / bit = UREGEX_COMMENTS; break;
1486	case `0x2d`: / '-' / fSetModeFlag = FALSE; break;
1487	default:
1488	UPRV_UNREACHABLE; // Should never happen. Other chars are filtered out
1489	// by the scanner.
1490	}
1491	if (fSetModeFlag) {
1492	fNewModeFlags \|= bit;
1493	} else {
1494	fNewModeFlags &= ~bit;
1495	}
1496	}
1497	break;
1498
1499	case doSetMatchMode:
1500	// Emit code to match any pending literals, using the not-yet changed match mode.
1501	fixLiterals();
1502
1503	// We've got a (?i) or similar. The match mode is being changed, but
1504	// the change is not scoped to a parenthesized block.
1505	U_ASSERT(fNewModeFlags < `0`);
1506	fModeFlags = fNewModeFlags;
1507
1508	break;
1509
1510
1511	case doMatchModeParen:
1512	// We've got a (?i: or similar. Begin a parenthesized block, save old
1513	// mode flags so they can be restored at the close of the block.
1514	//
1515	// Compile to a
1516	// - NOP, which later may be replaced by a save-state if the
1517	// parenthesized group gets a quantifier, followed by*
1518	// - NOP, which may later be replaced by a save-state if there
1519	// is an '\|' alternation within the parens.
1520	{
1521	fixLiterals(FALSE);
1522	appendOp(URX_NOP, `0`);
1523	appendOp(URX_NOP, `0`);
1524
1525	// On the Parentheses stack, start a new frame and add the postions
1526	// of the two NOPs (a normal non-capturing () frame, except for the
1527	// saving of the orignal mode flags.)
1528	fParenStack.push(fModeFlags, *fStatus);
1529	fParenStack.push(flags, fStatus); // Frame Marker*
1530	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP*
1531	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP*
1532
1533	// Set the current mode flags to the new values.
1534	U_ASSERT(fNewModeFlags < `0`);
1535	fModeFlags = fNewModeFlags;
1536	}
1537	break;
1538
1539	case doBadModeFlag:
1540	error(U_REGEX_INVALID_FLAG);
1541	break;
1542
1543	case doSuppressComments:
1544	// We have just scanned a '(?'. We now need to prevent the character scanner from
1545	// treating a '#' as a to-the-end-of-line comment.
1546	// (This Perl compatibility just gets uglier and uglier to do...)
1547	fEOLComments = FALSE;
1548	break;
1549
1550
1551	case doSetAddAmp:
1552	{
1553	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1554	set->add(chAmp);
1555	}
1556	break;
1557
1558	case doSetAddDash:
1559	{
1560	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1561	set->add(chDash);
1562	}
1563	break;
1564
1565	case doSetBackslash_s:
1566	{
1567	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1568	set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1569	break;
1570	}
1571
1572	case doSetBackslash_S:
1573	{
1574	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1575	UnicodeSet SSet;
1576	SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
1577	set->addAll(SSet);
1578	break;
1579	}
1580
1581	case doSetBackslash_d:
1582	{
1583	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1584	// TODO - make a static set, ticket 6058.
1585	addCategory(set, U_GC_ND_MASK, *fStatus);
1586	break;
1587	}
1588
1589	case doSetBackslash_D:
1590	{
1591	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1592	UnicodeSet digits;
1593	// TODO - make a static set, ticket 6058.
1594	digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1595	digits.complement();
1596	set->addAll(digits);
1597	break;
1598	}
1599
1600	case doSetBackslash_h:
1601	{
1602	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1603	UnicodeSet h;
1604	h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1605	h.add((UChar32)`9`); // Tab
1606	set->addAll(h);
1607	break;
1608	}
1609
1610	case doSetBackslash_H:
1611	{
1612	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1613	UnicodeSet h;
1614	h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1615	h.add((UChar32)`9`); // Tab
1616	h.complement();
1617	set->addAll(h);
1618	break;
1619	}
1620
1621	case doSetBackslash_v:
1622	{
1623	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1624	set->add((UChar32)`0x0a`, (UChar32)`0x0d`); // add range
1625	set->add((UChar32)`0x85`);
1626	set->add((UChar32)`0x2028`, (UChar32)`0x2029`);
1627	break;
1628	}
1629
1630	case doSetBackslash_V:
1631	{
1632	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1633	UnicodeSet v;
1634	v.add((UChar32)`0x0a`, (UChar32)`0x0d`); // add range
1635	v.add((UChar32)`0x85`);
1636	v.add((UChar32)`0x2028`, (UChar32)`0x2029`);
1637	v.complement();
1638	set->addAll(v);
1639	break;
1640	}
1641
1642	case doSetBackslash_w:
1643	{
1644	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1645	set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1646	break;
1647	}
1648
1649	case doSetBackslash_W:
1650	{
1651	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1652	UnicodeSet SSet;
1653	SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
1654	set->addAll(SSet);
1655	break;
1656	}
1657
1658	case doSetBegin:
1659	fixLiterals(FALSE);
1660	fSetStack.push(new UnicodeSet (), *fStatus);
1661	fSetOpStack.push(setStart, *fStatus);
1662	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != `0`) {
1663	fSetOpStack.push(setCaseClose, *fStatus);
1664	}
1665	break;
1666
1667	case doSetBeginDifference1:
1668	// We have scanned something like [[abc]-[
1669	// Set up a new UnicodeSet for the set beginning with the just-scanned '['
1670	// Push a Difference operator, which will cause the new set to be subtracted from what
1671	// went before once it is created.
1672	setPushOp(setDifference1);
1673	fSetOpStack.push(setStart, *fStatus);
1674	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != `0`) {
1675	fSetOpStack.push(setCaseClose, *fStatus);
1676	}
1677	break;
1678
1679	case doSetBeginIntersection1:
1680	// We have scanned something like [[abc]&[
1681	// Need both the '&' operator and the open '[' operator.
1682	setPushOp(setIntersection1);
1683	fSetOpStack.push(setStart, *fStatus);
1684	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != `0`) {
1685	fSetOpStack.push(setCaseClose, *fStatus);
1686	}
1687	break;
1688
1689	case doSetBeginUnion:
1690	// We have scanned something like [[abc][
1691	// Need to handle the union operation explicitly [[abc] \| [
1692	setPushOp(setUnion);
1693	fSetOpStack.push(setStart, *fStatus);
1694	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != `0`) {
1695	fSetOpStack.push(setCaseClose, *fStatus);
1696	}
1697	break;
1698
1699	case doSetDifference2:
1700	// We have scanned something like [abc--
1701	// Consider this to unambiguously be a set difference operator.
1702	setPushOp(setDifference2);
1703	break;
1704
1705	case doSetEnd:
1706	// Have encountered the ']' that closes a set.
1707	// Force the evaluation of any pending operations within this set,
1708	// leave the completed set on the top of the set stack.
1709	setEval(setEnd);
1710	U_ASSERT(fSetOpStack.peeki()==setStart);
1711	fSetOpStack.popi();
1712	break;
1713
1714	case doSetFinish:
1715	{
1716	// Finished a complete set expression, including all nested sets.
1717	// The close bracket has already triggered clearing out pending set operators,
1718	// the operator stack should be empty and the operand stack should have just
1719	// one entry, the result set.
1720	U_ASSERT(fSetOpStack.empty());
1721	UnicodeSet theSet = (UnicodeSet )fSetStack.pop();
1722	U_ASSERT(fSetStack.empty());
1723	compileSet(theSet);
1724	break;
1725	}
1726
1727	case doSetIntersection2:
1728	// Have scanned something like [abc&&
1729	setPushOp(setIntersection2);
1730	break;
1731
1732	case doSetLiteral:
1733	// Union the just-scanned literal character into the set being built.
1734	// This operation is the highest precedence set operation, so we can always do
1735	// it immediately, without waiting to see what follows. It is necessary to perform
1736	// any pending '-' or '&' operation first, because these have the same precedence
1737	// as union-ing in a literal'
1738	{
1739	setEval(setUnion);
1740	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1741	s->add(fC.fChar);
1742	fLastSetLiteral = fC.fChar;
1743	break;
1744	}
1745
1746	case doSetLiteralEscaped:
1747	// A back-slash escaped literal character was encountered.
1748	// Processing is the same as with setLiteral, above, with the addition of
1749	// the optional check for errors on escaped ASCII letters.
1750	{
1751	if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != `0` &&
1752	((fC.fChar >= `0x41` && fC.fChar<= `0x5A`) \|\| // in [A-Z]
1753	(fC.fChar >= `0x61` && fC.fChar <= `0x7a`))) { // in [a-z]
1754	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1755	}
1756	setEval(setUnion);
1757	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1758	s->add(fC.fChar);
1759	fLastSetLiteral = fC.fChar;
1760	break;
1761	}
1762
1763	case doSetNamedChar:
1764	// Scanning a \N{UNICODE CHARACTER NAME}
1765	// Aside from the source of the character, the processing is identical to doSetLiteral,
1766	// above.
1767	{
1768	UChar32 c = scanNamedChar();
1769	setEval(setUnion);
1770	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1771	s->add(c);
1772	fLastSetLiteral = c;
1773	break;
1774	}
1775
1776	case doSetNamedRange:
1777	// We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1778	// The left character is already in the set, and is saved in fLastSetLiteral.
1779	// The right side needs to be picked up, the scan is at the 'N'.
1780	// Lower Limit > Upper limit being an error matches both Java
1781	// and ICU UnicodeSet behavior.
1782	{
1783	UChar32 c = scanNamedChar();
1784	if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL \|\| fLastSetLiteral > c)) {
1785	error(U_REGEX_INVALID_RANGE);
1786	}
1787	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1788	s->add(fLastSetLiteral, c);
1789	fLastSetLiteral = c;
1790	break;
1791	}
1792
1793
1794	case doSetNegate:
1795	// Scanned a '^' at the start of a set.
1796	// Push the negation operator onto the set op stack.
1797	// A twist for case-insensitive matching:
1798	// the case closure operation must happen _before_ negation.
1799	// But the case closure operation will already be on the stack if it's required.
1800	// This requires checking for case closure, and swapping the stack order
1801	// if it is present.
1802	{
1803	int32_t tosOp = fSetOpStack.peeki();
1804	if (tosOp == setCaseClose) {
1805	fSetOpStack.popi();
1806	fSetOpStack.push(setNegation, *fStatus);
1807	fSetOpStack.push(setCaseClose, *fStatus);
1808	} else {
1809	fSetOpStack.push(setNegation, *fStatus);
1810	}
1811	}
1812	break;
1813
1814	case doSetNoCloseError:
1815	error(U_REGEX_MISSING_CLOSE_BRACKET);
1816	break;
1817
1818	case doSetOpError:
1819	error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
1820	break;
1821
1822	case doSetPosixProp:
1823	{
1824	UnicodeSet *s = scanPosixProp();
1825	if (s != NULL) {
1826	UnicodeSet tos = (UnicodeSet )fSetStack.peek();
1827	tos->addAll(*s);
1828	delete s;
1829	} // else error. scanProp() reported the error status already.
1830	}
1831	break;
1832
1833	case doSetProp:
1834	// Scanned a \p \P within [brackets].
1835	{
1836	UnicodeSet *s = scanProp();
1837	if (s != NULL) {
1838	UnicodeSet tos = (UnicodeSet )fSetStack.peek();
1839	tos->addAll(*s);
1840	delete s;
1841	} // else error. scanProp() reported the error status already.
1842	}
1843	break;
1844
1845
1846	case doSetRange:
1847	// We have scanned literal-literal. Add the range to the set.
1848	// The left character is already in the set, and is saved in fLastSetLiteral.
1849	// The right side is the current character.
1850	// Lower Limit > Upper limit being an error matches both Java
1851	// and ICU UnicodeSet behavior.
1852	{
1853
1854	if (fLastSetLiteral == U_SENTINEL \|\| fLastSetLiteral > fC.fChar) {
1855	error(U_REGEX_INVALID_RANGE);
1856	}
1857	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1858	s->add(fLastSetLiteral, fC.fChar);
1859	break;
1860	}
1861
1862	default:
1863	UPRV_UNREACHABLE;
1864	}
1865
1866	if (U_FAILURE(*fStatus)) {
1867	returnVal = FALSE;
1868	}
1869
1870	return returnVal;
1871	}
1872
1873
1874
1875	//------------------------------------------------------------------------------
1876	//
1877	// literalChar We've encountered a literal character from the pattern,
1878	// or an escape sequence that reduces to a character.
1879	// Add it to the string containing all literal chars/strings from
1880	// the pattern.
1881	//
1882	//------------------------------------------------------------------------------
1883	void RegexCompile::literalChar(UChar32 c) {
1884	fLiteralChars.append(c);
1885	}
1886
1887
1888	//------------------------------------------------------------------------------
1889	//
1890	// fixLiterals When compiling something that can follow a literal
1891	// string in a pattern, emit the code to match the
1892	// accumulated literal string.
1893	//
1894	// Optionally, split the last char of the string off into
1895	// a single "ONE_CHAR" operation, so that quantifiers can
1896	// apply to that char alone. Example: abc*
1897	// The must apply to the 'c' only.*
1898	//
1899	//------------------------------------------------------------------------------
1900	void RegexCompile::fixLiterals(UBool split) {
1901
1902	// If no literal characters have been scanned but not yet had code generated
1903	// for them, nothing needs to be done.
1904	if (fLiteralChars.length() == `0`) {
1905	return;
1906	}
1907
1908	int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -`1`);
1909	UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1910
1911	// Split: We need to ensure that the last item in the compiled pattern
1912	// refers only to the last literal scanned in the pattern, so that
1913	// quantifiers (, +, etc.) affect only it, and not a longer string.*
1914	// Split before case folding for case insensitive matches.
1915
1916	if (split) {
1917	fLiteralChars.truncate(indexOfLastCodePoint);
1918	fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string.
1919	// Note that the truncated literal string may be empty, in which case
1920	// nothing will be emitted.
1921
1922	literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
1923	fixLiterals(FALSE); // Second recursive call, code for the final code point.
1924	return;
1925	}
1926
1927	// If we are doing case-insensitive matching, case fold the string. This may expand
1928	// the string, e.g. the German sharp-s turns into "ss"
1929	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1930	fLiteralChars.foldCase();
1931	indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -`1`);
1932	lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1933	}
1934
1935	if (indexOfLastCodePoint == `0`) {
1936	// Single character, emit a URX_ONECHAR op to match it.
1937	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1938	u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1939	appendOp(URX_ONECHAR_I, lastCodePoint);
1940	} else {
1941	appendOp(URX_ONECHAR, lastCodePoint);
1942	}
1943	} else {
1944	// Two or more chars, emit a URX_STRING to match them.
1945	if (fLiteralChars.length() > `0x00ffffff` \|\| fRXPat->fLiteralText.length() > `0x00ffffff`) {
1946	error(U_REGEX_PATTERN_TOO_BIG);
1947	}
1948	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1949	appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1950	} else {
1951	// TODO here: add optimization to split case sensitive strings of length two
1952	// into two single char ops, for efficiency.
1953	appendOp(URX_STRING, fRXPat->fLiteralText.length());
1954	}
1955	appendOp(URX_STRING_LEN, fLiteralChars.length());
1956
1957	// Add this string into the accumulated strings of the compiled pattern.
1958	fRXPat->fLiteralText.append(fLiteralChars);
1959	}
1960
1961	fLiteralChars.remove();
1962	}
1963
1964
1965	int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1966	if (U_FAILURE(*fStatus)) {
1967	return `0`;
1968	}
1969	if (type < `0` \|\| type > `255`) {
1970	UPRV_UNREACHABLE;
1971	}
1972	if (val > `0x00ffffff`) {
1973	UPRV_UNREACHABLE;
1974	}
1975	if (val < `0`) {
1976	if (!(type == URX_RESERVED_OP_N \|\| type == URX_RESERVED_OP)) {
1977	UPRV_UNREACHABLE;
1978	}
1979	if (URX_TYPE(val) != `0xff`) {
1980	UPRV_UNREACHABLE;
1981	}
1982	type = URX_RESERVED_OP_N;
1983	}
1984	return (type << `24`) \| val;
1985	}
1986
1987
1988	//------------------------------------------------------------------------------
1989	//
1990	// appendOp() Append a new instruction onto the compiled pattern
1991	// Includes error checking, limiting the size of the
1992	// pattern to lengths that can be represented in the
1993	// 24 bit operand field of an instruction.
1994	//
1995	//------------------------------------------------------------------------------
1996	void RegexCompile::appendOp(int32_t op) {
1997	if (U_FAILURE(*fStatus)) {
1998	return;
1999	}
2000	fRXPat->fCompiledPat->addElement(op, *fStatus);
2001	if ((fRXPat->fCompiledPat->size() > `0x00fffff0`) && U_SUCCESS(*fStatus)) {
2002	error(U_REGEX_PATTERN_TOO_BIG);
2003	}
2004	}
2005
2006	void RegexCompile::appendOp(int32_t type, int32_t val) {
2007	appendOp(buildOp(type, val));
2008	}
2009
2010
2011	//------------------------------------------------------------------------------
2012	//
2013	// insertOp() Insert a slot for a new opcode into the already
2014	// compiled pattern code.
2015	//
2016	// Fill the slot with a NOP. Our caller will replace it
2017	// with what they really wanted.
2018	//
2019	//------------------------------------------------------------------------------
2020	void RegexCompile::insertOp(int32_t where) {
2021	UVector64 *code = fRXPat->fCompiledPat;
2022	U_ASSERT(where>`0` && where < code->size());
2023
2024	int32_t nop = buildOp(URX_NOP, `0`);
2025	code->insertElementAt(nop, where, *fStatus);
2026
2027	// Walk through the pattern, looking for any ops with targets that
2028	// were moved down by the insert. Fix them.
2029	int32_t loc;
2030	for (loc=`0`; loc<code->size(); loc++) {
2031	int32_t op = (int32_t)code->elementAti(loc);
2032	int32_t opType = URX_TYPE(op);
2033	int32_t opValue = URX_VAL(op);
2034	if ((opType == URX_JMP \|\|
2035	opType == URX_JMPX \|\|
2036	opType == URX_STATE_SAVE \|\|
2037	opType == URX_CTR_LOOP \|\|
2038	opType == URX_CTR_LOOP_NG \|\|
2039	opType == URX_JMP_SAV \|\|
2040	opType == URX_JMP_SAV_X \|\|
2041	opType == URX_RELOC_OPRND) && opValue > where) {
2042	// Target location for this opcode is after the insertion point and
2043	// needs to be incremented to adjust for the insertion.
2044	opValue++;
2045	op = buildOp(opType, opValue);
2046	code->setElementAt(op, loc);
2047	}
2048	}
2049
2050	// Now fix up the parentheses stack. All positive values in it are locations in
2051	// the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2052	for (loc=`0`; loc<fParenStack.size(); loc++) {
2053	int32_t x = fParenStack.elementAti(loc);
2054	U_ASSERT(x < code->size());
2055	if (x>where) {
2056	x++;
2057	fParenStack.setElementAt(x, loc);
2058	}
2059	}
2060
2061	if (fMatchCloseParen > where) {
2062	fMatchCloseParen++;
2063	}
2064	if (fMatchOpenParen > where) {
2065	fMatchOpenParen++;
2066	}
2067	}
2068
2069
2070	//------------------------------------------------------------------------------
2071	//
2072	// allocateData() Allocate storage in the matcher's static data area.
2073	// Return the index for the newly allocated data.
2074	// The storage won't actually exist until we are running a match
2075	// operation, but the storage indexes are inserted into various
2076	// opcodes while compiling the pattern.
2077	//
2078	//------------------------------------------------------------------------------
2079	int32_t RegexCompile::allocateData(int32_t size) {
2080	if (U_FAILURE(*fStatus)) {
2081	return `0`;
2082	}
2083	if (size <= `0` \|\| size > `0x100` \|\| fRXPat->fDataSize < `0`) {
2084	error(U_REGEX_INTERNAL_ERROR);
2085	return `0`;
2086	}
2087	int32_t dataIndex = fRXPat->fDataSize;
2088	fRXPat->fDataSize += size;
2089	if (fRXPat->fDataSize >= `0x00fffff0`) {
2090	error(U_REGEX_INTERNAL_ERROR);
2091	}
2092	return dataIndex;
2093	}
2094
2095
2096	//------------------------------------------------------------------------------
2097	//
2098	// allocateStackData() Allocate space in the back-tracking stack frame.
2099	// Return the index for the newly allocated data.
2100	// The frame indexes are inserted into various
2101	// opcodes while compiling the pattern, meaning that frame
2102	// size must be restricted to the size that will fit
2103	// as an operand (24 bits).
2104	//
2105	//------------------------------------------------------------------------------
2106	int32_t RegexCompile::allocateStackData(int32_t size) {
2107	if (U_FAILURE(*fStatus)) {
2108	return `0`;
2109	}
2110	if (size <= `0` \|\| size > `0x100` \|\| fRXPat->fFrameSize < `0`) {
2111	error(U_REGEX_INTERNAL_ERROR);
2112	return `0`;
2113	}
2114	int32_t dataIndex = fRXPat->fFrameSize;
2115	fRXPat->fFrameSize += size;
2116	if (fRXPat->fFrameSize >= `0x00fffff0`) {
2117	error(U_REGEX_PATTERN_TOO_BIG);
2118	}
2119	return dataIndex;
2120	}
2121
2122
2123	//------------------------------------------------------------------------------
2124	//
2125	// blockTopLoc() Find or create a location in the compiled pattern
2126	// at the start of the operation or block that has
2127	// just been compiled. Needed when a quantifier ( or*
2128	// whatever) appears, and we need to add an operation
2129	// at the start of the thing being quantified.
2130	//
2131	// (Parenthesized Blocks) have a slot with a NOP that
2132	// is reserved for this purpose. . or similar don't*
2133	// and a slot needs to be added.
2134	//
2135	// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2136	// at the returned location.
2137	// FALSE - just return the address,
2138	// do not reserve a location there.
2139	//
2140	//------------------------------------------------------------------------------
2141	int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
2142	int32_t theLoc;
2143	fixLiterals(TRUE); // Emit code for any pending literals.
2144	// If last item was a string, emit separate op for the its last char.
2145	if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2146	{
2147	// The item just processed is a parenthesized block.
2148	theLoc = fMatchOpenParen; // A slot is already reserved for us.
2149	U_ASSERT(theLoc > `0`);
2150	U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2151	}
2152	else {
2153	// Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2154	// No slot for STATE_SAVE was pre-reserved in the compiled code.
2155	// We need to make space now.
2156	theLoc = fRXPat->fCompiledPat->size()-`1`;
2157	int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2158	if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2159	// Strings take two opcode, we want the position of the first one.
2160	// We can have a string at this point if a single character case-folded to two.
2161	theLoc--;
2162	}
2163	if (reserveLoc) {
2164	int32_t nop = buildOp(URX_NOP, `0`);
2165	fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2166	}
2167	}
2168	return theLoc;
2169	}
2170
2171
2172
2173	//------------------------------------------------------------------------------
2174	//
2175	// handleCloseParen When compiling a close paren, we need to go back
2176	// and fix up any JMP or SAVE operations within the
2177	// parenthesized block that need to target the end
2178	// of the block. The locations of these are kept on
2179	// the paretheses stack.
2180	//
2181	// This function is called both when encountering a
2182	// real ) and at the end of the pattern.
2183	//
2184	//------------------------------------------------------------------------------
2185	void RegexCompile::handleCloseParen() {
2186	int32_t patIdx;
2187	int32_t patOp;
2188	if (fParenStack.size() <= `0`) {
2189	error(U_REGEX_MISMATCHED_PAREN);
2190	return;
2191	}
2192
2193	// Emit code for any pending literals.
2194	fixLiterals(FALSE);
2195
2196	// Fixup any operations within the just-closed parenthesized group
2197	// that need to reference the end of the (block).
2198	// (The first one popped from the stack is an unused slot for
2199	// alternation (OR) state save, but applying the fixup to it does no harm.)
2200	for (;;) {
2201	patIdx = fParenStack.popi();
2202	if (patIdx < `0`) {
2203	// value < 0 flags the start of the frame on the paren stack.
2204	break;
2205	}
2206	U_ASSERT(patIdx>`0` && patIdx <= fRXPat->fCompiledPat->size());
2207	patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2208	U_ASSERT(URX_VAL(patOp) == `0`); // Branch target for JMP should not be set.
2209	patOp \|= fRXPat->fCompiledPat->size(); // Set it now.
2210	fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2211	fMatchOpenParen = patIdx;
2212	}
2213
2214	// At the close of any parenthesized block, restore the match mode flags to
2215	// the value they had at the open paren. Saved value is
2216	// at the top of the paren stack.
2217	fModeFlags = fParenStack.popi();
2218	U_ASSERT(fModeFlags < `0`);
2219
2220	// DO any additional fixups, depending on the specific kind of
2221	// parentesized grouping this is
2222
2223	switch (patIdx) {
2224	case plain:
2225	case flags:
2226	// No additional fixups required.
2227	// (Grouping-only parentheses)
2228	break;
2229	case capturing:
2230	// Capturing Parentheses.
2231	// Insert a End Capture op into the pattern.
2232	// The frame offset of the variables for this cg is obtained from the
2233	// start capture op and put it into the end-capture op.
2234	{
2235	int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+`1`);
2236	U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2237
2238	int32_t frameVarLocation = URX_VAL(captureOp);
2239	appendOp(URX_END_CAPTURE, frameVarLocation);
2240	}
2241	break;
2242	case atomic:
2243	// Atomic Parenthesis.
2244	// Insert a LD_SP operation to restore the state stack to the position
2245	// it was when the atomic parens were entered.
2246	{
2247	int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+`1`);
2248	U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2249	int32_t stoLoc = URX_VAL(stoOp);
2250	appendOp(URX_LD_SP, stoLoc);
2251	}
2252	break;
2253
2254	case lookAhead:
2255	{
2256	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-`5`);
2257	U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2258	int32_t dataLoc = URX_VAL(startOp);
2259	appendOp(URX_LA_END, dataLoc);
2260	}
2261	break;
2262
2263	case negLookAhead:
2264	{
2265	// See comment at doOpenLookAheadNeg
2266	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-`1`);
2267	U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2268	int32_t dataLoc = URX_VAL(startOp);
2269	appendOp(URX_LA_END, dataLoc);
2270	appendOp(URX_BACKTRACK, `0`);
2271	appendOp(URX_LA_END, dataLoc);
2272
2273	// Patch the URX_SAVE near the top of the block.
2274	// The destination of the SAVE is the final LA_END that was just added.
2275	int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2276	U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2277	int32_t dest = fRXPat->fCompiledPat->size()-`1`;
2278	saveOp = buildOp(URX_STATE_SAVE, dest);
2279	fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2280	}
2281	break;
2282
2283	case lookBehind:
2284	{
2285	// See comment at doOpenLookBehind.
2286
2287	// Append the URX_LB_END and URX_LA_END to the compiled pattern.
2288	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-`4`);
2289	U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2290	int32_t dataLoc = URX_VAL(startOp);
2291	appendOp(URX_LB_END, dataLoc);
2292	appendOp(URX_LA_END, dataLoc);
2293
2294	// Determine the min and max bounds for the length of the
2295	// string that the pattern can match.
2296	// An unbounded upper limit is an error.
2297	int32_t patEnd = fRXPat->fCompiledPat->size() - `1`;
2298	int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2299	int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2300	if (URX_TYPE(maxML) != `0`) {
2301	error(U_REGEX_LOOK_BEHIND_LIMIT);
2302	break;
2303	}
2304	if (maxML == INT32_MAX) {
2305	error(U_REGEX_LOOK_BEHIND_LIMIT);
2306	break;
2307	}
2308	if (minML == INT32_MAX) {
2309	// This condition happens when no match is possible, such as with a
2310	// [set] expression containing no elements.
2311	// In principle, the generated code to evaluate the expression could be deleted,
2312	// but it's probably not worth the complication.
2313	minML = `0`;
2314	}
2315	U_ASSERT(minML <= maxML);
2316
2317	// Insert the min and max match len bounds into the URX_LB_CONT op that
2318	// appears at the top of the look-behind block, at location fMatchOpenParen+1
2319	fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-`2`);
2320	fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-`1`);
2321
2322	}
2323	break;
2324
2325
2326
2327	case lookBehindN:
2328	{
2329	// See comment at doOpenLookBehindNeg.
2330
2331	// Append the URX_LBN_END to the compiled pattern.
2332	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-`5`);
2333	U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2334	int32_t dataLoc = URX_VAL(startOp);
2335	appendOp(URX_LBN_END, dataLoc);
2336
2337	// Determine the min and max bounds for the length of the
2338	// string that the pattern can match.
2339	// An unbounded upper limit is an error.
2340	int32_t patEnd = fRXPat->fCompiledPat->size() - `1`;
2341	int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2342	int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2343	if (URX_TYPE(maxML) != `0`) {
2344	error(U_REGEX_LOOK_BEHIND_LIMIT);
2345	break;
2346	}
2347	if (maxML == INT32_MAX) {
2348	error(U_REGEX_LOOK_BEHIND_LIMIT);
2349	break;
2350	}
2351	if (minML == INT32_MAX) {
2352	// This condition happens when no match is possible, such as with a
2353	// [set] expression containing no elements.
2354	// In principle, the generated code to evaluate the expression could be deleted,
2355	// but it's probably not worth the complication.
2356	minML = `0`;
2357	}
2358
2359	U_ASSERT(minML <= maxML);
2360
2361	// Insert the min and max match len bounds into the URX_LB_CONT op that
2362	// appears at the top of the look-behind block, at location fMatchOpenParen+1
2363	fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-`3`);
2364	fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-`2`);
2365
2366	// Insert the pattern location to continue at after a successful match
2367	// as the last operand of the URX_LBN_CONT
2368	int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2369	fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-`1`);
2370	}
2371	break;
2372
2373
2374
2375	default:
2376	UPRV_UNREACHABLE;
2377	}
2378
2379	// remember the next location in the compiled pattern.
2380	// The compilation of Quantifiers will look at this to see whether its looping
2381	// over a parenthesized block or a single item
2382	fMatchCloseParen = fRXPat->fCompiledPat->size();
2383	}
2384
2385
2386
2387	//------------------------------------------------------------------------------
2388	//
2389	// compileSet Compile the pattern operations for a reference to a
2390	// UnicodeSet.
2391	//
2392	//------------------------------------------------------------------------------
2393	void RegexCompile::compileSet(UnicodeSet *theSet)
2394	{
2395	if (theSet == NULL) {
2396	return;
2397	}
2398	// Remove any strings from the set.
2399	// There shoudn't be any, but just in case.
2400	// (Case Closure can add them; if we had a simple case closure avaialble that
2401	// ignored strings, that would be better.)
2402	theSet->removeAllStrings();
2403	int32_t setSize = theSet->size();
2404
2405	switch (setSize) {
2406	case `0`:
2407	{
2408	// Set of no elements. Always fails to match.
2409	appendOp(URX_BACKTRACK, `0`);
2410	delete theSet;
2411	}
2412	break;
2413
2414	case `1`:
2415	{
2416	// The set contains only a single code point. Put it into
2417	// the compiled pattern as a single char operation rather
2418	// than a set, and discard the set itself.
2419	literalChar(theSet->charAt(`0`));
2420	delete theSet;
2421	}
2422	break;
2423
2424	default:
2425	{
2426	// The set contains two or more chars. (the normal case)
2427	// Put it into the compiled pattern as a set.
2428	theSet->freeze();
2429	int32_t setNumber = fRXPat->fSets->size();
2430	fRXPat->fSets->addElement(theSet, *fStatus);
2431	appendOp(URX_SETREF, setNumber);
2432	}
2433	}
2434	}
2435
2436
2437	//------------------------------------------------------------------------------
2438	//
2439	// compileInterval Generate the code for a {min, max} style interval quantifier.
2440	// Except for the specific opcodes used, the code is the same
2441	// for all three types (greedy, non-greedy, possessive) of
2442	// intervals. The opcodes are supplied as parameters.
2443	// (There are two sets of opcodes - greedy & possessive use the
2444	// same ones, while non-greedy has it's own.)
2445	//
2446	// The code for interval loops has this form:
2447	// 0 CTR_INIT counter loc (in stack frame)
2448	// 1 5 patt address of CTR_LOOP at bottom of block
2449	// 2 min count
2450	// 3 max count (-1 for unbounded)
2451	// 4 ... block to be iterated over
2452	// 5 CTR_LOOP
2453	//
2454	// In
2455	//------------------------------------------------------------------------------
2456	void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
2457	{
2458	// The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2459	// four slots in the compiled code. Reserve them.
2460	int32_t topOfBlock = blockTopLoc(TRUE);
2461	insertOp(topOfBlock);
2462	insertOp(topOfBlock);
2463	insertOp(topOfBlock);
2464
2465	// The operands for the CTR_INIT opcode include the index in the matcher data
2466	// of the counter. Allocate it now. There are two data items
2467	// counterLoc --> Loop counter
2468	// +1 --> Input index (for breaking non-progressing loops)
2469	// (Only present if unbounded upper limit on loop)
2470	int32_t dataSize = fIntervalUpper < `0` ? `2` : `1`;
2471	int32_t counterLoc = allocateStackData(dataSize);
2472
2473	int32_t op = buildOp(InitOp, counterLoc);
2474	fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2475
2476	// The second operand of CTR_INIT is the location following the end of the loop.
2477	// Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2478	// compilation of something later on causes the code to grow and the target
2479	// position to move.
2480	int32_t loopEnd = fRXPat->fCompiledPat->size();
2481	op = buildOp(URX_RELOC_OPRND, loopEnd);
2482	fRXPat->fCompiledPat->setElementAt(op, topOfBlock+`1`);
2483
2484	// Followed by the min and max counts.
2485	fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+`2`);
2486	fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+`3`);
2487
2488	// Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2489	// Goes at end of the block being looped over, so just append to the code so far.
2490	appendOp(LoopOp, topOfBlock);
2491
2492	if ((fIntervalLow & `0xff000000`) != `0` \|\|
2493	(fIntervalUpper > `0` && (fIntervalUpper & `0xff000000`) != `0`)) {
2494	error(U_REGEX_NUMBER_TOO_BIG);
2495	}
2496
2497	if (fIntervalLow > fIntervalUpper && fIntervalUpper != -`1`) {
2498	error(U_REGEX_MAX_LT_MIN);
2499	}
2500	}
2501
2502
2503
2504	UBool RegexCompile::compileInlineInterval() {
2505	if (fIntervalUpper > `10` \|\| fIntervalUpper < fIntervalLow) {
2506	// Too big to inline. Fail, which will cause looping code to be generated.
2507	// (Upper < Lower picks up unbounded upper and errors, both.)
2508	return FALSE;
2509	}
2510
2511	int32_t topOfBlock = blockTopLoc(FALSE);
2512	if (fIntervalUpper == `0`) {
2513	// Pathological case. Attempt no matches, as if the block doesn't exist.
2514	// Discard the generated code for the block.
2515	// If the block included parens, discard the info pertaining to them as well.
2516	fRXPat->fCompiledPat->setSize(topOfBlock);
2517	if (fMatchOpenParen >= topOfBlock) {
2518	fMatchOpenParen = -`1`;
2519	}
2520	if (fMatchCloseParen >= topOfBlock) {
2521	fMatchCloseParen = -`1`;
2522	}
2523	return TRUE;
2524	}
2525
2526	if (topOfBlock != fRXPat->fCompiledPat->size()-`1` && fIntervalUpper != `1`) {
2527	// The thing being repeated is not a single op, but some
2528	// more complex block. Do it as a loop, not inlines.
2529	// Note that things "repeated" a max of once are handled as inline, because
2530	// the one copy of the code already generated is just fine.
2531	return FALSE;
2532	}
2533
2534	// Pick up the opcode that is to be repeated
2535	//
2536	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2537
2538	// Compute the pattern location where the inline sequence
2539	// will end, and set up the state save op that will be needed.
2540	//
2541	int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-`1`
2542	+ fIntervalUpper + (fIntervalUpper-fIntervalLow);
2543	int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2544	if (fIntervalLow == `0`) {
2545	insertOp(topOfBlock);
2546	fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2547	}
2548
2549
2550
2551	// Loop, emitting the op for the thing being repeated each time.
2552	// Loop starts at 1 because one instance of the op already exists in the pattern,
2553	// it was put there when it was originally encountered.
2554	int32_t i;
2555	for (i=`1`; i<fIntervalUpper; i++ ) {
2556	if (i >= fIntervalLow) {
2557	appendOp(saveOp);
2558	}
2559	appendOp(op);
2560	}
2561	return TRUE;
2562	}
2563
2564
2565
2566	//------------------------------------------------------------------------------
2567	//
2568	// caseInsensitiveStart given a single code point from a pattern string, determine the
2569	// set of characters that could potentially begin a case-insensitive
2570	// match of a string beginning with that character, using full Unicode
2571	// case insensitive matching.
2572	//
2573	// This is used in optimizing find().
2574	//
2575	// closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2576	// misses cases like this:
2577	// A string from the pattern begins with 'ss' (although all we know
2578	// in this context is that it begins with 's')
2579	// The pattern could match a string beginning with a German sharp-s
2580	//
2581	// To the ordinary case closure for a character c, we add all other
2582	// characters cx where the case closure of cx incudes a string form that begins
2583	// with the original character c.
2584	//
2585	// This function could be made smarter. The full pattern string is available
2586	// and it would be possible to verify that the extra characters being added
2587	// to the starting set fully match, rather than having just a first-char of the
2588	// folded form match.
2589	//
2590	//------------------------------------------------------------------------------
2591	void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2592
2593	// Machine Generated below.
2594	// It may need updating with new versions of Unicode.
2595	// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2596	// The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2597
2598	// Machine Generated Data. Do not hand edit.
2599	static const UChar32 RECaseFixCodePoints[] = {
2600	`0x61`, `0x66`, `0x68`, `0x69`, `0x6a`, `0x73`, `0x74`, `0x77`, `0x79`, `0x2bc`,
2601	`0x3ac`, `0x3ae`, `0x3b1`, `0x3b7`, `0x3b9`, `0x3c1`, `0x3c5`, `0x3c9`, `0x3ce`, `0x565`,
2602	`0x574`, `0x57e`, `0x1f00`, `0x1f01`, `0x1f02`, `0x1f03`, `0x1f04`, `0x1f05`, `0x1f06`, `0x1f07`,
2603	`0x1f20`, `0x1f21`, `0x1f22`, `0x1f23`, `0x1f24`, `0x1f25`, `0x1f26`, `0x1f27`, `0x1f60`, `0x1f61`,
2604	`0x1f62`, `0x1f63`, `0x1f64`, `0x1f65`, `0x1f66`, `0x1f67`, `0x1f70`, `0x1f74`, `0x1f7c`, `0x110000`};
2605
2606	static const int16_t RECaseFixStringOffsets[] = {
2607	`0x0`, `0x1`, `0x6`, `0x7`, `0x8`, `0x9`, `0xd`, `0xe`, `0xf`, `0x10`,
2608	`0x11`, `0x12`, `0x13`, `0x17`, `0x1b`, `0x20`, `0x21`, `0x2a`, `0x2e`, `0x2f`,
2609	`0x30`, `0x34`, `0x35`, `0x37`, `0x39`, `0x3b`, `0x3d`, `0x3f`, `0x41`, `0x43`,
2610	`0x45`, `0x47`, `0x49`, `0x4b`, `0x4d`, `0x4f`, `0x51`, `0x53`, `0x55`, `0x57`,
2611	`0x59`, `0x5b`, `0x5d`, `0x5f`, `0x61`, `0x63`, `0x65`, `0x66`, `0x67`, `0`};
2612
2613	static const int16_t RECaseFixCounts[] = {
2614	`0x1`, `0x5`, `0x1`, `0x1`, `0x1`, `0x4`, `0x1`, `0x1`, `0x1`, `0x1`,
2615	`0x1`, `0x1`, `0x4`, `0x4`, `0x5`, `0x1`, `0x9`, `0x4`, `0x1`, `0x1`,
2616	`0x4`, `0x1`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`,
2617	`0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`,
2618	`0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x1`, `0x1`, `0x1`, `0`};
2619
2620	static const UChar RECaseFixData[] = {
2621	`0x1e9a`, `0xfb00`, `0xfb01`, `0xfb02`, `0xfb03`, `0xfb04`, `0x1e96`, `0x130`, `0x1f0`, `0xdf`,
2622	`0x1e9e`, `0xfb05`, `0xfb06`, `0x1e97`, `0x1e98`, `0x1e99`, `0x149`, `0x1fb4`, `0x1fc4`, `0x1fb3`,
2623	`0x1fb6`, `0x1fb7`, `0x1fbc`, `0x1fc3`, `0x1fc6`, `0x1fc7`, `0x1fcc`, `0x390`, `0x1fd2`, `0x1fd3`,
2624	`0x1fd6`, `0x1fd7`, `0x1fe4`, `0x3b0`, `0x1f50`, `0x1f52`, `0x1f54`, `0x1f56`, `0x1fe2`, `0x1fe3`,
2625	`0x1fe6`, `0x1fe7`, `0x1ff3`, `0x1ff6`, `0x1ff7`, `0x1ffc`, `0x1ff4`, `0x587`, `0xfb13`, `0xfb14`,
2626	`0xfb15`, `0xfb17`, `0xfb16`, `0x1f80`, `0x1f88`, `0x1f81`, `0x1f89`, `0x1f82`, `0x1f8a`, `0x1f83`,
2627	`0x1f8b`, `0x1f84`, `0x1f8c`, `0x1f85`, `0x1f8d`, `0x1f86`, `0x1f8e`, `0x1f87`, `0x1f8f`, `0x1f90`,
2628	`0x1f98`, `0x1f91`, `0x1f99`, `0x1f92`, `0x1f9a`, `0x1f93`, `0x1f9b`, `0x1f94`, `0x1f9c`, `0x1f95`,
2629	`0x1f9d`, `0x1f96`, `0x1f9e`, `0x1f97`, `0x1f9f`, `0x1fa0`, `0x1fa8`, `0x1fa1`, `0x1fa9`, `0x1fa2`,
2630	`0x1faa`, `0x1fa3`, `0x1fab`, `0x1fa4`, `0x1fac`, `0x1fa5`, `0x1fad`, `0x1fa6`, `0x1fae`, `0x1fa7`,
2631	`0x1faf`, `0x1fb2`, `0x1fc2`, `0x1ff2`, `0`};
2632
2633	// End of machine generated data.
2634
2635	if (c < UCHAR_MIN_VALUE \|\| c > UCHAR_MAX_VALUE) {
2636	// This function should never be called with an invalid input character.
2637	UPRV_UNREACHABLE;
2638	} else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2639	UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2640	starterChars->set(caseFoldedC, caseFoldedC);
2641
2642	int32_t i;
2643	for (i=`0`; RECaseFixCodePoints[i]<c ; i++) {
2644	// Simple linear search through the sorted list of interesting code points.
2645	}
2646
2647	if (RECaseFixCodePoints[i] == c) {
2648	int32_t dataIndex = RECaseFixStringOffsets[i];
2649	int32_t numCharsToAdd = RECaseFixCounts[i];
2650	UChar32 cpToAdd = `0`;
2651	for (int32_t j=`0`; j<numCharsToAdd; j++) {
2652	U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2653	starterChars->add(cpToAdd);
2654	}
2655	}
2656
2657	starterChars->closeOver(USET_CASE_INSENSITIVE);
2658	starterChars->removeAllStrings();
2659	} else {
2660	// Not a cased character. Just return it alone.
2661	starterChars->set(c, c);
2662	}
2663	}
2664
2665
2666	// Increment with overflow check.
2667	// val and delta will both be positive.
2668
2669	static int32_t safeIncrement(int32_t val, int32_t delta) {
2670	if (INT32_MAX - val > delta) {
2671	return val + delta;
2672	} else {
2673	return INT32_MAX;
2674	}
2675	}
2676
2677
2678	//------------------------------------------------------------------------------
2679	//
2680	// matchStartType Determine how a match can start.
2681	// Used to optimize find() operations.
2682	//
2683	// Operation is very similar to minMatchLength(). Walk the compiled
2684	// pattern, keeping an on-going minimum-match-length. For any
2685	// op where the min match coming in is zero, add that ops possible
2686	// starting matches to the possible starts for the overall pattern.
2687	//
2688	//------------------------------------------------------------------------------
2689	void RegexCompile::matchStartType() {
2690	if (U_FAILURE(*fStatus)) {
2691	return;
2692	}
2693
2694
2695	int32_t loc; // Location in the pattern of the current op being processed.
2696	int32_t op; // The op being processed
2697	int32_t opType; // The opcode type of the op
2698	int32_t currentLen = `0`; // Minimum length of a match to this point (loc) in the pattern
2699	int32_t numInitialStrings = `0`; // Number of strings encountered that could match at start.
2700
2701	UBool atStart = TRUE; // True if no part of the pattern yet encountered
2702	// could have advanced the position in a match.
2703	// (Maximum match length so far == 0)
2704
2705	// forwardedLength is a vector holding minimum-match-length values that
2706	// are propagated forward in the pattern by JMP or STATE_SAVE operations.
2707	// It must be one longer than the pattern being checked because some ops
2708	// will jmp to a end-of-block+1 location from within a block, and we must
2709	// count those when checking the block.
2710	int32_t end = fRXPat->fCompiledPat->size();
2711	UVector32 forwardedLength(end+`1`, *fStatus);
2712	forwardedLength.setSize(end+`1`);
2713	for (loc=`3`; loc<end; loc++) {
2714	forwardedLength.setElementAt(INT32_MAX, loc);
2715	}
2716
2717	for (loc = `3`; loc<end; loc++) {
2718	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2719	opType = URX_TYPE(op);
2720
2721	// The loop is advancing linearly through the pattern.
2722	// If the op we are now at was the destination of a branch in the pattern,
2723	// and that path has a shorter minimum length than the current accumulated value,
2724	// replace the current accumulated value.
2725	if (forwardedLength.elementAti(loc) < currentLen) {
2726	currentLen = forwardedLength.elementAti(loc);
2727	U_ASSERT(currentLen>=`0` && currentLen < INT32_MAX);
2728	}
2729
2730	switch (opType) {
2731	// Ops that don't change the total length matched
2732	case URX_RESERVED_OP:
2733	case URX_END:
2734	case URX_FAIL:
2735	case URX_STRING_LEN:
2736	case URX_NOP:
2737	case URX_START_CAPTURE:
2738	case URX_END_CAPTURE:
2739	case URX_BACKSLASH_B:
2740	case URX_BACKSLASH_BU:
2741	case URX_BACKSLASH_G:
2742	case URX_BACKSLASH_Z:
2743	case URX_DOLLAR:
2744	case URX_DOLLAR_M:
2745	case URX_DOLLAR_D:
2746	case URX_DOLLAR_MD:
2747	case URX_RELOC_OPRND:
2748	case URX_STO_INP_LOC:
2749	case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2750	case URX_BACKREF_I:
2751
2752	case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2753	case URX_LD_SP:
2754	break;
2755
2756	case URX_CARET:
2757	if (atStart) {
2758	fRXPat->fStartType = START_START;
2759	}
2760	break;
2761
2762	case URX_CARET_M:
2763	case URX_CARET_M_UNIX:
2764	if (atStart) {
2765	fRXPat->fStartType = START_LINE;
2766	}
2767	break;
2768
2769	case URX_ONECHAR:
2770	if (currentLen == `0`) {
2771	// This character could appear at the start of a match.
2772	// Add it to the set of possible starting characters.
2773	fRXPat->fInitialChars->add(URX_VAL(op));
2774	numInitialStrings += `2`;
2775	}
2776	currentLen = safeIncrement(currentLen, `1`);
2777	atStart = FALSE;
2778	break;
2779
2780
2781	case URX_SETREF:
2782	if (currentLen == `0`) {
2783	int32_t sn = URX_VAL(op);
2784	U_ASSERT(sn > `0` && sn < fRXPat->fSets->size());
2785	const UnicodeSet s = (UnicodeSet )fRXPat->fSets->elementAt(sn);
2786	fRXPat->fInitialChars->addAll(*s);
2787	numInitialStrings += `2`;
2788	}
2789	currentLen = safeIncrement(currentLen, `1`);
2790	atStart = FALSE;
2791	break;
2792
2793	case URX_LOOP_SR_I:
2794	// [Set], like a SETREF, above, in what it can match,*
2795	// but may not match at all, so currentLen is not incremented.
2796	if (currentLen == `0`) {
2797	int32_t sn = URX_VAL(op);
2798	U_ASSERT(sn > `0` && sn < fRXPat->fSets->size());
2799	const UnicodeSet s = (UnicodeSet )fRXPat->fSets->elementAt(sn);
2800	fRXPat->fInitialChars->addAll(*s);
2801	numInitialStrings += `2`;
2802	}
2803	atStart = FALSE;
2804	break;
2805
2806	case URX_LOOP_DOT_I:
2807	if (currentLen == `0`) {
2808	// . at the start of a pattern.*
2809	// Any character can begin the match.
2810	fRXPat->fInitialChars->clear();
2811	fRXPat->fInitialChars->complement();
2812	numInitialStrings += `2`;
2813	}
2814	atStart = FALSE;
2815	break;
2816
2817
2818	case URX_STATIC_SETREF:
2819	if (currentLen == `0`) {
2820	int32_t sn = URX_VAL(op);
2821	U_ASSERT(sn>`0` && sn<URX_LAST_SET);
2822	const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
2823	fRXPat->fInitialChars->addAll(s);
2824	numInitialStrings += `2`;
2825	}
2826	currentLen = safeIncrement(currentLen, `1`);
2827	atStart = FALSE;
2828	break;
2829
2830
2831
2832	case URX_STAT_SETREF_N:
2833	if (currentLen == `0`) {
2834	int32_t sn = URX_VAL(op);
2835	UnicodeSet sc;
2836	sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
2837	fRXPat->fInitialChars->addAll(sc);
2838	numInitialStrings += `2`;
2839	}
2840	currentLen = safeIncrement(currentLen, `1`);
2841	atStart = FALSE;
2842	break;
2843
2844
2845
2846	case URX_BACKSLASH_D:
2847	// Digit Char
2848	if (currentLen == `0`) {
2849	UnicodeSet s;
2850	s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2851	if (URX_VAL(op) != `0`) {
2852	s.complement();
2853	}
2854	fRXPat->fInitialChars->addAll(s);
2855	numInitialStrings += `2`;
2856	}
2857	currentLen = safeIncrement(currentLen, `1`);
2858	atStart = FALSE;
2859	break;
2860
2861
2862	case URX_BACKSLASH_H:
2863	// Horiz white space
2864	if (currentLen == `0`) {
2865	UnicodeSet s;
2866	s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2867	s.add((UChar32)`9`); // Tab
2868	if (URX_VAL(op) != `0`) {
2869	s.complement();
2870	}
2871	fRXPat->fInitialChars->addAll(s);
2872	numInitialStrings += `2`;
2873	}
2874	currentLen = safeIncrement(currentLen, `1`);
2875	atStart = FALSE;
2876	break;
2877
2878
2879	case URX_BACKSLASH_R: // Any line ending sequence
2880	case URX_BACKSLASH_V: // Any line ending code point, with optional negation
2881	if (currentLen == `0`) {
2882	UnicodeSet s;
2883	s.add((UChar32)`0x0a`, (UChar32)`0x0d`); // add range
2884	s.add((UChar32)`0x85`);
2885	s.add((UChar32)`0x2028`, (UChar32)`0x2029`);
2886	if (URX_VAL(op) != `0`) {
2887	// Complement option applies to URX_BACKSLASH_V only.
2888	s.complement();
2889	}
2890	fRXPat->fInitialChars->addAll(s);
2891	numInitialStrings += `2`;
2892	}
2893	currentLen = safeIncrement(currentLen, `1`);
2894	atStart = FALSE;
2895	break;
2896
2897
2898
2899	case URX_ONECHAR_I:
2900	// Case Insensitive Single Character.
2901	if (currentLen == `0`) {
2902	UChar32 c = URX_VAL(op);
2903	if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2904	UnicodeSet starters(c, c);
2905	starters.closeOver(USET_CASE_INSENSITIVE);
2906	// findCaseInsensitiveStarters(c, &starters);
2907	// For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2908	// The expanded folding can't match the pattern.
2909	fRXPat->fInitialChars->addAll(starters);
2910	} else {
2911	// Char has no case variants. Just add it as-is to the
2912	// set of possible starting chars.
2913	fRXPat->fInitialChars->add(c);
2914	}
2915	numInitialStrings += `2`;
2916	}
2917	currentLen = safeIncrement(currentLen, `1`);
2918	atStart = FALSE;
2919	break;
2920
2921
2922	case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2923	case URX_DOTANY_ALL: // . matches one or two.
2924	case URX_DOTANY:
2925	case URX_DOTANY_UNIX:
2926	if (currentLen == `0`) {
2927	// These constructs are all bad news when they appear at the start
2928	// of a match. Any character can begin the match.
2929	fRXPat->fInitialChars->clear();
2930	fRXPat->fInitialChars->complement();
2931	numInitialStrings += `2`;
2932	}
2933	currentLen = safeIncrement(currentLen, `1`);
2934	atStart = FALSE;
2935	break;
2936
2937
2938	case URX_JMPX:
2939	loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2940	U_FALLTHROUGH;
2941	case URX_JMP:
2942	{
2943	int32_t jmpDest = URX_VAL(op);
2944	if (jmpDest < loc) {
2945	// Loop of some kind. Can safely ignore, the worst that will happen
2946	// is that we understate the true minimum length
2947	currentLen = forwardedLength.elementAti(loc+`1`);
2948
2949	} else {
2950	// Forward jump. Propagate the current min length to the target loc of the jump.
2951	U_ASSERT(jmpDest <= end+`1`);
2952	if (forwardedLength.elementAti(jmpDest) > currentLen) {
2953	forwardedLength.setElementAt(currentLen, jmpDest);
2954	}
2955	}
2956	}
2957	atStart = FALSE;
2958	break;
2959
2960	case URX_JMP_SAV:
2961	case URX_JMP_SAV_X:
2962	// Combo of state save to the next loc, + jmp backwards.
2963	// Net effect on min. length computation is nothing.
2964	atStart = FALSE;
2965	break;
2966
2967	case URX_BACKTRACK:
2968	// Fails are kind of like a branch, except that the min length was
2969	// propagated already, by the state save.
2970	currentLen = forwardedLength.elementAti(loc+`1`);
2971	atStart = FALSE;
2972	break;
2973
2974
2975	case URX_STATE_SAVE:
2976	{
2977	// State Save, for forward jumps, propagate the current minimum.
2978	// of the state save.
2979	int32_t jmpDest = URX_VAL(op);
2980	if (jmpDest > loc) {
2981	if (currentLen < forwardedLength.elementAti(jmpDest)) {
2982	forwardedLength.setElementAt(currentLen, jmpDest);
2983	}
2984	}
2985	}
2986	atStart = FALSE;
2987	break;
2988
2989
2990
2991
2992	case URX_STRING:
2993	{
2994	loc++;
2995	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2996	int32_t stringLen = URX_VAL(stringLenOp);
2997	U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2998	U_ASSERT(stringLenOp >= `2`);
2999	if (currentLen == `0`) {
3000	// Add the starting character of this string to the set of possible starting
3001	// characters for this pattern.
3002	int32_t stringStartIdx = URX_VAL(op);
3003	UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3004	fRXPat->fInitialChars->add(c);
3005
3006	// Remember this string. After the entire pattern has been checked,
3007	// if nothing else is identified that can start a match, we'll use it.
3008	numInitialStrings++;
3009	fRXPat->fInitialStringIdx = stringStartIdx;
3010	fRXPat->fInitialStringLen = stringLen;
3011	}
3012
3013	currentLen = safeIncrement(currentLen, stringLen);
3014	atStart = FALSE;
3015	}
3016	break;
3017
3018	case URX_STRING_I:
3019	{
3020	// Case-insensitive string. Unlike exact-match strings, we won't
3021	// attempt a string search for possible match positions. But we
3022	// do update the set of possible starting characters.
3023	loc++;
3024	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3025	int32_t stringLen = URX_VAL(stringLenOp);
3026	U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3027	U_ASSERT(stringLenOp >= `2`);
3028	if (currentLen == `0`) {
3029	// Add the starting character of this string to the set of possible starting
3030	// characters for this pattern.
3031	int32_t stringStartIdx = URX_VAL(op);
3032	UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3033	UnicodeSet s;
3034	findCaseInsensitiveStarters(c, &s);
3035	fRXPat->fInitialChars->addAll(s);
3036	numInitialStrings += `2`; // Matching on an initial string not possible.
3037	}
3038	currentLen = safeIncrement(currentLen, stringLen);
3039	atStart = FALSE;
3040	}
3041	break;
3042
3043	case URX_CTR_INIT:
3044	case URX_CTR_INIT_NG:
3045	{
3046	// Loop Init Ops. These don't change the min length, but they are 4 word ops
3047	// so location must be updated accordingly.
3048	// Loop Init Ops.
3049	// If the min loop count == 0
3050	// move loc forwards to the end of the loop, skipping over the body.
3051	// If the min count is > 0,
3052	// continue normal processing of the body of the loop.
3053	int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+`1`);
3054	loopEndLoc = URX_VAL(loopEndLoc);
3055	int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+`2`);
3056	if (minLoopCount == `0`) {
3057	// Min Loop Count of 0, treat like a forward branch and
3058	// move the current minimum length up to the target
3059	// (end of loop) location.
3060	U_ASSERT(loopEndLoc <= end+`1`);
3061	if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3062	forwardedLength.setElementAt(currentLen, loopEndLoc);
3063	}
3064	}
3065	loc+=`3`; // Skips over operands of CTR_INIT
3066	}
3067	atStart = FALSE;
3068	break;
3069
3070
3071	case URX_CTR_LOOP:
3072	case URX_CTR_LOOP_NG:
3073	// Loop ops.
3074	// The jump is conditional, backwards only.
3075	atStart = FALSE;
3076	break;
3077
3078	case URX_LOOP_C:
3079	// More loop ops. These state-save to themselves.
3080	// don't change the minimum match
3081	atStart = FALSE;
3082	break;
3083
3084
3085	case URX_LA_START:
3086	case URX_LB_START:
3087	{
3088	// Look-around. Scan forward until the matching look-ahead end,
3089	// without processing the look-around block. This is overly pessimistic.
3090
3091	// Keep track of the nesting depth of look-around blocks. Boilerplate code for
3092	// lookahead contains two LA_END instructions, so count goes up by two
3093	// for each LA_START.
3094	int32_t depth = (opType == URX_LA_START? `2`: `1`);
3095	for (;;) {
3096	loc++;
3097	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3098	if (URX_TYPE(op) == URX_LA_START) {
3099	depth+=`2`;
3100	}
3101	if (URX_TYPE(op) == URX_LB_START) {
3102	depth++;
3103	}
3104	if (URX_TYPE(op) == URX_LA_END \|\| URX_TYPE(op)==URX_LBN_END) {
3105	depth--;
3106	if (depth == `0`) {
3107	break;
3108	}
3109	}
3110	if (URX_TYPE(op) == URX_STATE_SAVE) {
3111	// Need this because neg lookahead blocks will FAIL to outside
3112	// of the block.
3113	int32_t jmpDest = URX_VAL(op);
3114	if (jmpDest > loc) {
3115	if (currentLen < forwardedLength.elementAti(jmpDest)) {
3116	forwardedLength.setElementAt(currentLen, jmpDest);
3117	}
3118	}
3119	}
3120	U_ASSERT(loc <= end);
3121	}
3122	}
3123	break;
3124
3125	case URX_LA_END:
3126	case URX_LB_CONT:
3127	case URX_LB_END:
3128	case URX_LBN_CONT:
3129	case URX_LBN_END:
3130	UPRV_UNREACHABLE; // Shouldn't get here. These ops should be
3131	// consumed by the scan in URX_LA_START and LB_START
3132	default:
3133	UPRV_UNREACHABLE;
3134	}
3135
3136	}
3137
3138
3139	// We have finished walking through the ops. Check whether some forward jump
3140	// propagated a shorter length to location end+1.
3141	if (forwardedLength.elementAti(end+`1`) < currentLen) {
3142	currentLen = forwardedLength.elementAti(end+`1`);
3143	}
3144
3145
3146	fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3147
3148
3149	// Sort out what we should check for when looking for candidate match start positions.
3150	// In order of preference,
3151	// 1. Start of input text buffer.
3152	// 2. A literal string.
3153	// 3. Start of line in multi-line mode.
3154	// 4. A single literal character.
3155	// 5. A character from a set of characters.
3156	//
3157	if (fRXPat->fStartType == START_START) {
3158	// Match only at the start of an input text string.
3159	// start type is already set. We're done.
3160	} else if (numInitialStrings == `1` && fRXPat->fMinMatchLen > `0`) {
3161	// Match beginning only with a literal string.
3162	UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3163	U_ASSERT(fRXPat->fInitialChars->contains(c));
3164	fRXPat->fStartType = START_STRING;
3165	fRXPat->fInitialChar = c;
3166	} else if (fRXPat->fStartType == START_LINE) {
3167	// Match at start of line in Multi-Line mode.
3168	// Nothing to do here; everything is already set.
3169	} else if (fRXPat->fMinMatchLen == `0`) {
3170	// Zero length match possible. We could start anywhere.
3171	fRXPat->fStartType = START_NO_INFO;
3172	} else if (fRXPat->fInitialChars->size() == `1`) {
3173	// All matches begin with the same char.
3174	fRXPat->fStartType = START_CHAR;
3175	fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(`0`);
3176	U_ASSERT(fRXPat->fInitialChar != (UChar32)-`1`);
3177	} else if (fRXPat->fInitialChars->contains((UChar32)`0`, (UChar32)`0x10ffff`) == FALSE &&
3178	fRXPat->fMinMatchLen > `0`) {
3179	// Matches start with a set of character smaller than the set of all chars.
3180	fRXPat->fStartType = START_SET;
3181	} else {
3182	// Matches can start with anything
3183	fRXPat->fStartType = START_NO_INFO;
3184	}
3185
3186	return;
3187	}
3188
3189
3190
3191	//------------------------------------------------------------------------------
3192	//
3193	// minMatchLength Calculate the length of the shortest string that could
3194	// match the specified pattern.
3195	// Length is in 16 bit code units, not code points.
3196	//
3197	// The calculated length may not be exact. The returned
3198	// value may be shorter than the actual minimum; it must
3199	// never be longer.
3200	//
3201	// start and end are the range of p-code operations to be
3202	// examined. The endpoints are included in the range.
3203	//
3204	//------------------------------------------------------------------------------
3205	int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
3206	if (U_FAILURE(*fStatus)) {
3207	return `0`;
3208	}
3209
3210	U_ASSERT(start <= end);
3211	U_ASSERT(end < fRXPat->fCompiledPat->size());
3212
3213
3214	int32_t loc;
3215	int32_t op;
3216	int32_t opType;
3217	int32_t currentLen = `0`;
3218
3219
3220	// forwardedLength is a vector holding minimum-match-length values that
3221	// are propagated forward in the pattern by JMP or STATE_SAVE operations.
3222	// It must be one longer than the pattern being checked because some ops
3223	// will jmp to a end-of-block+1 location from within a block, and we must
3224	// count those when checking the block.
3225	UVector32 forwardedLength(end+`2`, *fStatus);
3226	forwardedLength.setSize(end+`2`);
3227	for (loc=start; loc<=end+`1`; loc++) {
3228	forwardedLength.setElementAt(INT32_MAX, loc);
3229	}
3230
3231	for (loc = start; loc<=end; loc++) {
3232	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3233	opType = URX_TYPE(op);
3234
3235	// The loop is advancing linearly through the pattern.
3236	// If the op we are now at was the destination of a branch in the pattern,
3237	// and that path has a shorter minimum length than the current accumulated value,
3238	// replace the current accumulated value.
3239	// U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3240	// no-match-possible cases.
3241	if (forwardedLength.elementAti(loc) < currentLen) {
3242	currentLen = forwardedLength.elementAti(loc);
3243	U_ASSERT(currentLen>=`0` && currentLen < INT32_MAX);
3244	}
3245
3246	switch (opType) {
3247	// Ops that don't change the total length matched
3248	case URX_RESERVED_OP:
3249	case URX_END:
3250	case URX_STRING_LEN:
3251	case URX_NOP:
3252	case URX_START_CAPTURE:
3253	case URX_END_CAPTURE:
3254	case URX_BACKSLASH_B:
3255	case URX_BACKSLASH_BU:
3256	case URX_BACKSLASH_G:
3257	case URX_BACKSLASH_Z:
3258	case URX_CARET:
3259	case URX_DOLLAR:
3260	case URX_DOLLAR_M:
3261	case URX_DOLLAR_D:
3262	case URX_DOLLAR_MD:
3263	case URX_RELOC_OPRND:
3264	case URX_STO_INP_LOC:
3265	case URX_CARET_M:
3266	case URX_CARET_M_UNIX:
3267	case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3268	case URX_BACKREF_I:
3269
3270	case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3271	case URX_LD_SP:
3272
3273	case URX_JMP_SAV:
3274	case URX_JMP_SAV_X:
3275	break;
3276
3277
3278	// Ops that match a minimum of one character (one or two 16 bit code units.)
3279	//
3280	case URX_ONECHAR:
3281	case URX_STATIC_SETREF:
3282	case URX_STAT_SETREF_N:
3283	case URX_SETREF:
3284	case URX_BACKSLASH_D:
3285	case URX_BACKSLASH_H:
3286	case URX_BACKSLASH_R:
3287	case URX_BACKSLASH_V:
3288	case URX_ONECHAR_I:
3289	case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3290	case URX_DOTANY_ALL: // . matches one or two.
3291	case URX_DOTANY:
3292	case URX_DOTANY_UNIX:
3293	currentLen = safeIncrement(currentLen, `1`);
3294	break;
3295
3296
3297	case URX_JMPX:
3298	loc++; // URX_JMPX has an extra operand, ignored here,
3299	// otherwise processed identically to URX_JMP.
3300	U_FALLTHROUGH;
3301	case URX_JMP:
3302	{
3303	int32_t jmpDest = URX_VAL(op);
3304	if (jmpDest < loc) {
3305	// Loop of some kind. Can safely ignore, the worst that will happen
3306	// is that we understate the true minimum length
3307	currentLen = forwardedLength.elementAti(loc+`1`);
3308	} else {
3309	// Forward jump. Propagate the current min length to the target loc of the jump.
3310	U_ASSERT(jmpDest <= end+`1`);
3311	if (forwardedLength.elementAti(jmpDest) > currentLen) {
3312	forwardedLength.setElementAt(currentLen, jmpDest);
3313	}
3314	}
3315	}
3316	break;
3317
3318	case URX_BACKTRACK:
3319	{
3320	// Back-tracks are kind of like a branch, except that the min length was
3321	// propagated already, by the state save.
3322	currentLen = forwardedLength.elementAti(loc+`1`);
3323	}
3324	break;
3325
3326
3327	case URX_STATE_SAVE:
3328	{
3329	// State Save, for forward jumps, propagate the current minimum.
3330	// of the state save.
3331	int32_t jmpDest = URX_VAL(op);
3332	if (jmpDest > loc) {
3333	if (currentLen < forwardedLength.elementAti(jmpDest)) {
3334	forwardedLength.setElementAt(currentLen, jmpDest);
3335	}
3336	}
3337	}
3338	break;
3339
3340
3341	case URX_STRING:
3342	{
3343	loc++;
3344	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3345	currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3346	}
3347	break;
3348
3349
3350	case URX_STRING_I:
3351	{
3352	loc++;
3353	// TODO: with full case folding, matching input text may be shorter than
3354	// the string we have here. More smarts could put some bounds on it.
3355	// Assume a min length of one for now. A min length of zero causes
3356	// optimization failures for a pattern like "string"+
3357	// currentLen += URX_VAL(stringLenOp);
3358	currentLen = safeIncrement(currentLen, `1`);
3359	}
3360	break;
3361
3362	case URX_CTR_INIT:
3363	case URX_CTR_INIT_NG:
3364	{
3365	// Loop Init Ops.
3366	// If the min loop count == 0
3367	// move loc forwards to the end of the loop, skipping over the body.
3368	// If the min count is > 0,
3369	// continue normal processing of the body of the loop.
3370	int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+`1`);
3371	loopEndLoc = URX_VAL(loopEndLoc);
3372	int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+`2`);
3373	if (minLoopCount == `0`) {
3374	loc = loopEndLoc;
3375	} else {
3376	loc+=`3`; // Skips over operands of CTR_INIT
3377	}
3378	}
3379	break;
3380
3381
3382	case URX_CTR_LOOP:
3383	case URX_CTR_LOOP_NG:
3384	// Loop ops.
3385	// The jump is conditional, backwards only.
3386	break;
3387
3388	case URX_LOOP_SR_I:
3389	case URX_LOOP_DOT_I:
3390	case URX_LOOP_C:
3391	// More loop ops. These state-save to themselves.
3392	// don't change the minimum match - could match nothing at all.
3393	break;
3394
3395
3396	case URX_LA_START:
3397	case URX_LB_START:
3398	{
3399	// Look-around. Scan forward until the matching look-ahead end,
3400	// without processing the look-around block. This is overly pessimistic for look-ahead,
3401	// it assumes that the look-ahead match might be zero-length.
3402	// TODO: Positive lookahead could recursively do the block, then continue
3403	// with the longer of the block or the value coming in. Ticket 6060
3404	int32_t depth = (opType == URX_LA_START? `2`: `1`);
3405	for (;;) {
3406	loc++;
3407	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3408	if (URX_TYPE(op) == URX_LA_START) {
3409	// The boilerplate for look-ahead includes two LA_END insturctions,
3410	// Depth will be decremented by each one when it is seen.
3411	depth += `2`;
3412	}
3413	if (URX_TYPE(op) == URX_LB_START) {
3414	depth++;
3415	}
3416	if (URX_TYPE(op) == URX_LA_END) {
3417	depth--;
3418	if (depth == `0`) {
3419	break;
3420	}
3421	}
3422	if (URX_TYPE(op)==URX_LBN_END) {
3423	depth--;
3424	if (depth == `0`) {
3425	break;
3426	}
3427	}
3428	if (URX_TYPE(op) == URX_STATE_SAVE) {
3429	// Need this because neg lookahead blocks will FAIL to outside
3430	// of the block.
3431	int32_t jmpDest = URX_VAL(op);
3432	if (jmpDest > loc) {
3433	if (currentLen < forwardedLength.elementAti(jmpDest)) {
3434	forwardedLength.setElementAt(currentLen, jmpDest);
3435	}
3436	}
3437	}
3438	U_ASSERT(loc <= end);
3439	}
3440	}
3441	break;
3442
3443	case URX_LA_END:
3444	case URX_LB_CONT:
3445	case URX_LB_END:
3446	case URX_LBN_CONT:
3447	case URX_LBN_END:
3448	// Only come here if the matching URX_LA_START or URX_LB_START was not in the
3449	// range being sized, which happens when measuring size of look-behind blocks.
3450	break;
3451
3452	default:
3453	UPRV_UNREACHABLE;
3454	}
3455
3456	}
3457
3458	// We have finished walking through the ops. Check whether some forward jump
3459	// propagated a shorter length to location end+1.
3460	if (forwardedLength.elementAti(end+`1`) < currentLen) {
3461	currentLen = forwardedLength.elementAti(end+`1`);
3462	U_ASSERT(currentLen>=`0` && currentLen < INT32_MAX);
3463	}
3464
3465	return currentLen;
3466	}
3467
3468	//------------------------------------------------------------------------------
3469	//
3470	// maxMatchLength Calculate the length of the longest string that could
3471	// match the specified pattern.
3472	// Length is in 16 bit code units, not code points.
3473	//
3474	// The calculated length may not be exact. The returned
3475	// value may be longer than the actual maximum; it must
3476	// never be shorter.
3477	//
3478	//------------------------------------------------------------------------------
3479	int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3480	if (U_FAILURE(*fStatus)) {
3481	return `0`;
3482	}
3483	U_ASSERT(start <= end);
3484	U_ASSERT(end < fRXPat->fCompiledPat->size());
3485
3486	int32_t loc;
3487	int32_t op;
3488	int32_t opType;
3489	int32_t currentLen = `0`;
3490	UVector32 forwardedLength(end+`1`, *fStatus);
3491	forwardedLength.setSize(end+`1`);
3492
3493	for (loc=start; loc<=end; loc++) {
3494	forwardedLength.setElementAt(`0`, loc);
3495	}
3496
3497	for (loc = start; loc<=end; loc++) {
3498	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3499	opType = URX_TYPE(op);
3500
3501	// The loop is advancing linearly through the pattern.
3502	// If the op we are now at was the destination of a branch in the pattern,
3503	// and that path has a longer maximum length than the current accumulated value,
3504	// replace the current accumulated value.
3505	if (forwardedLength.elementAti(loc) > currentLen) {
3506	currentLen = forwardedLength.elementAti(loc);
3507	}
3508
3509	switch (opType) {
3510	// Ops that don't change the total length matched
3511	case URX_RESERVED_OP:
3512	case URX_END:
3513	case URX_STRING_LEN:
3514	case URX_NOP:
3515	case URX_START_CAPTURE:
3516	case URX_END_CAPTURE:
3517	case URX_BACKSLASH_B:
3518	case URX_BACKSLASH_BU:
3519	case URX_BACKSLASH_G:
3520	case URX_BACKSLASH_Z:
3521	case URX_CARET:
3522	case URX_DOLLAR:
3523	case URX_DOLLAR_M:
3524	case URX_DOLLAR_D:
3525	case URX_DOLLAR_MD:
3526	case URX_RELOC_OPRND:
3527	case URX_STO_INP_LOC:
3528	case URX_CARET_M:
3529	case URX_CARET_M_UNIX:
3530
3531	case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3532	case URX_LD_SP:
3533
3534	case URX_LB_END:
3535	case URX_LB_CONT:
3536	case URX_LBN_CONT:
3537	case URX_LBN_END:
3538	break;
3539
3540
3541	// Ops that increase that cause an unbounded increase in the length
3542	// of a matched string, or that increase it a hard to characterize way.
3543	// Call the max length unbounded, and stop further checking.
3544	case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3545	case URX_BACKREF_I:
3546	case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3547	currentLen = INT32_MAX;
3548	break;
3549
3550
3551	// Ops that match a max of one character (possibly two 16 bit code units.)
3552	//
3553	case URX_STATIC_SETREF:
3554	case URX_STAT_SETREF_N:
3555	case URX_SETREF:
3556	case URX_BACKSLASH_D:
3557	case URX_BACKSLASH_H:
3558	case URX_BACKSLASH_R:
3559	case URX_BACKSLASH_V:
3560	case URX_ONECHAR_I:
3561	case URX_DOTANY_ALL:
3562	case URX_DOTANY:
3563	case URX_DOTANY_UNIX:
3564	currentLen = safeIncrement(currentLen, `2`);
3565	break;
3566
3567	// Single literal character. Increase current max length by one or two,
3568	// depending on whether the char is in the supplementary range.
3569	case URX_ONECHAR:
3570	currentLen = safeIncrement(currentLen, `1`);
3571	if (URX_VAL(op) > `0x10000`) {
3572	currentLen = safeIncrement(currentLen, `1`);
3573	}
3574	break;
3575
3576	// Jumps.
3577	//
3578	case URX_JMP:
3579	case URX_JMPX:
3580	case URX_JMP_SAV:
3581	case URX_JMP_SAV_X:
3582	{
3583	int32_t jmpDest = URX_VAL(op);
3584	if (jmpDest < loc) {
3585	// Loop of some kind. Max match length is unbounded.
3586	currentLen = INT32_MAX;
3587	} else {
3588	// Forward jump. Propagate the current min length to the target loc of the jump.
3589	if (forwardedLength.elementAti(jmpDest) < currentLen) {
3590	forwardedLength.setElementAt(currentLen, jmpDest);
3591	}
3592	currentLen = `0`;
3593	}
3594	}
3595	break;
3596
3597	case URX_BACKTRACK:
3598	// back-tracks are kind of like a branch, except that the max length was
3599	// propagated already, by the state save.
3600	currentLen = forwardedLength.elementAti(loc+`1`);
3601	break;
3602
3603
3604	case URX_STATE_SAVE:
3605	{
3606	// State Save, for forward jumps, propagate the current minimum.
3607	// of the state save.
3608	// For backwards jumps, they create a loop, maximum
3609	// match length is unbounded.
3610	int32_t jmpDest = URX_VAL(op);
3611	if (jmpDest > loc) {
3612	if (currentLen > forwardedLength.elementAti(jmpDest)) {
3613	forwardedLength.setElementAt(currentLen, jmpDest);
3614	}
3615	} else {
3616	currentLen = INT32_MAX;
3617	}
3618	}
3619	break;
3620
3621
3622
3623
3624	case URX_STRING:
3625	{
3626	loc++;
3627	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3628	currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3629	break;
3630	}
3631
3632	case URX_STRING_I:
3633	// TODO: This code assumes that any user string that matches will be no longer
3634	// than our compiled string, with case insensitive matching.
3635	// Our compiled string has been case-folded already.
3636	//
3637	// Any matching user string will have no more code points than our
3638	// compiled (folded) string. Folding may add code points, but
3639	// not remove them.
3640	//
3641	// There is a potential problem if a supplemental code point
3642	// case-folds to a BMP code point. In this case our compiled string
3643	// could be shorter (in code units) than a matching user string.
3644	//
3645	// At this time (Unicode 6.1) there are no such characters, and this case
3646	// is not being handled. A test, intltest regex/Bug9283, will fail if
3647	// any problematic characters are added to Unicode.
3648	//
3649	// If this happens, we can make a set of the BMP chars that the
3650	// troublesome supplementals fold to, scan our string, and bump the
3651	// currentLen one extra for each that is found.
3652	//
3653	{
3654	loc++;
3655	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3656	currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3657	}
3658	break;
3659
3660	case URX_CTR_INIT:
3661	case URX_CTR_INIT_NG:
3662	// For Loops, recursively call this function on the pattern for the loop body,
3663	// then multiply the result by the maximum loop count.
3664	{
3665	int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+`1`));
3666	if (loopEndLoc == loc+`4`) {
3667	// Loop has an empty body. No affect on max match length.
3668	// Continue processing with code after the loop end.
3669	loc = loopEndLoc;
3670	break;
3671	}
3672
3673	int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+`3`));
3674	if (maxLoopCount == -`1`) {
3675	// Unbounded Loop. No upper bound on match length.
3676	currentLen = INT32_MAX;
3677	break;
3678	}
3679
3680	U_ASSERT(loopEndLoc >= loc+`4`);
3681	int64_t blockLen = maxMatchLength(loc+`4`, loopEndLoc-`1`); // Recursive call.
3682	int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3683	if (updatedLen >= INT32_MAX) {
3684	currentLen = INT32_MAX;
3685	break;
3686	}
3687	currentLen = (int32_t)updatedLen;
3688	loc = loopEndLoc;
3689	break;
3690	}
3691
3692	case URX_CTR_LOOP:
3693	case URX_CTR_LOOP_NG:
3694	// These opcodes will be skipped over by code for URX_CTR_INIT.
3695	// We shouldn't encounter them here.
3696	UPRV_UNREACHABLE;
3697
3698	case URX_LOOP_SR_I:
3699	case URX_LOOP_DOT_I:
3700	case URX_LOOP_C:
3701	// For anything to do with loops, make the match length unbounded.
3702	currentLen = INT32_MAX;
3703	break;
3704
3705
3706
3707	case URX_LA_START:
3708	case URX_LA_END:
3709	// Look-ahead. Just ignore, treat the look-ahead block as if
3710	// it were normal pattern. Gives a too-long match length,
3711	// but good enough for now.
3712	break;
3713
3714	// End of look-ahead ops should always be consumed by the processing at
3715	// the URX_LA_START op.
3716	// UPRV_UNREACHABLE;
3717
3718	case URX_LB_START:
3719	{
3720	// Look-behind. Scan forward until the matching look-around end,
3721	// without processing the look-behind block.
3722	int32_t dataLoc = URX_VAL(op);
3723	for (loc = loc + `1`; loc < end; ++loc) {
3724	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3725	int32_t opType = URX_TYPE(op);
3726	if ((opType == URX_LA_END \|\| opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3727	break;
3728	}
3729	}
3730	U_ASSERT(loc < end);
3731	}
3732	break;
3733
3734	default:
3735	UPRV_UNREACHABLE;
3736	}
3737
3738
3739	if (currentLen == INT32_MAX) {
3740	// The maximum length is unbounded.
3741	// Stop further processing of the pattern.
3742	break;
3743	}
3744
3745	}
3746	return currentLen;
3747
3748	}
3749
3750
3751	//------------------------------------------------------------------------------
3752	//
3753	// stripNOPs Remove any NOP operations from the compiled pattern code.
3754	// Extra NOPs are inserted for some constructs during the initial
3755	// code generation to provide locations that may be patched later.
3756	// Many end up unneeded, and are removed by this function.
3757	//
3758	// In order to minimize the number of passes through the pattern,
3759	// back-reference fixup is also performed here (adjusting
3760	// back-reference operands to point to the correct frame offsets).
3761	//
3762	//------------------------------------------------------------------------------
3763	void RegexCompile::stripNOPs() {
3764
3765	if (U_FAILURE(*fStatus)) {
3766	return;
3767	}
3768
3769	int32_t end = fRXPat->fCompiledPat->size();
3770	UVector32 deltas(end, *fStatus);
3771
3772	// Make a first pass over the code, computing the amount that things
3773	// will be offset at each location in the original code.
3774	int32_t loc;
3775	int32_t d = `0`;
3776	for (loc=`0`; loc<end; loc++) {
3777	deltas.addElement(d, *fStatus);
3778	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3779	if (URX_TYPE(op) == URX_NOP) {
3780	d++;
3781	}
3782	}
3783
3784	UnicodeString caseStringBuffer;
3785
3786	// Make a second pass over the code, removing the NOPs by moving following
3787	// code up, and patching operands that refer to code locations that
3788	// are being moved. The array of offsets from the first step is used
3789	// to compute the new operand values.
3790	int32_t src;
3791	int32_t dst = `0`;
3792	for (src=`0`; src<end; src++) {
3793	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3794	int32_t opType = URX_TYPE(op);
3795	switch (opType) {
3796	case URX_NOP:
3797	break;
3798
3799	case URX_STATE_SAVE:
3800	case URX_JMP:
3801	case URX_CTR_LOOP:
3802	case URX_CTR_LOOP_NG:
3803	case URX_RELOC_OPRND:
3804	case URX_JMPX:
3805	case URX_JMP_SAV:
3806	case URX_JMP_SAV_X:
3807	// These are instructions with operands that refer to code locations.
3808	{
3809	int32_t operandAddress = URX_VAL(op);
3810	U_ASSERT(operandAddress>=`0` && operandAddress<deltas.size());
3811	int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3812	op = buildOp(opType, fixedOperandAddress);
3813	fRXPat->fCompiledPat->setElementAt(op, dst);
3814	dst++;
3815	break;
3816	}
3817
3818	case URX_BACKREF:
3819	case URX_BACKREF_I:
3820	{
3821	int32_t where = URX_VAL(op);
3822	if (where > fRXPat->fGroupMap->size()) {
3823	error(U_REGEX_INVALID_BACK_REF);
3824	break;
3825	}
3826	where = fRXPat->fGroupMap->elementAti(where-`1`);
3827	op = buildOp(opType, where);
3828	fRXPat->fCompiledPat->setElementAt(op, dst);
3829	dst++;
3830
3831	fRXPat->fNeedsAltInput = TRUE;
3832	break;
3833	}
3834	case URX_RESERVED_OP:
3835	case URX_RESERVED_OP_N:
3836	case URX_BACKTRACK:
3837	case URX_END:
3838	case URX_ONECHAR:
3839	case URX_STRING:
3840	case URX_STRING_LEN:
3841	case URX_START_CAPTURE:
3842	case URX_END_CAPTURE:
3843	case URX_STATIC_SETREF:
3844	case URX_STAT_SETREF_N:
3845	case URX_SETREF:
3846	case URX_DOTANY:
3847	case URX_FAIL:
3848	case URX_BACKSLASH_B:
3849	case URX_BACKSLASH_BU:
3850	case URX_BACKSLASH_G:
3851	case URX_BACKSLASH_X:
3852	case URX_BACKSLASH_Z:
3853	case URX_DOTANY_ALL:
3854	case URX_BACKSLASH_D:
3855	case URX_CARET:
3856	case URX_DOLLAR:
3857	case URX_CTR_INIT:
3858	case URX_CTR_INIT_NG:
3859	case URX_DOTANY_UNIX:
3860	case URX_STO_SP:
3861	case URX_LD_SP:
3862	case URX_STO_INP_LOC:
3863	case URX_LA_START:
3864	case URX_LA_END:
3865	case URX_ONECHAR_I:
3866	case URX_STRING_I:
3867	case URX_DOLLAR_M:
3868	case URX_CARET_M:
3869	case URX_CARET_M_UNIX:
3870	case URX_LB_START:
3871	case URX_LB_CONT:
3872	case URX_LB_END:
3873	case URX_LBN_CONT:
3874	case URX_LBN_END:
3875	case URX_LOOP_SR_I:
3876	case URX_LOOP_DOT_I:
3877	case URX_LOOP_C:
3878	case URX_DOLLAR_D:
3879	case URX_DOLLAR_MD:
3880	case URX_BACKSLASH_H:
3881	case URX_BACKSLASH_R:
3882	case URX_BACKSLASH_V:
3883	// These instructions are unaltered by the relocation.
3884	fRXPat->fCompiledPat->setElementAt(op, dst);
3885	dst++;
3886	break;
3887
3888	default:
3889	// Some op is unaccounted for.
3890	UPRV_UNREACHABLE;
3891	}
3892	}
3893
3894	fRXPat->fCompiledPat->setSize(dst);
3895	}
3896
3897
3898
3899
3900	//------------------------------------------------------------------------------
3901	//
3902	// Error Report a rule parse error.
3903	// Only report it if no previous error has been recorded.
3904	//
3905	//------------------------------------------------------------------------------
3906	void RegexCompile::error(UErrorCode e) {
3907	if (U_SUCCESS(*fStatus) \|\| e == U_MEMORY_ALLOCATION_ERROR) {
3908	*fStatus = e;
3909	// Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3910	// API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3911	// int64_t. If the values of the latter are out of range for the former,
3912	// set them to the appropriate "field not supported" values.
3913	if (fLineNum > `0x7FFFFFFF`) {
3914	fParseErr->line = `0`;
3915	fParseErr->offset = -`1`;
3916	} else if (fCharNum > `0x7FFFFFFF`) {
3917	fParseErr->line = (int32_t)fLineNum;
3918	fParseErr->offset = -`1`;
3919	} else {
3920	fParseErr->line = (int32_t)fLineNum;
3921	fParseErr->offset = (int32_t)fCharNum;
3922	}
3923
3924	UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3925
3926	// Fill in the context.
3927	// Note: extractBetween() pins supplied indicies to the string bounds.
3928	uprv_memset(fParseErr->preContext, `0`, sizeof(fParseErr->preContext));
3929	uprv_memset(fParseErr->postContext, `0`, sizeof(fParseErr->postContext));
3930	utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+`1`, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3931	utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-`1`, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3932	}
3933	}
3934
3935
3936	//
3937	// Assorted Unicode character constants.
3938	// Numeric because there is no portable way to enter them as literals.
3939	// (Think EBCDIC).
3940	//
3941	static const UChar chCR = `0x0d`; // New lines, for terminating comments.
3942	static const UChar chLF = `0x0a`; // Line Feed
3943	static const UChar chPound = `0x23`; // '#', introduces a comment.
3944	static const UChar chDigit0 = `0x30`; // '0'
3945	static const UChar chDigit7 = `0x37`; // '9'
3946	static const UChar chColon = `0x3A`; // ':'
3947	static const UChar chE = `0x45`; // 'E'
3948	static const UChar chQ = `0x51`; // 'Q'
3949	//static const UChar chN = 0x4E; // 'N'
3950	static const UChar chP = `0x50`; // 'P'
3951	static const UChar chBackSlash = `0x5c`; // '\' introduces a char escape
3952	//static const UChar chLBracket = 0x5b; // '['
3953	static const UChar chRBracket = `0x5d`; // ']'
3954	static const UChar chUp = `0x5e`; // '^'
3955	static const UChar chLowerP = `0x70`;
3956	static const UChar chLBrace = `0x7b`; // '{'
3957	static const UChar chRBrace = `0x7d`; // '}'
3958	static const UChar chNEL = `0x85`; // NEL newline variant
3959	static const UChar chLS = `0x2028`; // Unicode Line Separator
3960
3961
3962	//------------------------------------------------------------------------------
3963	//
3964	// nextCharLL Low Level Next Char from the regex pattern.
3965	// Get a char from the string, keep track of input position
3966	// for error reporting.
3967	//
3968	//------------------------------------------------------------------------------
3969	UChar32 RegexCompile::nextCharLL() {
3970	UChar32 ch;
3971
3972	if (fPeekChar != -`1`) {
3973	ch = fPeekChar;
3974	fPeekChar = -`1`;
3975	return ch;
3976	}
3977
3978	// assume we're already in the right place
3979	ch = UTEXT_NEXT32(fRXPat->fPattern);
3980	if (ch == U_SENTINEL) {
3981	return ch;
3982	}
3983
3984	if (ch == chCR \|\|
3985	ch == chNEL \|\|
3986	ch == chLS \|\|
3987	(ch == chLF && fLastChar != chCR)) {
3988	// Character is starting a new line. Bump up the line number, and
3989	// reset the column to 0.
3990	fLineNum++;
3991	fCharNum=`0`;
3992	}
3993	else {
3994	// Character is not starting a new line. Except in the case of a
3995	// LF following a CR, increment the column position.
3996	if (ch != chLF) {
3997	fCharNum++;
3998	}
3999	}
4000	fLastChar = ch;
4001	return ch;
4002	}
4003
4004	//------------------------------------------------------------------------------
4005	//
4006	// peekCharLL Low Level Character Scanning, sneak a peek at the next
4007	// character without actually getting it.
4008	//
4009	//------------------------------------------------------------------------------
4010	UChar32 RegexCompile::peekCharLL() {
4011	if (fPeekChar == -`1`) {
4012	fPeekChar = nextCharLL();
4013	}
4014	return fPeekChar;
4015	}
4016
4017
4018	//------------------------------------------------------------------------------
4019	//
4020	// nextChar for pattern scanning. At this level, we handle stripping
4021	// out comments and processing some backslash character escapes.
4022	// The rest of the pattern grammar is handled at the next level up.
4023	//
4024	//------------------------------------------------------------------------------
4025	void RegexCompile::nextChar(RegexPatternChar &c) {
4026	tailRecursion:
4027	fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4028	c.fChar = nextCharLL();
4029	c.fQuoted = FALSE;
4030
4031	if (fQuoteMode) {
4032	c.fQuoted = TRUE;
4033	if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == `0`)) \|\|
4034	c.fChar == (UChar32)-`1`) {
4035	fQuoteMode = FALSE; // Exit quote mode,
4036	nextCharLL(); // discard the E
4037	// nextChar(c); // recurse to get the real next char
4038	goto tailRecursion; // Note: fuzz testing produced testcases that
4039	// resulted in stack overflow here.
4040	}
4041	}
4042	else if (fInBackslashQuote) {
4043	// The current character immediately follows a '\'
4044	// Don't check for any further escapes, just return it as-is.
4045	// Don't set c.fQuoted, because that would prevent the state machine from
4046	// dispatching on the character.
4047	fInBackslashQuote = FALSE;
4048	}
4049	else
4050	{
4051	// We are not in a \Q quoted region \E of the source.
4052	//
4053	if (fModeFlags & UREGEX_COMMENTS) {
4054	//
4055	// We are in free-spacing and comments mode.
4056	// Scan through any white space and comments, until we
4057	// reach a significant character or the end of inut.
4058	for (;;) {
4059	if (c.fChar == (UChar32)-`1`) {
4060	break; // End of Input
4061	}
4062	if (c.fChar == chPound && fEOLComments == TRUE) {
4063	// Start of a comment. Consume the rest of it, until EOF or a new line
4064	for (;;) {
4065	c.fChar = nextCharLL();
4066	if (c.fChar == (UChar32)-`1` \|\| // EOF
4067	c.fChar == chCR \|\|
4068	c.fChar == chLF \|\|
4069	c.fChar == chNEL \|\|
4070	c.fChar == chLS) {
4071	break;
4072	}
4073	}
4074	}
4075	// TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4076	if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4077	break;
4078	}
4079	c.fChar = nextCharLL();
4080	}
4081	}
4082
4083	//
4084	// check for backslash escaped characters.
4085	//
4086	if (c.fChar == chBackSlash) {
4087	int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4088	if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4089	//
4090	// A '\' sequence that is handled by ICU's standard unescapeAt function.
4091	// Includes \uxxxx, \n, \r, many others.
4092	// Return the single equivalent character.
4093	//
4094	nextCharLL(); // get & discard the peeked char.
4095	c.fQuoted = TRUE;
4096
4097	if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4098	int32_t endIndex = (int32_t)pos;
4099	c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4100
4101	if (endIndex == pos) {
4102	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4103	}
4104	fCharNum += endIndex - pos;
4105	UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4106	} else {
4107	int32_t offset = `0`;
4108	struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4109
4110	UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4111	c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4112
4113	if (offset == `0`) {
4114	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4115	} else if (context.lastOffset == offset) {
4116	UTEXT_PREVIOUS32(fRXPat->fPattern);
4117	} else if (context.lastOffset != offset-`1`) {
4118	utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - `1`);
4119	}
4120	fCharNum += offset;
4121	}
4122	}
4123	else if (peekCharLL() == chDigit0) {
4124	// Octal Escape, using Java Regexp Conventions
4125	// which are \0 followed by 1-3 octal digits.
4126	// Different from ICU Unescape handling of Octal, which does not
4127	// require the leading 0.
4128	// Java also has the convention of only consuming 2 octal digits if
4129	// the three digit number would be > 0xff
4130	//
4131	c.fChar = `0`;
4132	nextCharLL(); // Consume the initial 0.
4133	int index;
4134	for (index=`0`; index<`3`; index++) {
4135	int32_t ch = peekCharLL();
4136	if (ch<chDigit0 \|\| ch>chDigit7) {
4137	if (index==`0`) {
4138	// \0 is not followed by any octal digits.
4139	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4140	}
4141	break;
4142	}
4143	c.fChar <<= `3`;
4144	c.fChar += ch&`7`;
4145	if (c.fChar <= `255`) {
4146	nextCharLL();
4147	} else {
4148	// The last digit made the number too big. Forget we saw it.
4149	c.fChar >>= `3`;
4150	}
4151	}
4152	c.fQuoted = TRUE;
4153	}
4154	else if (peekCharLL() == chQ) {
4155	// "\Q" enter quote mode, which will continue until "\E"
4156	fQuoteMode = TRUE;
4157	nextCharLL(); // discard the 'Q'.
4158	// nextChar(c); // recurse to get the real next char.
4159	goto tailRecursion; // Note: fuzz testing produced test cases that
4160	// resulted in stack overflow here.
4161	}
4162	else
4163	{
4164	// We are in a '\' escape that will be handled by the state table scanner.
4165	// Just return the backslash, but remember that the following char is to
4166	// be taken literally.
4167	fInBackslashQuote = TRUE;
4168	}
4169	}
4170	}
4171
4172	// re-enable # to end-of-line comments, in case they were disabled.
4173	// They are disabled by the parser upon seeing '(?', but this lasts for
4174	// the fetching of the next character only.
4175	fEOLComments = TRUE;
4176
4177	// putc(c.fChar, stdout);
4178	}
4179
4180
4181
4182	//------------------------------------------------------------------------------
4183	//
4184	// scanNamedChar
4185	// Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4186	//
4187	// The scan position will be at the 'N'. On return
4188	// the scan position should be just after the '}'
4189	//
4190	// Return the UChar32
4191	//
4192	//------------------------------------------------------------------------------
4193	UChar32 RegexCompile::scanNamedChar() {
4194	if (U_FAILURE(*fStatus)) {
4195	return `0`;
4196	}
4197
4198	nextChar(fC);
4199	if (fC.fChar != chLBrace) {
4200	error(U_REGEX_PROPERTY_SYNTAX);
4201	return `0`;
4202	}
4203
4204	UnicodeString charName;
4205	for (;;) {
4206	nextChar(fC);
4207	if (fC.fChar == chRBrace) {
4208	break;
4209	}
4210	if (fC.fChar == -`1`) {
4211	error(U_REGEX_PROPERTY_SYNTAX);
4212	return `0`;
4213	}
4214	charName.append(fC.fChar);
4215	}
4216
4217	char name[`100`];
4218	if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) \|\|
4219	(uint32_t)charName.length()>=sizeof(name)) {
4220	// All Unicode character names have only invariant characters.
4221	// The API to get a character, given a name, accepts only char , forcing us to convert,*
4222	// which requires this error check
4223	error(U_REGEX_PROPERTY_SYNTAX);
4224	return `0`;
4225	}
4226	charName.extract(`0`, charName.length(), name, sizeof(name), US_INV);
4227
4228	UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4229	if (U_FAILURE(*fStatus)) {
4230	error(U_REGEX_PROPERTY_SYNTAX);
4231	}
4232
4233	nextChar(fC); // Continue overall regex pattern processing with char after the '}'
4234	return theChar;
4235	}
4236
4237	//------------------------------------------------------------------------------
4238	//
4239	// scanProp Construct a UnicodeSet from the text at the current scan
4240	// position, which will be of the form \p{whaterver}
4241	//
4242	// The scan position will be at the 'p' or 'P'. On return
4243	// the scan position should be just after the '}'
4244	//
4245	// Return a UnicodeSet, constructed from the \P pattern,
4246	// or NULL if the pattern is invalid.
4247	//
4248	//------------------------------------------------------------------------------
4249	UnicodeSet *RegexCompile::scanProp() {
4250	UnicodeSet *uset = NULL;
4251
4252	if (U_FAILURE(*fStatus)) {
4253	return NULL;
4254	}
4255	(void)chLowerP; // Suppress compiler unused variable warning.
4256	U_ASSERT(fC.fChar == chLowerP \|\| fC.fChar == chP);
4257	UBool negated = (fC.fChar == chP);
4258
4259	UnicodeString propertyName;
4260	nextChar(fC);
4261	if (fC.fChar != chLBrace) {
4262	error(U_REGEX_PROPERTY_SYNTAX);
4263	return NULL;
4264	}
4265	for (;;) {
4266	nextChar(fC);
4267	if (fC.fChar == chRBrace) {
4268	break;
4269	}
4270	if (fC.fChar == -`1`) {
4271	// Hit the end of the input string without finding the closing '}'
4272	error(U_REGEX_PROPERTY_SYNTAX);
4273	return NULL;
4274	}
4275	propertyName.append(fC.fChar);
4276	}
4277	uset = createSetForProperty(propertyName, negated);
4278	nextChar(fC); // Move input scan to position following the closing '}'
4279	return uset;
4280	}
4281
4282	//------------------------------------------------------------------------------
4283	//
4284	// scanPosixProp Construct a UnicodeSet from the text at the current scan
4285	// position, which is expected be of the form [:property expression:]
4286	//
4287	// The scan position will be at the opening ':'. On return
4288	// the scan position must be on the closing ']'
4289	//
4290	// Return a UnicodeSet constructed from the pattern,
4291	// or NULL if this is not a valid POSIX-style set expression.
4292	// If not a property expression, restore the initial scan position
4293	// (to the opening ':')
4294	//
4295	// Note: the opening '[:' is not sufficient to guarantee that
4296	// this is a [:property:] expression.
4297	// [:'+=,] is a perfectly good ordinary set expression that
4298	// happens to include ':' as one of its characters.
4299	//
4300	//------------------------------------------------------------------------------
4301	UnicodeSet *RegexCompile::scanPosixProp() {
4302	UnicodeSet *uset = NULL;
4303
4304	if (U_FAILURE(*fStatus)) {
4305	return NULL;
4306	}
4307
4308	U_ASSERT(fC.fChar == chColon);
4309
4310	// Save the scanner state.
4311	// TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4312	int64_t savedScanIndex = fScanIndex;
4313	int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4314	UBool savedQuoteMode = fQuoteMode;
4315	UBool savedInBackslashQuote = fInBackslashQuote;
4316	UBool savedEOLComments = fEOLComments;
4317	int64_t savedLineNum = fLineNum;
4318	int64_t savedCharNum = fCharNum;
4319	UChar32 savedLastChar = fLastChar;
4320	UChar32 savedPeekChar = fPeekChar;
4321	RegexPatternChar savedfC = fC;
4322
4323	// Scan for a closing ]. A little tricky because there are some perverse
4324	// edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4325	// ending on the second closing ].
4326
4327	UnicodeString propName;
4328	UBool negated = FALSE;
4329
4330	// Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4331	nextChar(fC);
4332	if (fC.fChar == chUp) {
4333	negated = TRUE;
4334	nextChar(fC);
4335	}
4336
4337	// Scan for the closing ":]", collecting the property name along the way.
4338	UBool sawPropSetTerminator = FALSE;
4339	for (;;) {
4340	propName.append(fC.fChar);
4341	nextChar(fC);
4342	if (fC.fQuoted \|\| fC.fChar == -`1`) {
4343	// Escaped characters or end of input - either says this isn't a [:Property:]
4344	break;
4345	}
4346	if (fC.fChar == chColon) {
4347	nextChar(fC);
4348	if (fC.fChar == chRBracket) {
4349	sawPropSetTerminator = TRUE;
4350	}
4351	break;
4352	}
4353	}
4354
4355	if (sawPropSetTerminator) {
4356	uset = createSetForProperty(propName, negated);
4357	}
4358	else
4359	{
4360	// No closing ":]".
4361	// Restore the original scan position.
4362	// The main scanner will retry the input as a normal set expression,
4363	// not a [:Property:] expression.
4364	fScanIndex = savedScanIndex;
4365	fQuoteMode = savedQuoteMode;
4366	fInBackslashQuote = savedInBackslashQuote;
4367	fEOLComments = savedEOLComments;
4368	fLineNum = savedLineNum;
4369	fCharNum = savedCharNum;
4370	fLastChar = savedLastChar;
4371	fPeekChar = savedPeekChar;
4372	fC = savedfC;
4373	UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4374	}
4375	return uset;
4376	}
4377
4378	static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4379	set->add(`0`, `8`).add(`0x0e`, `0x1b`).add(`0x7f`, `0x9f`);
4380	addCategory(set, U_GC_CF_MASK, ec);
4381	}
4382
4383	//
4384	// Create a Unicode Set from a Unicode Property expression.
4385	// This is common code underlying both \p{...} ane [:...:] expressions.
4386	// Includes trying the Java "properties" that aren't supported as
4387	// normal ICU UnicodeSet properties
4388	//
4389	UnicodeSet RegexCompile::createSetForProperty(const* UnicodeString &propName, UBool negated) {
4390
4391	if (U_FAILURE(*fStatus)) {
4392	return nullptr;
4393	}
4394	LocalPointer<UnicodeSet> set;
4395	UErrorCode status = U_ZERO_ERROR;
4396
4397	do { // non-loop, exists to allow breaks from the block.
4398	//
4399	// First try the property as we received it
4400	//
4401	UnicodeString setExpr;
4402	uint32_t usetFlags = `0`;
4403	setExpr.append(u"[\\p{", -`1`);
4404	setExpr.append(propName);
4405	setExpr.append(u"}]", -`1`);
4406	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4407	usetFlags \|= USET_CASE_INSENSITIVE;
4408	}
4409	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (setExpr, usetFlags, NULL, status), status);
4410	if (U_SUCCESS(status) \|\| status == U_MEMORY_ALLOCATION_ERROR) {
4411	break;
4412	}
4413
4414	//
4415	// The incoming property wasn't directly recognized by ICU.
4416
4417	// Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4418	// Java accepts 'word' with mixed case.
4419	// Java accepts 'all' only in all lower case.
4420
4421	status = U_ZERO_ERROR;
4422	if (propName.caseCompare(u"word", -`1`, `0`) == `0`) {
4423	set.adoptInsteadAndCheckErrorCode(
4424	RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
4425	break;
4426	}
4427	if (propName.compare(u"all", -`1`) == `0`) {
4428	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (`0`, `0x10ffff`), status);
4429	break;
4430	}
4431
4432
4433	// Do Java InBlock expressions
4434	//
4435	UnicodeString mPropName = propName;
4436	if (mPropName.startsWith(u"In", `2`) && mPropName.length() >= `3`) {
4437	status = U_ZERO_ERROR;
4438	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (), status);
4439	if (U_FAILURE(status)) {
4440	break;
4441	}
4442	UnicodeString blockName(mPropName, `2`); // Property with the leading "In" removed.
4443	set ->applyPropertyAlias(UnicodeString (u"Block"), blockName, status);
4444	break;
4445	}
4446
4447	// Check for the Java form "IsBooleanPropertyValue", which we will recast
4448	// as "BooleanPropertyValue". The property value can be either a
4449	// a General Category or a Script Name.
4450
4451	if (propName.startsWith(u"Is", `2`) && propName.length()>=`3`) {
4452	mPropName.remove(`0`, `2`); // Strip the "Is"
4453	if (mPropName.indexOf(u`'='`) >= `0`) {
4454	// Reject any "Is..." property expression containing an '=', that is,
4455	// any non-binary property expression.
4456	status = U_REGEX_PROPERTY_SYNTAX;
4457	break;
4458	}
4459
4460	if (mPropName.caseCompare(u"assigned", -`1`, `0`) == `0`) {
4461	mPropName.setTo(u"unassigned", -`1`);
4462	negated = !negated;
4463	} else if (mPropName.caseCompare(u"TitleCase", -`1`, `0`) == `0`) {
4464	mPropName.setTo(u"Titlecase_Letter", -`1`);
4465	}
4466
4467	mPropName.insert(`0`, u"[\\p{", -`1`);
4468	mPropName.append(u"}]", -`1`);
4469	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (mPropName, *fStatus), status);
4470
4471	if (U_SUCCESS(status) && !set ->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4472	set ->closeOver(USET_CASE_INSENSITIVE);
4473	}
4474	break;
4475
4476	}
4477
4478	if (propName.startsWith(u"java", -`1`)) {
4479	status = U_ZERO_ERROR;
4480	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (), status);
4481	if (U_FAILURE(status)) {
4482	break;
4483	}
4484	//
4485	// Try the various Java specific properties.
4486	// These all begin with "java"
4487	//
4488	if (propName.compare(u"javaDefined", -`1`) == `0`) {
4489	addCategory(set.getAlias(), U_GC_CN_MASK, status);
4490	set ->complement();
4491	}
4492	else if (propName.compare(u"javaDigit", -`1`) == `0`) {
4493	addCategory(set.getAlias(), U_GC_ND_MASK, status);
4494	}
4495	else if (propName.compare(u"javaIdentifierIgnorable", -`1`) == `0`) {
4496	addIdentifierIgnorable(set.getAlias(), status);
4497	}
4498	else if (propName.compare(u"javaISOControl", -`1`) == `0`) {
4499	set ->add(`0`, `0x1F`).add(`0x7F`, `0x9F`);
4500	}
4501	else if (propName.compare(u"javaJavaIdentifierPart", -`1`) == `0`) {
4502	addCategory(set.getAlias(), U_GC_L_MASK, status);
4503	addCategory(set.getAlias(), U_GC_SC_MASK, status);
4504	addCategory(set.getAlias(), U_GC_PC_MASK, status);
4505	addCategory(set.getAlias(), U_GC_ND_MASK, status);
4506	addCategory(set.getAlias(), U_GC_NL_MASK, status);
4507	addCategory(set.getAlias(), U_GC_MC_MASK, status);
4508	addCategory(set.getAlias(), U_GC_MN_MASK, status);
4509	addIdentifierIgnorable(set.getAlias(), status);
4510	}
4511	else if (propName.compare(u"javaJavaIdentifierStart", -`1`) == `0`) {
4512	addCategory(set.getAlias(), U_GC_L_MASK, status);
4513	addCategory(set.getAlias(), U_GC_NL_MASK, status);
4514	addCategory(set.getAlias(), U_GC_SC_MASK, status);
4515	addCategory(set.getAlias(), U_GC_PC_MASK, status);
4516	}
4517	else if (propName.compare(u"javaLetter", -`1`) == `0`) {
4518	addCategory(set.getAlias(), U_GC_L_MASK, status);
4519	}
4520	else if (propName.compare(u"javaLetterOrDigit", -`1`) == `0`) {
4521	addCategory(set.getAlias(), U_GC_L_MASK, status);
4522	addCategory(set.getAlias(), U_GC_ND_MASK, status);
4523	}
4524	else if (propName.compare(u"javaLowerCase", -`1`) == `0`) {
4525	addCategory(set.getAlias(), U_GC_LL_MASK, status);
4526	}
4527	else if (propName.compare(u"javaMirrored", -`1`) == `0`) {
4528	set ->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, `1`, status);
4529	}
4530	else if (propName.compare(u"javaSpaceChar", -`1`) == `0`) {
4531	addCategory(set.getAlias(), U_GC_Z_MASK, status);
4532	}
4533	else if (propName.compare(u"javaSupplementaryCodePoint", -`1`) == `0`) {
4534	set ->add(`0x10000`, UnicodeSet::MAX_VALUE);
4535	}
4536	else if (propName.compare(u"javaTitleCase", -`1`) == `0`) {
4537	addCategory(set.getAlias(), U_GC_LT_MASK, status);
4538	}
4539	else if (propName.compare(u"javaUnicodeIdentifierStart", -`1`) == `0`) {
4540	addCategory(set.getAlias(), U_GC_L_MASK, status);
4541	addCategory(set.getAlias(), U_GC_NL_MASK, status);
4542	}
4543	else if (propName.compare(u"javaUnicodeIdentifierPart", -`1`) == `0`) {
4544	addCategory(set.getAlias(), U_GC_L_MASK, status);
4545	addCategory(set.getAlias(), U_GC_PC_MASK, status);
4546	addCategory(set.getAlias(), U_GC_ND_MASK, status);
4547	addCategory(set.getAlias(), U_GC_NL_MASK, status);
4548	addCategory(set.getAlias(), U_GC_MC_MASK, status);
4549	addCategory(set.getAlias(), U_GC_MN_MASK, status);
4550	addIdentifierIgnorable(set.getAlias(), status);
4551	}
4552	else if (propName.compare(u"javaUpperCase", -`1`) == `0`) {
4553	addCategory(set.getAlias(), U_GC_LU_MASK, status);
4554	}
4555	else if (propName.compare(u"javaValidCodePoint", -`1`) == `0`) {
4556	set ->add(`0`, UnicodeSet::MAX_VALUE);
4557	}
4558	else if (propName.compare(u"javaWhitespace", -`1`) == `0`) {
4559	addCategory(set.getAlias(), U_GC_Z_MASK, status);
4560	set ->removeAll(UnicodeSet ().add(`0xa0`).add(`0x2007`).add(`0x202f`));
4561	set ->add(`9`, `0x0d`).add(`0x1c`, `0x1f`);
4562	} else {
4563	status = U_REGEX_PROPERTY_SYNTAX;
4564	}
4565
4566	if (U_SUCCESS(status) && !set ->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4567	set ->closeOver(USET_CASE_INSENSITIVE);
4568	}
4569	break;
4570	}
4571
4572	// Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4573	// extensions matched it.
4574	status = U_REGEX_PROPERTY_SYNTAX;
4575	} while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
4576
4577	if (U_SUCCESS(status)) {
4578	U_ASSERT(set.isValid());
4579	if (negated) {
4580	set ->complement();
4581	}
4582	return set.orphan();
4583	} else {
4584	if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4585	status = U_REGEX_PROPERTY_SYNTAX;
4586	}
4587	error(status);
4588	return nullptr;
4589	}
4590	}
4591
4592
4593	//
4594	// SetEval Part of the evaluation of [set expressions].
4595	// Perform any pending (stacked) operations with precedence
4596	// equal or greater to that of the next operator encountered
4597	// in the expression.
4598	//
4599	void RegexCompile::setEval(int32_t nextOp) {
4600	UnicodeSet *rightOperand = NULL;
4601	UnicodeSet *leftOperand = NULL;
4602	for (;;) {
4603	U_ASSERT(fSetOpStack.empty()==FALSE);
4604	int32_t pendingSetOperation = fSetOpStack.peeki();
4605	if ((pendingSetOperation&`0xffff0000`) < (nextOp&`0xffff0000`)) {
4606	break;
4607	}
4608	fSetOpStack.popi();
4609	U_ASSERT(fSetStack.empty() == FALSE);
4610	rightOperand = (UnicodeSet *)fSetStack.peek();
4611	switch (pendingSetOperation) {
4612	case setNegation:
4613	rightOperand->complement();
4614	break;
4615	case setCaseClose:
4616	// TODO: need a simple close function. Ticket 6065
4617	rightOperand->closeOver(USET_CASE_INSENSITIVE);
4618	rightOperand->removeAllStrings();
4619	break;
4620	case setDifference1:
4621	case setDifference2:
4622	fSetStack.pop();
4623	leftOperand = (UnicodeSet *)fSetStack.peek();
4624	leftOperand->removeAll(*rightOperand);
4625	delete rightOperand;
4626	break;
4627	case setIntersection1:
4628	case setIntersection2:
4629	fSetStack.pop();
4630	leftOperand = (UnicodeSet *)fSetStack.peek();
4631	leftOperand->retainAll(*rightOperand);
4632	delete rightOperand;
4633	break;
4634	case setUnion:
4635	fSetStack.pop();
4636	leftOperand = (UnicodeSet *)fSetStack.peek();
4637	leftOperand->addAll(*rightOperand);
4638	delete rightOperand;
4639	break;
4640	default:
4641	UPRV_UNREACHABLE;
4642	}
4643	}
4644	}
4645
4646	void RegexCompile::setPushOp(int32_t op) {
4647	setEval(op);
4648	fSetOpStack.push(op, *fStatus);
4649	fSetStack.push(new UnicodeSet (), *fStatus);
4650	}
4651
4652	U_NAMESPACE_END
4653	#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4654
4655

Browse the source code of engine/third_party/icu/source/i18n/regexcmp.cpp