regexcmp.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/regexcmp.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	//
4	// file: regexcmp.cpp
5	//
6	// Copyright (C) 2002-2016 International Business Machines Corporation and others.
7	// All Rights Reserved.
8	//
9	// This file contains the ICU regular expression compiler, which is responsible
10	// for processing a regular expression pattern into the compiled form that
11	// is used by the match finding engine.
12	//
13
14	#include "unicode/utypes.h"
15
16	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
17
18	#include "unicode/ustring.h"
19	#include "unicode/unistr.h"
20	#include "unicode/uniset.h"
21	#include "unicode/uchar.h"
22	#include "unicode/uchriter.h"
23	#include "unicode/parsepos.h"
24	#include "unicode/parseerr.h"
25	#include "unicode/regex.h"
26	#include "unicode/utf.h"
27	#include "unicode/utf16.h"
28	#include "patternprops.h"
29	#include "putilimp.h"
30	#include "cmemory.h"
31	#include "cstr.h"
32	#include "cstring.h"
33	#include "uvectr32.h"
34	#include "uvectr64.h"
35	#include "uassert.h"
36	#include "uinvchar.h"
37
38	#include "regeximp.h"
39	#include "regexcst.h" // Contains state table for the regex pattern parser.
40	// generated by a Perl script.
41	#include "regexcmp.h"
42	#include "regexst.h"
43	#include "regextxt.h"
44
45
46
47	U_NAMESPACE_BEGIN
48
49
50	//------------------------------------------------------------------------------
51	//
52	// Constructor.
53	//
54	//------------------------------------------------------------------------------
55	RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56	fParenStack (status), fSetStack (status), fSetOpStack (status)
57	{
58	// Lazy init of all shared global sets (needed for init()'s empty text)
59	RegexStaticSets::initGlobals(&status);
60
61	fStatus = &status;
62
63	fRXPat = rxp;
64	fScanIndex = `0`;
65	fLastChar = -`1`;
66	fPeekChar = -`1`;
67	fLineNum = `1`;
68	fCharNum = `0`;
69	fQuoteMode = FALSE;
70	fInBackslashQuote = FALSE;
71	fModeFlags = fRXPat->fFlags \| `0x80000000`;
72	fEOLComments = TRUE;
73
74	fMatchOpenParen = -`1`;
75	fMatchCloseParen = -`1`;
76	fCaptureName = NULL;
77	fLastSetLiteral = U_SENTINEL;
78
79	if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80	status = rxp->fDeferredStatus;
81	}
82	}
83
84	static const UChar chAmp = `0x26`; // '&'
85	static const UChar chDash = `0x2d`; // '-'
86
87
88	//------------------------------------------------------------------------------
89	//
90	// Destructor
91	//
92	//------------------------------------------------------------------------------
93	RegexCompile::~RegexCompile() {
94	delete fCaptureName; // Normally will be NULL, but can exist if pattern
95	// compilation stops with a syntax error.
96	}
97
98	static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99	set->addAll(UnicodeSet ().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100	}
101
102	//------------------------------------------------------------------------------
103	//
104	// Compile regex pattern. The state machine for rexexp pattern parsing is here.
105	// The state tables are hand-written in the file regexcst.txt,
106	// and converted to the form used here by a perl
107	// script regexcst.pl
108	//
109	//------------------------------------------------------------------------------
110	void RegexCompile::compile(
111	const UnicodeString &pat, // Source pat to be compiled.
112	UParseError &pp, // Error position info
113	UErrorCode &e) // Error Code
114	{
115	fRXPat->fPatternString = new UnicodeString (pat);
116	UText patternText = UTEXT_INITIALIZER;
117	utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118
119	if (U_SUCCESS(e)) {
120	compile(&patternText, pp, e);
121	utext_close(&patternText);
122	}
123	}
124
125	//
126	// compile, UText mode
127	// All the work is actually done here.
128	//
129	void RegexCompile::compile(
130	UText pat, // Source pat to be compiled.*
131	UParseError &pp, // Error position info
132	UErrorCode &e) // Error Code
133	{
134	fStatus = &e;
135	fParseErr = &pp;
136	fStackPtr = `0`;
137	fStack[fStackPtr] = `0`;
138
139	if (U_FAILURE(*fStatus)) {
140	return;
141	}
142
143	// There should be no pattern stuff in the RegexPattern object. They can not be reused.
144	U_ASSERT(fRXPat->fPattern == NULL \|\| utext_nativeLength(fRXPat->fPattern) == `0`);
145
146	// Prepare the RegexPattern object to receive the compiled pattern.
147	fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
148	if (U_FAILURE(*fStatus)) {
149	return;
150	}
151	fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
152	fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
153
154
155	// Initialize the pattern scanning state machine
156	fPatternLength = utext_nativeLength(pat);
157	uint16_t state = `1`;
158	const RegexTableEl *tableEl;
159
160	// UREGEX_LITERAL force entire pattern to be treated as a literal string.
161	if (fModeFlags & UREGEX_LITERAL) {
162	fQuoteMode = TRUE;
163	}
164
165	nextChar(fC); // Fetch the first char from the pattern string.
166
167	//
168	// Main loop for the regex pattern parsing state machine.
169	// Runs once per state transition.
170	// Each time through optionally performs, depending on the state table,
171	// - an advance to the the next pattern char
172	// - an action to be performed.
173	// - pushing or popping a state to/from the local state return stack.
174	// file regexcst.txt is the source for the state table. The logic behind
175	// recongizing the pattern syntax is there, not here.
176	//
177	for (;;) {
178	// Bail out if anything has gone wrong.
179	// Regex pattern parsing stops on the first error encountered.
180	if (U_FAILURE(*fStatus)) {
181	break;
182	}
183
184	U_ASSERT(state != `0`);
185
186	// Find the state table element that matches the input char from the pattern, or the
187	// class of the input character. Start with the first table row for this
188	// state, then linearly scan forward until we find a row that matches the
189	// character. The last row for each state always matches all characters, so
190	// the search will stop there, if not before.
191	//
192	tableEl = &gRuleParseStateTable[state];
193	REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
194	fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
195
196	for (;;) { // loop through table rows belonging to this state, looking for one
197	// that matches the current input char.
198	REGEX_SCAN_DEBUG_PRINTF(("."));
199	if (tableEl->fCharClass < `127` && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
200	// Table row specified an individual character, not a set, and
201	// the input character is not quoted, and
202	// the input character matched it.
203	break;
204	}
205	if (tableEl->fCharClass == `255`) {
206	// Table row specified default, match anything character class.
207	break;
208	}
209	if (tableEl->fCharClass == `254` && fC.fQuoted) {
210	// Table row specified "quoted" and the char was quoted.
211	break;
212	}
213	if (tableEl->fCharClass == `253` && fC.fChar == (UChar32)-`1`) {
214	// Table row specified eof and we hit eof on the input.
215	break;
216	}
217
218	if (tableEl->fCharClass >= `128` && tableEl->fCharClass < `240` && // Table specs a char class &&
219	fC.fQuoted == FALSE && // char is not escaped &&
220	fC.fChar != (UChar32)-`1`) { // char is not EOF
221	U_ASSERT(tableEl->fCharClass <= `137`);
222	if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-`128`].contains(fC.fChar)) {
223	// Table row specified a character class, or set of characters,
224	// and the current char matches it.
225	break;
226	}
227	}
228
229	// No match on this row, advance to the next row for this state,
230	tableEl++;
231	}
232	REGEX_SCAN_DEBUG_PRINTF(("\n"));
233
234	//
235	// We've found the row of the state table that matches the current input
236	// character from the rules string.
237	// Perform any action specified by this row in the state table.
238	if (doParseActions(tableEl->fAction) == FALSE) {
239	// Break out of the state machine loop if the
240	// the action signalled some kind of error, or
241	// the action was to exit, occurs on normal end-of-rules-input.
242	break;
243	}
244
245	if (tableEl->fPushState != `0`) {
246	fStackPtr++;
247	if (fStackPtr >= kStackSize) {
248	error(U_REGEX_INTERNAL_ERROR);
249	REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
250	fStackPtr--;
251	}
252	fStack[fStackPtr] = tableEl->fPushState;
253	}
254
255	//
256	// NextChar. This is where characters are actually fetched from the pattern.
257	// Happens under control of the 'n' tag in the state table.
258	//
259	if (tableEl->fNextChar) {
260	nextChar(fC);
261	}
262
263	// Get the next state from the table entry, or from the
264	// state stack if the next state was specified as "pop".
265	if (tableEl->fNextState != `255`) {
266	state = tableEl->fNextState;
267	} else {
268	state = fStack[fStackPtr];
269	fStackPtr--;
270	if (fStackPtr < `0`) {
271	// state stack underflow
272	// This will occur if the user pattern has mis-matched parentheses,
273	// with extra close parens.
274	//
275	fStackPtr++;
276	error(U_REGEX_MISMATCHED_PAREN);
277	}
278	}
279
280	}
281
282	if (U_FAILURE(*fStatus)) {
283	// Bail out if the pattern had errors.
284	// Set stack cleanup: a successful compile would have left it empty,
285	// but errors can leave temporary sets hanging around.
286	while (!fSetStack.empty()) {
287	delete (UnicodeSet *)fSetStack.pop();
288	}
289	return;
290	}
291
292	//
293	// The pattern has now been read and processed, and the compiled code generated.
294	//
295
296	//
297	// The pattern's fFrameSize so far has accumulated the requirements for
298	// storage for capture parentheses, counters, etc. that are encountered
299	// in the pattern. Add space for the two variables that are always
300	// present in the saved state: the input string position (int64_t) and
301	// the position in the compiled pattern.
302	//
303	allocateStackData(RESTACKFRAME_HDRCOUNT);
304
305	//
306	// Optimization pass 1: NOPs, back-references, and case-folding
307	//
308	stripNOPs();
309
310	//
311	// Get bounds for the minimum and maximum length of a string that this
312	// pattern can match. Used to avoid looking for matches in strings that
313	// are too short.
314	//
315	fRXPat->fMinMatchLen = minMatchLength(`3`, fRXPat->fCompiledPat->size()-`1`);
316
317	//
318	// Optimization pass 2: match start type
319	//
320	matchStartType();
321
322	//
323	// Set up fast latin-1 range sets
324	//
325	int32_t numSets = fRXPat->fSets->size();
326	fRXPat->fSets8 = new Regex8BitSet[numSets];
327	// Null pointer check.
328	if (fRXPat->fSets8 == NULL) {
329	e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
330	return;
331	}
332	int32_t i;
333	for (i=`0`; i<numSets; i++) {
334	UnicodeSet s = (UnicodeSet )fRXPat->fSets->elementAt(i);
335	fRXPat->fSets8[i].init(s);
336	}
337
338	}
339
340
341
342
343
344	//------------------------------------------------------------------------------
345	//
346	// doParseAction Do some action during regex pattern parsing.
347	// Called by the parse state machine.
348	//
349	// Generation of the match engine PCode happens here, or
350	// in functions called from the parse actions defined here.
351	//
352	//
353	//------------------------------------------------------------------------------
354	UBool RegexCompile::doParseActions(int32_t action)
355	{
356	UBool returnVal = TRUE;
357
358	switch ((Regex_PatternParseAction)action) {
359
360	case doPatStart:
361	// Start of pattern compiles to:
362	//0 SAVE 2 Fall back to position of FAIL
363	//1 jmp 3
364	//2 FAIL Stop if we ever reach here.
365	//3 NOP Dummy, so start of pattern looks the same as
366	// the start of an ( grouping.
367	//4 NOP Resreved, will be replaced by a save if there are
368	// OR \| operators at the top level
369	appendOp(URX_STATE_SAVE, `2`);
370	appendOp(URX_JMP, `3`);
371	appendOp(URX_FAIL, `0`);
372
373	// Standard open nonCapture paren action emits the two NOPs and
374	// sets up the paren stack frame.
375	doParseActions(doOpenNonCaptureParen);
376	break;
377
378	case doPatFinish:
379	// We've scanned to the end of the pattern
380	// The end of pattern compiles to:
381	// URX_END
382	// which will stop the runtime match engine.
383	// Encountering end of pattern also behaves like a close paren,
384	// and forces fixups of the State Save at the beginning of the compiled pattern
385	// and of any OR operations at the top level.
386	//
387	handleCloseParen();
388	if (fParenStack.size() > `0`) {
389	// Missing close paren in pattern.
390	error(U_REGEX_MISMATCHED_PAREN);
391	}
392
393	// add the END operation to the compiled pattern.
394	appendOp(URX_END, `0`);
395
396	// Terminate the pattern compilation state machine.
397	returnVal = FALSE;
398	break;
399
400
401
402	case doOrOperator:
403	// Scanning a '\|', as in (A\|B)
404	{
405	// Generate code for any pending literals preceding the '\|'
406	fixLiterals(FALSE);
407
408	// Insert a SAVE operation at the start of the pattern section preceding
409	// this OR at this level. This SAVE will branch the match forward
410	// to the right hand side of the OR in the event that the left hand
411	// side fails to match and backtracks. Locate the position for the
412	// save from the location on the top of the parentheses stack.
413	int32_t savePosition = fParenStack.popi();
414	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
415	U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
416	op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+`1`);
417	fRXPat->fCompiledPat->setElementAt(op, savePosition);
418
419	// Append an JMP operation into the compiled pattern. The operand for
420	// the JMP will eventually be the location following the ')' for the
421	// group. This will be patched in later, when the ')' is encountered.
422	appendOp(URX_JMP, `0`);
423
424	// Push the position of the newly added JMP op onto the parentheses stack.
425	// This registers if for fixup when this block's close paren is encountered.
426	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, *fStatus);
427
428	// Append a NOP to the compiled pattern. This is the slot reserved
429	// for a SAVE in the event that there is yet another '\|' following
430	// this one.
431	appendOp(URX_NOP, `0`);
432	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, *fStatus);
433	}
434	break;
435
436
437	case doBeginNamedCapture:
438	// Scanning (?<letter.
439	// The first letter of the name will come through again under doConinueNamedCapture.
440	fCaptureName = new UnicodeString ();
441	if (fCaptureName == NULL) {
442	error(U_MEMORY_ALLOCATION_ERROR);
443	}
444	break;
445
446	case doContinueNamedCapture:
447	fCaptureName->append(fC.fChar);
448	break;
449
450	case doBadNamedCapture:
451	error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
452	break;
453
454	case doOpenCaptureParen:
455	// Open Capturing Paren, possibly named.
456	// Compile to a
457	// - NOP, which later may be replaced by a save-state if the
458	// parenthesized group gets a quantifier, followed by*
459	// - START_CAPTURE n where n is stack frame offset to the capture group variables.
460	// - NOP, which may later be replaced by a save-state if there
461	// is an '\|' alternation within the parens.
462	//
463	// Each capture group gets three slots in the save stack frame:
464	// 0: Capture Group start position (in input string being matched.)
465	// 1: Capture Group end position.
466	// 2: Start of Match-in-progress.
467	// The first two locations are for a completed capture group, and are
468	// referred to by back references and the like.
469	// The third location stores the capture start position when an START_CAPTURE is
470	// encountered. This will be promoted to a completed capture when (and if) the corresponding
471	// END_CAPTURE is encountered.
472	{
473	fixLiterals();
474	appendOp(URX_NOP, `0`);
475	int32_t varsLoc = allocateStackData(`3`); // Reserve three slots in match stack frame.
476	appendOp(URX_START_CAPTURE, varsLoc);
477	appendOp(URX_NOP, `0`);
478
479	// On the Parentheses stack, start a new frame and add the postions
480	// of the two NOPs. Depending on what follows in the pattern, the
481	// NOPs may be changed to SAVE_STATE or JMP ops, with a target
482	// address of the end of the parenthesized group.
483	fParenStack.push(fModeFlags, fStatus); // Match mode state*
484	fParenStack.push(capturing, fStatus); // Frame type.*
485	fParenStack.push(fRXPat->fCompiledPat->size()-`3`, fStatus); // The first NOP location*
486	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP loc*
487
488	// Save the mapping from group number to stack frame variable position.
489	fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
490
491	// If this is a named capture group, add the name->group number mapping.
492	if (fCaptureName != NULL) {
493	if (!fRXPat->initNamedCaptureMap()) {
494	if (U_SUCCESS(*fStatus)) {
495	error(fRXPat->fDeferredStatus);
496	}
497	break;
498	}
499	int32_t groupNumber = fRXPat->fGroupMap->size();
500	int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
501	fCaptureName = NULL; // hash table takes ownership of the name (key) string.
502	if (previousMapping > `0` && U_SUCCESS(*fStatus)) {
503	error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
504	}
505	}
506	}
507	break;
508
509	case doOpenNonCaptureParen:
510	// Open non-caputuring (grouping only) Paren.
511	// Compile to a
512	// - NOP, which later may be replaced by a save-state if the
513	// parenthesized group gets a quantifier, followed by*
514	// - NOP, which may later be replaced by a save-state if there
515	// is an '\|' alternation within the parens.
516	{
517	fixLiterals();
518	appendOp(URX_NOP, `0`);
519	appendOp(URX_NOP, `0`);
520
521	// On the Parentheses stack, start a new frame and add the postions
522	// of the two NOPs.
523	fParenStack.push(fModeFlags, fStatus); // Match mode state*
524	fParenStack.push(plain, fStatus); // Begin a new frame.*
525	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP location*
526	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP loc*
527	}
528	break;
529
530
531	case doOpenAtomicParen:
532	// Open Atomic Paren. (?>
533	// Compile to a
534	// - NOP, which later may be replaced if the parenthesized group
535	// has a quantifier, followed by
536	// - STO_SP save state stack position, so it can be restored at the ")"
537	// - NOP, which may later be replaced by a save-state if there
538	// is an '\|' alternation within the parens.
539	{
540	fixLiterals();
541	appendOp(URX_NOP, `0`);
542	int32_t varLoc = allocateData(`1`); // Reserve a data location for saving the state stack ptr.
543	appendOp(URX_STO_SP, varLoc);
544	appendOp(URX_NOP, `0`);
545
546	// On the Parentheses stack, start a new frame and add the postions
547	// of the two NOPs. Depending on what follows in the pattern, the
548	// NOPs may be changed to SAVE_STATE or JMP ops, with a target
549	// address of the end of the parenthesized group.
550	fParenStack.push(fModeFlags, fStatus); // Match mode state*
551	fParenStack.push(atomic, fStatus); // Frame type.*
552	fParenStack.push(fRXPat->fCompiledPat->size()-`3`, fStatus); // The first NOP*
553	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP*
554	}
555	break;
556
557
558	case doOpenLookAhead:
559	// Positive Look-ahead (?= stuff )
560	//
561	// Note: Addition of transparent input regions, with the need to
562	// restore the original regions when failing out of a lookahead
563	// block, complicated this sequence. Some conbined opcodes
564	// might make sense - or might not, lookahead aren't that common.
565	//
566	// Caution: min match length optimization knows about this
567	// sequence; don't change without making updates there too.
568	//
569	// Compiles to
570	// 1 LA_START dataLoc Saves SP, Input Pos, Active input region.
571	// 2. STATE_SAVE 4 on failure of lookahead, goto 4
572	// 3 JMP 6 continue ...
573	//
574	// 4. LA_END Look Ahead failed. Restore regions.
575	// 5. BACKTRACK and back track again.
576	//
577	// 6. NOP reserved for use by quantifiers on the block.
578	// Look-ahead can't have quantifiers, but paren stack
579	// compile time conventions require the slot anyhow.
580	// 7. NOP may be replaced if there is are '\|' ops in the block.
581	// 8. code for parenthesized stuff.
582	// 9. LA_END
583	//
584	// Four data slots are reserved, for saving state on entry to the look-around
585	// 0: stack pointer on entry.
586	// 1: input position on entry.
587	// 2: fActiveStart, the active bounds start on entry.
588	// 3: fActiveLimit, the active bounds limit on entry.
589	{
590	fixLiterals();
591	int32_t dataLoc = allocateData(`4`);
592	appendOp(URX_LA_START, dataLoc);
593	appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ `2`);
594	appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ `3`);
595	appendOp(URX_LA_END, dataLoc);
596	appendOp(URX_BACKTRACK, `0`);
597	appendOp(URX_NOP, `0`);
598	appendOp(URX_NOP, `0`);
599
600	// On the Parentheses stack, start a new frame and add the postions
601	// of the NOPs.
602	fParenStack.push(fModeFlags, fStatus); // Match mode state*
603	fParenStack.push(lookAhead, fStatus); // Frame type.*
604	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP location*
605	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP location*
606	}
607	break;
608
609	case doOpenLookAheadNeg:
610	// Negated Lookahead. (?! stuff )
611	// Compiles to
612	// 1. LA_START dataloc
613	// 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
614	// // which continues with the match.
615	// 3. NOP // Std. Open Paren sequence, for possible '\|'
616	// 4. code for parenthesized stuff.
617	// 5. LA_END // Cut back stack, remove saved state from step 2.
618	// 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
619	// 7. END_LA // Restore match region, in case look-ahead was using
620	// an alternate (transparent) region.
621	// Four data slots are reserved, for saving state on entry to the look-around
622	// 0: stack pointer on entry.
623	// 1: input position on entry.
624	// 2: fActiveStart, the active bounds start on entry.
625	// 3: fActiveLimit, the active bounds limit on entry.
626	{
627	fixLiterals();
628	int32_t dataLoc = allocateData(`4`);
629	appendOp(URX_LA_START, dataLoc);
630	appendOp(URX_STATE_SAVE, `0`); // dest address will be patched later.
631	appendOp(URX_NOP, `0`);
632
633	// On the Parentheses stack, start a new frame and add the postions
634	// of the StateSave and NOP.
635	fParenStack.push(fModeFlags, fStatus); // Match mode state*
636	fParenStack.push(negLookAhead, fStatus); // Frame type*
637	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The STATE_SAVE location*
638	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP location*
639
640	// Instructions #5 - #7 will be added when the ')' is encountered.
641	}
642	break;
643
644	case doOpenLookBehind:
645	{
646	// Compile a (?<= look-behind open paren.
647	//
648	// Compiles to
649	// 0 URX_LB_START dataLoc
650	// 1 URX_LB_CONT dataLoc
651	// 2 MinMatchLen
652	// 3 MaxMatchLen
653	// 4 URX_NOP Standard '(' boilerplate.
654	// 5 URX_NOP Reserved slot for use with '\|' ops within (block).
655	// 6 <code for LookBehind expression>
656	// 7 URX_LB_END dataLoc # Check match len, restore input len
657	// 8 URX_LA_END dataLoc # Restore stack, input pos
658	//
659	// Allocate a block of matcher data, to contain (when running a match)
660	// 0: Stack ptr on entry
661	// 1: Input Index on entry
662	// 2: fActiveStart, the active bounds start on entry.
663	// 3: fActiveLimit, the active bounds limit on entry.
664	// 4: Start index of match current match attempt.
665	// The first four items must match the layout of data for LA_START / LA_END
666
667	// Generate match code for any pending literals.
668	fixLiterals();
669
670	// Allocate data space
671	int32_t dataLoc = allocateData(`5`);
672
673	// Emit URX_LB_START
674	appendOp(URX_LB_START, dataLoc);
675
676	// Emit URX_LB_CONT
677	appendOp(URX_LB_CONT, dataLoc);
678	appendOp(URX_RESERVED_OP, `0`); // MinMatchLength. To be filled later.
679	appendOp(URX_RESERVED_OP, `0`); // MaxMatchLength. To be filled later.
680
681	// Emit the NOPs
682	appendOp(URX_NOP, `0`);
683	appendOp(URX_NOP, `0`);
684
685	// On the Parentheses stack, start a new frame and add the postions
686	// of the URX_LB_CONT and the NOP.
687	fParenStack.push(fModeFlags, fStatus); // Match mode state*
688	fParenStack.push(lookBehind, fStatus); // Frame type*
689	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP location*
690	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The 2nd NOP location*
691
692	// The final two instructions will be added when the ')' is encountered.
693	}
694
695	break;
696
697	case doOpenLookBehindNeg:
698	{
699	// Compile a (?<! negated look-behind open paren.
700	//
701	// Compiles to
702	// 0 URX_LB_START dataLoc # Save entry stack, input len
703	// 1 URX_LBN_CONT dataLoc # Iterate possible match positions
704	// 2 MinMatchLen
705	// 3 MaxMatchLen
706	// 4 continueLoc (9)
707	// 5 URX_NOP Standard '(' boilerplate.
708	// 6 URX_NOP Reserved slot for use with '\|' ops within (block).
709	// 7 <code for LookBehind expression>
710	// 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
711	// 9 ...
712	//
713	// Allocate a block of matcher data, to contain (when running a match)
714	// 0: Stack ptr on entry
715	// 1: Input Index on entry
716	// 2: fActiveStart, the active bounds start on entry.
717	// 3: fActiveLimit, the active bounds limit on entry.
718	// 4: Start index of match current match attempt.
719	// The first four items must match the layout of data for LA_START / LA_END
720
721	// Generate match code for any pending literals.
722	fixLiterals();
723
724	// Allocate data space
725	int32_t dataLoc = allocateData(`5`);
726
727	// Emit URX_LB_START
728	appendOp(URX_LB_START, dataLoc);
729
730	// Emit URX_LBN_CONT
731	appendOp(URX_LBN_CONT, dataLoc);
732	appendOp(URX_RESERVED_OP, `0`); // MinMatchLength. To be filled later.
733	appendOp(URX_RESERVED_OP, `0`); // MaxMatchLength. To be filled later.
734	appendOp(URX_RESERVED_OP, `0`); // Continue Loc. To be filled later.
735
736	// Emit the NOPs
737	appendOp(URX_NOP, `0`);
738	appendOp(URX_NOP, `0`);
739
740	// On the Parentheses stack, start a new frame and add the postions
741	// of the URX_LB_CONT and the NOP.
742	fParenStack.push(fModeFlags, fStatus); // Match mode state*
743	fParenStack.push(lookBehindN, fStatus); // Frame type*
744	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP location*
745	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The 2nd NOP location*
746
747	// The final two instructions will be added when the ')' is encountered.
748	}
749	break;
750
751	case doConditionalExpr:
752	// Conditionals such as (?(1)a:b)
753	case doPerlInline:
754	// Perl inline-condtionals. (?{perl code}a\|b) We're not perl, no way to do them.
755	error(U_REGEX_UNIMPLEMENTED);
756	break;
757
758
759	case doCloseParen:
760	handleCloseParen();
761	if (fParenStack.size() <= `0`) {
762	// Extra close paren, or missing open paren.
763	error(U_REGEX_MISMATCHED_PAREN);
764	}
765	break;
766
767	case doNOP:
768	break;
769
770
771	case doBadOpenParenType:
772	case doRuleError:
773	error(U_REGEX_RULE_SYNTAX);
774	break;
775
776
777	case doMismatchedParenErr:
778	error(U_REGEX_MISMATCHED_PAREN);
779	break;
780
781	case doPlus:
782	// Normal '+' compiles to
783	// 1. stuff to be repeated (already built)
784	// 2. jmp-sav 1
785	// 3. ...
786	//
787	// Or, if the item to be repeated can match a zero length string,
788	// 1. STO_INP_LOC data-loc
789	// 2. body of stuff to be repeated
790	// 3. JMP_SAV_X 2
791	// 4. ...
792
793	//
794	// Or, if the item to be repeated is simple
795	// 1. Item to be repeated.
796	// 2. LOOP_SR_I set number (assuming repeated item is a set ref)
797	// 3. LOOP_C stack location
798	{
799	int32_t topLoc = blockTopLoc(FALSE); // location of item #1
800	int32_t frameLoc;
801
802	// Check for simple constructs, which may get special optimized code.
803	if (topLoc == fRXPat->fCompiledPat->size() - `1`) {
804	int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
805
806	if (URX_TYPE(repeatedOp) == URX_SETREF) {
807	// Emit optimized code for [char set]+
808	appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
809	frameLoc = allocateStackData(`1`);
810	appendOp(URX_LOOP_C, frameLoc);
811	break;
812	}
813
814	if (URX_TYPE(repeatedOp) == URX_DOTANY \|\|
815	URX_TYPE(repeatedOp) == URX_DOTANY_ALL \|\|
816	URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
817	// Emit Optimized code for .+ operations.
818	int32_t loopOpI = buildOp(URX_LOOP_DOT_I, `0`);
819	if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
820	// URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
821	loopOpI \|= `1`;
822	}
823	if (fModeFlags & UREGEX_UNIX_LINES) {
824	loopOpI \|= `2`;
825	}
826	appendOp(loopOpI);
827	frameLoc = allocateStackData(`1`);
828	appendOp(URX_LOOP_C, frameLoc);
829	break;
830	}
831
832	}
833
834	// General case.
835
836	// Check for minimum match length of zero, which requires
837	// extra loop-breaking code.
838	if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-`1`) == `0`) {
839	// Zero length match is possible.
840	// Emit the code sequence that can handle it.
841	insertOp(topLoc);
842	frameLoc = allocateStackData(`1`);
843
844	int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
845	fRXPat->fCompiledPat->setElementAt(op, topLoc);
846
847	appendOp(URX_JMP_SAV_X, topLoc+`1`);
848	} else {
849	// Simpler code when the repeated body must match something non-empty
850	appendOp(URX_JMP_SAV, topLoc);
851	}
852	}
853	break;
854
855	case doNGPlus:
856	// Non-greedy '+?' compiles to
857	// 1. stuff to be repeated (already built)
858	// 2. state-save 1
859	// 3. ...
860	{
861	int32_t topLoc = blockTopLoc(FALSE);
862	appendOp(URX_STATE_SAVE, topLoc);
863	}
864	break;
865
866
867	case doOpt:
868	// Normal (greedy) ? quantifier.
869	// Compiles to
870	// 1. state save 3
871	// 2. body of optional block
872	// 3. ...
873	// Insert the state save into the compiled pattern, and we're done.
874	{
875	int32_t saveStateLoc = blockTopLoc(TRUE);
876	int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
877	fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
878	}
879	break;
880
881	case doNGOpt:
882	// Non-greedy ?? quantifier
883	// compiles to
884	// 1. jmp 4
885	// 2. body of optional block
886	// 3 jmp 5
887	// 4. state save 2
888	// 5 ...
889	// This code is less than ideal, with two jmps instead of one, because we can only
890	// insert one instruction at the top of the block being iterated.
891	{
892	int32_t jmp1_loc = blockTopLoc(TRUE);
893	int32_t jmp2_loc = fRXPat->fCompiledPat->size();
894
895	int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+`1`);
896	fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
897
898	appendOp(URX_JMP, jmp2_loc+`2`);
899
900	appendOp(URX_STATE_SAVE, jmp1_loc+`1`);
901	}
902	break;
903
904
905	case doStar:
906	// Normal (greedy) quantifier.*
907	// Compiles to
908	// 1. STATE_SAVE 4
909	// 2. body of stuff being iterated over
910	// 3. JMP_SAV 2
911	// 4. ...
912	//
913	// Or, if the body is a simple [Set],
914	// 1. LOOP_SR_I set number
915	// 2. LOOP_C stack location
916	// ...
917	//
918	// Or if this is a .*
919	// 1. LOOP_DOT_I (. matches all mode flag)
920	// 2. LOOP_C stack location
921	//
922	// Or, if the body can match a zero-length string, to inhibit infinite loops,
923	// 1. STATE_SAVE 5
924	// 2. STO_INP_LOC data-loc
925	// 3. body of stuff
926	// 4. JMP_SAV_X 2
927	// 5. ...
928	{
929	// location of item #1, the STATE_SAVE
930	int32_t topLoc = blockTopLoc(FALSE);
931	int32_t dataLoc = -`1`;
932
933	// Check for simple , where the construct being repeated*
934	// compiled to single opcode, and might be optimizable.
935	if (topLoc == fRXPat->fCompiledPat->size() - `1`) {
936	int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
937
938	if (URX_TYPE(repeatedOp) == URX_SETREF) {
939	// Emit optimized code for a [char set]*
940	int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
941	fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
942	dataLoc = allocateStackData(`1`);
943	appendOp(URX_LOOP_C, dataLoc);
944	break;
945	}
946
947	if (URX_TYPE(repeatedOp) == URX_DOTANY \|\|
948	URX_TYPE(repeatedOp) == URX_DOTANY_ALL \|\|
949	URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
950	// Emit Optimized code for . operations.*
951	int32_t loopOpI = buildOp(URX_LOOP_DOT_I, `0`);
952	if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
953	// URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
954	loopOpI \|= `1`;
955	}
956	if ((fModeFlags & UREGEX_UNIX_LINES) != `0`) {
957	loopOpI \|= `2`;
958	}
959	fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
960	dataLoc = allocateStackData(`1`);
961	appendOp(URX_LOOP_C, dataLoc);
962	break;
963	}
964	}
965
966	// Emit general case code for this *
967	// The optimizations did not apply.
968
969	int32_t saveStateLoc = blockTopLoc(TRUE);
970	int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+`1`);
971
972	// Check for minimum match length of zero, which requires
973	// extra loop-breaking code.
974	if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-`1`) == `0`) {
975	insertOp(saveStateLoc);
976	dataLoc = allocateStackData(`1`);
977
978	int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
979	fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+`1`);
980	jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+`2`);
981	}
982
983	// Locate the position in the compiled pattern where the match will continue
984	// after completing the . (4 or 5 in the comment above)*
985	int32_t continueLoc = fRXPat->fCompiledPat->size()+`1`;
986
987	// Put together the save state op and store it into the compiled code.
988	int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
989	fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
990
991	// Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
992	appendOp(jmpOp);
993	}
994	break;
995
996	case doNGStar:
997	// Non-greedy ? quantifier*
998	// compiles to
999	// 1. JMP 3
1000	// 2. body of stuff being iterated over
1001	// 3. STATE_SAVE 2
1002	// 4 ...
1003	{
1004	int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
1005	int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
1006	int32_t jmpOp = buildOp(URX_JMP, saveLoc);
1007	fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1008	appendOp(URX_STATE_SAVE, jmpLoc+`1`);
1009	}
1010	break;
1011
1012
1013	case doIntervalInit:
1014	// The '{' opening an interval quantifier was just scanned.
1015	// Init the counter varaiables that will accumulate the values as the digits
1016	// are scanned.
1017	fIntervalLow = `0`;
1018	fIntervalUpper = -`1`;
1019	break;
1020
1021	case doIntevalLowerDigit:
1022	// Scanned a digit from the lower value of an {lower,upper} interval
1023	{
1024	int32_t digitValue = u_charDigitValue(fC.fChar);
1025	U_ASSERT(digitValue >= `0`);
1026	int64_t val = (int64_t)fIntervalLow*`10` + digitValue;
1027	if (val > INT32_MAX) {
1028	error(U_REGEX_NUMBER_TOO_BIG);
1029	} else {
1030	fIntervalLow = (int32_t)val;
1031	}
1032	}
1033	break;
1034
1035	case doIntervalUpperDigit:
1036	// Scanned a digit from the upper value of an {lower,upper} interval
1037	{
1038	if (fIntervalUpper < `0`) {
1039	fIntervalUpper = `0`;
1040	}
1041	int32_t digitValue = u_charDigitValue(fC.fChar);
1042	U_ASSERT(digitValue >= `0`);
1043	int64_t val = (int64_t)fIntervalUpper*`10` + digitValue;
1044	if (val > INT32_MAX) {
1045	error(U_REGEX_NUMBER_TOO_BIG);
1046	} else {
1047	fIntervalUpper = (int32_t)val;
1048	}
1049	}
1050	break;
1051
1052	case doIntervalSame:
1053	// Scanned a single value interval like {27}. Upper = Lower.
1054	fIntervalUpper = fIntervalLow;
1055	break;
1056
1057	case doInterval:
1058	// Finished scanning a normal {lower,upper} interval. Generate the code for it.
1059	if (compileInlineInterval() == FALSE) {
1060	compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1061	}
1062	break;
1063
1064	case doPossessiveInterval:
1065	// Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1066	{
1067	// Remember the loc for the top of the block being looped over.
1068	// (Can not reserve a slot in the compiled pattern at this time, because
1069	// compileInterval needs to reserve also, and blockTopLoc can only reserve
1070	// once per block.)
1071	int32_t topLoc = blockTopLoc(FALSE);
1072
1073	// Produce normal looping code.
1074	compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1075
1076	// Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1077	// just as if the loop was inclosed in atomic parentheses.
1078
1079	// First the STO_SP before the start of the loop
1080	insertOp(topLoc);
1081
1082	int32_t varLoc = allocateData(`1`); // Reserve a data location for saving the
1083	int32_t op = buildOp(URX_STO_SP, varLoc);
1084	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1085
1086	int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1087	U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1088	loopOp++; // point LoopOp after the just-inserted STO_SP
1089	fRXPat->fCompiledPat->push(loopOp, *fStatus);
1090
1091	// Then the LD_SP after the end of the loop
1092	appendOp(URX_LD_SP, varLoc);
1093	}
1094
1095	break;
1096
1097	case doNGInterval:
1098	// Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1099	compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1100	break;
1101
1102	case doIntervalError:
1103	error(U_REGEX_BAD_INTERVAL);
1104	break;
1105
1106	case doLiteralChar:
1107	// We've just scanned a "normal" character from the pattern,
1108	literalChar(fC.fChar);
1109	break;
1110
1111
1112	case doEscapedLiteralChar:
1113	// We've just scanned an backslashed escaped character with no
1114	// special meaning. It represents itself.
1115	if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != `0` &&
1116	((fC.fChar >= `0x41` && fC.fChar<= `0x5A`) \|\| // in [A-Z]
1117	(fC.fChar >= `0x61` && fC.fChar <= `0x7a`))) { // in [a-z]
1118	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1119	}
1120	literalChar(fC.fChar);
1121	break;
1122
1123
1124	case doDotAny:
1125	// scanned a ".", match any single character.
1126	{
1127	fixLiterals(FALSE);
1128	if (fModeFlags & UREGEX_DOTALL) {
1129	appendOp(URX_DOTANY_ALL, `0`);
1130	} else if (fModeFlags & UREGEX_UNIX_LINES) {
1131	appendOp(URX_DOTANY_UNIX, `0`);
1132	} else {
1133	appendOp(URX_DOTANY, `0`);
1134	}
1135	}
1136	break;
1137
1138	case doCaret:
1139	{
1140	fixLiterals(FALSE);
1141	if ( (fModeFlags & UREGEX_MULTILINE) == `0` && (fModeFlags & UREGEX_UNIX_LINES) == `0`) {
1142	appendOp(URX_CARET, `0`);
1143	} else if ((fModeFlags & UREGEX_MULTILINE) != `0` && (fModeFlags & UREGEX_UNIX_LINES) == `0`) {
1144	appendOp(URX_CARET_M, `0`);
1145	} else if ((fModeFlags & UREGEX_MULTILINE) == `0` && (fModeFlags & UREGEX_UNIX_LINES) != `0`) {
1146	appendOp(URX_CARET, `0`); // Only testing true start of input.
1147	} else if ((fModeFlags & UREGEX_MULTILINE) != `0` && (fModeFlags & UREGEX_UNIX_LINES) != `0`) {
1148	appendOp(URX_CARET_M_UNIX, `0`);
1149	}
1150	}
1151	break;
1152
1153	case doDollar:
1154	{
1155	fixLiterals(FALSE);
1156	if ( (fModeFlags & UREGEX_MULTILINE) == `0` && (fModeFlags & UREGEX_UNIX_LINES) == `0`) {
1157	appendOp(URX_DOLLAR, `0`);
1158	} else if ((fModeFlags & UREGEX_MULTILINE) != `0` && (fModeFlags & UREGEX_UNIX_LINES) == `0`) {
1159	appendOp(URX_DOLLAR_M, `0`);
1160	} else if ((fModeFlags & UREGEX_MULTILINE) == `0` && (fModeFlags & UREGEX_UNIX_LINES) != `0`) {
1161	appendOp(URX_DOLLAR_D, `0`);
1162	} else if ((fModeFlags & UREGEX_MULTILINE) != `0` && (fModeFlags & UREGEX_UNIX_LINES) != `0`) {
1163	appendOp(URX_DOLLAR_MD, `0`);
1164	}
1165	}
1166	break;
1167
1168	case doBackslashA:
1169	fixLiterals(FALSE);
1170	appendOp(URX_CARET, `0`);
1171	break;
1172
1173	case doBackslashB:
1174	{
1175	#if UCONFIG_NO_BREAK_ITERATION==1
1176	if (fModeFlags & UREGEX_UWORD) {
1177	error(U_UNSUPPORTED_ERROR);
1178	}
1179	#endif
1180	fixLiterals(FALSE);
1181	int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1182	appendOp(op, `1`);
1183	}
1184	break;
1185
1186	case doBackslashb:
1187	{
1188	#if UCONFIG_NO_BREAK_ITERATION==1
1189	if (fModeFlags & UREGEX_UWORD) {
1190	error(U_UNSUPPORTED_ERROR);
1191	}
1192	#endif
1193	fixLiterals(FALSE);
1194	int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1195	appendOp(op, `0`);
1196	}
1197	break;
1198
1199	case doBackslashD:
1200	fixLiterals(FALSE);
1201	appendOp(URX_BACKSLASH_D, `1`);
1202	break;
1203
1204	case doBackslashd:
1205	fixLiterals(FALSE);
1206	appendOp(URX_BACKSLASH_D, `0`);
1207	break;
1208
1209	case doBackslashG:
1210	fixLiterals(FALSE);
1211	appendOp(URX_BACKSLASH_G, `0`);
1212	break;
1213
1214	case doBackslashH:
1215	fixLiterals(FALSE);
1216	appendOp(URX_BACKSLASH_H, `1`);
1217	break;
1218
1219	case doBackslashh:
1220	fixLiterals(FALSE);
1221	appendOp(URX_BACKSLASH_H, `0`);
1222	break;
1223
1224	case doBackslashR:
1225	fixLiterals(FALSE);
1226	appendOp(URX_BACKSLASH_R, `0`);
1227	break;
1228
1229	case doBackslashS:
1230	fixLiterals(FALSE);
1231	appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1232	break;
1233
1234	case doBackslashs:
1235	fixLiterals(FALSE);
1236	appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1237	break;
1238
1239	case doBackslashV:
1240	fixLiterals(FALSE);
1241	appendOp(URX_BACKSLASH_V, `1`);
1242	break;
1243
1244	case doBackslashv:
1245	fixLiterals(FALSE);
1246	appendOp(URX_BACKSLASH_V, `0`);
1247	break;
1248
1249	case doBackslashW:
1250	fixLiterals(FALSE);
1251	appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1252	break;
1253
1254	case doBackslashw:
1255	fixLiterals(FALSE);
1256	appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1257	break;
1258
1259	case doBackslashX:
1260	fixLiterals(FALSE);
1261	appendOp(URX_BACKSLASH_X, `0`);
1262	break;
1263
1264
1265	case doBackslashZ:
1266	fixLiterals(FALSE);
1267	appendOp(URX_DOLLAR, `0`);
1268	break;
1269
1270	case doBackslashz:
1271	fixLiterals(FALSE);
1272	appendOp(URX_BACKSLASH_Z, `0`);
1273	break;
1274
1275	case doEscapeError:
1276	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1277	break;
1278
1279	case doExit:
1280	fixLiterals(FALSE);
1281	returnVal = FALSE;
1282	break;
1283
1284	case doProperty:
1285	{
1286	fixLiterals(FALSE);
1287	UnicodeSet *theSet = scanProp();
1288	compileSet(theSet);
1289	}
1290	break;
1291
1292	case doNamedChar:
1293	{
1294	UChar32 c = scanNamedChar();
1295	literalChar(c);
1296	}
1297	break;
1298
1299
1300	case doBackRef:
1301	// BackReference. Somewhat unusual in that the front-end can not completely parse
1302	// the regular expression, because the number of digits to be consumed
1303	// depends on the number of capture groups that have been defined. So
1304	// we have to do it here instead.
1305	{
1306	int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1307	int32_t groupNum = `0`;
1308	UChar32 c = fC.fChar;
1309
1310	for (;;) {
1311	// Loop once per digit, for max allowed number of digits in a back reference.
1312	int32_t digit = u_charDigitValue(c);
1313	groupNum = groupNum * `10` + digit;
1314	if (groupNum >= numCaptureGroups) {
1315	break;
1316	}
1317	c = peekCharLL();
1318	if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1319	break;
1320	}
1321	nextCharLL();
1322	}
1323
1324	// Scan of the back reference in the source regexp is complete. Now generate
1325	// the compiled code for it.
1326	// Because capture groups can be forward-referenced by back-references,
1327	// we fill the operand with the capture group number. At the end
1328	// of compilation, it will be changed to the variable's location.
1329	U_ASSERT(groupNum > `0`); // Shouldn't happen. '\0' begins an octal escape sequence,
1330	// and shouldn't enter this code path at all.
1331	fixLiterals(FALSE);
1332	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1333	appendOp(URX_BACKREF_I, groupNum);
1334	} else {
1335	appendOp(URX_BACKREF, groupNum);
1336	}
1337	}
1338	break;
1339
1340	case doBeginNamedBackRef:
1341	U_ASSERT(fCaptureName == NULL);
1342	fCaptureName = new UnicodeString;
1343	if (fCaptureName == NULL) {
1344	error(U_MEMORY_ALLOCATION_ERROR);
1345	}
1346	break;
1347
1348	case doContinueNamedBackRef:
1349	fCaptureName->append(fC.fChar);
1350	break;
1351
1352	case doCompleteNamedBackRef:
1353	{
1354	int32_t groupNumber =
1355	fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : `0`;
1356	if (groupNumber == `0`) {
1357	// Group name has not been defined.
1358	// Could be a forward reference. If we choose to support them at some
1359	// future time, extra mechanism will be required at this point.
1360	error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1361	} else {
1362	// Given the number, handle identically to a \n numbered back reference.
1363	// See comments above, under doBackRef
1364	fixLiterals(FALSE);
1365	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1366	appendOp(URX_BACKREF_I, groupNumber);
1367	} else {
1368	appendOp(URX_BACKREF, groupNumber);
1369	}
1370	}
1371	delete fCaptureName;
1372	fCaptureName = NULL;
1373	break;
1374	}
1375
1376	case doPossessivePlus:
1377	// Possessive ++ quantifier.
1378	// Compiles to
1379	// 1. STO_SP
1380	// 2. body of stuff being iterated over
1381	// 3. STATE_SAVE 5
1382	// 4. JMP 2
1383	// 5. LD_SP
1384	// 6. ...
1385	//
1386	// Note: TODO: This is pretty inefficient. A mass of saved state is built up
1387	// then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1388	//
1389	{
1390	// Emit the STO_SP
1391	int32_t topLoc = blockTopLoc(TRUE);
1392	int32_t stoLoc = allocateData(`1`); // Reserve the data location for storing save stack ptr.
1393	int32_t op = buildOp(URX_STO_SP, stoLoc);
1394	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1395
1396	// Emit the STATE_SAVE
1397	appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+`2`);
1398
1399	// Emit the JMP
1400	appendOp(URX_JMP, topLoc+`1`);
1401
1402	// Emit the LD_SP
1403	appendOp(URX_LD_SP, stoLoc);
1404	}
1405	break;
1406
1407	case doPossessiveStar:
1408	// Possessive + quantifier.*
1409	// Compiles to
1410	// 1. STO_SP loc
1411	// 2. STATE_SAVE 5
1412	// 3. body of stuff being iterated over
1413	// 4. JMP 2
1414	// 5. LD_SP loc
1415	// 6 ...
1416	// TODO: do something to cut back the state stack each time through the loop.
1417	{
1418	// Reserve two slots at the top of the block.
1419	int32_t topLoc = blockTopLoc(TRUE);
1420	insertOp(topLoc);
1421
1422	// emit STO_SP loc
1423	int32_t stoLoc = allocateData(`1`); // Reserve the data location for storing save stack ptr.
1424	int32_t op = buildOp(URX_STO_SP, stoLoc);
1425	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1426
1427	// Emit the SAVE_STATE 5
1428	int32_t L7 = fRXPat->fCompiledPat->size()+`1`;
1429	op = buildOp(URX_STATE_SAVE, L7);
1430	fRXPat->fCompiledPat->setElementAt(op, topLoc+`1`);
1431
1432	// Append the JMP operation.
1433	appendOp(URX_JMP, topLoc+`1`);
1434
1435	// Emit the LD_SP loc
1436	appendOp(URX_LD_SP, stoLoc);
1437	}
1438	break;
1439
1440	case doPossessiveOpt:
1441	// Possessive ?+ quantifier.
1442	// Compiles to
1443	// 1. STO_SP loc
1444	// 2. SAVE_STATE 5
1445	// 3. body of optional block
1446	// 4. LD_SP loc
1447	// 5. ...
1448	//
1449	{
1450	// Reserve two slots at the top of the block.
1451	int32_t topLoc = blockTopLoc(TRUE);
1452	insertOp(topLoc);
1453
1454	// Emit the STO_SP
1455	int32_t stoLoc = allocateData(`1`); // Reserve the data location for storing save stack ptr.
1456	int32_t op = buildOp(URX_STO_SP, stoLoc);
1457	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1458
1459	// Emit the SAVE_STATE
1460	int32_t continueLoc = fRXPat->fCompiledPat->size()+`1`;
1461	op = buildOp(URX_STATE_SAVE, continueLoc);
1462	fRXPat->fCompiledPat->setElementAt(op, topLoc+`1`);
1463
1464	// Emit the LD_SP
1465	appendOp(URX_LD_SP, stoLoc);
1466	}
1467	break;
1468
1469
1470	case doBeginMatchMode:
1471	fNewModeFlags = fModeFlags;
1472	fSetModeFlag = TRUE;
1473	break;
1474
1475	case doMatchMode: // (?i) and similar
1476	{
1477	int32_t bit = `0`;
1478	switch (fC.fChar) {
1479	case `0x69`: / 'i' / bit = UREGEX_CASE_INSENSITIVE; break;
1480	case `0x64`: / 'd' / bit = UREGEX_UNIX_LINES; break;
1481	case `0x6d`: / 'm' / bit = UREGEX_MULTILINE; break;
1482	case `0x73`: / 's' / bit = UREGEX_DOTALL; break;
1483	case `0x75`: / 'u' / bit = `0`; / Unicode casing / break;
1484	case `0x77`: / 'w' / bit = UREGEX_UWORD; break;
1485	case `0x78`: / 'x' / bit = UREGEX_COMMENTS; break;
1486	case `0x2d`: / '-' / fSetModeFlag = FALSE; break;
1487	default:
1488	UPRV_UNREACHABLE; // Should never happen. Other chars are filtered out
1489	// by the scanner.
1490	}
1491	if (fSetModeFlag) {
1492	fNewModeFlags \|= bit;
1493	} else {
1494	fNewModeFlags &= ~bit;
1495	}
1496	}
1497	break;
1498
1499	case doSetMatchMode:
1500	// Emit code to match any pending literals, using the not-yet changed match mode.
1501	fixLiterals();
1502
1503	// We've got a (?i) or similar. The match mode is being changed, but
1504	// the change is not scoped to a parenthesized block.
1505	U_ASSERT(fNewModeFlags < `0`);
1506	fModeFlags = fNewModeFlags;
1507
1508	break;
1509
1510
1511	case doMatchModeParen:
1512	// We've got a (?i: or similar. Begin a parenthesized block, save old
1513	// mode flags so they can be restored at the close of the block.
1514	//
1515	// Compile to a
1516	// - NOP, which later may be replaced by a save-state if the
1517	// parenthesized group gets a quantifier, followed by*
1518	// - NOP, which may later be replaced by a save-state if there
1519	// is an '\|' alternation within the parens.
1520	{
1521	fixLiterals(FALSE);
1522	appendOp(URX_NOP, `0`);
1523	appendOp(URX_NOP, `0`);
1524
1525	// On the Parentheses stack, start a new frame and add the postions
1526	// of the two NOPs (a normal non-capturing () frame, except for the
1527	// saving of the orignal mode flags.)
1528	fParenStack.push(fModeFlags, *fStatus);
1529	fParenStack.push(flags, fStatus); // Frame Marker*
1530	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP*
1531	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP*
1532
1533	// Set the current mode flags to the new values.
1534	U_ASSERT(fNewModeFlags < `0`);
1535	fModeFlags = fNewModeFlags;
1536	}
1537	break;
1538
1539	case doBadModeFlag:
1540	error(U_REGEX_INVALID_FLAG);
1541	break;
1542
1543	case doSuppressComments:
1544	// We have just scanned a '(?'. We now need to prevent the character scanner from
1545	// treating a '#' as a to-the-end-of-line comment.
1546	// (This Perl compatibility just gets uglier and uglier to do...)
1547	fEOLComments = FALSE;
1548	break;
1549
1550
1551	case doSetAddAmp:
1552	{
1553	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1554	set->add(chAmp);
1555	}
1556	break;
1557
1558	case doSetAddDash:
1559	{
1560	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1561	set->add(chDash);
1562	}
1563	break;
1564
1565	case doSetBackslash_s:
1566	{
1567	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1568	set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1569	break;
1570	}
1571
1572	case doSetBackslash_S:
1573	{
1574	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1575	UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1576	SSet.complement();
1577	set->addAll(SSet);
1578	break;
1579	}
1580
1581	case doSetBackslash_d:
1582	{
1583	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1584	// TODO - make a static set, ticket 6058.
1585	addCategory(set, U_GC_ND_MASK, *fStatus);
1586	break;
1587	}
1588
1589	case doSetBackslash_D:
1590	{
1591	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1592	UnicodeSet digits;
1593	// TODO - make a static set, ticket 6058.
1594	digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1595	digits.complement();
1596	set->addAll(digits);
1597	break;
1598	}
1599
1600	case doSetBackslash_h:
1601	{
1602	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1603	UnicodeSet h;
1604	h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1605	h.add((UChar32)`9`); // Tab
1606	set->addAll(h);
1607	break;
1608	}
1609
1610	case doSetBackslash_H:
1611	{
1612	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1613	UnicodeSet h;
1614	h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1615	h.add((UChar32)`9`); // Tab
1616	h.complement();
1617	set->addAll(h);
1618	break;
1619	}
1620
1621	case doSetBackslash_v:
1622	{
1623	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1624	set->add((UChar32)`0x0a`, (UChar32)`0x0d`); // add range
1625	set->add((UChar32)`0x85`);
1626	set->add((UChar32)`0x2028`, (UChar32)`0x2029`);
1627	break;
1628	}
1629
1630	case doSetBackslash_V:
1631	{
1632	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1633	UnicodeSet v;
1634	v.add((UChar32)`0x0a`, (UChar32)`0x0d`); // add range
1635	v.add((UChar32)`0x85`);
1636	v.add((UChar32)`0x2028`, (UChar32)`0x2029`);
1637	v.complement();
1638	set->addAll(v);
1639	break;
1640	}
1641
1642	case doSetBackslash_w:
1643	{
1644	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1645	set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1646	break;
1647	}
1648
1649	case doSetBackslash_W:
1650	{
1651	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1652	UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1653	SSet.complement();
1654	set->addAll(SSet);
1655	break;
1656	}
1657
1658	case doSetBegin:
1659	fixLiterals(FALSE);
1660	fSetStack.push(new UnicodeSet (), *fStatus);
1661	fSetOpStack.push(setStart, *fStatus);
1662	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != `0`) {
1663	fSetOpStack.push(setCaseClose, *fStatus);
1664	}
1665	break;
1666
1667	case doSetBeginDifference1:
1668	// We have scanned something like [[abc]-[
1669	// Set up a new UnicodeSet for the set beginning with the just-scanned '['
1670	// Push a Difference operator, which will cause the new set to be subtracted from what
1671	// went before once it is created.
1672	setPushOp(setDifference1);
1673	fSetOpStack.push(setStart, *fStatus);
1674	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != `0`) {
1675	fSetOpStack.push(setCaseClose, *fStatus);
1676	}
1677	break;
1678
1679	case doSetBeginIntersection1:
1680	// We have scanned something like [[abc]&[
1681	// Need both the '&' operator and the open '[' operator.
1682	setPushOp(setIntersection1);
1683	fSetOpStack.push(setStart, *fStatus);
1684	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != `0`) {
1685	fSetOpStack.push(setCaseClose, *fStatus);
1686	}
1687	break;
1688
1689	case doSetBeginUnion:
1690	// We have scanned something like [[abc][
1691	// Need to handle the union operation explicitly [[abc] \| [
1692	setPushOp(setUnion);
1693	fSetOpStack.push(setStart, *fStatus);
1694	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != `0`) {
1695	fSetOpStack.push(setCaseClose, *fStatus);
1696	}
1697	break;
1698
1699	case doSetDifference2:
1700	// We have scanned something like [abc--
1701	// Consider this to unambiguously be a set difference operator.
1702	setPushOp(setDifference2);
1703	break;
1704
1705	case doSetEnd:
1706	// Have encountered the ']' that closes a set.
1707	// Force the evaluation of any pending operations within this set,
1708	// leave the completed set on the top of the set stack.
1709	setEval(setEnd);
1710	U_ASSERT(fSetOpStack.peeki()==setStart);
1711	fSetOpStack.popi();
1712	break;
1713
1714	case doSetFinish:
1715	{
1716	// Finished a complete set expression, including all nested sets.
1717	// The close bracket has already triggered clearing out pending set operators,
1718	// the operator stack should be empty and the operand stack should have just
1719	// one entry, the result set.
1720	U_ASSERT(fSetOpStack.empty());
1721	UnicodeSet theSet = (UnicodeSet )fSetStack.pop();
1722	U_ASSERT(fSetStack.empty());
1723	compileSet(theSet);
1724	break;
1725	}
1726
1727	case doSetIntersection2:
1728	// Have scanned something like [abc&&
1729	setPushOp(setIntersection2);
1730	break;
1731
1732	case doSetLiteral:
1733	// Union the just-scanned literal character into the set being built.
1734	// This operation is the highest precedence set operation, so we can always do
1735	// it immediately, without waiting to see what follows. It is necessary to perform
1736	// any pending '-' or '&' operation first, because these have the same precedence
1737	// as union-ing in a literal'
1738	{
1739	setEval(setUnion);
1740	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1741	s->add(fC.fChar);
1742	fLastSetLiteral = fC.fChar;
1743	break;
1744	}
1745
1746	case doSetLiteralEscaped:
1747	// A back-slash escaped literal character was encountered.
1748	// Processing is the same as with setLiteral, above, with the addition of
1749	// the optional check for errors on escaped ASCII letters.
1750	{
1751	if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != `0` &&
1752	((fC.fChar >= `0x41` && fC.fChar<= `0x5A`) \|\| // in [A-Z]
1753	(fC.fChar >= `0x61` && fC.fChar <= `0x7a`))) { // in [a-z]
1754	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1755	}
1756	setEval(setUnion);
1757	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1758	s->add(fC.fChar);
1759	fLastSetLiteral = fC.fChar;
1760	break;
1761	}
1762
1763	case doSetNamedChar:
1764	// Scanning a \N{UNICODE CHARACTER NAME}
1765	// Aside from the source of the character, the processing is identical to doSetLiteral,
1766	// above.
1767	{
1768	UChar32 c = scanNamedChar();
1769	setEval(setUnion);
1770	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1771	s->add(c);
1772	fLastSetLiteral = c;
1773	break;
1774	}
1775
1776	case doSetNamedRange:
1777	// We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1778	// The left character is already in the set, and is saved in fLastSetLiteral.
1779	// The right side needs to be picked up, the scan is at the 'N'.
1780	// Lower Limit > Upper limit being an error matches both Java
1781	// and ICU UnicodeSet behavior.
1782	{
1783	UChar32 c = scanNamedChar();
1784	if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL \|\| fLastSetLiteral > c)) {
1785	error(U_REGEX_INVALID_RANGE);
1786	}
1787	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1788	s->add(fLastSetLiteral, c);
1789	fLastSetLiteral = c;
1790	break;
1791	}
1792
1793
1794	case doSetNegate:
1795	// Scanned a '^' at the start of a set.
1796	// Push the negation operator onto the set op stack.
1797	// A twist for case-insensitive matching:
1798	// the case closure operation must happen _before_ negation.
1799	// But the case closure operation will already be on the stack if it's required.
1800	// This requires checking for case closure, and swapping the stack order
1801	// if it is present.
1802	{
1803	int32_t tosOp = fSetOpStack.peeki();
1804	if (tosOp == setCaseClose) {
1805	fSetOpStack.popi();
1806	fSetOpStack.push(setNegation, *fStatus);
1807	fSetOpStack.push(setCaseClose, *fStatus);
1808	} else {
1809	fSetOpStack.push(setNegation, *fStatus);
1810	}
1811	}
1812	break;
1813
1814	case doSetNoCloseError:
1815	error(U_REGEX_MISSING_CLOSE_BRACKET);
1816	break;
1817
1818	case doSetOpError:
1819	error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
1820	break;
1821
1822	case doSetPosixProp:
1823	{
1824	UnicodeSet *s = scanPosixProp();
1825	if (s != NULL) {
1826	UnicodeSet tos = (UnicodeSet )fSetStack.peek();
1827	tos->addAll(*s);
1828	delete s;
1829	} // else error. scanProp() reported the error status already.
1830	}
1831	break;
1832
1833	case doSetProp:
1834	// Scanned a \p \P within [brackets].
1835	{
1836	UnicodeSet *s = scanProp();
1837	if (s != NULL) {
1838	UnicodeSet tos = (UnicodeSet )fSetStack.peek();
1839	tos->addAll(*s);
1840	delete s;
1841	} // else error. scanProp() reported the error status already.
1842	}
1843	break;
1844
1845
1846	case doSetRange:
1847	// We have scanned literal-literal. Add the range to the set.
1848	// The left character is already in the set, and is saved in fLastSetLiteral.
1849	// The right side is the current character.
1850	// Lower Limit > Upper limit being an error matches both Java
1851	// and ICU UnicodeSet behavior.
1852	{
1853
1854	if (fLastSetLiteral == U_SENTINEL \|\| fLastSetLiteral > fC.fChar) {
1855	error(U_REGEX_INVALID_RANGE);
1856	}
1857	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1858	s->add(fLastSetLiteral, fC.fChar);
1859	break;
1860	}
1861
1862	default:
1863	UPRV_UNREACHABLE;
1864	}
1865
1866	if (U_FAILURE(*fStatus)) {
1867	returnVal = FALSE;
1868	}
1869
1870	return returnVal;
1871	}
1872
1873
1874
1875	//------------------------------------------------------------------------------
1876	//
1877	// literalChar We've encountered a literal character from the pattern,
1878	// or an escape sequence that reduces to a character.
1879	// Add it to the string containing all literal chars/strings from
1880	// the pattern.
1881	//
1882	//------------------------------------------------------------------------------
1883	void RegexCompile::literalChar(UChar32 c) {
1884	fLiteralChars.append(c);
1885	}
1886
1887
1888	//------------------------------------------------------------------------------
1889	//
1890	// fixLiterals When compiling something that can follow a literal
1891	// string in a pattern, emit the code to match the
1892	// accumulated literal string.
1893	//
1894	// Optionally, split the last char of the string off into
1895	// a single "ONE_CHAR" operation, so that quantifiers can
1896	// apply to that char alone. Example: abc*
1897	// The must apply to the 'c' only.*
1898	//
1899	//------------------------------------------------------------------------------
1900	void RegexCompile::fixLiterals(UBool split) {
1901
1902	// If no literal characters have been scanned but not yet had code generated
1903	// for them, nothing needs to be done.
1904	if (fLiteralChars.length() == `0`) {
1905	return;
1906	}
1907
1908	int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -`1`);
1909	UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1910
1911	// Split: We need to ensure that the last item in the compiled pattern
1912	// refers only to the last literal scanned in the pattern, so that
1913	// quantifiers (, +, etc.) affect only it, and not a longer string.*
1914	// Split before case folding for case insensitive matches.
1915
1916	if (split) {
1917	fLiteralChars.truncate(indexOfLastCodePoint);
1918	fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string.
1919	// Note that the truncated literal string may be empty, in which case
1920	// nothing will be emitted.
1921
1922	literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
1923	fixLiterals(FALSE); // Second recursive call, code for the final code point.
1924	return;
1925	}
1926
1927	// If we are doing case-insensitive matching, case fold the string. This may expand
1928	// the string, e.g. the German sharp-s turns into "ss"
1929	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1930	fLiteralChars.foldCase();
1931	indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -`1`);
1932	lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1933	}
1934
1935	if (indexOfLastCodePoint == `0`) {
1936	// Single character, emit a URX_ONECHAR op to match it.
1937	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1938	u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1939	appendOp(URX_ONECHAR_I, lastCodePoint);
1940	} else {
1941	appendOp(URX_ONECHAR, lastCodePoint);
1942	}
1943	} else {
1944	// Two or more chars, emit a URX_STRING to match them.
1945	if (fLiteralChars.length() > `0x00ffffff` \|\| fRXPat->fLiteralText.length() > `0x00ffffff`) {
1946	error(U_REGEX_PATTERN_TOO_BIG);
1947	}
1948	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1949	appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1950	} else {
1951	// TODO here: add optimization to split case sensitive strings of length two
1952	// into two single char ops, for efficiency.
1953	appendOp(URX_STRING, fRXPat->fLiteralText.length());
1954	}
1955	appendOp(URX_STRING_LEN, fLiteralChars.length());
1956
1957	// Add this string into the accumulated strings of the compiled pattern.
1958	fRXPat->fLiteralText.append(fLiteralChars);
1959	}
1960
1961	fLiteralChars.remove();
1962	}
1963
1964
1965	int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1966	if (U_FAILURE(*fStatus)) {
1967	return `0`;
1968	}
1969	if (type < `0` \|\| type > `255`) {
1970	UPRV_UNREACHABLE;
1971	}
1972	if (val > `0x00ffffff`) {
1973	UPRV_UNREACHABLE;
1974	}
1975	if (val < `0`) {
1976	if (!(type == URX_RESERVED_OP_N \|\| type == URX_RESERVED_OP)) {
1977	UPRV_UNREACHABLE;
1978	}
1979	if (URX_TYPE(val) != `0xff`) {
1980	UPRV_UNREACHABLE;
1981	}
1982	type = URX_RESERVED_OP_N;
1983	}
1984	return (type << `24`) \| val;
1985	}
1986
1987
1988	//------------------------------------------------------------------------------
1989	//
1990	// appendOp() Append a new instruction onto the compiled pattern
1991	// Includes error checking, limiting the size of the
1992	// pattern to lengths that can be represented in the
1993	// 24 bit operand field of an instruction.
1994	//
1995	//------------------------------------------------------------------------------
1996	void RegexCompile::appendOp(int32_t op) {
1997	if (U_FAILURE(*fStatus)) {
1998	return;
1999	}
2000	fRXPat->fCompiledPat->addElement(op, *fStatus);
2001	if ((fRXPat->fCompiledPat->size() > `0x00fffff0`) && U_SUCCESS(*fStatus)) {
2002	error(U_REGEX_PATTERN_TOO_BIG);
2003	}
2004	}
2005
2006	void RegexCompile::appendOp(int32_t type, int32_t val) {
2007	appendOp(buildOp(type, val));
2008	}
2009
2010
2011	//------------------------------------------------------------------------------
2012	//
2013	// insertOp() Insert a slot for a new opcode into the already
2014	// compiled pattern code.
2015	//
2016	// Fill the slot with a NOP. Our caller will replace it
2017	// with what they really wanted.
2018	//
2019	//------------------------------------------------------------------------------
2020	void RegexCompile::insertOp(int32_t where) {
2021	UVector64 *code = fRXPat->fCompiledPat;
2022	U_ASSERT(where>`0` && where < code->size());
2023
2024	int32_t nop = buildOp(URX_NOP, `0`);
2025	code->insertElementAt(nop, where, *fStatus);
2026
2027	// Walk through the pattern, looking for any ops with targets that
2028	// were moved down by the insert. Fix them.
2029	int32_t loc;
2030	for (loc=`0`; loc<code->size(); loc++) {
2031	int32_t op = (int32_t)code->elementAti(loc);
2032	int32_t opType = URX_TYPE(op);
2033	int32_t opValue = URX_VAL(op);
2034	if ((opType == URX_JMP \|\|
2035	opType == URX_JMPX \|\|
2036	opType == URX_STATE_SAVE \|\|
2037	opType == URX_CTR_LOOP \|\|
2038	opType == URX_CTR_LOOP_NG \|\|
2039	opType == URX_JMP_SAV \|\|
2040	opType == URX_JMP_SAV_X \|\|
2041	opType == URX_RELOC_OPRND) && opValue > where) {
2042	// Target location for this opcode is after the insertion point and
2043	// needs to be incremented to adjust for the insertion.
2044	opValue++;
2045	op = buildOp(opType, opValue);
2046	code->setElementAt(op, loc);
2047	}
2048	}
2049
2050	// Now fix up the parentheses stack. All positive values in it are locations in
2051	// the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2052	for (loc=`0`; loc<fParenStack.size(); loc++) {
2053	int32_t x = fParenStack.elementAti(loc);
2054	U_ASSERT(x < code->size());
2055	if (x>where) {
2056	x++;
2057	fParenStack.setElementAt(x, loc);
2058	}
2059	}
2060
2061	if (fMatchCloseParen > where) {
2062	fMatchCloseParen++;
2063	}
2064	if (fMatchOpenParen > where) {
2065	fMatchOpenParen++;
2066	}
2067	}
2068
2069
2070	//------------------------------------------------------------------------------
2071	//
2072	// allocateData() Allocate storage in the matcher's static data area.
2073	// Return the index for the newly allocated data.
2074	// The storage won't actually exist until we are running a match
2075	// operation, but the storage indexes are inserted into various
2076	// opcodes while compiling the pattern.
2077	//
2078	//------------------------------------------------------------------------------
2079	int32_t RegexCompile::allocateData(int32_t size) {
2080	if (U_FAILURE(*fStatus)) {
2081	return `0`;
2082	}
2083	if (size <= `0` \|\| size > `0x100` \|\| fRXPat->fDataSize < `0`) {
2084	error(U_REGEX_INTERNAL_ERROR);
2085	return `0`;
2086	}
2087	int32_t dataIndex = fRXPat->fDataSize;
2088	fRXPat->fDataSize += size;
2089	if (fRXPat->fDataSize >= `0x00fffff0`) {
2090	error(U_REGEX_INTERNAL_ERROR);
2091	}
2092	return dataIndex;
2093	}
2094
2095
2096	//------------------------------------------------------------------------------
2097	//
2098	// allocateStackData() Allocate space in the back-tracking stack frame.
2099	// Return the index for the newly allocated data.
2100	// The frame indexes are inserted into various
2101	// opcodes while compiling the pattern, meaning that frame
2102	// size must be restricted to the size that will fit
2103	// as an operand (24 bits).
2104	//
2105	//------------------------------------------------------------------------------
2106	int32_t RegexCompile::allocateStackData(int32_t size) {
2107	if (U_FAILURE(*fStatus)) {
2108	return `0`;
2109	}
2110	if (size <= `0` \|\| size > `0x100` \|\| fRXPat->fFrameSize < `0`) {
2111	error(U_REGEX_INTERNAL_ERROR);
2112	return `0`;
2113	}
2114	int32_t dataIndex = fRXPat->fFrameSize;
2115	fRXPat->fFrameSize += size;
2116	if (fRXPat->fFrameSize >= `0x00fffff0`) {
2117	error(U_REGEX_PATTERN_TOO_BIG);
2118	}
2119	return dataIndex;
2120	}
2121
2122
2123	//------------------------------------------------------------------------------
2124	//
2125	// blockTopLoc() Find or create a location in the compiled pattern
2126	// at the start of the operation or block that has
2127	// just been compiled. Needed when a quantifier ( or*
2128	// whatever) appears, and we need to add an operation
2129	// at the start of the thing being quantified.
2130	//
2131	// (Parenthesized Blocks) have a slot with a NOP that
2132	// is reserved for this purpose. . or similar don't*
2133	// and a slot needs to be added.
2134	//
2135	// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2136	// at the returned location.
2137	// FALSE - just return the address,
2138	// do not reserve a location there.
2139	//
2140	//------------------------------------------------------------------------------
2141	int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
2142	int32_t theLoc;
2143	fixLiterals(TRUE); // Emit code for any pending literals.
2144	// If last item was a string, emit separate op for the its last char.
2145	if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2146	{
2147	// The item just processed is a parenthesized block.
2148	theLoc = fMatchOpenParen; // A slot is already reserved for us.
2149	U_ASSERT(theLoc > `0`);
2150	U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2151	}
2152	else {
2153	// Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2154	// No slot for STATE_SAVE was pre-reserved in the compiled code.
2155	// We need to make space now.
2156	theLoc = fRXPat->fCompiledPat->size()-`1`;
2157	int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2158	if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2159	// Strings take two opcode, we want the position of the first one.
2160	// We can have a string at this point if a single character case-folded to two.
2161	theLoc--;
2162	}
2163	if (reserveLoc) {
2164	int32_t nop = buildOp(URX_NOP, `0`);
2165	fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2166	}
2167	}
2168	return theLoc;
2169	}
2170
2171
2172
2173	//------------------------------------------------------------------------------
2174	//
2175	// handleCloseParen When compiling a close paren, we need to go back
2176	// and fix up any JMP or SAVE operations within the
2177	// parenthesized block that need to target the end
2178	// of the block. The locations of these are kept on
2179	// the paretheses stack.
2180	//
2181	// This function is called both when encountering a
2182	// real ) and at the end of the pattern.
2183	//
2184	//------------------------------------------------------------------------------
2185	void RegexCompile::handleCloseParen() {
2186	int32_t patIdx;
2187	int32_t patOp;
2188	if (fParenStack.size() <= `0`) {
2189	error(U_REGEX_MISMATCHED_PAREN);
2190	return;
2191	}
2192
2193	// Emit code for any pending literals.
2194	fixLiterals(FALSE);
2195
2196	// Fixup any operations within the just-closed parenthesized group
2197	// that need to reference the end of the (block).
2198	// (The first one popped from the stack is an unused slot for
2199	// alternation (OR) state save, but applying the fixup to it does no harm.)
2200	for (;;) {
2201	patIdx = fParenStack.popi();
2202	if (patIdx < `0`) {
2203	// value < 0 flags the start of the frame on the paren stack.
2204	break;
2205	}
2206	U_ASSERT(patIdx>`0` && patIdx <= fRXPat->fCompiledPat->size());
2207	patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2208	U_ASSERT(URX_VAL(patOp) == `0`); // Branch target for JMP should not be set.
2209	patOp \|= fRXPat->fCompiledPat->size(); // Set it now.
2210	fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2211	fMatchOpenParen = patIdx;
2212	}
2213
2214	// At the close of any parenthesized block, restore the match mode flags to
2215	// the value they had at the open paren. Saved value is
2216	// at the top of the paren stack.
2217	fModeFlags = fParenStack.popi();
2218	U_ASSERT(fModeFlags < `0`);
2219
2220	// DO any additional fixups, depending on the specific kind of
2221	// parentesized grouping this is
2222
2223	switch (patIdx) {
2224	case plain:
2225	case flags:
2226	// No additional fixups required.
2227	// (Grouping-only parentheses)
2228	break;
2229	case capturing:
2230	// Capturing Parentheses.
2231	// Insert a End Capture op into the pattern.
2232	// The frame offset of the variables for this cg is obtained from the
2233	// start capture op and put it into the end-capture op.
2234	{
2235	int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+`1`);
2236	U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2237
2238	int32_t frameVarLocation = URX_VAL(captureOp);
2239	appendOp(URX_END_CAPTURE, frameVarLocation);
2240	}
2241	break;
2242	case atomic:
2243	// Atomic Parenthesis.
2244	// Insert a LD_SP operation to restore the state stack to the position
2245	// it was when the atomic parens were entered.
2246	{
2247	int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+`1`);
2248	U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2249	int32_t stoLoc = URX_VAL(stoOp);
2250	appendOp(URX_LD_SP, stoLoc);
2251	}
2252	break;
2253
2254	case lookAhead:
2255	{
2256	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-`5`);
2257	U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2258	int32_t dataLoc = URX_VAL(startOp);
2259	appendOp(URX_LA_END, dataLoc);
2260	}
2261	break;
2262
2263	case negLookAhead:
2264	{
2265	// See comment at doOpenLookAheadNeg
2266	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-`1`);
2267	U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2268	int32_t dataLoc = URX_VAL(startOp);
2269	appendOp(URX_LA_END, dataLoc);
2270	appendOp(URX_BACKTRACK, `0`);
2271	appendOp(URX_LA_END, dataLoc);
2272
2273	// Patch the URX_SAVE near the top of the block.
2274	// The destination of the SAVE is the final LA_END that was just added.
2275	int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2276	U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2277	int32_t dest = fRXPat->fCompiledPat->size()-`1`;
2278	saveOp = buildOp(URX_STATE_SAVE, dest);
2279	fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2280	}
2281	break;
2282
2283	case lookBehind:
2284	{
2285	// See comment at doOpenLookBehind.
2286
2287	// Append the URX_LB_END and URX_LA_END to the compiled pattern.
2288	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-`4`);
2289	U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2290	int32_t dataLoc = URX_VAL(startOp);
2291	appendOp(URX_LB_END, dataLoc);
2292	appendOp(URX_LA_END, dataLoc);
2293
2294	// Determine the min and max bounds for the length of the
2295	// string that the pattern can match.
2296	// An unbounded upper limit is an error.
2297	int32_t patEnd = fRXPat->fCompiledPat->size() - `1`;
2298	int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2299	int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2300	if (URX_TYPE(maxML) != `0`) {
2301	error(U_REGEX_LOOK_BEHIND_LIMIT);
2302	break;
2303	}
2304	if (maxML == INT32_MAX) {
2305	error(U_REGEX_LOOK_BEHIND_LIMIT);
2306	break;
2307	}
2308	if (minML == INT32_MAX) {
2309	// This condition happens when no match is possible, such as with a
2310	// [set] expression containing no elements.
2311	// In principle, the generated code to evaluate the expression could be deleted,
2312	// but it's probably not worth the complication.
2313	minML = `0`;
2314	}
2315	U_ASSERT(minML <= maxML);
2316
2317	// Insert the min and max match len bounds into the URX_LB_CONT op that
2318	// appears at the top of the look-behind block, at location fMatchOpenParen+1
2319	fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-`2`);
2320	fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-`1`);
2321
2322	}
2323	break;
2324
2325
2326
2327	case lookBehindN:
2328	{
2329	// See comment at doOpenLookBehindNeg.
2330
2331	// Append the URX_LBN_END to the compiled pattern.
2332	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-`5`);
2333	U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2334	int32_t dataLoc = URX_VAL(startOp);
2335	appendOp(URX_LBN_END, dataLoc);
2336
2337	// Determine the min and max bounds for the length of the
2338	// string that the pattern can match.
2339	// An unbounded upper limit is an error.
2340	int32_t patEnd = fRXPat->fCompiledPat->size() - `1`;
2341	int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2342	int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2343	if (URX_TYPE(maxML) != `0`) {
2344	error(U_REGEX_LOOK_BEHIND_LIMIT);
2345	break;
2346	}
2347	if (maxML == INT32_MAX) {
2348	error(U_REGEX_LOOK_BEHIND_LIMIT);
2349	break;
2350	}
2351	if (minML == INT32_MAX) {
2352	// This condition happens when no match is possible, such as with a
2353	// [set] expression containing no elements.
2354	// In principle, the generated code to evaluate the expression could be deleted,
2355	// but it's probably not worth the complication.
2356	minML = `0`;
2357	}
2358
2359	U_ASSERT(minML <= maxML);
2360
2361	// Insert the min and max match len bounds into the URX_LB_CONT op that
2362	// appears at the top of the look-behind block, at location fMatchOpenParen+1
2363	fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-`3`);
2364	fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-`2`);
2365
2366	// Insert the pattern location to continue at after a successful match
2367	// as the last operand of the URX_LBN_CONT
2368	int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2369	fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-`1`);
2370	}
2371	break;
2372
2373
2374
2375	default:
2376	UPRV_UNREACHABLE;
2377	}
2378
2379	// remember the next location in the compiled pattern.
2380	// The compilation of Quantifiers will look at this to see whether its looping
2381	// over a parenthesized block or a single item
2382	fMatchCloseParen = fRXPat->fCompiledPat->size();
2383	}
2384
2385
2386
2387	//------------------------------------------------------------------------------
2388	//
2389	// compileSet Compile the pattern operations for a reference to a
2390	// UnicodeSet.
2391	//
2392	//------------------------------------------------------------------------------
2393	void RegexCompile::compileSet(UnicodeSet *theSet)
2394	{
2395	if (theSet == NULL) {
2396	return;
2397	}
2398	// Remove any strings from the set.
2399	// There shoudn't be any, but just in case.
2400	// (Case Closure can add them; if we had a simple case closure avaialble that
2401	// ignored strings, that would be better.)
2402	theSet->removeAllStrings();
2403	int32_t setSize = theSet->size();
2404
2405	switch (setSize) {
2406	case `0`:
2407	{
2408	// Set of no elements. Always fails to match.
2409	appendOp(URX_BACKTRACK, `0`);
2410	delete theSet;
2411	}
2412	break;
2413
2414	case `1`:
2415	{
2416	// The set contains only a single code point. Put it into
2417	// the compiled pattern as a single char operation rather
2418	// than a set, and discard the set itself.
2419	literalChar(theSet->charAt(`0`));
2420	delete theSet;
2421	}
2422	break;
2423
2424	default:
2425	{
2426	// The set contains two or more chars. (the normal case)
2427	// Put it into the compiled pattern as a set.
2428	int32_t setNumber = fRXPat->fSets->size();
2429	fRXPat->fSets->addElement(theSet, *fStatus);
2430	appendOp(URX_SETREF, setNumber);
2431	}
2432	}
2433	}
2434
2435
2436	//------------------------------------------------------------------------------
2437	//
2438	// compileInterval Generate the code for a {min, max} style interval quantifier.
2439	// Except for the specific opcodes used, the code is the same
2440	// for all three types (greedy, non-greedy, possessive) of
2441	// intervals. The opcodes are supplied as parameters.
2442	// (There are two sets of opcodes - greedy & possessive use the
2443	// same ones, while non-greedy has it's own.)
2444	//
2445	// The code for interval loops has this form:
2446	// 0 CTR_INIT counter loc (in stack frame)
2447	// 1 5 patt address of CTR_LOOP at bottom of block
2448	// 2 min count
2449	// 3 max count (-1 for unbounded)
2450	// 4 ... block to be iterated over
2451	// 5 CTR_LOOP
2452	//
2453	// In
2454	//------------------------------------------------------------------------------
2455	void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
2456	{
2457	// The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2458	// four slots in the compiled code. Reserve them.
2459	int32_t topOfBlock = blockTopLoc(TRUE);
2460	insertOp(topOfBlock);
2461	insertOp(topOfBlock);
2462	insertOp(topOfBlock);
2463
2464	// The operands for the CTR_INIT opcode include the index in the matcher data
2465	// of the counter. Allocate it now. There are two data items
2466	// counterLoc --> Loop counter
2467	// +1 --> Input index (for breaking non-progressing loops)
2468	// (Only present if unbounded upper limit on loop)
2469	int32_t dataSize = fIntervalUpper < `0` ? `2` : `1`;
2470	int32_t counterLoc = allocateStackData(dataSize);
2471
2472	int32_t op = buildOp(InitOp, counterLoc);
2473	fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2474
2475	// The second operand of CTR_INIT is the location following the end of the loop.
2476	// Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2477	// compilation of something later on causes the code to grow and the target
2478	// position to move.
2479	int32_t loopEnd = fRXPat->fCompiledPat->size();
2480	op = buildOp(URX_RELOC_OPRND, loopEnd);
2481	fRXPat->fCompiledPat->setElementAt(op, topOfBlock+`1`);
2482
2483	// Followed by the min and max counts.
2484	fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+`2`);
2485	fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+`3`);
2486
2487	// Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2488	// Goes at end of the block being looped over, so just append to the code so far.
2489	appendOp(LoopOp, topOfBlock);
2490
2491	if ((fIntervalLow & `0xff000000`) != `0` \|\|
2492	(fIntervalUpper > `0` && (fIntervalUpper & `0xff000000`) != `0`)) {
2493	error(U_REGEX_NUMBER_TOO_BIG);
2494	}
2495
2496	if (fIntervalLow > fIntervalUpper && fIntervalUpper != -`1`) {
2497	error(U_REGEX_MAX_LT_MIN);
2498	}
2499	}
2500
2501
2502
2503	UBool RegexCompile::compileInlineInterval() {
2504	if (fIntervalUpper > `10` \|\| fIntervalUpper < fIntervalLow) {
2505	// Too big to inline. Fail, which will cause looping code to be generated.
2506	// (Upper < Lower picks up unbounded upper and errors, both.)
2507	return FALSE;
2508	}
2509
2510	int32_t topOfBlock = blockTopLoc(FALSE);
2511	if (fIntervalUpper == `0`) {
2512	// Pathological case. Attempt no matches, as if the block doesn't exist.
2513	// Discard the generated code for the block.
2514	// If the block included parens, discard the info pertaining to them as well.
2515	fRXPat->fCompiledPat->setSize(topOfBlock);
2516	if (fMatchOpenParen >= topOfBlock) {
2517	fMatchOpenParen = -`1`;
2518	}
2519	if (fMatchCloseParen >= topOfBlock) {
2520	fMatchCloseParen = -`1`;
2521	}
2522	return TRUE;
2523	}
2524
2525	if (topOfBlock != fRXPat->fCompiledPat->size()-`1` && fIntervalUpper != `1`) {
2526	// The thing being repeated is not a single op, but some
2527	// more complex block. Do it as a loop, not inlines.
2528	// Note that things "repeated" a max of once are handled as inline, because
2529	// the one copy of the code already generated is just fine.
2530	return FALSE;
2531	}
2532
2533	// Pick up the opcode that is to be repeated
2534	//
2535	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2536
2537	// Compute the pattern location where the inline sequence
2538	// will end, and set up the state save op that will be needed.
2539	//
2540	int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-`1`
2541	+ fIntervalUpper + (fIntervalUpper-fIntervalLow);
2542	int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2543	if (fIntervalLow == `0`) {
2544	insertOp(topOfBlock);
2545	fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2546	}
2547
2548
2549
2550	// Loop, emitting the op for the thing being repeated each time.
2551	// Loop starts at 1 because one instance of the op already exists in the pattern,
2552	// it was put there when it was originally encountered.
2553	int32_t i;
2554	for (i=`1`; i<fIntervalUpper; i++ ) {
2555	if (i >= fIntervalLow) {
2556	appendOp(saveOp);
2557	}
2558	appendOp(op);
2559	}
2560	return TRUE;
2561	}
2562
2563
2564
2565	//------------------------------------------------------------------------------
2566	//
2567	// caseInsensitiveStart given a single code point from a pattern string, determine the
2568	// set of characters that could potentially begin a case-insensitive
2569	// match of a string beginning with that character, using full Unicode
2570	// case insensitive matching.
2571	//
2572	// This is used in optimizing find().
2573	//
2574	// closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2575	// misses cases like this:
2576	// A string from the pattern begins with 'ss' (although all we know
2577	// in this context is that it begins with 's')
2578	// The pattern could match a string beginning with a German sharp-s
2579	//
2580	// To the ordinary case closure for a character c, we add all other
2581	// characters cx where the case closure of cx incudes a string form that begins
2582	// with the original character c.
2583	//
2584	// This function could be made smarter. The full pattern string is available
2585	// and it would be possible to verify that the extra characters being added
2586	// to the starting set fully match, rather than having just a first-char of the
2587	// folded form match.
2588	//
2589	//------------------------------------------------------------------------------
2590	void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2591
2592	// Machine Generated below.
2593	// It may need updating with new versions of Unicode.
2594	// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2595	// The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2596
2597	// Machine Generated Data. Do not hand edit.
2598	static const UChar32 RECaseFixCodePoints[] = {
2599	`0x61`, `0x66`, `0x68`, `0x69`, `0x6a`, `0x73`, `0x74`, `0x77`, `0x79`, `0x2bc`,
2600	`0x3ac`, `0x3ae`, `0x3b1`, `0x3b7`, `0x3b9`, `0x3c1`, `0x3c5`, `0x3c9`, `0x3ce`, `0x565`,
2601	`0x574`, `0x57e`, `0x1f00`, `0x1f01`, `0x1f02`, `0x1f03`, `0x1f04`, `0x1f05`, `0x1f06`, `0x1f07`,
2602	`0x1f20`, `0x1f21`, `0x1f22`, `0x1f23`, `0x1f24`, `0x1f25`, `0x1f26`, `0x1f27`, `0x1f60`, `0x1f61`,
2603	`0x1f62`, `0x1f63`, `0x1f64`, `0x1f65`, `0x1f66`, `0x1f67`, `0x1f70`, `0x1f74`, `0x1f7c`, `0x110000`};
2604
2605	static const int16_t RECaseFixStringOffsets[] = {
2606	`0x0`, `0x1`, `0x6`, `0x7`, `0x8`, `0x9`, `0xd`, `0xe`, `0xf`, `0x10`,
2607	`0x11`, `0x12`, `0x13`, `0x17`, `0x1b`, `0x20`, `0x21`, `0x2a`, `0x2e`, `0x2f`,
2608	`0x30`, `0x34`, `0x35`, `0x37`, `0x39`, `0x3b`, `0x3d`, `0x3f`, `0x41`, `0x43`,
2609	`0x45`, `0x47`, `0x49`, `0x4b`, `0x4d`, `0x4f`, `0x51`, `0x53`, `0x55`, `0x57`,
2610	`0x59`, `0x5b`, `0x5d`, `0x5f`, `0x61`, `0x63`, `0x65`, `0x66`, `0x67`, `0`};
2611
2612	static const int16_t RECaseFixCounts[] = {
2613	`0x1`, `0x5`, `0x1`, `0x1`, `0x1`, `0x4`, `0x1`, `0x1`, `0x1`, `0x1`,
2614	`0x1`, `0x1`, `0x4`, `0x4`, `0x5`, `0x1`, `0x9`, `0x4`, `0x1`, `0x1`,
2615	`0x4`, `0x1`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`,
2616	`0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`,
2617	`0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x1`, `0x1`, `0x1`, `0`};
2618
2619	static const UChar RECaseFixData[] = {
2620	`0x1e9a`, `0xfb00`, `0xfb01`, `0xfb02`, `0xfb03`, `0xfb04`, `0x1e96`, `0x130`, `0x1f0`, `0xdf`,
2621	`0x1e9e`, `0xfb05`, `0xfb06`, `0x1e97`, `0x1e98`, `0x1e99`, `0x149`, `0x1fb4`, `0x1fc4`, `0x1fb3`,
2622	`0x1fb6`, `0x1fb7`, `0x1fbc`, `0x1fc3`, `0x1fc6`, `0x1fc7`, `0x1fcc`, `0x390`, `0x1fd2`, `0x1fd3`,
2623	`0x1fd6`, `0x1fd7`, `0x1fe4`, `0x3b0`, `0x1f50`, `0x1f52`, `0x1f54`, `0x1f56`, `0x1fe2`, `0x1fe3`,
2624	`0x1fe6`, `0x1fe7`, `0x1ff3`, `0x1ff6`, `0x1ff7`, `0x1ffc`, `0x1ff4`, `0x587`, `0xfb13`, `0xfb14`,
2625	`0xfb15`, `0xfb17`, `0xfb16`, `0x1f80`, `0x1f88`, `0x1f81`, `0x1f89`, `0x1f82`, `0x1f8a`, `0x1f83`,
2626	`0x1f8b`, `0x1f84`, `0x1f8c`, `0x1f85`, `0x1f8d`, `0x1f86`, `0x1f8e`, `0x1f87`, `0x1f8f`, `0x1f90`,
2627	`0x1f98`, `0x1f91`, `0x1f99`, `0x1f92`, `0x1f9a`, `0x1f93`, `0x1f9b`, `0x1f94`, `0x1f9c`, `0x1f95`,
2628	`0x1f9d`, `0x1f96`, `0x1f9e`, `0x1f97`, `0x1f9f`, `0x1fa0`, `0x1fa8`, `0x1fa1`, `0x1fa9`, `0x1fa2`,
2629	`0x1faa`, `0x1fa3`, `0x1fab`, `0x1fa4`, `0x1fac`, `0x1fa5`, `0x1fad`, `0x1fa6`, `0x1fae`, `0x1fa7`,
2630	`0x1faf`, `0x1fb2`, `0x1fc2`, `0x1ff2`, `0`};
2631
2632	// End of machine generated data.
2633
2634	if (c < UCHAR_MIN_VALUE \|\| c > UCHAR_MAX_VALUE) {
2635	// This function should never be called with an invalid input character.
2636	UPRV_UNREACHABLE;
2637	} else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2638	UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2639	starterChars->set(caseFoldedC, caseFoldedC);
2640
2641	int32_t i;
2642	for (i=`0`; RECaseFixCodePoints[i]<c ; i++) {
2643	// Simple linear search through the sorted list of interesting code points.
2644	}
2645
2646	if (RECaseFixCodePoints[i] == c) {
2647	int32_t dataIndex = RECaseFixStringOffsets[i];
2648	int32_t numCharsToAdd = RECaseFixCounts[i];
2649	UChar32 cpToAdd = `0`;
2650	for (int32_t j=`0`; j<numCharsToAdd; j++) {
2651	U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2652	starterChars->add(cpToAdd);
2653	}
2654	}
2655
2656	starterChars->closeOver(USET_CASE_INSENSITIVE);
2657	starterChars->removeAllStrings();
2658	} else {
2659	// Not a cased character. Just return it alone.
2660	starterChars->set(c, c);
2661	}
2662	}
2663
2664
2665	// Increment with overflow check.
2666	// val and delta will both be positive.
2667
2668	static int32_t safeIncrement(int32_t val, int32_t delta) {
2669	if (INT32_MAX - val > delta) {
2670	return val + delta;
2671	} else {
2672	return INT32_MAX;
2673	}
2674	}
2675
2676
2677	//------------------------------------------------------------------------------
2678	//
2679	// matchStartType Determine how a match can start.
2680	// Used to optimize find() operations.
2681	//
2682	// Operation is very similar to minMatchLength(). Walk the compiled
2683	// pattern, keeping an on-going minimum-match-length. For any
2684	// op where the min match coming in is zero, add that ops possible
2685	// starting matches to the possible starts for the overall pattern.
2686	//
2687	//------------------------------------------------------------------------------
2688	void RegexCompile::matchStartType() {
2689	if (U_FAILURE(*fStatus)) {
2690	return;
2691	}
2692
2693
2694	int32_t loc; // Location in the pattern of the current op being processed.
2695	int32_t op; // The op being processed
2696	int32_t opType; // The opcode type of the op
2697	int32_t currentLen = `0`; // Minimum length of a match to this point (loc) in the pattern
2698	int32_t numInitialStrings = `0`; // Number of strings encountered that could match at start.
2699
2700	UBool atStart = TRUE; // True if no part of the pattern yet encountered
2701	// could have advanced the position in a match.
2702	// (Maximum match length so far == 0)
2703
2704	// forwardedLength is a vector holding minimum-match-length values that
2705	// are propagated forward in the pattern by JMP or STATE_SAVE operations.
2706	// It must be one longer than the pattern being checked because some ops
2707	// will jmp to a end-of-block+1 location from within a block, and we must
2708	// count those when checking the block.
2709	int32_t end = fRXPat->fCompiledPat->size();
2710	UVector32 forwardedLength(end+`1`, *fStatus);
2711	forwardedLength.setSize(end+`1`);
2712	for (loc=`3`; loc<end; loc++) {
2713	forwardedLength.setElementAt(INT32_MAX, loc);
2714	}
2715
2716	for (loc = `3`; loc<end; loc++) {
2717	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2718	opType = URX_TYPE(op);
2719
2720	// The loop is advancing linearly through the pattern.
2721	// If the op we are now at was the destination of a branch in the pattern,
2722	// and that path has a shorter minimum length than the current accumulated value,
2723	// replace the current accumulated value.
2724	if (forwardedLength.elementAti(loc) < currentLen) {
2725	currentLen = forwardedLength.elementAti(loc);
2726	U_ASSERT(currentLen>=`0` && currentLen < INT32_MAX);
2727	}
2728
2729	switch (opType) {
2730	// Ops that don't change the total length matched
2731	case URX_RESERVED_OP:
2732	case URX_END:
2733	case URX_FAIL:
2734	case URX_STRING_LEN:
2735	case URX_NOP:
2736	case URX_START_CAPTURE:
2737	case URX_END_CAPTURE:
2738	case URX_BACKSLASH_B:
2739	case URX_BACKSLASH_BU:
2740	case URX_BACKSLASH_G:
2741	case URX_BACKSLASH_Z:
2742	case URX_DOLLAR:
2743	case URX_DOLLAR_M:
2744	case URX_DOLLAR_D:
2745	case URX_DOLLAR_MD:
2746	case URX_RELOC_OPRND:
2747	case URX_STO_INP_LOC:
2748	case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2749	case URX_BACKREF_I:
2750
2751	case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2752	case URX_LD_SP:
2753	break;
2754
2755	case URX_CARET:
2756	if (atStart) {
2757	fRXPat->fStartType = START_START;
2758	}
2759	break;
2760
2761	case URX_CARET_M:
2762	case URX_CARET_M_UNIX:
2763	if (atStart) {
2764	fRXPat->fStartType = START_LINE;
2765	}
2766	break;
2767
2768	case URX_ONECHAR:
2769	if (currentLen == `0`) {
2770	// This character could appear at the start of a match.
2771	// Add it to the set of possible starting characters.
2772	fRXPat->fInitialChars->add(URX_VAL(op));
2773	numInitialStrings += `2`;
2774	}
2775	currentLen = safeIncrement(currentLen, `1`);
2776	atStart = FALSE;
2777	break;
2778
2779
2780	case URX_SETREF:
2781	if (currentLen == `0`) {
2782	int32_t sn = URX_VAL(op);
2783	U_ASSERT(sn > `0` && sn < fRXPat->fSets->size());
2784	const UnicodeSet s = (UnicodeSet )fRXPat->fSets->elementAt(sn);
2785	fRXPat->fInitialChars->addAll(*s);
2786	numInitialStrings += `2`;
2787	}
2788	currentLen = safeIncrement(currentLen, `1`);
2789	atStart = FALSE;
2790	break;
2791
2792	case URX_LOOP_SR_I:
2793	// [Set], like a SETREF, above, in what it can match,*
2794	// but may not match at all, so currentLen is not incremented.
2795	if (currentLen == `0`) {
2796	int32_t sn = URX_VAL(op);
2797	U_ASSERT(sn > `0` && sn < fRXPat->fSets->size());
2798	const UnicodeSet s = (UnicodeSet )fRXPat->fSets->elementAt(sn);
2799	fRXPat->fInitialChars->addAll(*s);
2800	numInitialStrings += `2`;
2801	}
2802	atStart = FALSE;
2803	break;
2804
2805	case URX_LOOP_DOT_I:
2806	if (currentLen == `0`) {
2807	// . at the start of a pattern.*
2808	// Any character can begin the match.
2809	fRXPat->fInitialChars->clear();
2810	fRXPat->fInitialChars->complement();
2811	numInitialStrings += `2`;
2812	}
2813	atStart = FALSE;
2814	break;
2815
2816
2817	case URX_STATIC_SETREF:
2818	if (currentLen == `0`) {
2819	int32_t sn = URX_VAL(op);
2820	U_ASSERT(sn>`0` && sn<URX_LAST_SET);
2821	const UnicodeSet *s = fRXPat->fStaticSets[sn];
2822	fRXPat->fInitialChars->addAll(*s);
2823	numInitialStrings += `2`;
2824	}
2825	currentLen = safeIncrement(currentLen, `1`);
2826	atStart = FALSE;
2827	break;
2828
2829
2830
2831	case URX_STAT_SETREF_N:
2832	if (currentLen == `0`) {
2833	int32_t sn = URX_VAL(op);
2834	const UnicodeSet *s = fRXPat->fStaticSets[sn];
2835	UnicodeSet sc(*s);
2836	sc.complement();
2837	fRXPat->fInitialChars->addAll(sc);
2838	numInitialStrings += `2`;
2839	}
2840	currentLen = safeIncrement(currentLen, `1`);
2841	atStart = FALSE;
2842	break;
2843
2844
2845
2846	case URX_BACKSLASH_D:
2847	// Digit Char
2848	if (currentLen == `0`) {
2849	UnicodeSet s;
2850	s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2851	if (URX_VAL(op) != `0`) {
2852	s.complement();
2853	}
2854	fRXPat->fInitialChars->addAll(s);
2855	numInitialStrings += `2`;
2856	}
2857	currentLen = safeIncrement(currentLen, `1`);
2858	atStart = FALSE;
2859	break;
2860
2861
2862	case URX_BACKSLASH_H:
2863	// Horiz white space
2864	if (currentLen == `0`) {
2865	UnicodeSet s;
2866	s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2867	s.add((UChar32)`9`); // Tab
2868	if (URX_VAL(op) != `0`) {
2869	s.complement();
2870	}
2871	fRXPat->fInitialChars->addAll(s);
2872	numInitialStrings += `2`;
2873	}
2874	currentLen = safeIncrement(currentLen, `1`);
2875	atStart = FALSE;
2876	break;
2877
2878
2879	case URX_BACKSLASH_R: // Any line ending sequence
2880	case URX_BACKSLASH_V: // Any line ending code point, with optional negation
2881	if (currentLen == `0`) {
2882	UnicodeSet s;
2883	s.add((UChar32)`0x0a`, (UChar32)`0x0d`); // add range
2884	s.add((UChar32)`0x85`);
2885	s.add((UChar32)`0x2028`, (UChar32)`0x2029`);
2886	if (URX_VAL(op) != `0`) {
2887	// Complement option applies to URX_BACKSLASH_V only.
2888	s.complement();
2889	}
2890	fRXPat->fInitialChars->addAll(s);
2891	numInitialStrings += `2`;
2892	}
2893	currentLen = safeIncrement(currentLen, `1`);
2894	atStart = FALSE;
2895	break;
2896
2897
2898
2899	case URX_ONECHAR_I:
2900	// Case Insensitive Single Character.
2901	if (currentLen == `0`) {
2902	UChar32 c = URX_VAL(op);
2903	if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2904	UnicodeSet starters(c, c);
2905	starters.closeOver(USET_CASE_INSENSITIVE);
2906	// findCaseInsensitiveStarters(c, &starters);
2907	// For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2908	// The expanded folding can't match the pattern.
2909	fRXPat->fInitialChars->addAll(starters);
2910	} else {
2911	// Char has no case variants. Just add it as-is to the
2912	// set of possible starting chars.
2913	fRXPat->fInitialChars->add(c);
2914	}
2915	numInitialStrings += `2`;
2916	}
2917	currentLen = safeIncrement(currentLen, `1`);
2918	atStart = FALSE;
2919	break;
2920
2921
2922	case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2923	case URX_DOTANY_ALL: // . matches one or two.
2924	case URX_DOTANY:
2925	case URX_DOTANY_UNIX:
2926	if (currentLen == `0`) {
2927	// These constructs are all bad news when they appear at the start
2928	// of a match. Any character can begin the match.
2929	fRXPat->fInitialChars->clear();
2930	fRXPat->fInitialChars->complement();
2931	numInitialStrings += `2`;
2932	}
2933	currentLen = safeIncrement(currentLen, `1`);
2934	atStart = FALSE;
2935	break;
2936
2937
2938	case URX_JMPX:
2939	loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2940	U_FALLTHROUGH;
2941	case URX_JMP:
2942	{
2943	int32_t jmpDest = URX_VAL(op);
2944	if (jmpDest < loc) {
2945	// Loop of some kind. Can safely ignore, the worst that will happen
2946	// is that we understate the true minimum length
2947	currentLen = forwardedLength.elementAti(loc+`1`);
2948
2949	} else {
2950	// Forward jump. Propagate the current min length to the target loc of the jump.
2951	U_ASSERT(jmpDest <= end+`1`);
2952	if (forwardedLength.elementAti(jmpDest) > currentLen) {
2953	forwardedLength.setElementAt(currentLen, jmpDest);
2954	}
2955	}
2956	}
2957	atStart = FALSE;
2958	break;
2959
2960	case URX_JMP_SAV:
2961	case URX_JMP_SAV_X:
2962	// Combo of state save to the next loc, + jmp backwards.
2963	// Net effect on min. length computation is nothing.
2964	atStart = FALSE;
2965	break;
2966
2967	case URX_BACKTRACK:
2968	// Fails are kind of like a branch, except that the min length was
2969	// propagated already, by the state save.
2970	currentLen = forwardedLength.elementAti(loc+`1`);
2971	atStart = FALSE;
2972	break;
2973
2974
2975	case URX_STATE_SAVE:
2976	{
2977	// State Save, for forward jumps, propagate the current minimum.
2978	// of the state save.
2979	int32_t jmpDest = URX_VAL(op);
2980	if (jmpDest > loc) {
2981	if (currentLen < forwardedLength.elementAti(jmpDest)) {
2982	forwardedLength.setElementAt(currentLen, jmpDest);
2983	}
2984	}
2985	}
2986	atStart = FALSE;
2987	break;
2988
2989
2990
2991
2992	case URX_STRING:
2993	{
2994	loc++;
2995	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2996	int32_t stringLen = URX_VAL(stringLenOp);
2997	U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2998	U_ASSERT(stringLenOp >= `2`);
2999	if (currentLen == `0`) {
3000	// Add the starting character of this string to the set of possible starting
3001	// characters for this pattern.
3002	int32_t stringStartIdx = URX_VAL(op);
3003	UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3004	fRXPat->fInitialChars->add(c);
3005
3006	// Remember this string. After the entire pattern has been checked,
3007	// if nothing else is identified that can start a match, we'll use it.
3008	numInitialStrings++;
3009	fRXPat->fInitialStringIdx = stringStartIdx;
3010	fRXPat->fInitialStringLen = stringLen;
3011	}
3012
3013	currentLen = safeIncrement(currentLen, stringLen);
3014	atStart = FALSE;
3015	}
3016	break;
3017
3018	case URX_STRING_I:
3019	{
3020	// Case-insensitive string. Unlike exact-match strings, we won't
3021	// attempt a string search for possible match positions. But we
3022	// do update the set of possible starting characters.
3023	loc++;
3024	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3025	int32_t stringLen = URX_VAL(stringLenOp);
3026	U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3027	U_ASSERT(stringLenOp >= `2`);
3028	if (currentLen == `0`) {
3029	// Add the starting character of this string to the set of possible starting
3030	// characters for this pattern.
3031	int32_t stringStartIdx = URX_VAL(op);
3032	UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3033	UnicodeSet s;
3034	findCaseInsensitiveStarters(c, &s);
3035	fRXPat->fInitialChars->addAll(s);
3036	numInitialStrings += `2`; // Matching on an initial string not possible.
3037	}
3038	currentLen = safeIncrement(currentLen, stringLen);
3039	atStart = FALSE;
3040	}
3041	break;
3042
3043	case URX_CTR_INIT:
3044	case URX_CTR_INIT_NG:
3045	{
3046	// Loop Init Ops. These don't change the min length, but they are 4 word ops
3047	// so location must be updated accordingly.
3048	// Loop Init Ops.
3049	// If the min loop count == 0
3050	// move loc forwards to the end of the loop, skipping over the body.
3051	// If the min count is > 0,
3052	// continue normal processing of the body of the loop.
3053	int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+`1`);
3054	loopEndLoc = URX_VAL(loopEndLoc);
3055	int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+`2`);
3056	if (minLoopCount == `0`) {
3057	// Min Loop Count of 0, treat like a forward branch and
3058	// move the current minimum length up to the target
3059	// (end of loop) location.
3060	U_ASSERT(loopEndLoc <= end+`1`);
3061	if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3062	forwardedLength.setElementAt(currentLen, loopEndLoc);
3063	}
3064	}
3065	loc+=`3`; // Skips over operands of CTR_INIT
3066	}
3067	atStart = FALSE;
3068	break;
3069
3070
3071	case URX_CTR_LOOP:
3072	case URX_CTR_LOOP_NG:
3073	// Loop ops.
3074	// The jump is conditional, backwards only.
3075	atStart = FALSE;
3076	break;
3077
3078	case URX_LOOP_C:
3079	// More loop ops. These state-save to themselves.
3080	// don't change the minimum match
3081	atStart = FALSE;
3082	break;
3083
3084
3085	case URX_LA_START:
3086	case URX_LB_START:
3087	{
3088	// Look-around. Scan forward until the matching look-ahead end,
3089	// without processing the look-around block. This is overly pessimistic.
3090
3091	// Keep track of the nesting depth of look-around blocks. Boilerplate code for
3092	// lookahead contains two LA_END instructions, so count goes up by two
3093	// for each LA_START.
3094	int32_t depth = (opType == URX_LA_START? `2`: `1`);
3095	for (;;) {
3096	loc++;
3097	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3098	if (URX_TYPE(op) == URX_LA_START) {
3099	depth+=`2`;
3100	}
3101	if (URX_TYPE(op) == URX_LB_START) {
3102	depth++;
3103	}
3104	if (URX_TYPE(op) == URX_LA_END \|\| URX_TYPE(op)==URX_LBN_END) {
3105	depth--;
3106	if (depth == `0`) {
3107	break;
3108	}
3109	}
3110	if (URX_TYPE(op) == URX_STATE_SAVE) {
3111	// Need this because neg lookahead blocks will FAIL to outside
3112	// of the block.
3113	int32_t jmpDest = URX_VAL(op);
3114	if (jmpDest > loc) {
3115	if (currentLen < forwardedLength.elementAti(jmpDest)) {
3116	forwardedLength.setElementAt(currentLen, jmpDest);
3117	}
3118	}
3119	}
3120	U_ASSERT(loc <= end);
3121	}
3122	}
3123	break;
3124
3125	case URX_LA_END:
3126	case URX_LB_CONT:
3127	case URX_LB_END:
3128	case URX_LBN_CONT:
3129	case URX_LBN_END:
3130	UPRV_UNREACHABLE; // Shouldn't get here. These ops should be
3131	// consumed by the scan in URX_LA_START and LB_START
3132	default:
3133	UPRV_UNREACHABLE;
3134	}
3135
3136	}
3137
3138
3139	// We have finished walking through the ops. Check whether some forward jump
3140	// propagated a shorter length to location end+1.
3141	if (forwardedLength.elementAti(end+`1`) < currentLen) {
3142	currentLen = forwardedLength.elementAti(end+`1`);
3143	}
3144
3145
3146	fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3147
3148
3149	// Sort out what we should check for when looking for candidate match start positions.
3150	// In order of preference,
3151	// 1. Start of input text buffer.
3152	// 2. A literal string.
3153	// 3. Start of line in multi-line mode.
3154	// 4. A single literal character.
3155	// 5. A character from a set of characters.
3156	//
3157	if (fRXPat->fStartType == START_START) {
3158	// Match only at the start of an input text string.
3159	// start type is already set. We're done.
3160	} else if (numInitialStrings == `1` && fRXPat->fMinMatchLen > `0`) {
3161	// Match beginning only with a literal string.
3162	UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3163	U_ASSERT(fRXPat->fInitialChars->contains(c));
3164	fRXPat->fStartType = START_STRING;
3165	fRXPat->fInitialChar = c;
3166	} else if (fRXPat->fStartType == START_LINE) {
3167	// Match at start of line in Multi-Line mode.
3168	// Nothing to do here; everything is already set.
3169	} else if (fRXPat->fMinMatchLen == `0`) {
3170	// Zero length match possible. We could start anywhere.
3171	fRXPat->fStartType = START_NO_INFO;
3172	} else if (fRXPat->fInitialChars->size() == `1`) {
3173	// All matches begin with the same char.
3174	fRXPat->fStartType = START_CHAR;
3175	fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(`0`);
3176	U_ASSERT(fRXPat->fInitialChar != (UChar32)-`1`);
3177	} else if (fRXPat->fInitialChars->contains((UChar32)`0`, (UChar32)`0x10ffff`) == FALSE &&
3178	fRXPat->fMinMatchLen > `0`) {
3179	// Matches start with a set of character smaller than the set of all chars.
3180	fRXPat->fStartType = START_SET;
3181	} else {
3182	// Matches can start with anything
3183	fRXPat->fStartType = START_NO_INFO;
3184	}
3185
3186	return;
3187	}
3188
3189
3190
3191	//------------------------------------------------------------------------------
3192	//
3193	// minMatchLength Calculate the length of the shortest string that could
3194	// match the specified pattern.
3195	// Length is in 16 bit code units, not code points.
3196	//
3197	// The calculated length may not be exact. The returned
3198	// value may be shorter than the actual minimum; it must
3199	// never be longer.
3200	//
3201	// start and end are the range of p-code operations to be
3202	// examined. The endpoints are included in the range.
3203	//
3204	//------------------------------------------------------------------------------
3205	int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
3206	if (U_FAILURE(*fStatus)) {
3207	return `0`;
3208	}
3209
3210	U_ASSERT(start <= end);
3211	U_ASSERT(end < fRXPat->fCompiledPat->size());
3212
3213
3214	int32_t loc;
3215	int32_t op;
3216	int32_t opType;
3217	int32_t currentLen = `0`;
3218
3219
3220	// forwardedLength is a vector holding minimum-match-length values that
3221	// are propagated forward in the pattern by JMP or STATE_SAVE operations.
3222	// It must be one longer than the pattern being checked because some ops
3223	// will jmp to a end-of-block+1 location from within a block, and we must
3224	// count those when checking the block.
3225	UVector32 forwardedLength(end+`2`, *fStatus);
3226	forwardedLength.setSize(end+`2`);
3227	for (loc=start; loc<=end+`1`; loc++) {
3228	forwardedLength.setElementAt(INT32_MAX, loc);
3229	}
3230
3231	for (loc = start; loc<=end; loc++) {
3232	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3233	opType = URX_TYPE(op);
3234
3235	// The loop is advancing linearly through the pattern.
3236	// If the op we are now at was the destination of a branch in the pattern,
3237	// and that path has a shorter minimum length than the current accumulated value,
3238	// replace the current accumulated value.
3239	// U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3240	// no-match-possible cases.
3241	if (forwardedLength.elementAti(loc) < currentLen) {
3242	currentLen = forwardedLength.elementAti(loc);
3243	U_ASSERT(currentLen>=`0` && currentLen < INT32_MAX);
3244	}
3245
3246	switch (opType) {
3247	// Ops that don't change the total length matched
3248	case URX_RESERVED_OP:
3249	case URX_END:
3250	case URX_STRING_LEN:
3251	case URX_NOP:
3252	case URX_START_CAPTURE:
3253	case URX_END_CAPTURE:
3254	case URX_BACKSLASH_B:
3255	case URX_BACKSLASH_BU:
3256	case URX_BACKSLASH_G:
3257	case URX_BACKSLASH_Z:
3258	case URX_CARET:
3259	case URX_DOLLAR:
3260	case URX_DOLLAR_M:
3261	case URX_DOLLAR_D:
3262	case URX_DOLLAR_MD:
3263	case URX_RELOC_OPRND:
3264	case URX_STO_INP_LOC:
3265	case URX_CARET_M:
3266	case URX_CARET_M_UNIX:
3267	case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3268	case URX_BACKREF_I:
3269
3270	case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3271	case URX_LD_SP:
3272
3273	case URX_JMP_SAV:
3274	case URX_JMP_SAV_X:
3275	break;
3276
3277
3278	// Ops that match a minimum of one character (one or two 16 bit code units.)
3279	//
3280	case URX_ONECHAR:
3281	case URX_STATIC_SETREF:
3282	case URX_STAT_SETREF_N:
3283	case URX_SETREF:
3284	case URX_BACKSLASH_D:
3285	case URX_BACKSLASH_H:
3286	case URX_BACKSLASH_R:
3287	case URX_BACKSLASH_V:
3288	case URX_ONECHAR_I:
3289	case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3290	case URX_DOTANY_ALL: // . matches one or two.
3291	case URX_DOTANY:
3292	case URX_DOTANY_UNIX:
3293	currentLen = safeIncrement(currentLen, `1`);
3294	break;
3295
3296
3297	case URX_JMPX:
3298	loc++; // URX_JMPX has an extra operand, ignored here,
3299	// otherwise processed identically to URX_JMP.
3300	U_FALLTHROUGH;
3301	case URX_JMP:
3302	{
3303	int32_t jmpDest = URX_VAL(op);
3304	if (jmpDest < loc) {
3305	// Loop of some kind. Can safely ignore, the worst that will happen
3306	// is that we understate the true minimum length
3307	currentLen = forwardedLength.elementAti(loc+`1`);
3308	} else {
3309	// Forward jump. Propagate the current min length to the target loc of the jump.
3310	U_ASSERT(jmpDest <= end+`1`);
3311	if (forwardedLength.elementAti(jmpDest) > currentLen) {
3312	forwardedLength.setElementAt(currentLen, jmpDest);
3313	}
3314	}
3315	}
3316	break;
3317
3318	case URX_BACKTRACK:
3319	{
3320	// Back-tracks are kind of like a branch, except that the min length was
3321	// propagated already, by the state save.
3322	currentLen = forwardedLength.elementAti(loc+`1`);
3323	}
3324	break;
3325
3326
3327	case URX_STATE_SAVE:
3328	{
3329	// State Save, for forward jumps, propagate the current minimum.
3330	// of the state save.
3331	int32_t jmpDest = URX_VAL(op);
3332	if (jmpDest > loc) {
3333	if (currentLen < forwardedLength.elementAti(jmpDest)) {
3334	forwardedLength.setElementAt(currentLen, jmpDest);
3335	}
3336	}
3337	}
3338	break;
3339
3340
3341	case URX_STRING:
3342	{
3343	loc++;
3344	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3345	currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3346	}
3347	break;
3348
3349
3350	case URX_STRING_I:
3351	{
3352	loc++;
3353	// TODO: with full case folding, matching input text may be shorter than
3354	// the string we have here. More smarts could put some bounds on it.
3355	// Assume a min length of one for now. A min length of zero causes
3356	// optimization failures for a pattern like "string"+
3357	// currentLen += URX_VAL(stringLenOp);
3358	currentLen = safeIncrement(currentLen, `1`);
3359	}
3360	break;
3361
3362	case URX_CTR_INIT:
3363	case URX_CTR_INIT_NG:
3364	{
3365	// Loop Init Ops.
3366	// If the min loop count == 0
3367	// move loc forwards to the end of the loop, skipping over the body.
3368	// If the min count is > 0,
3369	// continue normal processing of the body of the loop.
3370	int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+`1`);
3371	loopEndLoc = URX_VAL(loopEndLoc);
3372	int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+`2`);
3373	if (minLoopCount == `0`) {
3374	loc = loopEndLoc;
3375	} else {
3376	loc+=`3`; // Skips over operands of CTR_INIT
3377	}
3378	}
3379	break;
3380
3381
3382	case URX_CTR_LOOP:
3383	case URX_CTR_LOOP_NG:
3384	// Loop ops.
3385	// The jump is conditional, backwards only.
3386	break;
3387
3388	case URX_LOOP_SR_I:
3389	case URX_LOOP_DOT_I:
3390	case URX_LOOP_C:
3391	// More loop ops. These state-save to themselves.
3392	// don't change the minimum match - could match nothing at all.
3393	break;
3394
3395
3396	case URX_LA_START:
3397	case URX_LB_START:
3398	{
3399	// Look-around. Scan forward until the matching look-ahead end,
3400	// without processing the look-around block. This is overly pessimistic for look-ahead,
3401	// it assumes that the look-ahead match might be zero-length.
3402	// TODO: Positive lookahead could recursively do the block, then continue
3403	// with the longer of the block or the value coming in. Ticket 6060
3404	int32_t depth = (opType == URX_LA_START? `2`: `1`);
3405	for (;;) {
3406	loc++;
3407	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3408	if (URX_TYPE(op) == URX_LA_START) {
3409	// The boilerplate for look-ahead includes two LA_END insturctions,
3410	// Depth will be decremented by each one when it is seen.
3411	depth += `2`;
3412	}
3413	if (URX_TYPE(op) == URX_LB_START) {
3414	depth++;
3415	}
3416	if (URX_TYPE(op) == URX_LA_END) {
3417	depth--;
3418	if (depth == `0`) {
3419	break;
3420	}
3421	}
3422	if (URX_TYPE(op)==URX_LBN_END) {
3423	depth--;
3424	if (depth == `0`) {
3425	break;
3426	}
3427	}
3428	if (URX_TYPE(op) == URX_STATE_SAVE) {
3429	// Need this because neg lookahead blocks will FAIL to outside
3430	// of the block.
3431	int32_t jmpDest = URX_VAL(op);
3432	if (jmpDest > loc) {
3433	if (currentLen < forwardedLength.elementAti(jmpDest)) {
3434	forwardedLength.setElementAt(currentLen, jmpDest);
3435	}
3436	}
3437	}
3438	U_ASSERT(loc <= end);
3439	}
3440	}
3441	break;
3442
3443	case URX_LA_END:
3444	case URX_LB_CONT:
3445	case URX_LB_END:
3446	case URX_LBN_CONT:
3447	case URX_LBN_END:
3448	// Only come here if the matching URX_LA_START or URX_LB_START was not in the
3449	// range being sized, which happens when measuring size of look-behind blocks.
3450	break;
3451
3452	default:
3453	UPRV_UNREACHABLE;
3454	}
3455
3456	}
3457
3458	// We have finished walking through the ops. Check whether some forward jump
3459	// propagated a shorter length to location end+1.
3460	if (forwardedLength.elementAti(end+`1`) < currentLen) {
3461	currentLen = forwardedLength.elementAti(end+`1`);
3462	U_ASSERT(currentLen>=`0` && currentLen < INT32_MAX);
3463	}
3464
3465	return currentLen;
3466	}
3467
3468	//------------------------------------------------------------------------------
3469	//
3470	// maxMatchLength Calculate the length of the longest string that could
3471	// match the specified pattern.
3472	// Length is in 16 bit code units, not code points.
3473	//
3474	// The calculated length may not be exact. The returned
3475	// value may be longer than the actual maximum; it must
3476	// never be shorter.
3477	//
3478	//------------------------------------------------------------------------------
3479	int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3480	if (U_FAILURE(*fStatus)) {
3481	return `0`;
3482	}
3483	U_ASSERT(start <= end);
3484	U_ASSERT(end < fRXPat->fCompiledPat->size());
3485
3486	int32_t loc;
3487	int32_t op;
3488	int32_t opType;
3489	int32_t currentLen = `0`;
3490	UVector32 forwardedLength(end+`1`, *fStatus);
3491	forwardedLength.setSize(end+`1`);
3492
3493	for (loc=start; loc<=end; loc++) {
3494	forwardedLength.setElementAt(`0`, loc);
3495	}
3496
3497	for (loc = start; loc<=end; loc++) {
3498	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3499	opType = URX_TYPE(op);
3500
3501	// The loop is advancing linearly through the pattern.
3502	// If the op we are now at was the destination of a branch in the pattern,
3503	// and that path has a longer maximum length than the current accumulated value,
3504	// replace the current accumulated value.
3505	if (forwardedLength.elementAti(loc) > currentLen) {
3506	currentLen = forwardedLength.elementAti(loc);
3507	}
3508
3509	switch (opType) {
3510	// Ops that don't change the total length matched
3511	case URX_RESERVED_OP:
3512	case URX_END:
3513	case URX_STRING_LEN:
3514	case URX_NOP:
3515	case URX_START_CAPTURE:
3516	case URX_END_CAPTURE:
3517	case URX_BACKSLASH_B:
3518	case URX_BACKSLASH_BU:
3519	case URX_BACKSLASH_G:
3520	case URX_BACKSLASH_Z:
3521	case URX_CARET:
3522	case URX_DOLLAR:
3523	case URX_DOLLAR_M:
3524	case URX_DOLLAR_D:
3525	case URX_DOLLAR_MD:
3526	case URX_RELOC_OPRND:
3527	case URX_STO_INP_LOC:
3528	case URX_CARET_M:
3529	case URX_CARET_M_UNIX:
3530
3531	case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3532	case URX_LD_SP:
3533
3534	case URX_LB_END:
3535	case URX_LB_CONT:
3536	case URX_LBN_CONT:
3537	case URX_LBN_END:
3538	break;
3539
3540
3541	// Ops that increase that cause an unbounded increase in the length
3542	// of a matched string, or that increase it a hard to characterize way.
3543	// Call the max length unbounded, and stop further checking.
3544	case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3545	case URX_BACKREF_I:
3546	case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3547	currentLen = INT32_MAX;
3548	break;
3549
3550
3551	// Ops that match a max of one character (possibly two 16 bit code units.)
3552	//
3553	case URX_STATIC_SETREF:
3554	case URX_STAT_SETREF_N:
3555	case URX_SETREF:
3556	case URX_BACKSLASH_D:
3557	case URX_BACKSLASH_H:
3558	case URX_BACKSLASH_R:
3559	case URX_BACKSLASH_V:
3560	case URX_ONECHAR_I:
3561	case URX_DOTANY_ALL:
3562	case URX_DOTANY:
3563	case URX_DOTANY_UNIX:
3564	currentLen = safeIncrement(currentLen, `2`);
3565	break;
3566
3567	// Single literal character. Increase current max length by one or two,
3568	// depending on whether the char is in the supplementary range.
3569	case URX_ONECHAR:
3570	currentLen = safeIncrement(currentLen, `1`);
3571	if (URX_VAL(op) > `0x10000`) {
3572	currentLen = safeIncrement(currentLen, `1`);
3573	}
3574	break;
3575
3576	// Jumps.
3577	//
3578	case URX_JMP:
3579	case URX_JMPX:
3580	case URX_JMP_SAV:
3581	case URX_JMP_SAV_X:
3582	{
3583	int32_t jmpDest = URX_VAL(op);
3584	if (jmpDest < loc) {
3585	// Loop of some kind. Max match length is unbounded.
3586	currentLen = INT32_MAX;
3587	} else {
3588	// Forward jump. Propagate the current min length to the target loc of the jump.
3589	if (forwardedLength.elementAti(jmpDest) < currentLen) {
3590	forwardedLength.setElementAt(currentLen, jmpDest);
3591	}
3592	currentLen = `0`;
3593	}
3594	}
3595	break;
3596
3597	case URX_BACKTRACK:
3598	// back-tracks are kind of like a branch, except that the max length was
3599	// propagated already, by the state save.
3600	currentLen = forwardedLength.elementAti(loc+`1`);
3601	break;
3602
3603
3604	case URX_STATE_SAVE:
3605	{
3606	// State Save, for forward jumps, propagate the current minimum.
3607	// of the state save.
3608	// For backwards jumps, they create a loop, maximum
3609	// match length is unbounded.
3610	int32_t jmpDest = URX_VAL(op);
3611	if (jmpDest > loc) {
3612	if (currentLen > forwardedLength.elementAti(jmpDest)) {
3613	forwardedLength.setElementAt(currentLen, jmpDest);
3614	}
3615	} else {
3616	currentLen = INT32_MAX;
3617	}
3618	}
3619	break;
3620
3621
3622
3623
3624	case URX_STRING:
3625	{
3626	loc++;
3627	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3628	currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3629	break;
3630	}
3631
3632	case URX_STRING_I:
3633	// TODO: This code assumes that any user string that matches will be no longer
3634	// than our compiled string, with case insensitive matching.
3635	// Our compiled string has been case-folded already.
3636	//
3637	// Any matching user string will have no more code points than our
3638	// compiled (folded) string. Folding may add code points, but
3639	// not remove them.
3640	//
3641	// There is a potential problem if a supplemental code point
3642	// case-folds to a BMP code point. In this case our compiled string
3643	// could be shorter (in code units) than a matching user string.
3644	//
3645	// At this time (Unicode 6.1) there are no such characters, and this case
3646	// is not being handled. A test, intltest regex/Bug9283, will fail if
3647	// any problematic characters are added to Unicode.
3648	//
3649	// If this happens, we can make a set of the BMP chars that the
3650	// troublesome supplementals fold to, scan our string, and bump the
3651	// currentLen one extra for each that is found.
3652	//
3653	{
3654	loc++;
3655	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3656	currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3657	}
3658	break;
3659
3660	case URX_CTR_INIT:
3661	case URX_CTR_INIT_NG:
3662	// For Loops, recursively call this function on the pattern for the loop body,
3663	// then multiply the result by the maximum loop count.
3664	{
3665	int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+`1`));
3666	if (loopEndLoc == loc+`4`) {
3667	// Loop has an empty body. No affect on max match length.
3668	// Continue processing with code after the loop end.
3669	loc = loopEndLoc;
3670	break;
3671	}
3672
3673	int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+`3`));
3674	if (maxLoopCount == -`1`) {
3675	// Unbounded Loop. No upper bound on match length.
3676	currentLen = INT32_MAX;
3677	break;
3678	}
3679
3680	U_ASSERT(loopEndLoc >= loc+`4`);
3681	int64_t blockLen = maxMatchLength(loc+`4`, loopEndLoc-`1`); // Recursive call.
3682	int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3683	if (updatedLen >= INT32_MAX) {
3684	currentLen = INT32_MAX;
3685	break;
3686	}
3687	currentLen = (int32_t)updatedLen;
3688	loc = loopEndLoc;
3689	break;
3690	}
3691
3692	case URX_CTR_LOOP:
3693	case URX_CTR_LOOP_NG:
3694	// These opcodes will be skipped over by code for URX_CTR_INIT.
3695	// We shouldn't encounter them here.
3696	UPRV_UNREACHABLE;
3697
3698	case URX_LOOP_SR_I:
3699	case URX_LOOP_DOT_I:
3700	case URX_LOOP_C:
3701	// For anything to do with loops, make the match length unbounded.
3702	currentLen = INT32_MAX;
3703	break;
3704
3705
3706
3707	case URX_LA_START:
3708	case URX_LA_END:
3709	// Look-ahead. Just ignore, treat the look-ahead block as if
3710	// it were normal pattern. Gives a too-long match length,
3711	// but good enough for now.
3712	break;
3713
3714	// End of look-ahead ops should always be consumed by the processing at
3715	// the URX_LA_START op.
3716	// UPRV_UNREACHABLE;
3717
3718	case URX_LB_START:
3719	{
3720	// Look-behind. Scan forward until the matching look-around end,
3721	// without processing the look-behind block.
3722	int32_t dataLoc = URX_VAL(op);
3723	for (loc = loc + `1`; loc < end; ++loc) {
3724	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3725	int32_t opType = URX_TYPE(op);
3726	if ((opType == URX_LA_END \|\| opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3727	break;
3728	}
3729	}
3730	U_ASSERT(loc < end);
3731	}
3732	break;
3733
3734	default:
3735	UPRV_UNREACHABLE;
3736	}
3737
3738
3739	if (currentLen == INT32_MAX) {
3740	// The maximum length is unbounded.
3741	// Stop further processing of the pattern.
3742	break;
3743	}
3744
3745	}
3746	return currentLen;
3747
3748	}
3749
3750
3751	//------------------------------------------------------------------------------
3752	//
3753	// stripNOPs Remove any NOP operations from the compiled pattern code.
3754	// Extra NOPs are inserted for some constructs during the initial
3755	// code generation to provide locations that may be patched later.
3756	// Many end up unneeded, and are removed by this function.
3757	//
3758	// In order to minimize the number of passes through the pattern,
3759	// back-reference fixup is also performed here (adjusting
3760	// back-reference operands to point to the correct frame offsets).
3761	//
3762	//------------------------------------------------------------------------------
3763	void RegexCompile::stripNOPs() {
3764
3765	if (U_FAILURE(*fStatus)) {
3766	return;
3767	}
3768
3769	int32_t end = fRXPat->fCompiledPat->size();
3770	UVector32 deltas(end, *fStatus);
3771
3772	// Make a first pass over the code, computing the amount that things
3773	// will be offset at each location in the original code.
3774	int32_t loc;
3775	int32_t d = `0`;
3776	for (loc=`0`; loc<end; loc++) {
3777	deltas.addElement(d, *fStatus);
3778	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3779	if (URX_TYPE(op) == URX_NOP) {
3780	d++;
3781	}
3782	}
3783
3784	UnicodeString caseStringBuffer;
3785
3786	// Make a second pass over the code, removing the NOPs by moving following
3787	// code up, and patching operands that refer to code locations that
3788	// are being moved. The array of offsets from the first step is used
3789	// to compute the new operand values.
3790	int32_t src;
3791	int32_t dst = `0`;
3792	for (src=`0`; src<end; src++) {
3793	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3794	int32_t opType = URX_TYPE(op);
3795	switch (opType) {
3796	case URX_NOP:
3797	break;
3798
3799	case URX_STATE_SAVE:
3800	case URX_JMP:
3801	case URX_CTR_LOOP:
3802	case URX_CTR_LOOP_NG:
3803	case URX_RELOC_OPRND:
3804	case URX_JMPX:
3805	case URX_JMP_SAV:
3806	case URX_JMP_SAV_X:
3807	// These are instructions with operands that refer to code locations.
3808	{
3809	int32_t operandAddress = URX_VAL(op);
3810	U_ASSERT(operandAddress>=`0` && operandAddress<deltas.size());
3811	int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3812	op = buildOp(opType, fixedOperandAddress);
3813	fRXPat->fCompiledPat->setElementAt(op, dst);
3814	dst++;
3815	break;
3816	}
3817
3818	case URX_BACKREF:
3819	case URX_BACKREF_I:
3820	{
3821	int32_t where = URX_VAL(op);
3822	if (where > fRXPat->fGroupMap->size()) {
3823	error(U_REGEX_INVALID_BACK_REF);
3824	break;
3825	}
3826	where = fRXPat->fGroupMap->elementAti(where-`1`);
3827	op = buildOp(opType, where);
3828	fRXPat->fCompiledPat->setElementAt(op, dst);
3829	dst++;
3830
3831	fRXPat->fNeedsAltInput = TRUE;
3832	break;
3833	}
3834	case URX_RESERVED_OP:
3835	case URX_RESERVED_OP_N:
3836	case URX_BACKTRACK:
3837	case URX_END:
3838	case URX_ONECHAR:
3839	case URX_STRING:
3840	case URX_STRING_LEN:
3841	case URX_START_CAPTURE:
3842	case URX_END_CAPTURE:
3843	case URX_STATIC_SETREF:
3844	case URX_STAT_SETREF_N:
3845	case URX_SETREF:
3846	case URX_DOTANY:
3847	case URX_FAIL:
3848	case URX_BACKSLASH_B:
3849	case URX_BACKSLASH_BU:
3850	case URX_BACKSLASH_G:
3851	case URX_BACKSLASH_X:
3852	case URX_BACKSLASH_Z:
3853	case URX_DOTANY_ALL:
3854	case URX_BACKSLASH_D:
3855	case URX_CARET:
3856	case URX_DOLLAR:
3857	case URX_CTR_INIT:
3858	case URX_CTR_INIT_NG:
3859	case URX_DOTANY_UNIX:
3860	case URX_STO_SP:
3861	case URX_LD_SP:
3862	case URX_STO_INP_LOC:
3863	case URX_LA_START:
3864	case URX_LA_END:
3865	case URX_ONECHAR_I:
3866	case URX_STRING_I:
3867	case URX_DOLLAR_M:
3868	case URX_CARET_M:
3869	case URX_CARET_M_UNIX:
3870	case URX_LB_START:
3871	case URX_LB_CONT:
3872	case URX_LB_END:
3873	case URX_LBN_CONT:
3874	case URX_LBN_END:
3875	case URX_LOOP_SR_I:
3876	case URX_LOOP_DOT_I:
3877	case URX_LOOP_C:
3878	case URX_DOLLAR_D:
3879	case URX_DOLLAR_MD:
3880	case URX_BACKSLASH_H:
3881	case URX_BACKSLASH_R:
3882	case URX_BACKSLASH_V:
3883	// These instructions are unaltered by the relocation.
3884	fRXPat->fCompiledPat->setElementAt(op, dst);
3885	dst++;
3886	break;
3887
3888	default:
3889	// Some op is unaccounted for.
3890	UPRV_UNREACHABLE;
3891	}
3892	}
3893
3894	fRXPat->fCompiledPat->setSize(dst);
3895	}
3896
3897
3898
3899
3900	//------------------------------------------------------------------------------
3901	//
3902	// Error Report a rule parse error.
3903	// Only report it if no previous error has been recorded.
3904	//
3905	//------------------------------------------------------------------------------
3906	void RegexCompile::error(UErrorCode e) {
3907	if (U_SUCCESS(*fStatus) \|\| e == U_MEMORY_ALLOCATION_ERROR) {
3908	*fStatus = e;
3909	// Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3910	// API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3911	// int64_t. If the values of the latter are out of range for the former,
3912	// set them to the appropriate "field not supported" values.
3913	if (fLineNum > `0x7FFFFFFF`) {
3914	fParseErr->line = `0`;
3915	fParseErr->offset = -`1`;
3916	} else if (fCharNum > `0x7FFFFFFF`) {
3917	fParseErr->line = (int32_t)fLineNum;
3918	fParseErr->offset = -`1`;
3919	} else {
3920	fParseErr->line = (int32_t)fLineNum;
3921	fParseErr->offset = (int32_t)fCharNum;
3922	}
3923
3924	UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3925
3926	// Fill in the context.
3927	// Note: extractBetween() pins supplied indicies to the string bounds.
3928	uprv_memset(fParseErr->preContext, `0`, sizeof(fParseErr->preContext));
3929	uprv_memset(fParseErr->postContext, `0`, sizeof(fParseErr->postContext));
3930	utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+`1`, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3931	utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-`1`, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3932	}
3933	}
3934
3935
3936	//
3937	// Assorted Unicode character constants.
3938	// Numeric because there is no portable way to enter them as literals.
3939	// (Think EBCDIC).
3940	//
3941	static const UChar chCR = `0x0d`; // New lines, for terminating comments.
3942	static const UChar chLF = `0x0a`; // Line Feed
3943	static const UChar chPound = `0x23`; // '#', introduces a comment.
3944	static const UChar chDigit0 = `0x30`; // '0'
3945	static const UChar chDigit7 = `0x37`; // '9'
3946	static const UChar chColon = `0x3A`; // ':'
3947	static const UChar chE = `0x45`; // 'E'
3948	static const UChar chQ = `0x51`; // 'Q'
3949	//static const UChar chN = 0x4E; // 'N'
3950	static const UChar chP = `0x50`; // 'P'
3951	static const UChar chBackSlash = `0x5c`; // '\' introduces a char escape
3952	//static const UChar chLBracket = 0x5b; // '['
3953	static const UChar chRBracket = `0x5d`; // ']'
3954	static const UChar chUp = `0x5e`; // '^'
3955	static const UChar chLowerP = `0x70`;
3956	static const UChar chLBrace = `0x7b`; // '{'
3957	static const UChar chRBrace = `0x7d`; // '}'
3958	static const UChar chNEL = `0x85`; // NEL newline variant
3959	static const UChar chLS = `0x2028`; // Unicode Line Separator
3960
3961
3962	//------------------------------------------------------------------------------
3963	//
3964	// nextCharLL Low Level Next Char from the regex pattern.
3965	// Get a char from the string, keep track of input position
3966	// for error reporting.
3967	//
3968	//------------------------------------------------------------------------------
3969	UChar32 RegexCompile::nextCharLL() {
3970	UChar32 ch;
3971
3972	if (fPeekChar != -`1`) {
3973	ch = fPeekChar;
3974	fPeekChar = -`1`;
3975	return ch;
3976	}
3977
3978	// assume we're already in the right place
3979	ch = UTEXT_NEXT32(fRXPat->fPattern);
3980	if (ch == U_SENTINEL) {
3981	return ch;
3982	}
3983
3984	if (ch == chCR \|\|
3985	ch == chNEL \|\|
3986	ch == chLS \|\|
3987	(ch == chLF && fLastChar != chCR)) {
3988	// Character is starting a new line. Bump up the line number, and
3989	// reset the column to 0.
3990	fLineNum++;
3991	fCharNum=`0`;
3992	}
3993	else {
3994	// Character is not starting a new line. Except in the case of a
3995	// LF following a CR, increment the column position.
3996	if (ch != chLF) {
3997	fCharNum++;
3998	}
3999	}
4000	fLastChar = ch;
4001	return ch;
4002	}
4003
4004	//------------------------------------------------------------------------------
4005	//
4006	// peekCharLL Low Level Character Scanning, sneak a peek at the next
4007	// character without actually getting it.
4008	//
4009	//------------------------------------------------------------------------------
4010	UChar32 RegexCompile::peekCharLL() {
4011	if (fPeekChar == -`1`) {
4012	fPeekChar = nextCharLL();
4013	}
4014	return fPeekChar;
4015	}
4016
4017
4018	//------------------------------------------------------------------------------
4019	//
4020	// nextChar for pattern scanning. At this level, we handle stripping
4021	// out comments and processing some backslash character escapes.
4022	// The rest of the pattern grammar is handled at the next level up.
4023	//
4024	//------------------------------------------------------------------------------
4025	void RegexCompile::nextChar(RegexPatternChar &c) {
4026	tailRecursion:
4027	fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4028	c.fChar = nextCharLL();
4029	c.fQuoted = FALSE;
4030
4031	if (fQuoteMode) {
4032	c.fQuoted = TRUE;
4033	if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == `0`)) \|\|
4034	c.fChar == (UChar32)-`1`) {
4035	fQuoteMode = FALSE; // Exit quote mode,
4036	nextCharLL(); // discard the E
4037	// nextChar(c); // recurse to get the real next char
4038	goto tailRecursion; // Note: fuzz testing produced testcases that
4039	// resulted in stack overflow here.
4040	}
4041	}
4042	else if (fInBackslashQuote) {
4043	// The current character immediately follows a '\'
4044	// Don't check for any further escapes, just return it as-is.
4045	// Don't set c.fQuoted, because that would prevent the state machine from
4046	// dispatching on the character.
4047	fInBackslashQuote = FALSE;
4048	}
4049	else
4050	{
4051	// We are not in a \Q quoted region \E of the source.
4052	//
4053	if (fModeFlags & UREGEX_COMMENTS) {
4054	//
4055	// We are in free-spacing and comments mode.
4056	// Scan through any white space and comments, until we
4057	// reach a significant character or the end of inut.
4058	for (;;) {
4059	if (c.fChar == (UChar32)-`1`) {
4060	break; // End of Input
4061	}
4062	if (c.fChar == chPound && fEOLComments == TRUE) {
4063	// Start of a comment. Consume the rest of it, until EOF or a new line
4064	for (;;) {
4065	c.fChar = nextCharLL();
4066	if (c.fChar == (UChar32)-`1` \|\| // EOF
4067	c.fChar == chCR \|\|
4068	c.fChar == chLF \|\|
4069	c.fChar == chNEL \|\|
4070	c.fChar == chLS) {
4071	break;
4072	}
4073	}
4074	}
4075	// TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4076	if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4077	break;
4078	}
4079	c.fChar = nextCharLL();
4080	}
4081	}
4082
4083	//
4084	// check for backslash escaped characters.
4085	//
4086	if (c.fChar == chBackSlash) {
4087	int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4088	if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4089	//
4090	// A '\' sequence that is handled by ICU's standard unescapeAt function.
4091	// Includes \uxxxx, \n, \r, many others.
4092	// Return the single equivalent character.
4093	//
4094	nextCharLL(); // get & discard the peeked char.
4095	c.fQuoted = TRUE;
4096
4097	if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4098	int32_t endIndex = (int32_t)pos;
4099	c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4100
4101	if (endIndex == pos) {
4102	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4103	}
4104	fCharNum += endIndex - pos;
4105	UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4106	} else {
4107	int32_t offset = `0`;
4108	struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4109
4110	UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4111	c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4112
4113	if (offset == `0`) {
4114	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4115	} else if (context.lastOffset == offset) {
4116	UTEXT_PREVIOUS32(fRXPat->fPattern);
4117	} else if (context.lastOffset != offset-`1`) {
4118	utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - `1`);
4119	}
4120	fCharNum += offset;
4121	}
4122	}
4123	else if (peekCharLL() == chDigit0) {
4124	// Octal Escape, using Java Regexp Conventions
4125	// which are \0 followed by 1-3 octal digits.
4126	// Different from ICU Unescape handling of Octal, which does not
4127	// require the leading 0.
4128	// Java also has the convention of only consuming 2 octal digits if
4129	// the three digit number would be > 0xff
4130	//
4131	c.fChar = `0`;
4132	nextCharLL(); // Consume the initial 0.
4133	int index;
4134	for (index=`0`; index<`3`; index++) {
4135	int32_t ch = peekCharLL();
4136	if (ch<chDigit0 \|\| ch>chDigit7) {
4137	if (index==`0`) {
4138	// \0 is not followed by any octal digits.
4139	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4140	}
4141	break;
4142	}
4143	c.fChar <<= `3`;
4144	c.fChar += ch&`7`;
4145	if (c.fChar <= `255`) {
4146	nextCharLL();
4147	} else {
4148	// The last digit made the number too big. Forget we saw it.
4149	c.fChar >>= `3`;
4150	}
4151	}
4152	c.fQuoted = TRUE;
4153	}
4154	else if (peekCharLL() == chQ) {
4155	// "\Q" enter quote mode, which will continue until "\E"
4156	fQuoteMode = TRUE;
4157	nextCharLL(); // discard the 'Q'.
4158	// nextChar(c); // recurse to get the real next char.
4159	goto tailRecursion; // Note: fuzz testing produced test cases that
4160	// resulted in stack overflow here.
4161	}
4162	else
4163	{
4164	// We are in a '\' escape that will be handled by the state table scanner.
4165	// Just return the backslash, but remember that the following char is to
4166	// be taken literally.
4167	fInBackslashQuote = TRUE;
4168	}
4169	}
4170	}
4171
4172	// re-enable # to end-of-line comments, in case they were disabled.
4173	// They are disabled by the parser upon seeing '(?', but this lasts for
4174	// the fetching of the next character only.
4175	fEOLComments = TRUE;
4176
4177	// putc(c.fChar, stdout);
4178	}
4179
4180
4181
4182	//------------------------------------------------------------------------------
4183	//
4184	// scanNamedChar
4185	// Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4186	//
4187	// The scan position will be at the 'N'. On return
4188	// the scan position should be just after the '}'
4189	//
4190	// Return the UChar32
4191	//
4192	//------------------------------------------------------------------------------
4193	UChar32 RegexCompile::scanNamedChar() {
4194	if (U_FAILURE(*fStatus)) {
4195	return `0`;
4196	}
4197
4198	nextChar(fC);
4199	if (fC.fChar != chLBrace) {
4200	error(U_REGEX_PROPERTY_SYNTAX);
4201	return `0`;
4202	}
4203
4204	UnicodeString charName;
4205	for (;;) {
4206	nextChar(fC);
4207	if (fC.fChar == chRBrace) {
4208	break;
4209	}
4210	if (fC.fChar == -`1`) {
4211	error(U_REGEX_PROPERTY_SYNTAX);
4212	return `0`;
4213	}
4214	charName.append(fC.fChar);
4215	}
4216
4217	char name[`100`];
4218	if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) \|\|
4219	(uint32_t)charName.length()>=sizeof(name)) {
4220	// All Unicode character names have only invariant characters.
4221	// The API to get a character, given a name, accepts only char , forcing us to convert,*
4222	// which requires this error check
4223	error(U_REGEX_PROPERTY_SYNTAX);
4224	return `0`;
4225	}
4226	charName.extract(`0`, charName.length(), name, sizeof(name), US_INV);
4227
4228	UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4229	if (U_FAILURE(*fStatus)) {
4230	error(U_REGEX_PROPERTY_SYNTAX);
4231	}
4232
4233	nextChar(fC); // Continue overall regex pattern processing with char after the '}'
4234	return theChar;
4235	}
4236
4237	//------------------------------------------------------------------------------
4238	//
4239	// scanProp Construct a UnicodeSet from the text at the current scan
4240	// position, which will be of the form \p{whaterver}
4241	//
4242	// The scan position will be at the 'p' or 'P'. On return
4243	// the scan position should be just after the '}'
4244	//
4245	// Return a UnicodeSet, constructed from the \P pattern,
4246	// or NULL if the pattern is invalid.
4247	//
4248	//------------------------------------------------------------------------------
4249	UnicodeSet *RegexCompile::scanProp() {
4250	UnicodeSet *uset = NULL;
4251
4252	if (U_FAILURE(*fStatus)) {
4253	return NULL;
4254	}
4255	(void)chLowerP; // Suppress compiler unused variable warning.
4256	U_ASSERT(fC.fChar == chLowerP \|\| fC.fChar == chP);
4257	UBool negated = (fC.fChar == chP);
4258
4259	UnicodeString propertyName;
4260	nextChar(fC);
4261	if (fC.fChar != chLBrace) {
4262	error(U_REGEX_PROPERTY_SYNTAX);
4263	return NULL;
4264	}
4265	for (;;) {
4266	nextChar(fC);
4267	if (fC.fChar == chRBrace) {
4268	break;
4269	}
4270	if (fC.fChar == -`1`) {
4271	// Hit the end of the input string without finding the closing '}'
4272	error(U_REGEX_PROPERTY_SYNTAX);
4273	return NULL;
4274	}
4275	propertyName.append(fC.fChar);
4276	}
4277	uset = createSetForProperty(propertyName, negated);
4278	nextChar(fC); // Move input scan to position following the closing '}'
4279	return uset;
4280	}
4281
4282	//------------------------------------------------------------------------------
4283	//
4284	// scanPosixProp Construct a UnicodeSet from the text at the current scan
4285	// position, which is expected be of the form [:property expression:]
4286	//
4287	// The scan position will be at the opening ':'. On return
4288	// the scan position must be on the closing ']'
4289	//
4290	// Return a UnicodeSet constructed from the pattern,
4291	// or NULL if this is not a valid POSIX-style set expression.
4292	// If not a property expression, restore the initial scan position
4293	// (to the opening ':')
4294	//
4295	// Note: the opening '[:' is not sufficient to guarantee that
4296	// this is a [:property:] expression.
4297	// [:'+=,] is a perfectly good ordinary set expression that
4298	// happens to include ':' as one of its characters.
4299	//
4300	//------------------------------------------------------------------------------
4301	UnicodeSet *RegexCompile::scanPosixProp() {
4302	UnicodeSet *uset = NULL;
4303
4304	if (U_FAILURE(*fStatus)) {
4305	return NULL;
4306	}
4307
4308	U_ASSERT(fC.fChar == chColon);
4309
4310	// Save the scanner state.
4311	// TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4312	int64_t savedScanIndex = fScanIndex;
4313	int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4314	UBool savedQuoteMode = fQuoteMode;
4315	UBool savedInBackslashQuote = fInBackslashQuote;
4316	UBool savedEOLComments = fEOLComments;
4317	int64_t savedLineNum = fLineNum;
4318	int64_t savedCharNum = fCharNum;
4319	UChar32 savedLastChar = fLastChar;
4320	UChar32 savedPeekChar = fPeekChar;
4321	RegexPatternChar savedfC = fC;
4322
4323	// Scan for a closing ]. A little tricky because there are some perverse
4324	// edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4325	// ending on the second closing ].
4326
4327	UnicodeString propName;
4328	UBool negated = FALSE;
4329
4330	// Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4331	nextChar(fC);
4332	if (fC.fChar == chUp) {
4333	negated = TRUE;
4334	nextChar(fC);
4335	}
4336
4337	// Scan for the closing ":]", collecting the property name along the way.
4338	UBool sawPropSetTerminator = FALSE;
4339	for (;;) {
4340	propName.append(fC.fChar);
4341	nextChar(fC);
4342	if (fC.fQuoted \|\| fC.fChar == -`1`) {
4343	// Escaped characters or end of input - either says this isn't a [:Property:]
4344	break;
4345	}
4346	if (fC.fChar == chColon) {
4347	nextChar(fC);
4348	if (fC.fChar == chRBracket) {
4349	sawPropSetTerminator = TRUE;
4350	}
4351	break;
4352	}
4353	}
4354
4355	if (sawPropSetTerminator) {
4356	uset = createSetForProperty(propName, negated);
4357	}
4358	else
4359	{
4360	// No closing ":]".
4361	// Restore the original scan position.
4362	// The main scanner will retry the input as a normal set expression,
4363	// not a [:Property:] expression.
4364	fScanIndex = savedScanIndex;
4365	fQuoteMode = savedQuoteMode;
4366	fInBackslashQuote = savedInBackslashQuote;
4367	fEOLComments = savedEOLComments;
4368	fLineNum = savedLineNum;
4369	fCharNum = savedCharNum;
4370	fLastChar = savedLastChar;
4371	fPeekChar = savedPeekChar;
4372	fC = savedfC;
4373	UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4374	}
4375	return uset;
4376	}
4377
4378	static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4379	set->add(`0`, `8`).add(`0x0e`, `0x1b`).add(`0x7f`, `0x9f`);
4380	addCategory(set, U_GC_CF_MASK, ec);
4381	}
4382
4383	//
4384	// Create a Unicode Set from a Unicode Property expression.
4385	// This is common code underlying both \p{...} ane [:...:] expressions.
4386	// Includes trying the Java "properties" that aren't supported as
4387	// normal ICU UnicodeSet properties
4388	//
4389	UnicodeSet RegexCompile::createSetForProperty(const* UnicodeString &propName, UBool negated) {
4390
4391	if (U_FAILURE(*fStatus)) {
4392	return nullptr;
4393	}
4394	LocalPointer<UnicodeSet> set;
4395	UErrorCode status = U_ZERO_ERROR;
4396
4397	do { // non-loop, exists to allow breaks from the block.
4398	//
4399	// First try the property as we received it
4400	//
4401	UnicodeString setExpr;
4402	uint32_t usetFlags = `0`;
4403	setExpr.append(u"[\\p{", -`1`);
4404	setExpr.append(propName);
4405	setExpr.append(u"}]", -`1`);
4406	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4407	usetFlags \|= USET_CASE_INSENSITIVE;
4408	}
4409	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (setExpr, usetFlags, NULL, status), status);
4410	if (U_SUCCESS(status) \|\| status == U_MEMORY_ALLOCATION_ERROR) {
4411	break;
4412	}
4413
4414	//
4415	// The incoming property wasn't directly recognized by ICU.
4416
4417	// Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4418	// Java accepts 'word' with mixed case.
4419	// Java accepts 'all' only in all lower case.
4420
4421	status = U_ZERO_ERROR;
4422	if (propName.caseCompare(u"word", -`1`, `0`) == `0`) {
4423	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (*(fRXPat->fStaticSets[URX_ISWORD_SET])), status);
4424	break;
4425	}
4426	if (propName.compare(u"all", -`1`) == `0`) {
4427	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (`0`, `0x10ffff`), status);
4428	break;
4429	}
4430
4431
4432	// Do Java InBlock expressions
4433	//
4434	UnicodeString mPropName = propName;
4435	if (mPropName.startsWith(u"In", `2`) && mPropName.length() >= `3`) {
4436	status = U_ZERO_ERROR;
4437	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (), status);
4438	if (U_FAILURE(status)) {
4439	break;
4440	}
4441	UnicodeString blockName(mPropName, `2`); // Property with the leading "In" removed.
4442	set ->applyPropertyAlias(UnicodeString (u"Block"), blockName, status);
4443	break;
4444	}
4445
4446	// Check for the Java form "IsBooleanPropertyValue", which we will recast
4447	// as "BooleanPropertyValue". The property value can be either a
4448	// a General Category or a Script Name.
4449
4450	if (propName.startsWith(u"Is", `2`) && propName.length()>=`3`) {
4451	mPropName.remove(`0`, `2`); // Strip the "Is"
4452	if (mPropName.indexOf(u`'='`) >= `0`) {
4453	// Reject any "Is..." property expression containing an '=', that is,
4454	// any non-binary property expression.
4455	status = U_REGEX_PROPERTY_SYNTAX;
4456	break;
4457	}
4458
4459	if (mPropName.caseCompare(u"assigned", -`1`, `0`) == `0`) {
4460	mPropName.setTo(u"unassigned", -`1`);
4461	negated = !negated;
4462	} else if (mPropName.caseCompare(u"TitleCase", -`1`, `0`) == `0`) {
4463	mPropName.setTo(u"Titlecase_Letter", -`1`);
4464	}
4465
4466	mPropName.insert(`0`, u"[\\p{", -`1`);
4467	mPropName.append(u"}]", -`1`);
4468	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (mPropName, *fStatus), status);
4469
4470	if (U_SUCCESS(status) && !set ->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4471	set ->closeOver(USET_CASE_INSENSITIVE);
4472	}
4473	break;
4474
4475	}
4476
4477	if (propName.startsWith(u"java", -`1`)) {
4478	status = U_ZERO_ERROR;
4479	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (), status);
4480	if (U_FAILURE(status)) {
4481	break;
4482	}
4483	//
4484	// Try the various Java specific properties.
4485	// These all begin with "java"
4486	//
4487	if (propName.compare(u"javaDefined", -`1`) == `0`) {
4488	addCategory(set.getAlias(), U_GC_CN_MASK, status);
4489	set ->complement();
4490	}
4491	else if (propName.compare(u"javaDigit", -`1`) == `0`) {
4492	addCategory(set.getAlias(), U_GC_ND_MASK, status);
4493	}
4494	else if (propName.compare(u"javaIdentifierIgnorable", -`1`) == `0`) {
4495	addIdentifierIgnorable(set.getAlias(), status);
4496	}
4497	else if (propName.compare(u"javaISOControl", -`1`) == `0`) {
4498	set ->add(`0`, `0x1F`).add(`0x7F`, `0x9F`);
4499	}
4500	else if (propName.compare(u"javaJavaIdentifierPart", -`1`) == `0`) {
4501	addCategory(set.getAlias(), U_GC_L_MASK, status);
4502	addCategory(set.getAlias(), U_GC_SC_MASK, status);
4503	addCategory(set.getAlias(), U_GC_PC_MASK, status);
4504	addCategory(set.getAlias(), U_GC_ND_MASK, status);
4505	addCategory(set.getAlias(), U_GC_NL_MASK, status);
4506	addCategory(set.getAlias(), U_GC_MC_MASK, status);
4507	addCategory(set.getAlias(), U_GC_MN_MASK, status);
4508	addIdentifierIgnorable(set.getAlias(), status);
4509	}
4510	else if (propName.compare(u"javaJavaIdentifierStart", -`1`) == `0`) {
4511	addCategory(set.getAlias(), U_GC_L_MASK, status);
4512	addCategory(set.getAlias(), U_GC_NL_MASK, status);
4513	addCategory(set.getAlias(), U_GC_SC_MASK, status);
4514	addCategory(set.getAlias(), U_GC_PC_MASK, status);
4515	}
4516	else if (propName.compare(u"javaLetter", -`1`) == `0`) {
4517	addCategory(set.getAlias(), U_GC_L_MASK, status);
4518	}
4519	else if (propName.compare(u"javaLetterOrDigit", -`1`) == `0`) {
4520	addCategory(set.getAlias(), U_GC_L_MASK, status);
4521	addCategory(set.getAlias(), U_GC_ND_MASK, status);
4522	}
4523	else if (propName.compare(u"javaLowerCase", -`1`) == `0`) {
4524	addCategory(set.getAlias(), U_GC_LL_MASK, status);
4525	}
4526	else if (propName.compare(u"javaMirrored", -`1`) == `0`) {
4527	set ->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, `1`, status);
4528	}
4529	else if (propName.compare(u"javaSpaceChar", -`1`) == `0`) {
4530	addCategory(set.getAlias(), U_GC_Z_MASK, status);
4531	}
4532	else if (propName.compare(u"javaSupplementaryCodePoint", -`1`) == `0`) {
4533	set ->add(`0x10000`, UnicodeSet::MAX_VALUE);
4534	}
4535	else if (propName.compare(u"javaTitleCase", -`1`) == `0`) {
4536	addCategory(set.getAlias(), U_GC_LT_MASK, status);
4537	}
4538	else if (propName.compare(u"javaUnicodeIdentifierStart", -`1`) == `0`) {
4539	addCategory(set.getAlias(), U_GC_L_MASK, status);
4540	addCategory(set.getAlias(), U_GC_NL_MASK, status);
4541	}
4542	else if (propName.compare(u"javaUnicodeIdentifierPart", -`1`) == `0`) {
4543	addCategory(set.getAlias(), U_GC_L_MASK, status);
4544	addCategory(set.getAlias(), U_GC_PC_MASK, status);
4545	addCategory(set.getAlias(), U_GC_ND_MASK, status);
4546	addCategory(set.getAlias(), U_GC_NL_MASK, status);
4547	addCategory(set.getAlias(), U_GC_MC_MASK, status);
4548	addCategory(set.getAlias(), U_GC_MN_MASK, status);
4549	addIdentifierIgnorable(set.getAlias(), status);
4550	}
4551	else if (propName.compare(u"javaUpperCase", -`1`) == `0`) {
4552	addCategory(set.getAlias(), U_GC_LU_MASK, status);
4553	}
4554	else if (propName.compare(u"javaValidCodePoint", -`1`) == `0`) {
4555	set ->add(`0`, UnicodeSet::MAX_VALUE);
4556	}
4557	else if (propName.compare(u"javaWhitespace", -`1`) == `0`) {
4558	addCategory(set.getAlias(), U_GC_Z_MASK, status);
4559	set ->removeAll(UnicodeSet ().add(`0xa0`).add(`0x2007`).add(`0x202f`));
4560	set ->add(`9`, `0x0d`).add(`0x1c`, `0x1f`);
4561	} else {
4562	status = U_REGEX_PROPERTY_SYNTAX;
4563	}
4564
4565	if (U_SUCCESS(status) && !set ->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4566	set ->closeOver(USET_CASE_INSENSITIVE);
4567	}
4568	break;
4569	}
4570
4571	// Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4572	// extensions matched it.
4573	status = U_REGEX_PROPERTY_SYNTAX;
4574	} while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
4575
4576	if (U_SUCCESS(status)) {
4577	U_ASSERT(set.isValid());
4578	if (negated) {
4579	set ->complement();
4580	}
4581	return set.orphan();
4582	} else {
4583	if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4584	status = U_REGEX_PROPERTY_SYNTAX;
4585	}
4586	error(status);
4587	return nullptr;
4588	}
4589	}
4590
4591
4592	//
4593	// SetEval Part of the evaluation of [set expressions].
4594	// Perform any pending (stacked) operations with precedence
4595	// equal or greater to that of the next operator encountered
4596	// in the expression.
4597	//
4598	void RegexCompile::setEval(int32_t nextOp) {
4599	UnicodeSet *rightOperand = NULL;
4600	UnicodeSet *leftOperand = NULL;
4601	for (;;) {
4602	U_ASSERT(fSetOpStack.empty()==FALSE);
4603	int32_t pendingSetOperation = fSetOpStack.peeki();
4604	if ((pendingSetOperation&`0xffff0000`) < (nextOp&`0xffff0000`)) {
4605	break;
4606	}
4607	fSetOpStack.popi();
4608	U_ASSERT(fSetStack.empty() == FALSE);
4609	rightOperand = (UnicodeSet *)fSetStack.peek();
4610	switch (pendingSetOperation) {
4611	case setNegation:
4612	rightOperand->complement();
4613	break;
4614	case setCaseClose:
4615	// TODO: need a simple close function. Ticket 6065
4616	rightOperand->closeOver(USET_CASE_INSENSITIVE);
4617	rightOperand->removeAllStrings();
4618	break;
4619	case setDifference1:
4620	case setDifference2:
4621	fSetStack.pop();
4622	leftOperand = (UnicodeSet *)fSetStack.peek();
4623	leftOperand->removeAll(*rightOperand);
4624	delete rightOperand;
4625	break;
4626	case setIntersection1:
4627	case setIntersection2:
4628	fSetStack.pop();
4629	leftOperand = (UnicodeSet *)fSetStack.peek();
4630	leftOperand->retainAll(*rightOperand);
4631	delete rightOperand;
4632	break;
4633	case setUnion:
4634	fSetStack.pop();
4635	leftOperand = (UnicodeSet *)fSetStack.peek();
4636	leftOperand->addAll(*rightOperand);
4637	delete rightOperand;
4638	break;
4639	default:
4640	UPRV_UNREACHABLE;
4641	}
4642	}
4643	}
4644
4645	void RegexCompile::setPushOp(int32_t op) {
4646	setEval(op);
4647	fSetOpStack.push(op, *fStatus);
4648	fSetStack.push(new UnicodeSet (), *fStatus);
4649	}
4650
4651	U_NAMESPACE_END
4652	#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4653
4654

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/regexcmp.cpp