regexcmp.cpp source code [Skia/third_party/externals/icu/source/i18n/regexcmp.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	//
4	// file: regexcmp.cpp
5	//
6	// Copyright (C) 2002-2016 International Business Machines Corporation and others.
7	// All Rights Reserved.
8	//
9	// This file contains the ICU regular expression compiler, which is responsible
10	// for processing a regular expression pattern into the compiled form that
11	// is used by the match finding engine.
12	//
13
14	#include "unicode/utypes.h"
15
16	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
17
18	#include "unicode/ustring.h"
19	#include "unicode/unistr.h"
20	#include "unicode/uniset.h"
21	#include "unicode/uchar.h"
22	#include "unicode/uchriter.h"
23	#include "unicode/parsepos.h"
24	#include "unicode/parseerr.h"
25	#include "unicode/regex.h"
26	#include "unicode/utf.h"
27	#include "unicode/utf16.h"
28	#include "patternprops.h"
29	#include "putilimp.h"
30	#include "cmemory.h"
31	#include "cstr.h"
32	#include "cstring.h"
33	#include "uvectr32.h"
34	#include "uvectr64.h"
35	#include "uassert.h"
36	#include "uinvchar.h"
37
38	#include "regeximp.h"
39	#include "regexcst.h" // Contains state table for the regex pattern parser.
40	// generated by a Perl script.
41	#include "regexcmp.h"
42	#include "regexst.h"
43	#include "regextxt.h"
44
45
46
47	U_NAMESPACE_BEGIN
48
49
50	//------------------------------------------------------------------------------
51	//
52	// Constructor.
53	//
54	//------------------------------------------------------------------------------
55	RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56	fParenStack (status), fSetStack (status), fSetOpStack (status)
57	{
58	// Lazy init of all shared global sets (needed for init()'s empty text)
59	RegexStaticSets::initGlobals(&status);
60
61	fStatus = &status;
62
63	fRXPat = rxp;
64	fScanIndex = `0`;
65	fLastChar = -`1`;
66	fPeekChar = -`1`;
67	fLineNum = `1`;
68	fCharNum = `0`;
69	fQuoteMode = FALSE;
70	fInBackslashQuote = FALSE;
71	fModeFlags = fRXPat->fFlags \| `0x80000000`;
72	fEOLComments = TRUE;
73
74	fMatchOpenParen = -`1`;
75	fMatchCloseParen = -`1`;
76	fCaptureName = NULL;
77	fLastSetLiteral = U_SENTINEL;
78
79	if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80	status = rxp->fDeferredStatus;
81	}
82	}
83
84	static const UChar chAmp = `0x26`; // '&'
85	static const UChar chDash = `0x2d`; // '-'
86
87
88	//------------------------------------------------------------------------------
89	//
90	// Destructor
91	//
92	//------------------------------------------------------------------------------
93	RegexCompile::~RegexCompile() {
94	delete fCaptureName; // Normally will be NULL, but can exist if pattern
95	// compilation stops with a syntax error.
96	}
97
98	static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99	set->addAll(UnicodeSet ().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100	}
101
102	//------------------------------------------------------------------------------
103	//
104	// Compile regex pattern. The state machine for rexexp pattern parsing is here.
105	// The state tables are hand-written in the file regexcst.txt,
106	// and converted to the form used here by a perl
107	// script regexcst.pl
108	//
109	//------------------------------------------------------------------------------
110	void RegexCompile::compile(
111	const UnicodeString &pat, // Source pat to be compiled.
112	UParseError &pp, // Error position info
113	UErrorCode &e) // Error Code
114	{
115	fRXPat->fPatternString = new UnicodeString (pat);
116	UText patternText = UTEXT_INITIALIZER;
117	utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118
119	if (U_SUCCESS(e)) {
120	compile(&patternText, pp, e);
121	utext_close(&patternText);
122	}
123	}
124
125	//
126	// compile, UText mode
127	// All the work is actually done here.
128	//
129	void RegexCompile::compile(
130	UText pat, // Source pat to be compiled.*
131	UParseError &pp, // Error position info
132	UErrorCode &e) // Error Code
133	{
134	fStatus = &e;
135	fParseErr = &pp;
136	fStackPtr = `0`;
137	fStack[fStackPtr] = `0`;
138
139	if (U_FAILURE(*fStatus)) {
140	return;
141	}
142
143	// There should be no pattern stuff in the RegexPattern object. They can not be reused.
144	U_ASSERT(fRXPat->fPattern == NULL \|\| utext_nativeLength(fRXPat->fPattern) == `0`);
145
146	// Prepare the RegexPattern object to receive the compiled pattern.
147	fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
148	if (U_FAILURE(*fStatus)) {
149	return;
150	}
151	fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
152	fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
153
154
155	// Initialize the pattern scanning state machine
156	fPatternLength = utext_nativeLength(pat);
157	uint16_t state = `1`;
158	const RegexTableEl *tableEl;
159
160	// UREGEX_LITERAL force entire pattern to be treated as a literal string.
161	if (fModeFlags & UREGEX_LITERAL) {
162	fQuoteMode = TRUE;
163	}
164
165	nextChar(fC); // Fetch the first char from the pattern string.
166
167	//
168	// Main loop for the regex pattern parsing state machine.
169	// Runs once per state transition.
170	// Each time through optionally performs, depending on the state table,
171	// - an advance to the the next pattern char
172	// - an action to be performed.
173	// - pushing or popping a state to/from the local state return stack.
174	// file regexcst.txt is the source for the state table. The logic behind
175	// recongizing the pattern syntax is there, not here.
176	//
177	for (;;) {
178	// Bail out if anything has gone wrong.
179	// Regex pattern parsing stops on the first error encountered.
180	if (U_FAILURE(*fStatus)) {
181	break;
182	}
183
184	U_ASSERT(state != `0`);
185
186	// Find the state table element that matches the input char from the pattern, or the
187	// class of the input character. Start with the first table row for this
188	// state, then linearly scan forward until we find a row that matches the
189	// character. The last row for each state always matches all characters, so
190	// the search will stop there, if not before.
191	//
192	tableEl = &gRuleParseStateTable[state];
193	REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
194	fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
195
196	for (;;) { // loop through table rows belonging to this state, looking for one
197	// that matches the current input char.
198	REGEX_SCAN_DEBUG_PRINTF(("."));
199	if (tableEl->fCharClass < `127` && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
200	// Table row specified an individual character, not a set, and
201	// the input character is not quoted, and
202	// the input character matched it.
203	break;
204	}
205	if (tableEl->fCharClass == `255`) {
206	// Table row specified default, match anything character class.
207	break;
208	}
209	if (tableEl->fCharClass == `254` && fC.fQuoted) {
210	// Table row specified "quoted" and the char was quoted.
211	break;
212	}
213	if (tableEl->fCharClass == `253` && fC.fChar == (UChar32)-`1`) {
214	// Table row specified eof and we hit eof on the input.
215	break;
216	}
217
218	if (tableEl->fCharClass >= `128` && tableEl->fCharClass < `240` && // Table specs a char class &&
219	fC.fQuoted == FALSE && // char is not escaped &&
220	fC.fChar != (UChar32)-`1`) { // char is not EOF
221	U_ASSERT(tableEl->fCharClass <= `137`);
222	if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-`128`].contains(fC.fChar)) {
223	// Table row specified a character class, or set of characters,
224	// and the current char matches it.
225	break;
226	}
227	}
228
229	// No match on this row, advance to the next row for this state,
230	tableEl++;
231	}
232	REGEX_SCAN_DEBUG_PRINTF(("\n"));
233
234	//
235	// We've found the row of the state table that matches the current input
236	// character from the rules string.
237	// Perform any action specified by this row in the state table.
238	if (doParseActions(tableEl->fAction) == FALSE) {
239	// Break out of the state machine loop if the
240	// the action signalled some kind of error, or
241	// the action was to exit, occurs on normal end-of-rules-input.
242	break;
243	}
244
245	if (tableEl->fPushState != `0`) {
246	fStackPtr++;
247	if (fStackPtr >= kStackSize) {
248	error(U_REGEX_INTERNAL_ERROR);
249	REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
250	fStackPtr--;
251	}
252	fStack[fStackPtr] = tableEl->fPushState;
253	}
254
255	//
256	// NextChar. This is where characters are actually fetched from the pattern.
257	// Happens under control of the 'n' tag in the state table.
258	//
259	if (tableEl->fNextChar) {
260	nextChar(fC);
261	}
262
263	// Get the next state from the table entry, or from the
264	// state stack if the next state was specified as "pop".
265	if (tableEl->fNextState != `255`) {
266	state = tableEl->fNextState;
267	} else {
268	state = fStack[fStackPtr];
269	fStackPtr--;
270	if (fStackPtr < `0`) {
271	// state stack underflow
272	// This will occur if the user pattern has mis-matched parentheses,
273	// with extra close parens.
274	//
275	fStackPtr++;
276	error(U_REGEX_MISMATCHED_PAREN);
277	}
278	}
279
280	}
281
282	if (U_FAILURE(*fStatus)) {
283	// Bail out if the pattern had errors.
284	// Set stack cleanup: a successful compile would have left it empty,
285	// but errors can leave temporary sets hanging around.
286	while (!fSetStack.empty()) {
287	delete (UnicodeSet *)fSetStack.pop();
288	}
289	return;
290	}
291
292	//
293	// The pattern has now been read and processed, and the compiled code generated.
294	//
295
296	//
297	// The pattern's fFrameSize so far has accumulated the requirements for
298	// storage for capture parentheses, counters, etc. that are encountered
299	// in the pattern. Add space for the two variables that are always
300	// present in the saved state: the input string position (int64_t) and
301	// the position in the compiled pattern.
302	//
303	allocateStackData(RESTACKFRAME_HDRCOUNT);
304
305	//
306	// Optimization pass 1: NOPs, back-references, and case-folding
307	//
308	stripNOPs();
309
310	//
311	// Get bounds for the minimum and maximum length of a string that this
312	// pattern can match. Used to avoid looking for matches in strings that
313	// are too short.
314	//
315	fRXPat->fMinMatchLen = minMatchLength(`3`, fRXPat->fCompiledPat->size()-`1`);
316
317	//
318	// Optimization pass 2: match start type
319	//
320	matchStartType();
321
322	//
323	// Set up fast latin-1 range sets
324	//
325	int32_t numSets = fRXPat->fSets->size();
326	fRXPat->fSets8 = new Regex8BitSet[numSets];
327	// Null pointer check.
328	if (fRXPat->fSets8 == NULL) {
329	e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
330	return;
331	}
332	int32_t i;
333	for (i=`0`; i<numSets; i++) {
334	UnicodeSet s = (UnicodeSet )fRXPat->fSets->elementAt(i);
335	fRXPat->fSets8[i].init(s);
336	}
337
338	}
339
340
341
342
343
344	//------------------------------------------------------------------------------
345	//
346	// doParseAction Do some action during regex pattern parsing.
347	// Called by the parse state machine.
348	//
349	// Generation of the match engine PCode happens here, or
350	// in functions called from the parse actions defined here.
351	//
352	//
353	//------------------------------------------------------------------------------
354	UBool RegexCompile::doParseActions(int32_t action)
355	{
356	UBool returnVal = TRUE;
357
358	switch ((Regex_PatternParseAction)action) {
359
360	case doPatStart:
361	// Start of pattern compiles to:
362	//0 SAVE 2 Fall back to position of FAIL
363	//1 jmp 3
364	//2 FAIL Stop if we ever reach here.
365	//3 NOP Dummy, so start of pattern looks the same as
366	// the start of an ( grouping.
367	//4 NOP Resreved, will be replaced by a save if there are
368	// OR \| operators at the top level
369	appendOp(URX_STATE_SAVE, `2`);
370	appendOp(URX_JMP, `3`);
371	appendOp(URX_FAIL, `0`);
372
373	// Standard open nonCapture paren action emits the two NOPs and
374	// sets up the paren stack frame.
375	doParseActions(doOpenNonCaptureParen);
376	break;
377
378	case doPatFinish:
379	// We've scanned to the end of the pattern
380	// The end of pattern compiles to:
381	// URX_END
382	// which will stop the runtime match engine.
383	// Encountering end of pattern also behaves like a close paren,
384	// and forces fixups of the State Save at the beginning of the compiled pattern
385	// and of any OR operations at the top level.
386	//
387	handleCloseParen();
388	if (fParenStack.size() > `0`) {
389	// Missing close paren in pattern.
390	error(U_REGEX_MISMATCHED_PAREN);
391	}
392
393	// add the END operation to the compiled pattern.
394	appendOp(URX_END, `0`);
395
396	// Terminate the pattern compilation state machine.
397	returnVal = FALSE;
398	break;
399
400
401
402	case doOrOperator:
403	// Scanning a '\|', as in (A\|B)
404	{
405	// Generate code for any pending literals preceding the '\|'
406	fixLiterals(FALSE);
407
408	// Insert a SAVE operation at the start of the pattern section preceding
409	// this OR at this level. This SAVE will branch the match forward
410	// to the right hand side of the OR in the event that the left hand
411	// side fails to match and backtracks. Locate the position for the
412	// save from the location on the top of the parentheses stack.
413	int32_t savePosition = fParenStack.popi();
414	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
415	U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
416	op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+`1`);
417	fRXPat->fCompiledPat->setElementAt(op, savePosition);
418
419	// Append an JMP operation into the compiled pattern. The operand for
420	// the JMP will eventually be the location following the ')' for the
421	// group. This will be patched in later, when the ')' is encountered.
422	appendOp(URX_JMP, `0`);
423
424	// Push the position of the newly added JMP op onto the parentheses stack.
425	// This registers if for fixup when this block's close paren is encountered.
426	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, *fStatus);
427
428	// Append a NOP to the compiled pattern. This is the slot reserved
429	// for a SAVE in the event that there is yet another '\|' following
430	// this one.
431	appendOp(URX_NOP, `0`);
432	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, *fStatus);
433	}
434	break;
435
436
437	case doBeginNamedCapture:
438	// Scanning (?<letter.
439	// The first letter of the name will come through again under doConinueNamedCapture.
440	fCaptureName = new UnicodeString ();
441	if (fCaptureName == NULL) {
442	error(U_MEMORY_ALLOCATION_ERROR);
443	}
444	break;
445
446	case doContinueNamedCapture:
447	fCaptureName->append(fC.fChar);
448	break;
449
450	case doBadNamedCapture:
451	error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
452	break;
453
454	case doOpenCaptureParen:
455	// Open Capturing Paren, possibly named.
456	// Compile to a
457	// - NOP, which later may be replaced by a save-state if the
458	// parenthesized group gets a quantifier, followed by*
459	// - START_CAPTURE n where n is stack frame offset to the capture group variables.
460	// - NOP, which may later be replaced by a save-state if there
461	// is an '\|' alternation within the parens.
462	//
463	// Each capture group gets three slots in the save stack frame:
464	// 0: Capture Group start position (in input string being matched.)
465	// 1: Capture Group end position.
466	// 2: Start of Match-in-progress.
467	// The first two locations are for a completed capture group, and are
468	// referred to by back references and the like.
469	// The third location stores the capture start position when an START_CAPTURE is
470	// encountered. This will be promoted to a completed capture when (and if) the corresponding
471	// END_CAPTURE is encountered.
472	{
473	fixLiterals();
474	appendOp(URX_NOP, `0`);
475	int32_t varsLoc = allocateStackData(`3`); // Reserve three slots in match stack frame.
476	appendOp(URX_START_CAPTURE, varsLoc);
477	appendOp(URX_NOP, `0`);
478
479	// On the Parentheses stack, start a new frame and add the postions
480	// of the two NOPs. Depending on what follows in the pattern, the
481	// NOPs may be changed to SAVE_STATE or JMP ops, with a target
482	// address of the end of the parenthesized group.
483	fParenStack.push(fModeFlags, fStatus); // Match mode state*
484	fParenStack.push(capturing, fStatus); // Frame type.*
485	fParenStack.push(fRXPat->fCompiledPat->size()-`3`, fStatus); // The first NOP location*
486	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP loc*
487
488	// Save the mapping from group number to stack frame variable position.
489	fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
490
491	// If this is a named capture group, add the name->group number mapping.
492	if (fCaptureName != NULL) {
493	int32_t groupNumber = fRXPat->fGroupMap->size();
494	int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
495	fCaptureName = NULL; // hash table takes ownership of the name (key) string.
496	if (previousMapping > `0` && U_SUCCESS(*fStatus)) {
497	error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
498	}
499	}
500	}
501	break;
502
503	case doOpenNonCaptureParen:
504	// Open non-caputuring (grouping only) Paren.
505	// Compile to a
506	// - NOP, which later may be replaced by a save-state if the
507	// parenthesized group gets a quantifier, followed by*
508	// - NOP, which may later be replaced by a save-state if there
509	// is an '\|' alternation within the parens.
510	{
511	fixLiterals();
512	appendOp(URX_NOP, `0`);
513	appendOp(URX_NOP, `0`);
514
515	// On the Parentheses stack, start a new frame and add the postions
516	// of the two NOPs.
517	fParenStack.push(fModeFlags, fStatus); // Match mode state*
518	fParenStack.push(plain, fStatus); // Begin a new frame.*
519	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP location*
520	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP loc*
521	}
522	break;
523
524
525	case doOpenAtomicParen:
526	// Open Atomic Paren. (?>
527	// Compile to a
528	// - NOP, which later may be replaced if the parenthesized group
529	// has a quantifier, followed by
530	// - STO_SP save state stack position, so it can be restored at the ")"
531	// - NOP, which may later be replaced by a save-state if there
532	// is an '\|' alternation within the parens.
533	{
534	fixLiterals();
535	appendOp(URX_NOP, `0`);
536	int32_t varLoc = allocateData(`1`); // Reserve a data location for saving the state stack ptr.
537	appendOp(URX_STO_SP, varLoc);
538	appendOp(URX_NOP, `0`);
539
540	// On the Parentheses stack, start a new frame and add the postions
541	// of the two NOPs. Depending on what follows in the pattern, the
542	// NOPs may be changed to SAVE_STATE or JMP ops, with a target
543	// address of the end of the parenthesized group.
544	fParenStack.push(fModeFlags, fStatus); // Match mode state*
545	fParenStack.push(atomic, fStatus); // Frame type.*
546	fParenStack.push(fRXPat->fCompiledPat->size()-`3`, fStatus); // The first NOP*
547	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP*
548	}
549	break;
550
551
552	case doOpenLookAhead:
553	// Positive Look-ahead (?= stuff )
554	//
555	// Note: Addition of transparent input regions, with the need to
556	// restore the original regions when failing out of a lookahead
557	// block, complicated this sequence. Some conbined opcodes
558	// might make sense - or might not, lookahead aren't that common.
559	//
560	// Caution: min match length optimization knows about this
561	// sequence; don't change without making updates there too.
562	//
563	// Compiles to
564	// 1 LA_START dataLoc Saves SP, Input Pos, Active input region.
565	// 2. STATE_SAVE 4 on failure of lookahead, goto 4
566	// 3 JMP 6 continue ...
567	//
568	// 4. LA_END Look Ahead failed. Restore regions.
569	// 5. BACKTRACK and back track again.
570	//
571	// 6. NOP reserved for use by quantifiers on the block.
572	// Look-ahead can't have quantifiers, but paren stack
573	// compile time conventions require the slot anyhow.
574	// 7. NOP may be replaced if there is are '\|' ops in the block.
575	// 8. code for parenthesized stuff.
576	// 9. LA_END
577	//
578	// Four data slots are reserved, for saving state on entry to the look-around
579	// 0: stack pointer on entry.
580	// 1: input position on entry.
581	// 2: fActiveStart, the active bounds start on entry.
582	// 3: fActiveLimit, the active bounds limit on entry.
583	{
584	fixLiterals();
585	int32_t dataLoc = allocateData(`4`);
586	appendOp(URX_LA_START, dataLoc);
587	appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ `2`);
588	appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ `3`);
589	appendOp(URX_LA_END, dataLoc);
590	appendOp(URX_BACKTRACK, `0`);
591	appendOp(URX_NOP, `0`);
592	appendOp(URX_NOP, `0`);
593
594	// On the Parentheses stack, start a new frame and add the postions
595	// of the NOPs.
596	fParenStack.push(fModeFlags, fStatus); // Match mode state*
597	fParenStack.push(lookAhead, fStatus); // Frame type.*
598	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP location*
599	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP location*
600	}
601	break;
602
603	case doOpenLookAheadNeg:
604	// Negated Lookahead. (?! stuff )
605	// Compiles to
606	// 1. LA_START dataloc
607	// 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
608	// // which continues with the match.
609	// 3. NOP // Std. Open Paren sequence, for possible '\|'
610	// 4. code for parenthesized stuff.
611	// 5. LA_END // Cut back stack, remove saved state from step 2.
612	// 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
613	// 7. END_LA // Restore match region, in case look-ahead was using
614	// an alternate (transparent) region.
615	// Four data slots are reserved, for saving state on entry to the look-around
616	// 0: stack pointer on entry.
617	// 1: input position on entry.
618	// 2: fActiveStart, the active bounds start on entry.
619	// 3: fActiveLimit, the active bounds limit on entry.
620	{
621	fixLiterals();
622	int32_t dataLoc = allocateData(`4`);
623	appendOp(URX_LA_START, dataLoc);
624	appendOp(URX_STATE_SAVE, `0`); // dest address will be patched later.
625	appendOp(URX_NOP, `0`);
626
627	// On the Parentheses stack, start a new frame and add the postions
628	// of the StateSave and NOP.
629	fParenStack.push(fModeFlags, fStatus); // Match mode state*
630	fParenStack.push(negLookAhead, fStatus); // Frame type*
631	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The STATE_SAVE location*
632	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP location*
633
634	// Instructions #5 - #7 will be added when the ')' is encountered.
635	}
636	break;
637
638	case doOpenLookBehind:
639	{
640	// Compile a (?<= look-behind open paren.
641	//
642	// Compiles to
643	// 0 URX_LB_START dataLoc
644	// 1 URX_LB_CONT dataLoc
645	// 2 MinMatchLen
646	// 3 MaxMatchLen
647	// 4 URX_NOP Standard '(' boilerplate.
648	// 5 URX_NOP Reserved slot for use with '\|' ops within (block).
649	// 6 <code for LookBehind expression>
650	// 7 URX_LB_END dataLoc # Check match len, restore input len
651	// 8 URX_LA_END dataLoc # Restore stack, input pos
652	//
653	// Allocate a block of matcher data, to contain (when running a match)
654	// 0: Stack ptr on entry
655	// 1: Input Index on entry
656	// 2: fActiveStart, the active bounds start on entry.
657	// 3: fActiveLimit, the active bounds limit on entry.
658	// 4: Start index of match current match attempt.
659	// The first four items must match the layout of data for LA_START / LA_END
660
661	// Generate match code for any pending literals.
662	fixLiterals();
663
664	// Allocate data space
665	int32_t dataLoc = allocateData(`5`);
666
667	// Emit URX_LB_START
668	appendOp(URX_LB_START, dataLoc);
669
670	// Emit URX_LB_CONT
671	appendOp(URX_LB_CONT, dataLoc);
672	appendOp(URX_RESERVED_OP, `0`); // MinMatchLength. To be filled later.
673	appendOp(URX_RESERVED_OP, `0`); // MaxMatchLength. To be filled later.
674
675	// Emit the NOPs
676	appendOp(URX_NOP, `0`);
677	appendOp(URX_NOP, `0`);
678
679	// On the Parentheses stack, start a new frame and add the postions
680	// of the URX_LB_CONT and the NOP.
681	fParenStack.push(fModeFlags, fStatus); // Match mode state*
682	fParenStack.push(lookBehind, fStatus); // Frame type*
683	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP location*
684	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The 2nd NOP location*
685
686	// The final two instructions will be added when the ')' is encountered.
687	}
688
689	break;
690
691	case doOpenLookBehindNeg:
692	{
693	// Compile a (?<! negated look-behind open paren.
694	//
695	// Compiles to
696	// 0 URX_LB_START dataLoc # Save entry stack, input len
697	// 1 URX_LBN_CONT dataLoc # Iterate possible match positions
698	// 2 MinMatchLen
699	// 3 MaxMatchLen
700	// 4 continueLoc (9)
701	// 5 URX_NOP Standard '(' boilerplate.
702	// 6 URX_NOP Reserved slot for use with '\|' ops within (block).
703	// 7 <code for LookBehind expression>
704	// 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
705	// 9 ...
706	//
707	// Allocate a block of matcher data, to contain (when running a match)
708	// 0: Stack ptr on entry
709	// 1: Input Index on entry
710	// 2: fActiveStart, the active bounds start on entry.
711	// 3: fActiveLimit, the active bounds limit on entry.
712	// 4: Start index of match current match attempt.
713	// The first four items must match the layout of data for LA_START / LA_END
714
715	// Generate match code for any pending literals.
716	fixLiterals();
717
718	// Allocate data space
719	int32_t dataLoc = allocateData(`5`);
720
721	// Emit URX_LB_START
722	appendOp(URX_LB_START, dataLoc);
723
724	// Emit URX_LBN_CONT
725	appendOp(URX_LBN_CONT, dataLoc);
726	appendOp(URX_RESERVED_OP, `0`); // MinMatchLength. To be filled later.
727	appendOp(URX_RESERVED_OP, `0`); // MaxMatchLength. To be filled later.
728	appendOp(URX_RESERVED_OP, `0`); // Continue Loc. To be filled later.
729
730	// Emit the NOPs
731	appendOp(URX_NOP, `0`);
732	appendOp(URX_NOP, `0`);
733
734	// On the Parentheses stack, start a new frame and add the postions
735	// of the URX_LB_CONT and the NOP.
736	fParenStack.push(fModeFlags, fStatus); // Match mode state*
737	fParenStack.push(lookBehindN, fStatus); // Frame type*
738	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP location*
739	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The 2nd NOP location*
740
741	// The final two instructions will be added when the ')' is encountered.
742	}
743	break;
744
745	case doConditionalExpr:
746	// Conditionals such as (?(1)a:b)
747	case doPerlInline:
748	// Perl inline-condtionals. (?{perl code}a\|b) We're not perl, no way to do them.
749	error(U_REGEX_UNIMPLEMENTED);
750	break;
751
752
753	case doCloseParen:
754	handleCloseParen();
755	if (fParenStack.size() <= `0`) {
756	// Extra close paren, or missing open paren.
757	error(U_REGEX_MISMATCHED_PAREN);
758	}
759	break;
760
761	case doNOP:
762	break;
763
764
765	case doBadOpenParenType:
766	case doRuleError:
767	error(U_REGEX_RULE_SYNTAX);
768	break;
769
770
771	case doMismatchedParenErr:
772	error(U_REGEX_MISMATCHED_PAREN);
773	break;
774
775	case doPlus:
776	// Normal '+' compiles to
777	// 1. stuff to be repeated (already built)
778	// 2. jmp-sav 1
779	// 3. ...
780	//
781	// Or, if the item to be repeated can match a zero length string,
782	// 1. STO_INP_LOC data-loc
783	// 2. body of stuff to be repeated
784	// 3. JMP_SAV_X 2
785	// 4. ...
786
787	//
788	// Or, if the item to be repeated is simple
789	// 1. Item to be repeated.
790	// 2. LOOP_SR_I set number (assuming repeated item is a set ref)
791	// 3. LOOP_C stack location
792	{
793	int32_t topLoc = blockTopLoc(FALSE); // location of item #1
794	int32_t frameLoc;
795
796	// Check for simple constructs, which may get special optimized code.
797	if (topLoc == fRXPat->fCompiledPat->size() - `1`) {
798	int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
799
800	if (URX_TYPE(repeatedOp) == URX_SETREF) {
801	// Emit optimized code for [char set]+
802	appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
803	frameLoc = allocateStackData(`1`);
804	appendOp(URX_LOOP_C, frameLoc);
805	break;
806	}
807
808	if (URX_TYPE(repeatedOp) == URX_DOTANY \|\|
809	URX_TYPE(repeatedOp) == URX_DOTANY_ALL \|\|
810	URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
811	// Emit Optimized code for .+ operations.
812	int32_t loopOpI = buildOp(URX_LOOP_DOT_I, `0`);
813	if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
814	// URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
815	loopOpI \|= `1`;
816	}
817	if (fModeFlags & UREGEX_UNIX_LINES) {
818	loopOpI \|= `2`;
819	}
820	appendOp(loopOpI);
821	frameLoc = allocateStackData(`1`);
822	appendOp(URX_LOOP_C, frameLoc);
823	break;
824	}
825
826	}
827
828	// General case.
829
830	// Check for minimum match length of zero, which requires
831	// extra loop-breaking code.
832	if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-`1`) == `0`) {
833	// Zero length match is possible.
834	// Emit the code sequence that can handle it.
835	insertOp(topLoc);
836	frameLoc = allocateStackData(`1`);
837
838	int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
839	fRXPat->fCompiledPat->setElementAt(op, topLoc);
840
841	appendOp(URX_JMP_SAV_X, topLoc+`1`);
842	} else {
843	// Simpler code when the repeated body must match something non-empty
844	appendOp(URX_JMP_SAV, topLoc);
845	}
846	}
847	break;
848
849	case doNGPlus:
850	// Non-greedy '+?' compiles to
851	// 1. stuff to be repeated (already built)
852	// 2. state-save 1
853	// 3. ...
854	{
855	int32_t topLoc = blockTopLoc(FALSE);
856	appendOp(URX_STATE_SAVE, topLoc);
857	}
858	break;
859
860
861	case doOpt:
862	// Normal (greedy) ? quantifier.
863	// Compiles to
864	// 1. state save 3
865	// 2. body of optional block
866	// 3. ...
867	// Insert the state save into the compiled pattern, and we're done.
868	{
869	int32_t saveStateLoc = blockTopLoc(TRUE);
870	int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
871	fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
872	}
873	break;
874
875	case doNGOpt:
876	// Non-greedy ?? quantifier
877	// compiles to
878	// 1. jmp 4
879	// 2. body of optional block
880	// 3 jmp 5
881	// 4. state save 2
882	// 5 ...
883	// This code is less than ideal, with two jmps instead of one, because we can only
884	// insert one instruction at the top of the block being iterated.
885	{
886	int32_t jmp1_loc = blockTopLoc(TRUE);
887	int32_t jmp2_loc = fRXPat->fCompiledPat->size();
888
889	int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+`1`);
890	fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
891
892	appendOp(URX_JMP, jmp2_loc+`2`);
893
894	appendOp(URX_STATE_SAVE, jmp1_loc+`1`);
895	}
896	break;
897
898
899	case doStar:
900	// Normal (greedy) quantifier.*
901	// Compiles to
902	// 1. STATE_SAVE 4
903	// 2. body of stuff being iterated over
904	// 3. JMP_SAV 2
905	// 4. ...
906	//
907	// Or, if the body is a simple [Set],
908	// 1. LOOP_SR_I set number
909	// 2. LOOP_C stack location
910	// ...
911	//
912	// Or if this is a .*
913	// 1. LOOP_DOT_I (. matches all mode flag)
914	// 2. LOOP_C stack location
915	//
916	// Or, if the body can match a zero-length string, to inhibit infinite loops,
917	// 1. STATE_SAVE 5
918	// 2. STO_INP_LOC data-loc
919	// 3. body of stuff
920	// 4. JMP_SAV_X 2
921	// 5. ...
922	{
923	// location of item #1, the STATE_SAVE
924	int32_t topLoc = blockTopLoc(FALSE);
925	int32_t dataLoc = -`1`;
926
927	// Check for simple , where the construct being repeated*
928	// compiled to single opcode, and might be optimizable.
929	if (topLoc == fRXPat->fCompiledPat->size() - `1`) {
930	int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
931
932	if (URX_TYPE(repeatedOp) == URX_SETREF) {
933	// Emit optimized code for a [char set]*
934	int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
935	fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
936	dataLoc = allocateStackData(`1`);
937	appendOp(URX_LOOP_C, dataLoc);
938	break;
939	}
940
941	if (URX_TYPE(repeatedOp) == URX_DOTANY \|\|
942	URX_TYPE(repeatedOp) == URX_DOTANY_ALL \|\|
943	URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
944	// Emit Optimized code for . operations.*
945	int32_t loopOpI = buildOp(URX_LOOP_DOT_I, `0`);
946	if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
947	// URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
948	loopOpI \|= `1`;
949	}
950	if ((fModeFlags & UREGEX_UNIX_LINES) != `0`) {
951	loopOpI \|= `2`;
952	}
953	fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
954	dataLoc = allocateStackData(`1`);
955	appendOp(URX_LOOP_C, dataLoc);
956	break;
957	}
958	}
959
960	// Emit general case code for this *
961	// The optimizations did not apply.
962
963	int32_t saveStateLoc = blockTopLoc(TRUE);
964	int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+`1`);
965
966	// Check for minimum match length of zero, which requires
967	// extra loop-breaking code.
968	if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-`1`) == `0`) {
969	insertOp(saveStateLoc);
970	dataLoc = allocateStackData(`1`);
971
972	int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
973	fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+`1`);
974	jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+`2`);
975	}
976
977	// Locate the position in the compiled pattern where the match will continue
978	// after completing the . (4 or 5 in the comment above)*
979	int32_t continueLoc = fRXPat->fCompiledPat->size()+`1`;
980
981	// Put together the save state op and store it into the compiled code.
982	int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
983	fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
984
985	// Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
986	appendOp(jmpOp);
987	}
988	break;
989
990	case doNGStar:
991	// Non-greedy ? quantifier*
992	// compiles to
993	// 1. JMP 3
994	// 2. body of stuff being iterated over
995	// 3. STATE_SAVE 2
996	// 4 ...
997	{
998	int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
999	int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
1000	int32_t jmpOp = buildOp(URX_JMP, saveLoc);
1001	fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1002	appendOp(URX_STATE_SAVE, jmpLoc+`1`);
1003	}
1004	break;
1005
1006
1007	case doIntervalInit:
1008	// The '{' opening an interval quantifier was just scanned.
1009	// Init the counter varaiables that will accumulate the values as the digits
1010	// are scanned.
1011	fIntervalLow = `0`;
1012	fIntervalUpper = -`1`;
1013	break;
1014
1015	case doIntevalLowerDigit:
1016	// Scanned a digit from the lower value of an {lower,upper} interval
1017	{
1018	int32_t digitValue = u_charDigitValue(fC.fChar);
1019	U_ASSERT(digitValue >= `0`);
1020	int64_t val = (int64_t)fIntervalLow*`10` + digitValue;
1021	if (val > INT32_MAX) {
1022	error(U_REGEX_NUMBER_TOO_BIG);
1023	} else {
1024	fIntervalLow = (int32_t)val;
1025	}
1026	}
1027	break;
1028
1029	case doIntervalUpperDigit:
1030	// Scanned a digit from the upper value of an {lower,upper} interval
1031	{
1032	if (fIntervalUpper < `0`) {
1033	fIntervalUpper = `0`;
1034	}
1035	int32_t digitValue = u_charDigitValue(fC.fChar);
1036	U_ASSERT(digitValue >= `0`);
1037	int64_t val = (int64_t)fIntervalUpper*`10` + digitValue;
1038	if (val > INT32_MAX) {
1039	error(U_REGEX_NUMBER_TOO_BIG);
1040	} else {
1041	fIntervalUpper = (int32_t)val;
1042	}
1043	}
1044	break;
1045
1046	case doIntervalSame:
1047	// Scanned a single value interval like {27}. Upper = Lower.
1048	fIntervalUpper = fIntervalLow;
1049	break;
1050
1051	case doInterval:
1052	// Finished scanning a normal {lower,upper} interval. Generate the code for it.
1053	if (compileInlineInterval() == FALSE) {
1054	compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1055	}
1056	break;
1057
1058	case doPossessiveInterval:
1059	// Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1060	{
1061	// Remember the loc for the top of the block being looped over.
1062	// (Can not reserve a slot in the compiled pattern at this time, because
1063	// compileInterval needs to reserve also, and blockTopLoc can only reserve
1064	// once per block.)
1065	int32_t topLoc = blockTopLoc(FALSE);
1066
1067	// Produce normal looping code.
1068	compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1069
1070	// Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1071	// just as if the loop was inclosed in atomic parentheses.
1072
1073	// First the STO_SP before the start of the loop
1074	insertOp(topLoc);
1075
1076	int32_t varLoc = allocateData(`1`); // Reserve a data location for saving the
1077	int32_t op = buildOp(URX_STO_SP, varLoc);
1078	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1079
1080	int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1081	U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1082	loopOp++; // point LoopOp after the just-inserted STO_SP
1083	fRXPat->fCompiledPat->push(loopOp, *fStatus);
1084
1085	// Then the LD_SP after the end of the loop
1086	appendOp(URX_LD_SP, varLoc);
1087	}
1088
1089	break;
1090
1091	case doNGInterval:
1092	// Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1093	compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1094	break;
1095
1096	case doIntervalError:
1097	error(U_REGEX_BAD_INTERVAL);
1098	break;
1099
1100	case doLiteralChar:
1101	// We've just scanned a "normal" character from the pattern,
1102	literalChar(fC.fChar);
1103	break;
1104
1105
1106	case doEscapedLiteralChar:
1107	// We've just scanned an backslashed escaped character with no
1108	// special meaning. It represents itself.
1109	if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != `0` &&
1110	((fC.fChar >= `0x41` && fC.fChar<= `0x5A`) \|\| // in [A-Z]
1111	(fC.fChar >= `0x61` && fC.fChar <= `0x7a`))) { // in [a-z]
1112	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1113	}
1114	literalChar(fC.fChar);
1115	break;
1116
1117
1118	case doDotAny:
1119	// scanned a ".", match any single character.
1120	{
1121	fixLiterals(FALSE);
1122	if (fModeFlags & UREGEX_DOTALL) {
1123	appendOp(URX_DOTANY_ALL, `0`);
1124	} else if (fModeFlags & UREGEX_UNIX_LINES) {
1125	appendOp(URX_DOTANY_UNIX, `0`);
1126	} else {
1127	appendOp(URX_DOTANY, `0`);
1128	}
1129	}
1130	break;
1131
1132	case doCaret:
1133	{
1134	fixLiterals(FALSE);
1135	if ( (fModeFlags & UREGEX_MULTILINE) == `0` && (fModeFlags & UREGEX_UNIX_LINES) == `0`) {
1136	appendOp(URX_CARET, `0`);
1137	} else if ((fModeFlags & UREGEX_MULTILINE) != `0` && (fModeFlags & UREGEX_UNIX_LINES) == `0`) {
1138	appendOp(URX_CARET_M, `0`);
1139	} else if ((fModeFlags & UREGEX_MULTILINE) == `0` && (fModeFlags & UREGEX_UNIX_LINES) != `0`) {
1140	appendOp(URX_CARET, `0`); // Only testing true start of input.
1141	} else if ((fModeFlags & UREGEX_MULTILINE) != `0` && (fModeFlags & UREGEX_UNIX_LINES) != `0`) {
1142	appendOp(URX_CARET_M_UNIX, `0`);
1143	}
1144	}
1145	break;
1146
1147	case doDollar:
1148	{
1149	fixLiterals(FALSE);
1150	if ( (fModeFlags & UREGEX_MULTILINE) == `0` && (fModeFlags & UREGEX_UNIX_LINES) == `0`) {
1151	appendOp(URX_DOLLAR, `0`);
1152	} else if ((fModeFlags & UREGEX_MULTILINE) != `0` && (fModeFlags & UREGEX_UNIX_LINES) == `0`) {
1153	appendOp(URX_DOLLAR_M, `0`);
1154	} else if ((fModeFlags & UREGEX_MULTILINE) == `0` && (fModeFlags & UREGEX_UNIX_LINES) != `0`) {
1155	appendOp(URX_DOLLAR_D, `0`);
1156	} else if ((fModeFlags & UREGEX_MULTILINE) != `0` && (fModeFlags & UREGEX_UNIX_LINES) != `0`) {
1157	appendOp(URX_DOLLAR_MD, `0`);
1158	}
1159	}
1160	break;
1161
1162	case doBackslashA:
1163	fixLiterals(FALSE);
1164	appendOp(URX_CARET, `0`);
1165	break;
1166
1167	case doBackslashB:
1168	{
1169	#if UCONFIG_NO_BREAK_ITERATION==1
1170	if (fModeFlags & UREGEX_UWORD) {
1171	error(U_UNSUPPORTED_ERROR);
1172	}
1173	#endif
1174	fixLiterals(FALSE);
1175	int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1176	appendOp(op, `1`);
1177	}
1178	break;
1179
1180	case doBackslashb:
1181	{
1182	#if UCONFIG_NO_BREAK_ITERATION==1
1183	if (fModeFlags & UREGEX_UWORD) {
1184	error(U_UNSUPPORTED_ERROR);
1185	}
1186	#endif
1187	fixLiterals(FALSE);
1188	int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1189	appendOp(op, `0`);
1190	}
1191	break;
1192
1193	case doBackslashD:
1194	fixLiterals(FALSE);
1195	appendOp(URX_BACKSLASH_D, `1`);
1196	break;
1197
1198	case doBackslashd:
1199	fixLiterals(FALSE);
1200	appendOp(URX_BACKSLASH_D, `0`);
1201	break;
1202
1203	case doBackslashG:
1204	fixLiterals(FALSE);
1205	appendOp(URX_BACKSLASH_G, `0`);
1206	break;
1207
1208	case doBackslashH:
1209	fixLiterals(FALSE);
1210	appendOp(URX_BACKSLASH_H, `1`);
1211	break;
1212
1213	case doBackslashh:
1214	fixLiterals(FALSE);
1215	appendOp(URX_BACKSLASH_H, `0`);
1216	break;
1217
1218	case doBackslashR:
1219	fixLiterals(FALSE);
1220	appendOp(URX_BACKSLASH_R, `0`);
1221	break;
1222
1223	case doBackslashS:
1224	fixLiterals(FALSE);
1225	appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1226	break;
1227
1228	case doBackslashs:
1229	fixLiterals(FALSE);
1230	appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1231	break;
1232
1233	case doBackslashV:
1234	fixLiterals(FALSE);
1235	appendOp(URX_BACKSLASH_V, `1`);
1236	break;
1237
1238	case doBackslashv:
1239	fixLiterals(FALSE);
1240	appendOp(URX_BACKSLASH_V, `0`);
1241	break;
1242
1243	case doBackslashW:
1244	fixLiterals(FALSE);
1245	appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1246	break;
1247
1248	case doBackslashw:
1249	fixLiterals(FALSE);
1250	appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1251	break;
1252
1253	case doBackslashX:
1254	fixLiterals(FALSE);
1255	appendOp(URX_BACKSLASH_X, `0`);
1256	break;
1257
1258
1259	case doBackslashZ:
1260	fixLiterals(FALSE);
1261	appendOp(URX_DOLLAR, `0`);
1262	break;
1263
1264	case doBackslashz:
1265	fixLiterals(FALSE);
1266	appendOp(URX_BACKSLASH_Z, `0`);
1267	break;
1268
1269	case doEscapeError:
1270	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1271	break;
1272
1273	case doExit:
1274	fixLiterals(FALSE);
1275	returnVal = FALSE;
1276	break;
1277
1278	case doProperty:
1279	{
1280	fixLiterals(FALSE);
1281	UnicodeSet *theSet = scanProp();
1282	compileSet(theSet);
1283	}
1284	break;
1285
1286	case doNamedChar:
1287	{
1288	UChar32 c = scanNamedChar();
1289	literalChar(c);
1290	}
1291	break;
1292
1293
1294	case doBackRef:
1295	// BackReference. Somewhat unusual in that the front-end can not completely parse
1296	// the regular expression, because the number of digits to be consumed
1297	// depends on the number of capture groups that have been defined. So
1298	// we have to do it here instead.
1299	{
1300	int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1301	int32_t groupNum = `0`;
1302	UChar32 c = fC.fChar;
1303
1304	for (;;) {
1305	// Loop once per digit, for max allowed number of digits in a back reference.
1306	int32_t digit = u_charDigitValue(c);
1307	groupNum = groupNum * `10` + digit;
1308	if (groupNum >= numCaptureGroups) {
1309	break;
1310	}
1311	c = peekCharLL();
1312	if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1313	break;
1314	}
1315	nextCharLL();
1316	}
1317
1318	// Scan of the back reference in the source regexp is complete. Now generate
1319	// the compiled code for it.
1320	// Because capture groups can be forward-referenced by back-references,
1321	// we fill the operand with the capture group number. At the end
1322	// of compilation, it will be changed to the variable's location.
1323	U_ASSERT(groupNum > `0`); // Shouldn't happen. '\0' begins an octal escape sequence,
1324	// and shouldn't enter this code path at all.
1325	fixLiterals(FALSE);
1326	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1327	appendOp(URX_BACKREF_I, groupNum);
1328	} else {
1329	appendOp(URX_BACKREF, groupNum);
1330	}
1331	}
1332	break;
1333
1334	case doBeginNamedBackRef:
1335	U_ASSERT(fCaptureName == NULL);
1336	fCaptureName = new UnicodeString;
1337	if (fCaptureName == NULL) {
1338	error(U_MEMORY_ALLOCATION_ERROR);
1339	}
1340	break;
1341
1342	case doContinueNamedBackRef:
1343	fCaptureName->append(fC.fChar);
1344	break;
1345
1346	case doCompleteNamedBackRef:
1347	{
1348	int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
1349	if (groupNumber == `0`) {
1350	// Group name has not been defined.
1351	// Could be a forward reference. If we choose to support them at some
1352	// future time, extra mechanism will be required at this point.
1353	error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1354	} else {
1355	// Given the number, handle identically to a \n numbered back reference.
1356	// See comments above, under doBackRef
1357	fixLiterals(FALSE);
1358	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1359	appendOp(URX_BACKREF_I, groupNumber);
1360	} else {
1361	appendOp(URX_BACKREF, groupNumber);
1362	}
1363	}
1364	delete fCaptureName;
1365	fCaptureName = NULL;
1366	break;
1367	}
1368
1369	case doPossessivePlus:
1370	// Possessive ++ quantifier.
1371	// Compiles to
1372	// 1. STO_SP
1373	// 2. body of stuff being iterated over
1374	// 3. STATE_SAVE 5
1375	// 4. JMP 2
1376	// 5. LD_SP
1377	// 6. ...
1378	//
1379	// Note: TODO: This is pretty inefficient. A mass of saved state is built up
1380	// then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1381	//
1382	{
1383	// Emit the STO_SP
1384	int32_t topLoc = blockTopLoc(TRUE);
1385	int32_t stoLoc = allocateData(`1`); // Reserve the data location for storing save stack ptr.
1386	int32_t op = buildOp(URX_STO_SP, stoLoc);
1387	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1388
1389	// Emit the STATE_SAVE
1390	appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+`2`);
1391
1392	// Emit the JMP
1393	appendOp(URX_JMP, topLoc+`1`);
1394
1395	// Emit the LD_SP
1396	appendOp(URX_LD_SP, stoLoc);
1397	}
1398	break;
1399
1400	case doPossessiveStar:
1401	// Possessive + quantifier.*
1402	// Compiles to
1403	// 1. STO_SP loc
1404	// 2. STATE_SAVE 5
1405	// 3. body of stuff being iterated over
1406	// 4. JMP 2
1407	// 5. LD_SP loc
1408	// 6 ...
1409	// TODO: do something to cut back the state stack each time through the loop.
1410	{
1411	// Reserve two slots at the top of the block.
1412	int32_t topLoc = blockTopLoc(TRUE);
1413	insertOp(topLoc);
1414
1415	// emit STO_SP loc
1416	int32_t stoLoc = allocateData(`1`); // Reserve the data location for storing save stack ptr.
1417	int32_t op = buildOp(URX_STO_SP, stoLoc);
1418	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1419
1420	// Emit the SAVE_STATE 5
1421	int32_t L7 = fRXPat->fCompiledPat->size()+`1`;
1422	op = buildOp(URX_STATE_SAVE, L7);
1423	fRXPat->fCompiledPat->setElementAt(op, topLoc+`1`);
1424
1425	// Append the JMP operation.
1426	appendOp(URX_JMP, topLoc+`1`);
1427
1428	// Emit the LD_SP loc
1429	appendOp(URX_LD_SP, stoLoc);
1430	}
1431	break;
1432
1433	case doPossessiveOpt:
1434	// Possessive ?+ quantifier.
1435	// Compiles to
1436	// 1. STO_SP loc
1437	// 2. SAVE_STATE 5
1438	// 3. body of optional block
1439	// 4. LD_SP loc
1440	// 5. ...
1441	//
1442	{
1443	// Reserve two slots at the top of the block.
1444	int32_t topLoc = blockTopLoc(TRUE);
1445	insertOp(topLoc);
1446
1447	// Emit the STO_SP
1448	int32_t stoLoc = allocateData(`1`); // Reserve the data location for storing save stack ptr.
1449	int32_t op = buildOp(URX_STO_SP, stoLoc);
1450	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1451
1452	// Emit the SAVE_STATE
1453	int32_t continueLoc = fRXPat->fCompiledPat->size()+`1`;
1454	op = buildOp(URX_STATE_SAVE, continueLoc);
1455	fRXPat->fCompiledPat->setElementAt(op, topLoc+`1`);
1456
1457	// Emit the LD_SP
1458	appendOp(URX_LD_SP, stoLoc);
1459	}
1460	break;
1461
1462
1463	case doBeginMatchMode:
1464	fNewModeFlags = fModeFlags;
1465	fSetModeFlag = TRUE;
1466	break;
1467
1468	case doMatchMode: // (?i) and similar
1469	{
1470	int32_t bit = `0`;
1471	switch (fC.fChar) {
1472	case `0x69`: / 'i' / bit = UREGEX_CASE_INSENSITIVE; break;
1473	case `0x64`: / 'd' / bit = UREGEX_UNIX_LINES; break;
1474	case `0x6d`: / 'm' / bit = UREGEX_MULTILINE; break;
1475	case `0x73`: / 's' / bit = UREGEX_DOTALL; break;
1476	case `0x75`: / 'u' / bit = `0`; / Unicode casing / break;
1477	case `0x77`: / 'w' / bit = UREGEX_UWORD; break;
1478	case `0x78`: / 'x' / bit = UREGEX_COMMENTS; break;
1479	case `0x2d`: / '-' / fSetModeFlag = FALSE; break;
1480	default:
1481	UPRV_UNREACHABLE; // Should never happen. Other chars are filtered out
1482	// by the scanner.
1483	}
1484	if (fSetModeFlag) {
1485	fNewModeFlags \|= bit;
1486	} else {
1487	fNewModeFlags &= ~bit;
1488	}
1489	}
1490	break;
1491
1492	case doSetMatchMode:
1493	// Emit code to match any pending literals, using the not-yet changed match mode.
1494	fixLiterals();
1495
1496	// We've got a (?i) or similar. The match mode is being changed, but
1497	// the change is not scoped to a parenthesized block.
1498	U_ASSERT(fNewModeFlags < `0`);
1499	fModeFlags = fNewModeFlags;
1500
1501	break;
1502
1503
1504	case doMatchModeParen:
1505	// We've got a (?i: or similar. Begin a parenthesized block, save old
1506	// mode flags so they can be restored at the close of the block.
1507	//
1508	// Compile to a
1509	// - NOP, which later may be replaced by a save-state if the
1510	// parenthesized group gets a quantifier, followed by*
1511	// - NOP, which may later be replaced by a save-state if there
1512	// is an '\|' alternation within the parens.
1513	{
1514	fixLiterals(FALSE);
1515	appendOp(URX_NOP, `0`);
1516	appendOp(URX_NOP, `0`);
1517
1518	// On the Parentheses stack, start a new frame and add the postions
1519	// of the two NOPs (a normal non-capturing () frame, except for the
1520	// saving of the orignal mode flags.)
1521	fParenStack.push(fModeFlags, *fStatus);
1522	fParenStack.push(flags, fStatus); // Frame Marker*
1523	fParenStack.push(fRXPat->fCompiledPat->size()-`2`, fStatus); // The first NOP*
1524	fParenStack.push(fRXPat->fCompiledPat->size()-`1`, fStatus); // The second NOP*
1525
1526	// Set the current mode flags to the new values.
1527	U_ASSERT(fNewModeFlags < `0`);
1528	fModeFlags = fNewModeFlags;
1529	}
1530	break;
1531
1532	case doBadModeFlag:
1533	error(U_REGEX_INVALID_FLAG);
1534	break;
1535
1536	case doSuppressComments:
1537	// We have just scanned a '(?'. We now need to prevent the character scanner from
1538	// treating a '#' as a to-the-end-of-line comment.
1539	// (This Perl compatibility just gets uglier and uglier to do...)
1540	fEOLComments = FALSE;
1541	break;
1542
1543
1544	case doSetAddAmp:
1545	{
1546	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1547	set->add(chAmp);
1548	}
1549	break;
1550
1551	case doSetAddDash:
1552	{
1553	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1554	set->add(chDash);
1555	}
1556	break;
1557
1558	case doSetBackslash_s:
1559	{
1560	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1561	set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1562	break;
1563	}
1564
1565	case doSetBackslash_S:
1566	{
1567	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1568	UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1569	SSet.complement();
1570	set->addAll(SSet);
1571	break;
1572	}
1573
1574	case doSetBackslash_d:
1575	{
1576	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1577	// TODO - make a static set, ticket 6058.
1578	addCategory(set, U_GC_ND_MASK, *fStatus);
1579	break;
1580	}
1581
1582	case doSetBackslash_D:
1583	{
1584	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1585	UnicodeSet digits;
1586	// TODO - make a static set, ticket 6058.
1587	digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1588	digits.complement();
1589	set->addAll(digits);
1590	break;
1591	}
1592
1593	case doSetBackslash_h:
1594	{
1595	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1596	UnicodeSet h;
1597	h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1598	h.add((UChar32)`9`); // Tab
1599	set->addAll(h);
1600	break;
1601	}
1602
1603	case doSetBackslash_H:
1604	{
1605	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1606	UnicodeSet h;
1607	h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1608	h.add((UChar32)`9`); // Tab
1609	h.complement();
1610	set->addAll(h);
1611	break;
1612	}
1613
1614	case doSetBackslash_v:
1615	{
1616	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1617	set->add((UChar32)`0x0a`, (UChar32)`0x0d`); // add range
1618	set->add((UChar32)`0x85`);
1619	set->add((UChar32)`0x2028`, (UChar32)`0x2029`);
1620	break;
1621	}
1622
1623	case doSetBackslash_V:
1624	{
1625	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1626	UnicodeSet v;
1627	v.add((UChar32)`0x0a`, (UChar32)`0x0d`); // add range
1628	v.add((UChar32)`0x85`);
1629	v.add((UChar32)`0x2028`, (UChar32)`0x2029`);
1630	v.complement();
1631	set->addAll(v);
1632	break;
1633	}
1634
1635	case doSetBackslash_w:
1636	{
1637	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1638	set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1639	break;
1640	}
1641
1642	case doSetBackslash_W:
1643	{
1644	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1645	UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1646	SSet.complement();
1647	set->addAll(SSet);
1648	break;
1649	}
1650
1651	case doSetBegin:
1652	fixLiterals(FALSE);
1653	fSetStack.push(new UnicodeSet (), *fStatus);
1654	fSetOpStack.push(setStart, *fStatus);
1655	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != `0`) {
1656	fSetOpStack.push(setCaseClose, *fStatus);
1657	}
1658	break;
1659
1660	case doSetBeginDifference1:
1661	// We have scanned something like [[abc]-[
1662	// Set up a new UnicodeSet for the set beginning with the just-scanned '['
1663	// Push a Difference operator, which will cause the new set to be subtracted from what
1664	// went before once it is created.
1665	setPushOp(setDifference1);
1666	fSetOpStack.push(setStart, *fStatus);
1667	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != `0`) {
1668	fSetOpStack.push(setCaseClose, *fStatus);
1669	}
1670	break;
1671
1672	case doSetBeginIntersection1:
1673	// We have scanned something like [[abc]&[
1674	// Need both the '&' operator and the open '[' operator.
1675	setPushOp(setIntersection1);
1676	fSetOpStack.push(setStart, *fStatus);
1677	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != `0`) {
1678	fSetOpStack.push(setCaseClose, *fStatus);
1679	}
1680	break;
1681
1682	case doSetBeginUnion:
1683	// We have scanned something like [[abc][
1684	// Need to handle the union operation explicitly [[abc] \| [
1685	setPushOp(setUnion);
1686	fSetOpStack.push(setStart, *fStatus);
1687	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != `0`) {
1688	fSetOpStack.push(setCaseClose, *fStatus);
1689	}
1690	break;
1691
1692	case doSetDifference2:
1693	// We have scanned something like [abc--
1694	// Consider this to unambiguously be a set difference operator.
1695	setPushOp(setDifference2);
1696	break;
1697
1698	case doSetEnd:
1699	// Have encountered the ']' that closes a set.
1700	// Force the evaluation of any pending operations within this set,
1701	// leave the completed set on the top of the set stack.
1702	setEval(setEnd);
1703	U_ASSERT(fSetOpStack.peeki()==setStart);
1704	fSetOpStack.popi();
1705	break;
1706
1707	case doSetFinish:
1708	{
1709	// Finished a complete set expression, including all nested sets.
1710	// The close bracket has already triggered clearing out pending set operators,
1711	// the operator stack should be empty and the operand stack should have just
1712	// one entry, the result set.
1713	U_ASSERT(fSetOpStack.empty());
1714	UnicodeSet theSet = (UnicodeSet )fSetStack.pop();
1715	U_ASSERT(fSetStack.empty());
1716	compileSet(theSet);
1717	break;
1718	}
1719
1720	case doSetIntersection2:
1721	// Have scanned something like [abc&&
1722	setPushOp(setIntersection2);
1723	break;
1724
1725	case doSetLiteral:
1726	// Union the just-scanned literal character into the set being built.
1727	// This operation is the highest precedence set operation, so we can always do
1728	// it immediately, without waiting to see what follows. It is necessary to perform
1729	// any pending '-' or '&' operation first, because these have the same precedence
1730	// as union-ing in a literal'
1731	{
1732	setEval(setUnion);
1733	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1734	s->add(fC.fChar);
1735	fLastSetLiteral = fC.fChar;
1736	break;
1737	}
1738
1739	case doSetLiteralEscaped:
1740	// A back-slash escaped literal character was encountered.
1741	// Processing is the same as with setLiteral, above, with the addition of
1742	// the optional check for errors on escaped ASCII letters.
1743	{
1744	if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != `0` &&
1745	((fC.fChar >= `0x41` && fC.fChar<= `0x5A`) \|\| // in [A-Z]
1746	(fC.fChar >= `0x61` && fC.fChar <= `0x7a`))) { // in [a-z]
1747	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1748	}
1749	setEval(setUnion);
1750	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1751	s->add(fC.fChar);
1752	fLastSetLiteral = fC.fChar;
1753	break;
1754	}
1755
1756	case doSetNamedChar:
1757	// Scanning a \N{UNICODE CHARACTER NAME}
1758	// Aside from the source of the character, the processing is identical to doSetLiteral,
1759	// above.
1760	{
1761	UChar32 c = scanNamedChar();
1762	setEval(setUnion);
1763	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1764	s->add(c);
1765	fLastSetLiteral = c;
1766	break;
1767	}
1768
1769	case doSetNamedRange:
1770	// We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1771	// The left character is already in the set, and is saved in fLastSetLiteral.
1772	// The right side needs to be picked up, the scan is at the 'N'.
1773	// Lower Limit > Upper limit being an error matches both Java
1774	// and ICU UnicodeSet behavior.
1775	{
1776	UChar32 c = scanNamedChar();
1777	if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL \|\| fLastSetLiteral > c)) {
1778	error(U_REGEX_INVALID_RANGE);
1779	}
1780	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1781	s->add(fLastSetLiteral, c);
1782	fLastSetLiteral = c;
1783	break;
1784	}
1785
1786
1787	case doSetNegate:
1788	// Scanned a '^' at the start of a set.
1789	// Push the negation operator onto the set op stack.
1790	// A twist for case-insensitive matching:
1791	// the case closure operation must happen _before_ negation.
1792	// But the case closure operation will already be on the stack if it's required.
1793	// This requires checking for case closure, and swapping the stack order
1794	// if it is present.
1795	{
1796	int32_t tosOp = fSetOpStack.peeki();
1797	if (tosOp == setCaseClose) {
1798	fSetOpStack.popi();
1799	fSetOpStack.push(setNegation, *fStatus);
1800	fSetOpStack.push(setCaseClose, *fStatus);
1801	} else {
1802	fSetOpStack.push(setNegation, *fStatus);
1803	}
1804	}
1805	break;
1806
1807	case doSetNoCloseError:
1808	error(U_REGEX_MISSING_CLOSE_BRACKET);
1809	break;
1810
1811	case doSetOpError:
1812	error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
1813	break;
1814
1815	case doSetPosixProp:
1816	{
1817	UnicodeSet *s = scanPosixProp();
1818	if (s != NULL) {
1819	UnicodeSet tos = (UnicodeSet )fSetStack.peek();
1820	tos->addAll(*s);
1821	delete s;
1822	} // else error. scanProp() reported the error status already.
1823	}
1824	break;
1825
1826	case doSetProp:
1827	// Scanned a \p \P within [brackets].
1828	{
1829	UnicodeSet *s = scanProp();
1830	if (s != NULL) {
1831	UnicodeSet tos = (UnicodeSet )fSetStack.peek();
1832	tos->addAll(*s);
1833	delete s;
1834	} // else error. scanProp() reported the error status already.
1835	}
1836	break;
1837
1838
1839	case doSetRange:
1840	// We have scanned literal-literal. Add the range to the set.
1841	// The left character is already in the set, and is saved in fLastSetLiteral.
1842	// The right side is the current character.
1843	// Lower Limit > Upper limit being an error matches both Java
1844	// and ICU UnicodeSet behavior.
1845	{
1846
1847	if (fLastSetLiteral == U_SENTINEL \|\| fLastSetLiteral > fC.fChar) {
1848	error(U_REGEX_INVALID_RANGE);
1849	}
1850	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1851	s->add(fLastSetLiteral, fC.fChar);
1852	break;
1853	}
1854
1855	default:
1856	UPRV_UNREACHABLE;
1857	}
1858
1859	if (U_FAILURE(*fStatus)) {
1860	returnVal = FALSE;
1861	}
1862
1863	return returnVal;
1864	}
1865
1866
1867
1868	//------------------------------------------------------------------------------
1869	//
1870	// literalChar We've encountered a literal character from the pattern,
1871	// or an escape sequence that reduces to a character.
1872	// Add it to the string containing all literal chars/strings from
1873	// the pattern.
1874	//
1875	//------------------------------------------------------------------------------
1876	void RegexCompile::literalChar(UChar32 c) {
1877	fLiteralChars.append(c);
1878	}
1879
1880
1881	//------------------------------------------------------------------------------
1882	//
1883	// fixLiterals When compiling something that can follow a literal
1884	// string in a pattern, emit the code to match the
1885	// accumulated literal string.
1886	//
1887	// Optionally, split the last char of the string off into
1888	// a single "ONE_CHAR" operation, so that quantifiers can
1889	// apply to that char alone. Example: abc*
1890	// The must apply to the 'c' only.*
1891	//
1892	//------------------------------------------------------------------------------
1893	void RegexCompile::fixLiterals(UBool split) {
1894
1895	// If no literal characters have been scanned but not yet had code generated
1896	// for them, nothing needs to be done.
1897	if (fLiteralChars.length() == `0`) {
1898	return;
1899	}
1900
1901	int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -`1`);
1902	UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1903
1904	// Split: We need to ensure that the last item in the compiled pattern
1905	// refers only to the last literal scanned in the pattern, so that
1906	// quantifiers (, +, etc.) affect only it, and not a longer string.*
1907	// Split before case folding for case insensitive matches.
1908
1909	if (split) {
1910	fLiteralChars.truncate(indexOfLastCodePoint);
1911	fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string.
1912	// Note that the truncated literal string may be empty, in which case
1913	// nothing will be emitted.
1914
1915	literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
1916	fixLiterals(FALSE); // Second recursive call, code for the final code point.
1917	return;
1918	}
1919
1920	// If we are doing case-insensitive matching, case fold the string. This may expand
1921	// the string, e.g. the German sharp-s turns into "ss"
1922	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1923	fLiteralChars.foldCase();
1924	indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -`1`);
1925	lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1926	}
1927
1928	if (indexOfLastCodePoint == `0`) {
1929	// Single character, emit a URX_ONECHAR op to match it.
1930	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1931	u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1932	appendOp(URX_ONECHAR_I, lastCodePoint);
1933	} else {
1934	appendOp(URX_ONECHAR, lastCodePoint);
1935	}
1936	} else {
1937	// Two or more chars, emit a URX_STRING to match them.
1938	if (fLiteralChars.length() > `0x00ffffff` \|\| fRXPat->fLiteralText.length() > `0x00ffffff`) {
1939	error(U_REGEX_PATTERN_TOO_BIG);
1940	}
1941	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1942	appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1943	} else {
1944	// TODO here: add optimization to split case sensitive strings of length two
1945	// into two single char ops, for efficiency.
1946	appendOp(URX_STRING, fRXPat->fLiteralText.length());
1947	}
1948	appendOp(URX_STRING_LEN, fLiteralChars.length());
1949
1950	// Add this string into the accumulated strings of the compiled pattern.
1951	fRXPat->fLiteralText.append(fLiteralChars);
1952	}
1953
1954	fLiteralChars.remove();
1955	}
1956
1957
1958	int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1959	if (U_FAILURE(*fStatus)) {
1960	return `0`;
1961	}
1962	if (type < `0` \|\| type > `255`) {
1963	UPRV_UNREACHABLE;
1964	}
1965	if (val > `0x00ffffff`) {
1966	UPRV_UNREACHABLE;
1967	}
1968	if (val < `0`) {
1969	if (!(type == URX_RESERVED_OP_N \|\| type == URX_RESERVED_OP)) {
1970	UPRV_UNREACHABLE;
1971	}
1972	if (URX_TYPE(val) != `0xff`) {
1973	UPRV_UNREACHABLE;
1974	}
1975	type = URX_RESERVED_OP_N;
1976	}
1977	return (type << `24`) \| val;
1978	}
1979
1980
1981	//------------------------------------------------------------------------------
1982	//
1983	// appendOp() Append a new instruction onto the compiled pattern
1984	// Includes error checking, limiting the size of the
1985	// pattern to lengths that can be represented in the
1986	// 24 bit operand field of an instruction.
1987	//
1988	//------------------------------------------------------------------------------
1989	void RegexCompile::appendOp(int32_t op) {
1990	if (U_FAILURE(*fStatus)) {
1991	return;
1992	}
1993	fRXPat->fCompiledPat->addElement(op, *fStatus);
1994	if ((fRXPat->fCompiledPat->size() > `0x00fffff0`) && U_SUCCESS(*fStatus)) {
1995	error(U_REGEX_PATTERN_TOO_BIG);
1996	}
1997	}
1998
1999	void RegexCompile::appendOp(int32_t type, int32_t val) {
2000	appendOp(buildOp(type, val));
2001	}
2002
2003
2004	//------------------------------------------------------------------------------
2005	//
2006	// insertOp() Insert a slot for a new opcode into the already
2007	// compiled pattern code.
2008	//
2009	// Fill the slot with a NOP. Our caller will replace it
2010	// with what they really wanted.
2011	//
2012	//------------------------------------------------------------------------------
2013	void RegexCompile::insertOp(int32_t where) {
2014	UVector64 *code = fRXPat->fCompiledPat;
2015	U_ASSERT(where>`0` && where < code->size());
2016
2017	int32_t nop = buildOp(URX_NOP, `0`);
2018	code->insertElementAt(nop, where, *fStatus);
2019
2020	// Walk through the pattern, looking for any ops with targets that
2021	// were moved down by the insert. Fix them.
2022	int32_t loc;
2023	for (loc=`0`; loc<code->size(); loc++) {
2024	int32_t op = (int32_t)code->elementAti(loc);
2025	int32_t opType = URX_TYPE(op);
2026	int32_t opValue = URX_VAL(op);
2027	if ((opType == URX_JMP \|\|
2028	opType == URX_JMPX \|\|
2029	opType == URX_STATE_SAVE \|\|
2030	opType == URX_CTR_LOOP \|\|
2031	opType == URX_CTR_LOOP_NG \|\|
2032	opType == URX_JMP_SAV \|\|
2033	opType == URX_JMP_SAV_X \|\|
2034	opType == URX_RELOC_OPRND) && opValue > where) {
2035	// Target location for this opcode is after the insertion point and
2036	// needs to be incremented to adjust for the insertion.
2037	opValue++;
2038	op = buildOp(opType, opValue);
2039	code->setElementAt(op, loc);
2040	}
2041	}
2042
2043	// Now fix up the parentheses stack. All positive values in it are locations in
2044	// the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2045	for (loc=`0`; loc<fParenStack.size(); loc++) {
2046	int32_t x = fParenStack.elementAti(loc);
2047	U_ASSERT(x < code->size());
2048	if (x>where) {
2049	x++;
2050	fParenStack.setElementAt(x, loc);
2051	}
2052	}
2053
2054	if (fMatchCloseParen > where) {
2055	fMatchCloseParen++;
2056	}
2057	if (fMatchOpenParen > where) {
2058	fMatchOpenParen++;
2059	}
2060	}
2061
2062
2063	//------------------------------------------------------------------------------
2064	//
2065	// allocateData() Allocate storage in the matcher's static data area.
2066	// Return the index for the newly allocated data.
2067	// The storage won't actually exist until we are running a match
2068	// operation, but the storage indexes are inserted into various
2069	// opcodes while compiling the pattern.
2070	//
2071	//------------------------------------------------------------------------------
2072	int32_t RegexCompile::allocateData(int32_t size) {
2073	if (U_FAILURE(*fStatus)) {
2074	return `0`;
2075	}
2076	if (size <= `0` \|\| size > `0x100` \|\| fRXPat->fDataSize < `0`) {
2077	error(U_REGEX_INTERNAL_ERROR);
2078	return `0`;
2079	}
2080	int32_t dataIndex = fRXPat->fDataSize;
2081	fRXPat->fDataSize += size;
2082	if (fRXPat->fDataSize >= `0x00fffff0`) {
2083	error(U_REGEX_INTERNAL_ERROR);
2084	}
2085	return dataIndex;
2086	}
2087
2088
2089	//------------------------------------------------------------------------------
2090	//
2091	// allocateStackData() Allocate space in the back-tracking stack frame.
2092	// Return the index for the newly allocated data.
2093	// The frame indexes are inserted into various
2094	// opcodes while compiling the pattern, meaning that frame
2095	// size must be restricted to the size that will fit
2096	// as an operand (24 bits).
2097	//
2098	//------------------------------------------------------------------------------
2099	int32_t RegexCompile::allocateStackData(int32_t size) {
2100	if (U_FAILURE(*fStatus)) {
2101	return `0`;
2102	}
2103	if (size <= `0` \|\| size > `0x100` \|\| fRXPat->fFrameSize < `0`) {
2104	error(U_REGEX_INTERNAL_ERROR);
2105	return `0`;
2106	}
2107	int32_t dataIndex = fRXPat->fFrameSize;
2108	fRXPat->fFrameSize += size;
2109	if (fRXPat->fFrameSize >= `0x00fffff0`) {
2110	error(U_REGEX_PATTERN_TOO_BIG);
2111	}
2112	return dataIndex;
2113	}
2114
2115
2116	//------------------------------------------------------------------------------
2117	//
2118	// blockTopLoc() Find or create a location in the compiled pattern
2119	// at the start of the operation or block that has
2120	// just been compiled. Needed when a quantifier ( or*
2121	// whatever) appears, and we need to add an operation
2122	// at the start of the thing being quantified.
2123	//
2124	// (Parenthesized Blocks) have a slot with a NOP that
2125	// is reserved for this purpose. . or similar don't*
2126	// and a slot needs to be added.
2127	//
2128	// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2129	// at the returned location.
2130	// FALSE - just return the address,
2131	// do not reserve a location there.
2132	//
2133	//------------------------------------------------------------------------------
2134	int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
2135	int32_t theLoc;
2136	fixLiterals(TRUE); // Emit code for any pending literals.
2137	// If last item was a string, emit separate op for the its last char.
2138	if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2139	{
2140	// The item just processed is a parenthesized block.
2141	theLoc = fMatchOpenParen; // A slot is already reserved for us.
2142	U_ASSERT(theLoc > `0`);
2143	U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2144	}
2145	else {
2146	// Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2147	// No slot for STATE_SAVE was pre-reserved in the compiled code.
2148	// We need to make space now.
2149	theLoc = fRXPat->fCompiledPat->size()-`1`;
2150	int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2151	if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2152	// Strings take two opcode, we want the position of the first one.
2153	// We can have a string at this point if a single character case-folded to two.
2154	theLoc--;
2155	}
2156	if (reserveLoc) {
2157	int32_t nop = buildOp(URX_NOP, `0`);
2158	fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2159	}
2160	}
2161	return theLoc;
2162	}
2163
2164
2165
2166	//------------------------------------------------------------------------------
2167	//
2168	// handleCloseParen When compiling a close paren, we need to go back
2169	// and fix up any JMP or SAVE operations within the
2170	// parenthesized block that need to target the end
2171	// of the block. The locations of these are kept on
2172	// the paretheses stack.
2173	//
2174	// This function is called both when encountering a
2175	// real ) and at the end of the pattern.
2176	//
2177	//------------------------------------------------------------------------------
2178	void RegexCompile::handleCloseParen() {
2179	int32_t patIdx;
2180	int32_t patOp;
2181	if (fParenStack.size() <= `0`) {
2182	error(U_REGEX_MISMATCHED_PAREN);
2183	return;
2184	}
2185
2186	// Emit code for any pending literals.
2187	fixLiterals(FALSE);
2188
2189	// Fixup any operations within the just-closed parenthesized group
2190	// that need to reference the end of the (block).
2191	// (The first one popped from the stack is an unused slot for
2192	// alternation (OR) state save, but applying the fixup to it does no harm.)
2193	for (;;) {
2194	patIdx = fParenStack.popi();
2195	if (patIdx < `0`) {
2196	// value < 0 flags the start of the frame on the paren stack.
2197	break;
2198	}
2199	U_ASSERT(patIdx>`0` && patIdx <= fRXPat->fCompiledPat->size());
2200	patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2201	U_ASSERT(URX_VAL(patOp) == `0`); // Branch target for JMP should not be set.
2202	patOp \|= fRXPat->fCompiledPat->size(); // Set it now.
2203	fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2204	fMatchOpenParen = patIdx;
2205	}
2206
2207	// At the close of any parenthesized block, restore the match mode flags to
2208	// the value they had at the open paren. Saved value is
2209	// at the top of the paren stack.
2210	fModeFlags = fParenStack.popi();
2211	U_ASSERT(fModeFlags < `0`);
2212
2213	// DO any additional fixups, depending on the specific kind of
2214	// parentesized grouping this is
2215
2216	switch (patIdx) {
2217	case plain:
2218	case flags:
2219	// No additional fixups required.
2220	// (Grouping-only parentheses)
2221	break;
2222	case capturing:
2223	// Capturing Parentheses.
2224	// Insert a End Capture op into the pattern.
2225	// The frame offset of the variables for this cg is obtained from the
2226	// start capture op and put it into the end-capture op.
2227	{
2228	int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+`1`);
2229	U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2230
2231	int32_t frameVarLocation = URX_VAL(captureOp);
2232	appendOp(URX_END_CAPTURE, frameVarLocation);
2233	}
2234	break;
2235	case atomic:
2236	// Atomic Parenthesis.
2237	// Insert a LD_SP operation to restore the state stack to the position
2238	// it was when the atomic parens were entered.
2239	{
2240	int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+`1`);
2241	U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2242	int32_t stoLoc = URX_VAL(stoOp);
2243	appendOp(URX_LD_SP, stoLoc);
2244	}
2245	break;
2246
2247	case lookAhead:
2248	{
2249	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-`5`);
2250	U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2251	int32_t dataLoc = URX_VAL(startOp);
2252	appendOp(URX_LA_END, dataLoc);
2253	}
2254	break;
2255
2256	case negLookAhead:
2257	{
2258	// See comment at doOpenLookAheadNeg
2259	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-`1`);
2260	U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2261	int32_t dataLoc = URX_VAL(startOp);
2262	appendOp(URX_LA_END, dataLoc);
2263	appendOp(URX_BACKTRACK, `0`);
2264	appendOp(URX_LA_END, dataLoc);
2265
2266	// Patch the URX_SAVE near the top of the block.
2267	// The destination of the SAVE is the final LA_END that was just added.
2268	int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2269	U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2270	int32_t dest = fRXPat->fCompiledPat->size()-`1`;
2271	saveOp = buildOp(URX_STATE_SAVE, dest);
2272	fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2273	}
2274	break;
2275
2276	case lookBehind:
2277	{
2278	// See comment at doOpenLookBehind.
2279
2280	// Append the URX_LB_END and URX_LA_END to the compiled pattern.
2281	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-`4`);
2282	U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2283	int32_t dataLoc = URX_VAL(startOp);
2284	appendOp(URX_LB_END, dataLoc);
2285	appendOp(URX_LA_END, dataLoc);
2286
2287	// Determine the min and max bounds for the length of the
2288	// string that the pattern can match.
2289	// An unbounded upper limit is an error.
2290	int32_t patEnd = fRXPat->fCompiledPat->size() - `1`;
2291	int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2292	int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2293	if (URX_TYPE(maxML) != `0`) {
2294	error(U_REGEX_LOOK_BEHIND_LIMIT);
2295	break;
2296	}
2297	if (maxML == INT32_MAX) {
2298	error(U_REGEX_LOOK_BEHIND_LIMIT);
2299	break;
2300	}
2301	if (minML == INT32_MAX) {
2302	// This condition happens when no match is possible, such as with a
2303	// [set] expression containing no elements.
2304	// In principle, the generated code to evaluate the expression could be deleted,
2305	// but it's probably not worth the complication.
2306	minML = `0`;
2307	}
2308	U_ASSERT(minML <= maxML);
2309
2310	// Insert the min and max match len bounds into the URX_LB_CONT op that
2311	// appears at the top of the look-behind block, at location fMatchOpenParen+1
2312	fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-`2`);
2313	fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-`1`);
2314
2315	}
2316	break;
2317
2318
2319
2320	case lookBehindN:
2321	{
2322	// See comment at doOpenLookBehindNeg.
2323
2324	// Append the URX_LBN_END to the compiled pattern.
2325	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-`5`);
2326	U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2327	int32_t dataLoc = URX_VAL(startOp);
2328	appendOp(URX_LBN_END, dataLoc);
2329
2330	// Determine the min and max bounds for the length of the
2331	// string that the pattern can match.
2332	// An unbounded upper limit is an error.
2333	int32_t patEnd = fRXPat->fCompiledPat->size() - `1`;
2334	int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2335	int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2336	if (URX_TYPE(maxML) != `0`) {
2337	error(U_REGEX_LOOK_BEHIND_LIMIT);
2338	break;
2339	}
2340	if (maxML == INT32_MAX) {
2341	error(U_REGEX_LOOK_BEHIND_LIMIT);
2342	break;
2343	}
2344	if (minML == INT32_MAX) {
2345	// This condition happens when no match is possible, such as with a
2346	// [set] expression containing no elements.
2347	// In principle, the generated code to evaluate the expression could be deleted,
2348	// but it's probably not worth the complication.
2349	minML = `0`;
2350	}
2351
2352	U_ASSERT(minML <= maxML);
2353
2354	// Insert the min and max match len bounds into the URX_LB_CONT op that
2355	// appears at the top of the look-behind block, at location fMatchOpenParen+1
2356	fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-`3`);
2357	fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-`2`);
2358
2359	// Insert the pattern location to continue at after a successful match
2360	// as the last operand of the URX_LBN_CONT
2361	int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2362	fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-`1`);
2363	}
2364	break;
2365
2366
2367
2368	default:
2369	UPRV_UNREACHABLE;
2370	}
2371
2372	// remember the next location in the compiled pattern.
2373	// The compilation of Quantifiers will look at this to see whether its looping
2374	// over a parenthesized block or a single item
2375	fMatchCloseParen = fRXPat->fCompiledPat->size();
2376	}
2377
2378
2379
2380	//------------------------------------------------------------------------------
2381	//
2382	// compileSet Compile the pattern operations for a reference to a
2383	// UnicodeSet.
2384	//
2385	//------------------------------------------------------------------------------
2386	void RegexCompile::compileSet(UnicodeSet *theSet)
2387	{
2388	if (theSet == NULL) {
2389	return;
2390	}
2391	// Remove any strings from the set.
2392	// There shoudn't be any, but just in case.
2393	// (Case Closure can add them; if we had a simple case closure avaialble that
2394	// ignored strings, that would be better.)
2395	theSet->removeAllStrings();
2396	int32_t setSize = theSet->size();
2397
2398	switch (setSize) {
2399	case `0`:
2400	{
2401	// Set of no elements. Always fails to match.
2402	appendOp(URX_BACKTRACK, `0`);
2403	delete theSet;
2404	}
2405	break;
2406
2407	case `1`:
2408	{
2409	// The set contains only a single code point. Put it into
2410	// the compiled pattern as a single char operation rather
2411	// than a set, and discard the set itself.
2412	literalChar(theSet->charAt(`0`));
2413	delete theSet;
2414	}
2415	break;
2416
2417	default:
2418	{
2419	// The set contains two or more chars. (the normal case)
2420	// Put it into the compiled pattern as a set.
2421	int32_t setNumber = fRXPat->fSets->size();
2422	fRXPat->fSets->addElement(theSet, *fStatus);
2423	appendOp(URX_SETREF, setNumber);
2424	}
2425	}
2426	}
2427
2428
2429	//------------------------------------------------------------------------------
2430	//
2431	// compileInterval Generate the code for a {min, max} style interval quantifier.
2432	// Except for the specific opcodes used, the code is the same
2433	// for all three types (greedy, non-greedy, possessive) of
2434	// intervals. The opcodes are supplied as parameters.
2435	// (There are two sets of opcodes - greedy & possessive use the
2436	// same ones, while non-greedy has it's own.)
2437	//
2438	// The code for interval loops has this form:
2439	// 0 CTR_INIT counter loc (in stack frame)
2440	// 1 5 patt address of CTR_LOOP at bottom of block
2441	// 2 min count
2442	// 3 max count (-1 for unbounded)
2443	// 4 ... block to be iterated over
2444	// 5 CTR_LOOP
2445	//
2446	// In
2447	//------------------------------------------------------------------------------
2448	void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
2449	{
2450	// The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2451	// four slots in the compiled code. Reserve them.
2452	int32_t topOfBlock = blockTopLoc(TRUE);
2453	insertOp(topOfBlock);
2454	insertOp(topOfBlock);
2455	insertOp(topOfBlock);
2456
2457	// The operands for the CTR_INIT opcode include the index in the matcher data
2458	// of the counter. Allocate it now. There are two data items
2459	// counterLoc --> Loop counter
2460	// +1 --> Input index (for breaking non-progressing loops)
2461	// (Only present if unbounded upper limit on loop)
2462	int32_t dataSize = fIntervalUpper < `0` ? `2` : `1`;
2463	int32_t counterLoc = allocateStackData(dataSize);
2464
2465	int32_t op = buildOp(InitOp, counterLoc);
2466	fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2467
2468	// The second operand of CTR_INIT is the location following the end of the loop.
2469	// Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2470	// compilation of something later on causes the code to grow and the target
2471	// position to move.
2472	int32_t loopEnd = fRXPat->fCompiledPat->size();
2473	op = buildOp(URX_RELOC_OPRND, loopEnd);
2474	fRXPat->fCompiledPat->setElementAt(op, topOfBlock+`1`);
2475
2476	// Followed by the min and max counts.
2477	fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+`2`);
2478	fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+`3`);
2479
2480	// Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2481	// Goes at end of the block being looped over, so just append to the code so far.
2482	appendOp(LoopOp, topOfBlock);
2483
2484	if ((fIntervalLow & `0xff000000`) != `0` \|\|
2485	(fIntervalUpper > `0` && (fIntervalUpper & `0xff000000`) != `0`)) {
2486	error(U_REGEX_NUMBER_TOO_BIG);
2487	}
2488
2489	if (fIntervalLow > fIntervalUpper && fIntervalUpper != -`1`) {
2490	error(U_REGEX_MAX_LT_MIN);
2491	}
2492	}
2493
2494
2495
2496	UBool RegexCompile::compileInlineInterval() {
2497	if (fIntervalUpper > `10` \|\| fIntervalUpper < fIntervalLow) {
2498	// Too big to inline. Fail, which will cause looping code to be generated.
2499	// (Upper < Lower picks up unbounded upper and errors, both.)
2500	return FALSE;
2501	}
2502
2503	int32_t topOfBlock = blockTopLoc(FALSE);
2504	if (fIntervalUpper == `0`) {
2505	// Pathological case. Attempt no matches, as if the block doesn't exist.
2506	// Discard the generated code for the block.
2507	// If the block included parens, discard the info pertaining to them as well.
2508	fRXPat->fCompiledPat->setSize(topOfBlock);
2509	if (fMatchOpenParen >= topOfBlock) {
2510	fMatchOpenParen = -`1`;
2511	}
2512	if (fMatchCloseParen >= topOfBlock) {
2513	fMatchCloseParen = -`1`;
2514	}
2515	return TRUE;
2516	}
2517
2518	if (topOfBlock != fRXPat->fCompiledPat->size()-`1` && fIntervalUpper != `1`) {
2519	// The thing being repeated is not a single op, but some
2520	// more complex block. Do it as a loop, not inlines.
2521	// Note that things "repeated" a max of once are handled as inline, because
2522	// the one copy of the code already generated is just fine.
2523	return FALSE;
2524	}
2525
2526	// Pick up the opcode that is to be repeated
2527	//
2528	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2529
2530	// Compute the pattern location where the inline sequence
2531	// will end, and set up the state save op that will be needed.
2532	//
2533	int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-`1`
2534	+ fIntervalUpper + (fIntervalUpper-fIntervalLow);
2535	int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2536	if (fIntervalLow == `0`) {
2537	insertOp(topOfBlock);
2538	fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2539	}
2540
2541
2542
2543	// Loop, emitting the op for the thing being repeated each time.
2544	// Loop starts at 1 because one instance of the op already exists in the pattern,
2545	// it was put there when it was originally encountered.
2546	int32_t i;
2547	for (i=`1`; i<fIntervalUpper; i++ ) {
2548	if (i >= fIntervalLow) {
2549	appendOp(saveOp);
2550	}
2551	appendOp(op);
2552	}
2553	return TRUE;
2554	}
2555
2556
2557
2558	//------------------------------------------------------------------------------
2559	//
2560	// caseInsensitiveStart given a single code point from a pattern string, determine the
2561	// set of characters that could potentially begin a case-insensitive
2562	// match of a string beginning with that character, using full Unicode
2563	// case insensitive matching.
2564	//
2565	// This is used in optimizing find().
2566	//
2567	// closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2568	// misses cases like this:
2569	// A string from the pattern begins with 'ss' (although all we know
2570	// in this context is that it begins with 's')
2571	// The pattern could match a string beginning with a German sharp-s
2572	//
2573	// To the ordinary case closure for a character c, we add all other
2574	// characters cx where the case closure of cx incudes a string form that begins
2575	// with the original character c.
2576	//
2577	// This function could be made smarter. The full pattern string is available
2578	// and it would be possible to verify that the extra characters being added
2579	// to the starting set fully match, rather than having just a first-char of the
2580	// folded form match.
2581	//
2582	//------------------------------------------------------------------------------
2583	void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2584
2585	// Machine Generated below.
2586	// It may need updating with new versions of Unicode.
2587	// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2588	// The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2589
2590	// Machine Generated Data. Do not hand edit.
2591	static const UChar32 RECaseFixCodePoints[] = {
2592	`0x61`, `0x66`, `0x68`, `0x69`, `0x6a`, `0x73`, `0x74`, `0x77`, `0x79`, `0x2bc`,
2593	`0x3ac`, `0x3ae`, `0x3b1`, `0x3b7`, `0x3b9`, `0x3c1`, `0x3c5`, `0x3c9`, `0x3ce`, `0x565`,
2594	`0x574`, `0x57e`, `0x1f00`, `0x1f01`, `0x1f02`, `0x1f03`, `0x1f04`, `0x1f05`, `0x1f06`, `0x1f07`,
2595	`0x1f20`, `0x1f21`, `0x1f22`, `0x1f23`, `0x1f24`, `0x1f25`, `0x1f26`, `0x1f27`, `0x1f60`, `0x1f61`,
2596	`0x1f62`, `0x1f63`, `0x1f64`, `0x1f65`, `0x1f66`, `0x1f67`, `0x1f70`, `0x1f74`, `0x1f7c`, `0x110000`};
2597
2598	static const int16_t RECaseFixStringOffsets[] = {
2599	`0x0`, `0x1`, `0x6`, `0x7`, `0x8`, `0x9`, `0xd`, `0xe`, `0xf`, `0x10`,
2600	`0x11`, `0x12`, `0x13`, `0x17`, `0x1b`, `0x20`, `0x21`, `0x2a`, `0x2e`, `0x2f`,
2601	`0x30`, `0x34`, `0x35`, `0x37`, `0x39`, `0x3b`, `0x3d`, `0x3f`, `0x41`, `0x43`,
2602	`0x45`, `0x47`, `0x49`, `0x4b`, `0x4d`, `0x4f`, `0x51`, `0x53`, `0x55`, `0x57`,
2603	`0x59`, `0x5b`, `0x5d`, `0x5f`, `0x61`, `0x63`, `0x65`, `0x66`, `0x67`, `0`};
2604
2605	static const int16_t RECaseFixCounts[] = {
2606	`0x1`, `0x5`, `0x1`, `0x1`, `0x1`, `0x4`, `0x1`, `0x1`, `0x1`, `0x1`,
2607	`0x1`, `0x1`, `0x4`, `0x4`, `0x5`, `0x1`, `0x9`, `0x4`, `0x1`, `0x1`,
2608	`0x4`, `0x1`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`,
2609	`0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`,
2610	`0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x2`, `0x1`, `0x1`, `0x1`, `0`};
2611
2612	static const UChar RECaseFixData[] = {
2613	`0x1e9a`, `0xfb00`, `0xfb01`, `0xfb02`, `0xfb03`, `0xfb04`, `0x1e96`, `0x130`, `0x1f0`, `0xdf`,
2614	`0x1e9e`, `0xfb05`, `0xfb06`, `0x1e97`, `0x1e98`, `0x1e99`, `0x149`, `0x1fb4`, `0x1fc4`, `0x1fb3`,
2615	`0x1fb6`, `0x1fb7`, `0x1fbc`, `0x1fc3`, `0x1fc6`, `0x1fc7`, `0x1fcc`, `0x390`, `0x1fd2`, `0x1fd3`,
2616	`0x1fd6`, `0x1fd7`, `0x1fe4`, `0x3b0`, `0x1f50`, `0x1f52`, `0x1f54`, `0x1f56`, `0x1fe2`, `0x1fe3`,
2617	`0x1fe6`, `0x1fe7`, `0x1ff3`, `0x1ff6`, `0x1ff7`, `0x1ffc`, `0x1ff4`, `0x587`, `0xfb13`, `0xfb14`,
2618	`0xfb15`, `0xfb17`, `0xfb16`, `0x1f80`, `0x1f88`, `0x1f81`, `0x1f89`, `0x1f82`, `0x1f8a`, `0x1f83`,
2619	`0x1f8b`, `0x1f84`, `0x1f8c`, `0x1f85`, `0x1f8d`, `0x1f86`, `0x1f8e`, `0x1f87`, `0x1f8f`, `0x1f90`,
2620	`0x1f98`, `0x1f91`, `0x1f99`, `0x1f92`, `0x1f9a`, `0x1f93`, `0x1f9b`, `0x1f94`, `0x1f9c`, `0x1f95`,
2621	`0x1f9d`, `0x1f96`, `0x1f9e`, `0x1f97`, `0x1f9f`, `0x1fa0`, `0x1fa8`, `0x1fa1`, `0x1fa9`, `0x1fa2`,
2622	`0x1faa`, `0x1fa3`, `0x1fab`, `0x1fa4`, `0x1fac`, `0x1fa5`, `0x1fad`, `0x1fa6`, `0x1fae`, `0x1fa7`,
2623	`0x1faf`, `0x1fb2`, `0x1fc2`, `0x1ff2`, `0`};
2624
2625	// End of machine generated data.
2626
2627	if (c < UCHAR_MIN_VALUE \|\| c > UCHAR_MAX_VALUE) {
2628	// This function should never be called with an invalid input character.
2629	UPRV_UNREACHABLE;
2630	} else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2631	UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2632	starterChars->set(caseFoldedC, caseFoldedC);
2633
2634	int32_t i;
2635	for (i=`0`; RECaseFixCodePoints[i]<c ; i++) {
2636	// Simple linear search through the sorted list of interesting code points.
2637	}
2638
2639	if (RECaseFixCodePoints[i] == c) {
2640	int32_t dataIndex = RECaseFixStringOffsets[i];
2641	int32_t numCharsToAdd = RECaseFixCounts[i];
2642	UChar32 cpToAdd = `0`;
2643	for (int32_t j=`0`; j<numCharsToAdd; j++) {
2644	U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2645	starterChars->add(cpToAdd);
2646	}
2647	}
2648
2649	starterChars->closeOver(USET_CASE_INSENSITIVE);
2650	starterChars->removeAllStrings();
2651	} else {
2652	// Not a cased character. Just return it alone.
2653	starterChars->set(c, c);
2654	}
2655	}
2656
2657
2658	// Increment with overflow check.
2659	// val and delta will both be positive.
2660
2661	static int32_t safeIncrement(int32_t val, int32_t delta) {
2662	if (INT32_MAX - val > delta) {
2663	return val + delta;
2664	} else {
2665	return INT32_MAX;
2666	}
2667	}
2668
2669
2670	//------------------------------------------------------------------------------
2671	//
2672	// matchStartType Determine how a match can start.
2673	// Used to optimize find() operations.
2674	//
2675	// Operation is very similar to minMatchLength(). Walk the compiled
2676	// pattern, keeping an on-going minimum-match-length. For any
2677	// op where the min match coming in is zero, add that ops possible
2678	// starting matches to the possible starts for the overall pattern.
2679	//
2680	//------------------------------------------------------------------------------
2681	void RegexCompile::matchStartType() {
2682	if (U_FAILURE(*fStatus)) {
2683	return;
2684	}
2685
2686
2687	int32_t loc; // Location in the pattern of the current op being processed.
2688	int32_t op; // The op being processed
2689	int32_t opType; // The opcode type of the op
2690	int32_t currentLen = `0`; // Minimum length of a match to this point (loc) in the pattern
2691	int32_t numInitialStrings = `0`; // Number of strings encountered that could match at start.
2692
2693	UBool atStart = TRUE; // True if no part of the pattern yet encountered
2694	// could have advanced the position in a match.
2695	// (Maximum match length so far == 0)
2696
2697	// forwardedLength is a vector holding minimum-match-length values that
2698	// are propagated forward in the pattern by JMP or STATE_SAVE operations.
2699	// It must be one longer than the pattern being checked because some ops
2700	// will jmp to a end-of-block+1 location from within a block, and we must
2701	// count those when checking the block.
2702	int32_t end = fRXPat->fCompiledPat->size();
2703	UVector32 forwardedLength(end+`1`, *fStatus);
2704	forwardedLength.setSize(end+`1`);
2705	for (loc=`3`; loc<end; loc++) {
2706	forwardedLength.setElementAt(INT32_MAX, loc);
2707	}
2708
2709	for (loc = `3`; loc<end; loc++) {
2710	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2711	opType = URX_TYPE(op);
2712
2713	// The loop is advancing linearly through the pattern.
2714	// If the op we are now at was the destination of a branch in the pattern,
2715	// and that path has a shorter minimum length than the current accumulated value,
2716	// replace the current accumulated value.
2717	if (forwardedLength.elementAti(loc) < currentLen) {
2718	currentLen = forwardedLength.elementAti(loc);
2719	U_ASSERT(currentLen>=`0` && currentLen < INT32_MAX);
2720	}
2721
2722	switch (opType) {
2723	// Ops that don't change the total length matched
2724	case URX_RESERVED_OP:
2725	case URX_END:
2726	case URX_FAIL:
2727	case URX_STRING_LEN:
2728	case URX_NOP:
2729	case URX_START_CAPTURE:
2730	case URX_END_CAPTURE:
2731	case URX_BACKSLASH_B:
2732	case URX_BACKSLASH_BU:
2733	case URX_BACKSLASH_G:
2734	case URX_BACKSLASH_Z:
2735	case URX_DOLLAR:
2736	case URX_DOLLAR_M:
2737	case URX_DOLLAR_D:
2738	case URX_DOLLAR_MD:
2739	case URX_RELOC_OPRND:
2740	case URX_STO_INP_LOC:
2741	case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2742	case URX_BACKREF_I:
2743
2744	case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2745	case URX_LD_SP:
2746	break;
2747
2748	case URX_CARET:
2749	if (atStart) {
2750	fRXPat->fStartType = START_START;
2751	}
2752	break;
2753
2754	case URX_CARET_M:
2755	case URX_CARET_M_UNIX:
2756	if (atStart) {
2757	fRXPat->fStartType = START_LINE;
2758	}
2759	break;
2760
2761	case URX_ONECHAR:
2762	if (currentLen == `0`) {
2763	// This character could appear at the start of a match.
2764	// Add it to the set of possible starting characters.
2765	fRXPat->fInitialChars->add(URX_VAL(op));
2766	numInitialStrings += `2`;
2767	}
2768	currentLen = safeIncrement(currentLen, `1`);
2769	atStart = FALSE;
2770	break;
2771
2772
2773	case URX_SETREF:
2774	if (currentLen == `0`) {
2775	int32_t sn = URX_VAL(op);
2776	U_ASSERT(sn > `0` && sn < fRXPat->fSets->size());
2777	const UnicodeSet s = (UnicodeSet )fRXPat->fSets->elementAt(sn);
2778	fRXPat->fInitialChars->addAll(*s);
2779	numInitialStrings += `2`;
2780	}
2781	currentLen = safeIncrement(currentLen, `1`);
2782	atStart = FALSE;
2783	break;
2784
2785	case URX_LOOP_SR_I:
2786	// [Set], like a SETREF, above, in what it can match,*
2787	// but may not match at all, so currentLen is not incremented.
2788	if (currentLen == `0`) {
2789	int32_t sn = URX_VAL(op);
2790	U_ASSERT(sn > `0` && sn < fRXPat->fSets->size());
2791	const UnicodeSet s = (UnicodeSet )fRXPat->fSets->elementAt(sn);
2792	fRXPat->fInitialChars->addAll(*s);
2793	numInitialStrings += `2`;
2794	}
2795	atStart = FALSE;
2796	break;
2797
2798	case URX_LOOP_DOT_I:
2799	if (currentLen == `0`) {
2800	// . at the start of a pattern.*
2801	// Any character can begin the match.
2802	fRXPat->fInitialChars->clear();
2803	fRXPat->fInitialChars->complement();
2804	numInitialStrings += `2`;
2805	}
2806	atStart = FALSE;
2807	break;
2808
2809
2810	case URX_STATIC_SETREF:
2811	if (currentLen == `0`) {
2812	int32_t sn = URX_VAL(op);
2813	U_ASSERT(sn>`0` && sn<URX_LAST_SET);
2814	const UnicodeSet *s = fRXPat->fStaticSets[sn];
2815	fRXPat->fInitialChars->addAll(*s);
2816	numInitialStrings += `2`;
2817	}
2818	currentLen = safeIncrement(currentLen, `1`);
2819	atStart = FALSE;
2820	break;
2821
2822
2823
2824	case URX_STAT_SETREF_N:
2825	if (currentLen == `0`) {
2826	int32_t sn = URX_VAL(op);
2827	const UnicodeSet *s = fRXPat->fStaticSets[sn];
2828	UnicodeSet sc(*s);
2829	sc.complement();
2830	fRXPat->fInitialChars->addAll(sc);
2831	numInitialStrings += `2`;
2832	}
2833	currentLen = safeIncrement(currentLen, `1`);
2834	atStart = FALSE;
2835	break;
2836
2837
2838
2839	case URX_BACKSLASH_D:
2840	// Digit Char
2841	if (currentLen == `0`) {
2842	UnicodeSet s;
2843	s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2844	if (URX_VAL(op) != `0`) {
2845	s.complement();
2846	}
2847	fRXPat->fInitialChars->addAll(s);
2848	numInitialStrings += `2`;
2849	}
2850	currentLen = safeIncrement(currentLen, `1`);
2851	atStart = FALSE;
2852	break;
2853
2854
2855	case URX_BACKSLASH_H:
2856	// Horiz white space
2857	if (currentLen == `0`) {
2858	UnicodeSet s;
2859	s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2860	s.add((UChar32)`9`); // Tab
2861	if (URX_VAL(op) != `0`) {
2862	s.complement();
2863	}
2864	fRXPat->fInitialChars->addAll(s);
2865	numInitialStrings += `2`;
2866	}
2867	currentLen = safeIncrement(currentLen, `1`);
2868	atStart = FALSE;
2869	break;
2870
2871
2872	case URX_BACKSLASH_R: // Any line ending sequence
2873	case URX_BACKSLASH_V: // Any line ending code point, with optional negation
2874	if (currentLen == `0`) {
2875	UnicodeSet s;
2876	s.add((UChar32)`0x0a`, (UChar32)`0x0d`); // add range
2877	s.add((UChar32)`0x85`);
2878	s.add((UChar32)`0x2028`, (UChar32)`0x2029`);
2879	if (URX_VAL(op) != `0`) {
2880	// Complement option applies to URX_BACKSLASH_V only.
2881	s.complement();
2882	}
2883	fRXPat->fInitialChars->addAll(s);
2884	numInitialStrings += `2`;
2885	}
2886	currentLen = safeIncrement(currentLen, `1`);
2887	atStart = FALSE;
2888	break;
2889
2890
2891
2892	case URX_ONECHAR_I:
2893	// Case Insensitive Single Character.
2894	if (currentLen == `0`) {
2895	UChar32 c = URX_VAL(op);
2896	if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2897	UnicodeSet starters(c, c);
2898	starters.closeOver(USET_CASE_INSENSITIVE);
2899	// findCaseInsensitiveStarters(c, &starters);
2900	// For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2901	// The expanded folding can't match the pattern.
2902	fRXPat->fInitialChars->addAll(starters);
2903	} else {
2904	// Char has no case variants. Just add it as-is to the
2905	// set of possible starting chars.
2906	fRXPat->fInitialChars->add(c);
2907	}
2908	numInitialStrings += `2`;
2909	}
2910	currentLen = safeIncrement(currentLen, `1`);
2911	atStart = FALSE;
2912	break;
2913
2914
2915	case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2916	case URX_DOTANY_ALL: // . matches one or two.
2917	case URX_DOTANY:
2918	case URX_DOTANY_UNIX:
2919	if (currentLen == `0`) {
2920	// These constructs are all bad news when they appear at the start
2921	// of a match. Any character can begin the match.
2922	fRXPat->fInitialChars->clear();
2923	fRXPat->fInitialChars->complement();
2924	numInitialStrings += `2`;
2925	}
2926	currentLen = safeIncrement(currentLen, `1`);
2927	atStart = FALSE;
2928	break;
2929
2930
2931	case URX_JMPX:
2932	loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2933	U_FALLTHROUGH;
2934	case URX_JMP:
2935	{
2936	int32_t jmpDest = URX_VAL(op);
2937	if (jmpDest < loc) {
2938	// Loop of some kind. Can safely ignore, the worst that will happen
2939	// is that we understate the true minimum length
2940	currentLen = forwardedLength.elementAti(loc+`1`);
2941
2942	} else {
2943	// Forward jump. Propagate the current min length to the target loc of the jump.
2944	U_ASSERT(jmpDest <= end+`1`);
2945	if (forwardedLength.elementAti(jmpDest) > currentLen) {
2946	forwardedLength.setElementAt(currentLen, jmpDest);
2947	}
2948	}
2949	}
2950	atStart = FALSE;
2951	break;
2952
2953	case URX_JMP_SAV:
2954	case URX_JMP_SAV_X:
2955	// Combo of state save to the next loc, + jmp backwards.
2956	// Net effect on min. length computation is nothing.
2957	atStart = FALSE;
2958	break;
2959
2960	case URX_BACKTRACK:
2961	// Fails are kind of like a branch, except that the min length was
2962	// propagated already, by the state save.
2963	currentLen = forwardedLength.elementAti(loc+`1`);
2964	atStart = FALSE;
2965	break;
2966
2967
2968	case URX_STATE_SAVE:
2969	{
2970	// State Save, for forward jumps, propagate the current minimum.
2971	// of the state save.
2972	int32_t jmpDest = URX_VAL(op);
2973	if (jmpDest > loc) {
2974	if (currentLen < forwardedLength.elementAti(jmpDest)) {
2975	forwardedLength.setElementAt(currentLen, jmpDest);
2976	}
2977	}
2978	}
2979	atStart = FALSE;
2980	break;
2981
2982
2983
2984
2985	case URX_STRING:
2986	{
2987	loc++;
2988	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2989	int32_t stringLen = URX_VAL(stringLenOp);
2990	U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2991	U_ASSERT(stringLenOp >= `2`);
2992	if (currentLen == `0`) {
2993	// Add the starting character of this string to the set of possible starting
2994	// characters for this pattern.
2995	int32_t stringStartIdx = URX_VAL(op);
2996	UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2997	fRXPat->fInitialChars->add(c);
2998
2999	// Remember this string. After the entire pattern has been checked,
3000	// if nothing else is identified that can start a match, we'll use it.
3001	numInitialStrings++;
3002	fRXPat->fInitialStringIdx = stringStartIdx;
3003	fRXPat->fInitialStringLen = stringLen;
3004	}
3005
3006	currentLen = safeIncrement(currentLen, stringLen);
3007	atStart = FALSE;
3008	}
3009	break;
3010
3011	case URX_STRING_I:
3012	{
3013	// Case-insensitive string. Unlike exact-match strings, we won't
3014	// attempt a string search for possible match positions. But we
3015	// do update the set of possible starting characters.
3016	loc++;
3017	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3018	int32_t stringLen = URX_VAL(stringLenOp);
3019	U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3020	U_ASSERT(stringLenOp >= `2`);
3021	if (currentLen == `0`) {
3022	// Add the starting character of this string to the set of possible starting
3023	// characters for this pattern.
3024	int32_t stringStartIdx = URX_VAL(op);
3025	UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3026	UnicodeSet s;
3027	findCaseInsensitiveStarters(c, &s);
3028	fRXPat->fInitialChars->addAll(s);
3029	numInitialStrings += `2`; // Matching on an initial string not possible.
3030	}
3031	currentLen = safeIncrement(currentLen, stringLen);
3032	atStart = FALSE;
3033	}
3034	break;
3035
3036	case URX_CTR_INIT:
3037	case URX_CTR_INIT_NG:
3038	{
3039	// Loop Init Ops. These don't change the min length, but they are 4 word ops
3040	// so location must be updated accordingly.
3041	// Loop Init Ops.
3042	// If the min loop count == 0
3043	// move loc forwards to the end of the loop, skipping over the body.
3044	// If the min count is > 0,
3045	// continue normal processing of the body of the loop.
3046	int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+`1`);
3047	loopEndLoc = URX_VAL(loopEndLoc);
3048	int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+`2`);
3049	if (minLoopCount == `0`) {
3050	// Min Loop Count of 0, treat like a forward branch and
3051	// move the current minimum length up to the target
3052	// (end of loop) location.
3053	U_ASSERT(loopEndLoc <= end+`1`);
3054	if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3055	forwardedLength.setElementAt(currentLen, loopEndLoc);
3056	}
3057	}
3058	loc+=`3`; // Skips over operands of CTR_INIT
3059	}
3060	atStart = FALSE;
3061	break;
3062
3063
3064	case URX_CTR_LOOP:
3065	case URX_CTR_LOOP_NG:
3066	// Loop ops.
3067	// The jump is conditional, backwards only.
3068	atStart = FALSE;
3069	break;
3070
3071	case URX_LOOP_C:
3072	// More loop ops. These state-save to themselves.
3073	// don't change the minimum match
3074	atStart = FALSE;
3075	break;
3076
3077
3078	case URX_LA_START:
3079	case URX_LB_START:
3080	{
3081	// Look-around. Scan forward until the matching look-ahead end,
3082	// without processing the look-around block. This is overly pessimistic.
3083
3084	// Keep track of the nesting depth of look-around blocks. Boilerplate code for
3085	// lookahead contains two LA_END instructions, so count goes up by two
3086	// for each LA_START.
3087	int32_t depth = (opType == URX_LA_START? `2`: `1`);
3088	for (;;) {
3089	loc++;
3090	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3091	if (URX_TYPE(op) == URX_LA_START) {
3092	depth+=`2`;
3093	}
3094	if (URX_TYPE(op) == URX_LB_START) {
3095	depth++;
3096	}
3097	if (URX_TYPE(op) == URX_LA_END \|\| URX_TYPE(op)==URX_LBN_END) {
3098	depth--;
3099	if (depth == `0`) {
3100	break;
3101	}
3102	}
3103	if (URX_TYPE(op) == URX_STATE_SAVE) {
3104	// Need this because neg lookahead blocks will FAIL to outside
3105	// of the block.
3106	int32_t jmpDest = URX_VAL(op);
3107	if (jmpDest > loc) {
3108	if (currentLen < forwardedLength.elementAti(jmpDest)) {
3109	forwardedLength.setElementAt(currentLen, jmpDest);
3110	}
3111	}
3112	}
3113	U_ASSERT(loc <= end);
3114	}
3115	}
3116	break;
3117
3118	case URX_LA_END:
3119	case URX_LB_CONT:
3120	case URX_LB_END:
3121	case URX_LBN_CONT:
3122	case URX_LBN_END:
3123	UPRV_UNREACHABLE; // Shouldn't get here. These ops should be
3124	// consumed by the scan in URX_LA_START and LB_START
3125	default:
3126	UPRV_UNREACHABLE;
3127	}
3128
3129	}
3130
3131
3132	// We have finished walking through the ops. Check whether some forward jump
3133	// propagated a shorter length to location end+1.
3134	if (forwardedLength.elementAti(end+`1`) < currentLen) {
3135	currentLen = forwardedLength.elementAti(end+`1`);
3136	}
3137
3138
3139	fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3140
3141
3142	// Sort out what we should check for when looking for candidate match start positions.
3143	// In order of preference,
3144	// 1. Start of input text buffer.
3145	// 2. A literal string.
3146	// 3. Start of line in multi-line mode.
3147	// 4. A single literal character.
3148	// 5. A character from a set of characters.
3149	//
3150	if (fRXPat->fStartType == START_START) {
3151	// Match only at the start of an input text string.
3152	// start type is already set. We're done.
3153	} else if (numInitialStrings == `1` && fRXPat->fMinMatchLen > `0`) {
3154	// Match beginning only with a literal string.
3155	UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3156	U_ASSERT(fRXPat->fInitialChars->contains(c));
3157	fRXPat->fStartType = START_STRING;
3158	fRXPat->fInitialChar = c;
3159	} else if (fRXPat->fStartType == START_LINE) {
3160	// Match at start of line in Multi-Line mode.
3161	// Nothing to do here; everything is already set.
3162	} else if (fRXPat->fMinMatchLen == `0`) {
3163	// Zero length match possible. We could start anywhere.
3164	fRXPat->fStartType = START_NO_INFO;
3165	} else if (fRXPat->fInitialChars->size() == `1`) {
3166	// All matches begin with the same char.
3167	fRXPat->fStartType = START_CHAR;
3168	fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(`0`);
3169	U_ASSERT(fRXPat->fInitialChar != (UChar32)-`1`);
3170	} else if (fRXPat->fInitialChars->contains((UChar32)`0`, (UChar32)`0x10ffff`) == FALSE &&
3171	fRXPat->fMinMatchLen > `0`) {
3172	// Matches start with a set of character smaller than the set of all chars.
3173	fRXPat->fStartType = START_SET;
3174	} else {
3175	// Matches can start with anything
3176	fRXPat->fStartType = START_NO_INFO;
3177	}
3178
3179	return;
3180	}
3181
3182
3183
3184	//------------------------------------------------------------------------------
3185	//
3186	// minMatchLength Calculate the length of the shortest string that could
3187	// match the specified pattern.
3188	// Length is in 16 bit code units, not code points.
3189	//
3190	// The calculated length may not be exact. The returned
3191	// value may be shorter than the actual minimum; it must
3192	// never be longer.
3193	//
3194	// start and end are the range of p-code operations to be
3195	// examined. The endpoints are included in the range.
3196	//
3197	//------------------------------------------------------------------------------
3198	int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
3199	if (U_FAILURE(*fStatus)) {
3200	return `0`;
3201	}
3202
3203	U_ASSERT(start <= end);
3204	U_ASSERT(end < fRXPat->fCompiledPat->size());
3205
3206
3207	int32_t loc;
3208	int32_t op;
3209	int32_t opType;
3210	int32_t currentLen = `0`;
3211
3212
3213	// forwardedLength is a vector holding minimum-match-length values that
3214	// are propagated forward in the pattern by JMP or STATE_SAVE operations.
3215	// It must be one longer than the pattern being checked because some ops
3216	// will jmp to a end-of-block+1 location from within a block, and we must
3217	// count those when checking the block.
3218	UVector32 forwardedLength(end+`2`, *fStatus);
3219	forwardedLength.setSize(end+`2`);
3220	for (loc=start; loc<=end+`1`; loc++) {
3221	forwardedLength.setElementAt(INT32_MAX, loc);
3222	}
3223
3224	for (loc = start; loc<=end; loc++) {
3225	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3226	opType = URX_TYPE(op);
3227
3228	// The loop is advancing linearly through the pattern.
3229	// If the op we are now at was the destination of a branch in the pattern,
3230	// and that path has a shorter minimum length than the current accumulated value,
3231	// replace the current accumulated value.
3232	// U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3233	// no-match-possible cases.
3234	if (forwardedLength.elementAti(loc) < currentLen) {
3235	currentLen = forwardedLength.elementAti(loc);
3236	U_ASSERT(currentLen>=`0` && currentLen < INT32_MAX);
3237	}
3238
3239	switch (opType) {
3240	// Ops that don't change the total length matched
3241	case URX_RESERVED_OP:
3242	case URX_END:
3243	case URX_STRING_LEN:
3244	case URX_NOP:
3245	case URX_START_CAPTURE:
3246	case URX_END_CAPTURE:
3247	case URX_BACKSLASH_B:
3248	case URX_BACKSLASH_BU:
3249	case URX_BACKSLASH_G:
3250	case URX_BACKSLASH_Z:
3251	case URX_CARET:
3252	case URX_DOLLAR:
3253	case URX_DOLLAR_M:
3254	case URX_DOLLAR_D:
3255	case URX_DOLLAR_MD:
3256	case URX_RELOC_OPRND:
3257	case URX_STO_INP_LOC:
3258	case URX_CARET_M:
3259	case URX_CARET_M_UNIX:
3260	case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3261	case URX_BACKREF_I:
3262
3263	case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3264	case URX_LD_SP:
3265
3266	case URX_JMP_SAV:
3267	case URX_JMP_SAV_X:
3268	break;
3269
3270
3271	// Ops that match a minimum of one character (one or two 16 bit code units.)
3272	//
3273	case URX_ONECHAR:
3274	case URX_STATIC_SETREF:
3275	case URX_STAT_SETREF_N:
3276	case URX_SETREF:
3277	case URX_BACKSLASH_D:
3278	case URX_BACKSLASH_H:
3279	case URX_BACKSLASH_R:
3280	case URX_BACKSLASH_V:
3281	case URX_ONECHAR_I:
3282	case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3283	case URX_DOTANY_ALL: // . matches one or two.
3284	case URX_DOTANY:
3285	case URX_DOTANY_UNIX:
3286	currentLen = safeIncrement(currentLen, `1`);
3287	break;
3288
3289
3290	case URX_JMPX:
3291	loc++; // URX_JMPX has an extra operand, ignored here,
3292	// otherwise processed identically to URX_JMP.
3293	U_FALLTHROUGH;
3294	case URX_JMP:
3295	{
3296	int32_t jmpDest = URX_VAL(op);
3297	if (jmpDest < loc) {
3298	// Loop of some kind. Can safely ignore, the worst that will happen
3299	// is that we understate the true minimum length
3300	currentLen = forwardedLength.elementAti(loc+`1`);
3301	} else {
3302	// Forward jump. Propagate the current min length to the target loc of the jump.
3303	U_ASSERT(jmpDest <= end+`1`);
3304	if (forwardedLength.elementAti(jmpDest) > currentLen) {
3305	forwardedLength.setElementAt(currentLen, jmpDest);
3306	}
3307	}
3308	}
3309	break;
3310
3311	case URX_BACKTRACK:
3312	{
3313	// Back-tracks are kind of like a branch, except that the min length was
3314	// propagated already, by the state save.
3315	currentLen = forwardedLength.elementAti(loc+`1`);
3316	}
3317	break;
3318
3319
3320	case URX_STATE_SAVE:
3321	{
3322	// State Save, for forward jumps, propagate the current minimum.
3323	// of the state save.
3324	int32_t jmpDest = URX_VAL(op);
3325	if (jmpDest > loc) {
3326	if (currentLen < forwardedLength.elementAti(jmpDest)) {
3327	forwardedLength.setElementAt(currentLen, jmpDest);
3328	}
3329	}
3330	}
3331	break;
3332
3333
3334	case URX_STRING:
3335	{
3336	loc++;
3337	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3338	currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3339	}
3340	break;
3341
3342
3343	case URX_STRING_I:
3344	{
3345	loc++;
3346	// TODO: with full case folding, matching input text may be shorter than
3347	// the string we have here. More smarts could put some bounds on it.
3348	// Assume a min length of one for now. A min length of zero causes
3349	// optimization failures for a pattern like "string"+
3350	// currentLen += URX_VAL(stringLenOp);
3351	currentLen = safeIncrement(currentLen, `1`);
3352	}
3353	break;
3354
3355	case URX_CTR_INIT:
3356	case URX_CTR_INIT_NG:
3357	{
3358	// Loop Init Ops.
3359	// If the min loop count == 0
3360	// move loc forwards to the end of the loop, skipping over the body.
3361	// If the min count is > 0,
3362	// continue normal processing of the body of the loop.
3363	int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+`1`);
3364	loopEndLoc = URX_VAL(loopEndLoc);
3365	int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+`2`);
3366	if (minLoopCount == `0`) {
3367	loc = loopEndLoc;
3368	} else {
3369	loc+=`3`; // Skips over operands of CTR_INIT
3370	}
3371	}
3372	break;
3373
3374
3375	case URX_CTR_LOOP:
3376	case URX_CTR_LOOP_NG:
3377	// Loop ops.
3378	// The jump is conditional, backwards only.
3379	break;
3380
3381	case URX_LOOP_SR_I:
3382	case URX_LOOP_DOT_I:
3383	case URX_LOOP_C:
3384	// More loop ops. These state-save to themselves.
3385	// don't change the minimum match - could match nothing at all.
3386	break;
3387
3388
3389	case URX_LA_START:
3390	case URX_LB_START:
3391	{
3392	// Look-around. Scan forward until the matching look-ahead end,
3393	// without processing the look-around block. This is overly pessimistic for look-ahead,
3394	// it assumes that the look-ahead match might be zero-length.
3395	// TODO: Positive lookahead could recursively do the block, then continue
3396	// with the longer of the block or the value coming in. Ticket 6060
3397	int32_t depth = (opType == URX_LA_START? `2`: `1`);
3398	for (;;) {
3399	loc++;
3400	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3401	if (URX_TYPE(op) == URX_LA_START) {
3402	// The boilerplate for look-ahead includes two LA_END insturctions,
3403	// Depth will be decremented by each one when it is seen.
3404	depth += `2`;
3405	}
3406	if (URX_TYPE(op) == URX_LB_START) {
3407	depth++;
3408	}
3409	if (URX_TYPE(op) == URX_LA_END) {
3410	depth--;
3411	if (depth == `0`) {
3412	break;
3413	}
3414	}
3415	if (URX_TYPE(op)==URX_LBN_END) {
3416	depth--;
3417	if (depth == `0`) {
3418	break;
3419	}
3420	}
3421	if (URX_TYPE(op) == URX_STATE_SAVE) {
3422	// Need this because neg lookahead blocks will FAIL to outside
3423	// of the block.
3424	int32_t jmpDest = URX_VAL(op);
3425	if (jmpDest > loc) {
3426	if (currentLen < forwardedLength.elementAti(jmpDest)) {
3427	forwardedLength.setElementAt(currentLen, jmpDest);
3428	}
3429	}
3430	}
3431	U_ASSERT(loc <= end);
3432	}
3433	}
3434	break;
3435
3436	case URX_LA_END:
3437	case URX_LB_CONT:
3438	case URX_LB_END:
3439	case URX_LBN_CONT:
3440	case URX_LBN_END:
3441	// Only come here if the matching URX_LA_START or URX_LB_START was not in the
3442	// range being sized, which happens when measuring size of look-behind blocks.
3443	break;
3444
3445	default:
3446	UPRV_UNREACHABLE;
3447	}
3448
3449	}
3450
3451	// We have finished walking through the ops. Check whether some forward jump
3452	// propagated a shorter length to location end+1.
3453	if (forwardedLength.elementAti(end+`1`) < currentLen) {
3454	currentLen = forwardedLength.elementAti(end+`1`);
3455	U_ASSERT(currentLen>=`0` && currentLen < INT32_MAX);
3456	}
3457
3458	return currentLen;
3459	}
3460
3461	//------------------------------------------------------------------------------
3462	//
3463	// maxMatchLength Calculate the length of the longest string that could
3464	// match the specified pattern.
3465	// Length is in 16 bit code units, not code points.
3466	//
3467	// The calculated length may not be exact. The returned
3468	// value may be longer than the actual maximum; it must
3469	// never be shorter.
3470	//
3471	//------------------------------------------------------------------------------
3472	int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3473	if (U_FAILURE(*fStatus)) {
3474	return `0`;
3475	}
3476	U_ASSERT(start <= end);
3477	U_ASSERT(end < fRXPat->fCompiledPat->size());
3478
3479	int32_t loc;
3480	int32_t op;
3481	int32_t opType;
3482	int32_t currentLen = `0`;
3483	UVector32 forwardedLength(end+`1`, *fStatus);
3484	forwardedLength.setSize(end+`1`);
3485
3486	for (loc=start; loc<=end; loc++) {
3487	forwardedLength.setElementAt(`0`, loc);
3488	}
3489
3490	for (loc = start; loc<=end; loc++) {
3491	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3492	opType = URX_TYPE(op);
3493
3494	// The loop is advancing linearly through the pattern.
3495	// If the op we are now at was the destination of a branch in the pattern,
3496	// and that path has a longer maximum length than the current accumulated value,
3497	// replace the current accumulated value.
3498	if (forwardedLength.elementAti(loc) > currentLen) {
3499	currentLen = forwardedLength.elementAti(loc);
3500	}
3501
3502	switch (opType) {
3503	// Ops that don't change the total length matched
3504	case URX_RESERVED_OP:
3505	case URX_END:
3506	case URX_STRING_LEN:
3507	case URX_NOP:
3508	case URX_START_CAPTURE:
3509	case URX_END_CAPTURE:
3510	case URX_BACKSLASH_B:
3511	case URX_BACKSLASH_BU:
3512	case URX_BACKSLASH_G:
3513	case URX_BACKSLASH_Z:
3514	case URX_CARET:
3515	case URX_DOLLAR:
3516	case URX_DOLLAR_M:
3517	case URX_DOLLAR_D:
3518	case URX_DOLLAR_MD:
3519	case URX_RELOC_OPRND:
3520	case URX_STO_INP_LOC:
3521	case URX_CARET_M:
3522	case URX_CARET_M_UNIX:
3523
3524	case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3525	case URX_LD_SP:
3526
3527	case URX_LB_END:
3528	case URX_LB_CONT:
3529	case URX_LBN_CONT:
3530	case URX_LBN_END:
3531	break;
3532
3533
3534	// Ops that increase that cause an unbounded increase in the length
3535	// of a matched string, or that increase it a hard to characterize way.
3536	// Call the max length unbounded, and stop further checking.
3537	case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3538	case URX_BACKREF_I:
3539	case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3540	currentLen = INT32_MAX;
3541	break;
3542
3543
3544	// Ops that match a max of one character (possibly two 16 bit code units.)
3545	//
3546	case URX_STATIC_SETREF:
3547	case URX_STAT_SETREF_N:
3548	case URX_SETREF:
3549	case URX_BACKSLASH_D:
3550	case URX_BACKSLASH_H:
3551	case URX_BACKSLASH_R:
3552	case URX_BACKSLASH_V:
3553	case URX_ONECHAR_I:
3554	case URX_DOTANY_ALL:
3555	case URX_DOTANY:
3556	case URX_DOTANY_UNIX:
3557	currentLen = safeIncrement(currentLen, `2`);
3558	break;
3559
3560	// Single literal character. Increase current max length by one or two,
3561	// depending on whether the char is in the supplementary range.
3562	case URX_ONECHAR:
3563	currentLen = safeIncrement(currentLen, `1`);
3564	if (URX_VAL(op) > `0x10000`) {
3565	currentLen = safeIncrement(currentLen, `1`);
3566	}
3567	break;
3568
3569	// Jumps.
3570	//
3571	case URX_JMP:
3572	case URX_JMPX:
3573	case URX_JMP_SAV:
3574	case URX_JMP_SAV_X:
3575	{
3576	int32_t jmpDest = URX_VAL(op);
3577	if (jmpDest < loc) {
3578	// Loop of some kind. Max match length is unbounded.
3579	currentLen = INT32_MAX;
3580	} else {
3581	// Forward jump. Propagate the current min length to the target loc of the jump.
3582	if (forwardedLength.elementAti(jmpDest) < currentLen) {
3583	forwardedLength.setElementAt(currentLen, jmpDest);
3584	}
3585	currentLen = `0`;
3586	}
3587	}
3588	break;
3589
3590	case URX_BACKTRACK:
3591	// back-tracks are kind of like a branch, except that the max length was
3592	// propagated already, by the state save.
3593	currentLen = forwardedLength.elementAti(loc+`1`);
3594	break;
3595
3596
3597	case URX_STATE_SAVE:
3598	{
3599	// State Save, for forward jumps, propagate the current minimum.
3600	// of the state save.
3601	// For backwards jumps, they create a loop, maximum
3602	// match length is unbounded.
3603	int32_t jmpDest = URX_VAL(op);
3604	if (jmpDest > loc) {
3605	if (currentLen > forwardedLength.elementAti(jmpDest)) {
3606	forwardedLength.setElementAt(currentLen, jmpDest);
3607	}
3608	} else {
3609	currentLen = INT32_MAX;
3610	}
3611	}
3612	break;
3613
3614
3615
3616
3617	case URX_STRING:
3618	{
3619	loc++;
3620	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3621	currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3622	break;
3623	}
3624
3625	case URX_STRING_I:
3626	// TODO: This code assumes that any user string that matches will be no longer
3627	// than our compiled string, with case insensitive matching.
3628	// Our compiled string has been case-folded already.
3629	//
3630	// Any matching user string will have no more code points than our
3631	// compiled (folded) string. Folding may add code points, but
3632	// not remove them.
3633	//
3634	// There is a potential problem if a supplemental code point
3635	// case-folds to a BMP code point. In this case our compiled string
3636	// could be shorter (in code units) than a matching user string.
3637	//
3638	// At this time (Unicode 6.1) there are no such characters, and this case
3639	// is not being handled. A test, intltest regex/Bug9283, will fail if
3640	// any problematic characters are added to Unicode.
3641	//
3642	// If this happens, we can make a set of the BMP chars that the
3643	// troublesome supplementals fold to, scan our string, and bump the
3644	// currentLen one extra for each that is found.
3645	//
3646	{
3647	loc++;
3648	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3649	currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3650	}
3651	break;
3652
3653	case URX_CTR_INIT:
3654	case URX_CTR_INIT_NG:
3655	// For Loops, recursively call this function on the pattern for the loop body,
3656	// then multiply the result by the maximum loop count.
3657	{
3658	int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+`1`));
3659	if (loopEndLoc == loc+`4`) {
3660	// Loop has an empty body. No affect on max match length.
3661	// Continue processing with code after the loop end.
3662	loc = loopEndLoc;
3663	break;
3664	}
3665
3666	int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+`3`));
3667	if (maxLoopCount == -`1`) {
3668	// Unbounded Loop. No upper bound on match length.
3669	currentLen = INT32_MAX;
3670	break;
3671	}
3672
3673	U_ASSERT(loopEndLoc >= loc+`4`);
3674	int64_t blockLen = maxMatchLength(loc+`4`, loopEndLoc-`1`); // Recursive call.
3675	int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3676	if (updatedLen >= INT32_MAX) {
3677	currentLen = INT32_MAX;
3678	break;
3679	}
3680	currentLen = (int32_t)updatedLen;
3681	loc = loopEndLoc;
3682	break;
3683	}
3684
3685	case URX_CTR_LOOP:
3686	case URX_CTR_LOOP_NG:
3687	// These opcodes will be skipped over by code for URX_CTR_INIT.
3688	// We shouldn't encounter them here.
3689	UPRV_UNREACHABLE;
3690
3691	case URX_LOOP_SR_I:
3692	case URX_LOOP_DOT_I:
3693	case URX_LOOP_C:
3694	// For anything to do with loops, make the match length unbounded.
3695	currentLen = INT32_MAX;
3696	break;
3697
3698
3699
3700	case URX_LA_START:
3701	case URX_LA_END:
3702	// Look-ahead. Just ignore, treat the look-ahead block as if
3703	// it were normal pattern. Gives a too-long match length,
3704	// but good enough for now.
3705	break;
3706
3707	// End of look-ahead ops should always be consumed by the processing at
3708	// the URX_LA_START op.
3709	// UPRV_UNREACHABLE;
3710
3711	case URX_LB_START:
3712	{
3713	// Look-behind. Scan forward until the matching look-around end,
3714	// without processing the look-behind block.
3715	int32_t dataLoc = URX_VAL(op);
3716	for (loc = loc + `1`; loc < end; ++loc) {
3717	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3718	int32_t opType = URX_TYPE(op);
3719	if ((opType == URX_LA_END \|\| opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3720	break;
3721	}
3722	}
3723	U_ASSERT(loc < end);
3724	}
3725	break;
3726
3727	default:
3728	UPRV_UNREACHABLE;
3729	}
3730
3731
3732	if (currentLen == INT32_MAX) {
3733	// The maximum length is unbounded.
3734	// Stop further processing of the pattern.
3735	break;
3736	}
3737
3738	}
3739	return currentLen;
3740
3741	}
3742
3743
3744	//------------------------------------------------------------------------------
3745	//
3746	// stripNOPs Remove any NOP operations from the compiled pattern code.
3747	// Extra NOPs are inserted for some constructs during the initial
3748	// code generation to provide locations that may be patched later.
3749	// Many end up unneeded, and are removed by this function.
3750	//
3751	// In order to minimize the number of passes through the pattern,
3752	// back-reference fixup is also performed here (adjusting
3753	// back-reference operands to point to the correct frame offsets).
3754	//
3755	//------------------------------------------------------------------------------
3756	void RegexCompile::stripNOPs() {
3757
3758	if (U_FAILURE(*fStatus)) {
3759	return;
3760	}
3761
3762	int32_t end = fRXPat->fCompiledPat->size();
3763	UVector32 deltas(end, *fStatus);
3764
3765	// Make a first pass over the code, computing the amount that things
3766	// will be offset at each location in the original code.
3767	int32_t loc;
3768	int32_t d = `0`;
3769	for (loc=`0`; loc<end; loc++) {
3770	deltas.addElement(d, *fStatus);
3771	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3772	if (URX_TYPE(op) == URX_NOP) {
3773	d++;
3774	}
3775	}
3776
3777	UnicodeString caseStringBuffer;
3778
3779	// Make a second pass over the code, removing the NOPs by moving following
3780	// code up, and patching operands that refer to code locations that
3781	// are being moved. The array of offsets from the first step is used
3782	// to compute the new operand values.
3783	int32_t src;
3784	int32_t dst = `0`;
3785	for (src=`0`; src<end; src++) {
3786	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3787	int32_t opType = URX_TYPE(op);
3788	switch (opType) {
3789	case URX_NOP:
3790	break;
3791
3792	case URX_STATE_SAVE:
3793	case URX_JMP:
3794	case URX_CTR_LOOP:
3795	case URX_CTR_LOOP_NG:
3796	case URX_RELOC_OPRND:
3797	case URX_JMPX:
3798	case URX_JMP_SAV:
3799	case URX_JMP_SAV_X:
3800	// These are instructions with operands that refer to code locations.
3801	{
3802	int32_t operandAddress = URX_VAL(op);
3803	U_ASSERT(operandAddress>=`0` && operandAddress<deltas.size());
3804	int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3805	op = buildOp(opType, fixedOperandAddress);
3806	fRXPat->fCompiledPat->setElementAt(op, dst);
3807	dst++;
3808	break;
3809	}
3810
3811	case URX_BACKREF:
3812	case URX_BACKREF_I:
3813	{
3814	int32_t where = URX_VAL(op);
3815	if (where > fRXPat->fGroupMap->size()) {
3816	error(U_REGEX_INVALID_BACK_REF);
3817	break;
3818	}
3819	where = fRXPat->fGroupMap->elementAti(where-`1`);
3820	op = buildOp(opType, where);
3821	fRXPat->fCompiledPat->setElementAt(op, dst);
3822	dst++;
3823
3824	fRXPat->fNeedsAltInput = TRUE;
3825	break;
3826	}
3827	case URX_RESERVED_OP:
3828	case URX_RESERVED_OP_N:
3829	case URX_BACKTRACK:
3830	case URX_END:
3831	case URX_ONECHAR:
3832	case URX_STRING:
3833	case URX_STRING_LEN:
3834	case URX_START_CAPTURE:
3835	case URX_END_CAPTURE:
3836	case URX_STATIC_SETREF:
3837	case URX_STAT_SETREF_N:
3838	case URX_SETREF:
3839	case URX_DOTANY:
3840	case URX_FAIL:
3841	case URX_BACKSLASH_B:
3842	case URX_BACKSLASH_BU:
3843	case URX_BACKSLASH_G:
3844	case URX_BACKSLASH_X:
3845	case URX_BACKSLASH_Z:
3846	case URX_DOTANY_ALL:
3847	case URX_BACKSLASH_D:
3848	case URX_CARET:
3849	case URX_DOLLAR:
3850	case URX_CTR_INIT:
3851	case URX_CTR_INIT_NG:
3852	case URX_DOTANY_UNIX:
3853	case URX_STO_SP:
3854	case URX_LD_SP:
3855	case URX_STO_INP_LOC:
3856	case URX_LA_START:
3857	case URX_LA_END:
3858	case URX_ONECHAR_I:
3859	case URX_STRING_I:
3860	case URX_DOLLAR_M:
3861	case URX_CARET_M:
3862	case URX_CARET_M_UNIX:
3863	case URX_LB_START:
3864	case URX_LB_CONT:
3865	case URX_LB_END:
3866	case URX_LBN_CONT:
3867	case URX_LBN_END:
3868	case URX_LOOP_SR_I:
3869	case URX_LOOP_DOT_I:
3870	case URX_LOOP_C:
3871	case URX_DOLLAR_D:
3872	case URX_DOLLAR_MD:
3873	case URX_BACKSLASH_H:
3874	case URX_BACKSLASH_R:
3875	case URX_BACKSLASH_V:
3876	// These instructions are unaltered by the relocation.
3877	fRXPat->fCompiledPat->setElementAt(op, dst);
3878	dst++;
3879	break;
3880
3881	default:
3882	// Some op is unaccounted for.
3883	UPRV_UNREACHABLE;
3884	}
3885	}
3886
3887	fRXPat->fCompiledPat->setSize(dst);
3888	}
3889
3890
3891
3892
3893	//------------------------------------------------------------------------------
3894	//
3895	// Error Report a rule parse error.
3896	// Only report it if no previous error has been recorded.
3897	//
3898	//------------------------------------------------------------------------------
3899	void RegexCompile::error(UErrorCode e) {
3900	if (U_SUCCESS(*fStatus) \|\| e == U_MEMORY_ALLOCATION_ERROR) {
3901	*fStatus = e;
3902	// Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3903	// API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3904	// int64_t. If the values of the latter are out of range for the former,
3905	// set them to the appropriate "field not supported" values.
3906	if (fLineNum > `0x7FFFFFFF`) {
3907	fParseErr->line = `0`;
3908	fParseErr->offset = -`1`;
3909	} else if (fCharNum > `0x7FFFFFFF`) {
3910	fParseErr->line = (int32_t)fLineNum;
3911	fParseErr->offset = -`1`;
3912	} else {
3913	fParseErr->line = (int32_t)fLineNum;
3914	fParseErr->offset = (int32_t)fCharNum;
3915	}
3916
3917	UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3918
3919	// Fill in the context.
3920	// Note: extractBetween() pins supplied indicies to the string bounds.
3921	uprv_memset(fParseErr->preContext, `0`, sizeof(fParseErr->preContext));
3922	uprv_memset(fParseErr->postContext, `0`, sizeof(fParseErr->postContext));
3923	utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+`1`, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3924	utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-`1`, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3925	}
3926	}
3927
3928
3929	//
3930	// Assorted Unicode character constants.
3931	// Numeric because there is no portable way to enter them as literals.
3932	// (Think EBCDIC).
3933	//
3934	static const UChar chCR = `0x0d`; // New lines, for terminating comments.
3935	static const UChar chLF = `0x0a`; // Line Feed
3936	static const UChar chPound = `0x23`; // '#', introduces a comment.
3937	static const UChar chDigit0 = `0x30`; // '0'
3938	static const UChar chDigit7 = `0x37`; // '9'
3939	static const UChar chColon = `0x3A`; // ':'
3940	static const UChar chE = `0x45`; // 'E'
3941	static const UChar chQ = `0x51`; // 'Q'
3942	//static const UChar chN = 0x4E; // 'N'
3943	static const UChar chP = `0x50`; // 'P'
3944	static const UChar chBackSlash = `0x5c`; // '\' introduces a char escape
3945	//static const UChar chLBracket = 0x5b; // '['
3946	static const UChar chRBracket = `0x5d`; // ']'
3947	static const UChar chUp = `0x5e`; // '^'
3948	static const UChar chLowerP = `0x70`;
3949	static const UChar chLBrace = `0x7b`; // '{'
3950	static const UChar chRBrace = `0x7d`; // '}'
3951	static const UChar chNEL = `0x85`; // NEL newline variant
3952	static const UChar chLS = `0x2028`; // Unicode Line Separator
3953
3954
3955	//------------------------------------------------------------------------------
3956	//
3957	// nextCharLL Low Level Next Char from the regex pattern.
3958	// Get a char from the string, keep track of input position
3959	// for error reporting.
3960	//
3961	//------------------------------------------------------------------------------
3962	UChar32 RegexCompile::nextCharLL() {
3963	UChar32 ch;
3964
3965	if (fPeekChar != -`1`) {
3966	ch = fPeekChar;
3967	fPeekChar = -`1`;
3968	return ch;
3969	}
3970
3971	// assume we're already in the right place
3972	ch = UTEXT_NEXT32(fRXPat->fPattern);
3973	if (ch == U_SENTINEL) {
3974	return ch;
3975	}
3976
3977	if (ch == chCR \|\|
3978	ch == chNEL \|\|
3979	ch == chLS \|\|
3980	(ch == chLF && fLastChar != chCR)) {
3981	// Character is starting a new line. Bump up the line number, and
3982	// reset the column to 0.
3983	fLineNum++;
3984	fCharNum=`0`;
3985	}
3986	else {
3987	// Character is not starting a new line. Except in the case of a
3988	// LF following a CR, increment the column position.
3989	if (ch != chLF) {
3990	fCharNum++;
3991	}
3992	}
3993	fLastChar = ch;
3994	return ch;
3995	}
3996
3997	//------------------------------------------------------------------------------
3998	//
3999	// peekCharLL Low Level Character Scanning, sneak a peek at the next
4000	// character without actually getting it.
4001	//
4002	//------------------------------------------------------------------------------
4003	UChar32 RegexCompile::peekCharLL() {
4004	if (fPeekChar == -`1`) {
4005	fPeekChar = nextCharLL();
4006	}
4007	return fPeekChar;
4008	}
4009
4010
4011	//------------------------------------------------------------------------------
4012	//
4013	// nextChar for pattern scanning. At this level, we handle stripping
4014	// out comments and processing some backslash character escapes.
4015	// The rest of the pattern grammar is handled at the next level up.
4016	//
4017	//------------------------------------------------------------------------------
4018	void RegexCompile::nextChar(RegexPatternChar &c) {
4019	tailRecursion:
4020	fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4021	c.fChar = nextCharLL();
4022	c.fQuoted = FALSE;
4023
4024	if (fQuoteMode) {
4025	c.fQuoted = TRUE;
4026	if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == `0`)) \|\|
4027	c.fChar == (UChar32)-`1`) {
4028	fQuoteMode = FALSE; // Exit quote mode,
4029	nextCharLL(); // discard the E
4030	// nextChar(c); // recurse to get the real next char
4031	goto tailRecursion; // Note: fuzz testing produced testcases that
4032	// resulted in stack overflow here.
4033	}
4034	}
4035	else if (fInBackslashQuote) {
4036	// The current character immediately follows a '\'
4037	// Don't check for any further escapes, just return it as-is.
4038	// Don't set c.fQuoted, because that would prevent the state machine from
4039	// dispatching on the character.
4040	fInBackslashQuote = FALSE;
4041	}
4042	else
4043	{
4044	// We are not in a \Q quoted region \E of the source.
4045	//
4046	if (fModeFlags & UREGEX_COMMENTS) {
4047	//
4048	// We are in free-spacing and comments mode.
4049	// Scan through any white space and comments, until we
4050	// reach a significant character or the end of inut.
4051	for (;;) {
4052	if (c.fChar == (UChar32)-`1`) {
4053	break; // End of Input
4054	}
4055	if (c.fChar == chPound && fEOLComments == TRUE) {
4056	// Start of a comment. Consume the rest of it, until EOF or a new line
4057	for (;;) {
4058	c.fChar = nextCharLL();
4059	if (c.fChar == (UChar32)-`1` \|\| // EOF
4060	c.fChar == chCR \|\|
4061	c.fChar == chLF \|\|
4062	c.fChar == chNEL \|\|
4063	c.fChar == chLS) {
4064	break;
4065	}
4066	}
4067	}
4068	// TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4069	if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4070	break;
4071	}
4072	c.fChar = nextCharLL();
4073	}
4074	}
4075
4076	//
4077	// check for backslash escaped characters.
4078	//
4079	if (c.fChar == chBackSlash) {
4080	int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4081	if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4082	//
4083	// A '\' sequence that is handled by ICU's standard unescapeAt function.
4084	// Includes \uxxxx, \n, \r, many others.
4085	// Return the single equivalent character.
4086	//
4087	nextCharLL(); // get & discard the peeked char.
4088	c.fQuoted = TRUE;
4089
4090	if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4091	int32_t endIndex = (int32_t)pos;
4092	c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4093
4094	if (endIndex == pos) {
4095	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4096	}
4097	fCharNum += endIndex - pos;
4098	UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4099	} else {
4100	int32_t offset = `0`;
4101	struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4102
4103	UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4104	c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4105
4106	if (offset == `0`) {
4107	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4108	} else if (context.lastOffset == offset) {
4109	UTEXT_PREVIOUS32(fRXPat->fPattern);
4110	} else if (context.lastOffset != offset-`1`) {
4111	utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - `1`);
4112	}
4113	fCharNum += offset;
4114	}
4115	}
4116	else if (peekCharLL() == chDigit0) {
4117	// Octal Escape, using Java Regexp Conventions
4118	// which are \0 followed by 1-3 octal digits.
4119	// Different from ICU Unescape handling of Octal, which does not
4120	// require the leading 0.
4121	// Java also has the convention of only consuming 2 octal digits if
4122	// the three digit number would be > 0xff
4123	//
4124	c.fChar = `0`;
4125	nextCharLL(); // Consume the initial 0.
4126	int index;
4127	for (index=`0`; index<`3`; index++) {
4128	int32_t ch = peekCharLL();
4129	if (ch<chDigit0 \|\| ch>chDigit7) {
4130	if (index==`0`) {
4131	// \0 is not followed by any octal digits.
4132	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4133	}
4134	break;
4135	}
4136	c.fChar <<= `3`;
4137	c.fChar += ch&`7`;
4138	if (c.fChar <= `255`) {
4139	nextCharLL();
4140	} else {
4141	// The last digit made the number too big. Forget we saw it.
4142	c.fChar >>= `3`;
4143	}
4144	}
4145	c.fQuoted = TRUE;
4146	}
4147	else if (peekCharLL() == chQ) {
4148	// "\Q" enter quote mode, which will continue until "\E"
4149	fQuoteMode = TRUE;
4150	nextCharLL(); // discard the 'Q'.
4151	// nextChar(c); // recurse to get the real next char.
4152	goto tailRecursion; // Note: fuzz testing produced test cases that
4153	// resulted in stack overflow here.
4154	}
4155	else
4156	{
4157	// We are in a '\' escape that will be handled by the state table scanner.
4158	// Just return the backslash, but remember that the following char is to
4159	// be taken literally.
4160	fInBackslashQuote = TRUE;
4161	}
4162	}
4163	}
4164
4165	// re-enable # to end-of-line comments, in case they were disabled.
4166	// They are disabled by the parser upon seeing '(?', but this lasts for
4167	// the fetching of the next character only.
4168	fEOLComments = TRUE;
4169
4170	// putc(c.fChar, stdout);
4171	}
4172
4173
4174
4175	//------------------------------------------------------------------------------
4176	//
4177	// scanNamedChar
4178	// Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4179	//
4180	// The scan position will be at the 'N'. On return
4181	// the scan position should be just after the '}'
4182	//
4183	// Return the UChar32
4184	//
4185	//------------------------------------------------------------------------------
4186	UChar32 RegexCompile::scanNamedChar() {
4187	if (U_FAILURE(*fStatus)) {
4188	return `0`;
4189	}
4190
4191	nextChar(fC);
4192	if (fC.fChar != chLBrace) {
4193	error(U_REGEX_PROPERTY_SYNTAX);
4194	return `0`;
4195	}
4196
4197	UnicodeString charName;
4198	for (;;) {
4199	nextChar(fC);
4200	if (fC.fChar == chRBrace) {
4201	break;
4202	}
4203	if (fC.fChar == -`1`) {
4204	error(U_REGEX_PROPERTY_SYNTAX);
4205	return `0`;
4206	}
4207	charName.append(fC.fChar);
4208	}
4209
4210	char name[`100`];
4211	if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) \|\|
4212	(uint32_t)charName.length()>=sizeof(name)) {
4213	// All Unicode character names have only invariant characters.
4214	// The API to get a character, given a name, accepts only char , forcing us to convert,*
4215	// which requires this error check
4216	error(U_REGEX_PROPERTY_SYNTAX);
4217	return `0`;
4218	}
4219	charName.extract(`0`, charName.length(), name, sizeof(name), US_INV);
4220
4221	UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4222	if (U_FAILURE(*fStatus)) {
4223	error(U_REGEX_PROPERTY_SYNTAX);
4224	}
4225
4226	nextChar(fC); // Continue overall regex pattern processing with char after the '}'
4227	return theChar;
4228	}
4229
4230	//------------------------------------------------------------------------------
4231	//
4232	// scanProp Construct a UnicodeSet from the text at the current scan
4233	// position, which will be of the form \p{whaterver}
4234	//
4235	// The scan position will be at the 'p' or 'P'. On return
4236	// the scan position should be just after the '}'
4237	//
4238	// Return a UnicodeSet, constructed from the \P pattern,
4239	// or NULL if the pattern is invalid.
4240	//
4241	//------------------------------------------------------------------------------
4242	UnicodeSet *RegexCompile::scanProp() {
4243	UnicodeSet *uset = NULL;
4244
4245	if (U_FAILURE(*fStatus)) {
4246	return NULL;
4247	}
4248	(void)chLowerP; // Suppress compiler unused variable warning.
4249	U_ASSERT(fC.fChar == chLowerP \|\| fC.fChar == chP);
4250	UBool negated = (fC.fChar == chP);
4251
4252	UnicodeString propertyName;
4253	nextChar(fC);
4254	if (fC.fChar != chLBrace) {
4255	error(U_REGEX_PROPERTY_SYNTAX);
4256	return NULL;
4257	}
4258	for (;;) {
4259	nextChar(fC);
4260	if (fC.fChar == chRBrace) {
4261	break;
4262	}
4263	if (fC.fChar == -`1`) {
4264	// Hit the end of the input string without finding the closing '}'
4265	error(U_REGEX_PROPERTY_SYNTAX);
4266	return NULL;
4267	}
4268	propertyName.append(fC.fChar);
4269	}
4270	uset = createSetForProperty(propertyName, negated);
4271	nextChar(fC); // Move input scan to position following the closing '}'
4272	return uset;
4273	}
4274
4275	//------------------------------------------------------------------------------
4276	//
4277	// scanPosixProp Construct a UnicodeSet from the text at the current scan
4278	// position, which is expected be of the form [:property expression:]
4279	//
4280	// The scan position will be at the opening ':'. On return
4281	// the scan position must be on the closing ']'
4282	//
4283	// Return a UnicodeSet constructed from the pattern,
4284	// or NULL if this is not a valid POSIX-style set expression.
4285	// If not a property expression, restore the initial scan position
4286	// (to the opening ':')
4287	//
4288	// Note: the opening '[:' is not sufficient to guarantee that
4289	// this is a [:property:] expression.
4290	// [:'+=,] is a perfectly good ordinary set expression that
4291	// happens to include ':' as one of its characters.
4292	//
4293	//------------------------------------------------------------------------------
4294	UnicodeSet *RegexCompile::scanPosixProp() {
4295	UnicodeSet *uset = NULL;
4296
4297	if (U_FAILURE(*fStatus)) {
4298	return NULL;
4299	}
4300
4301	U_ASSERT(fC.fChar == chColon);
4302
4303	// Save the scanner state.
4304	// TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4305	int64_t savedScanIndex = fScanIndex;
4306	int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4307	UBool savedQuoteMode = fQuoteMode;
4308	UBool savedInBackslashQuote = fInBackslashQuote;
4309	UBool savedEOLComments = fEOLComments;
4310	int64_t savedLineNum = fLineNum;
4311	int64_t savedCharNum = fCharNum;
4312	UChar32 savedLastChar = fLastChar;
4313	UChar32 savedPeekChar = fPeekChar;
4314	RegexPatternChar savedfC = fC;
4315
4316	// Scan for a closing ]. A little tricky because there are some perverse
4317	// edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4318	// ending on the second closing ].
4319
4320	UnicodeString propName;
4321	UBool negated = FALSE;
4322
4323	// Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4324	nextChar(fC);
4325	if (fC.fChar == chUp) {
4326	negated = TRUE;
4327	nextChar(fC);
4328	}
4329
4330	// Scan for the closing ":]", collecting the property name along the way.
4331	UBool sawPropSetTerminator = FALSE;
4332	for (;;) {
4333	propName.append(fC.fChar);
4334	nextChar(fC);
4335	if (fC.fQuoted \|\| fC.fChar == -`1`) {
4336	// Escaped characters or end of input - either says this isn't a [:Property:]
4337	break;
4338	}
4339	if (fC.fChar == chColon) {
4340	nextChar(fC);
4341	if (fC.fChar == chRBracket) {
4342	sawPropSetTerminator = TRUE;
4343	}
4344	break;
4345	}
4346	}
4347
4348	if (sawPropSetTerminator) {
4349	uset = createSetForProperty(propName, negated);
4350	}
4351	else
4352	{
4353	// No closing ":]".
4354	// Restore the original scan position.
4355	// The main scanner will retry the input as a normal set expression,
4356	// not a [:Property:] expression.
4357	fScanIndex = savedScanIndex;
4358	fQuoteMode = savedQuoteMode;
4359	fInBackslashQuote = savedInBackslashQuote;
4360	fEOLComments = savedEOLComments;
4361	fLineNum = savedLineNum;
4362	fCharNum = savedCharNum;
4363	fLastChar = savedLastChar;
4364	fPeekChar = savedPeekChar;
4365	fC = savedfC;
4366	UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4367	}
4368	return uset;
4369	}
4370
4371	static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4372	set->add(`0`, `8`).add(`0x0e`, `0x1b`).add(`0x7f`, `0x9f`);
4373	addCategory(set, U_GC_CF_MASK, ec);
4374	}
4375
4376	//
4377	// Create a Unicode Set from a Unicode Property expression.
4378	// This is common code underlying both \p{...} ane [:...:] expressions.
4379	// Includes trying the Java "properties" that aren't supported as
4380	// normal ICU UnicodeSet properties
4381	//
4382	UnicodeSet RegexCompile::createSetForProperty(const* UnicodeString &propName, UBool negated) {
4383
4384	if (U_FAILURE(*fStatus)) {
4385	return nullptr;
4386	}
4387	LocalPointer<UnicodeSet> set;
4388	UErrorCode status = U_ZERO_ERROR;
4389
4390	do { // non-loop, exists to allow breaks from the block.
4391	//
4392	// First try the property as we received it
4393	//
4394	UnicodeString setExpr;
4395	uint32_t usetFlags = `0`;
4396	setExpr.append(u"[\\p{", -`1`);
4397	setExpr.append(propName);
4398	setExpr.append(u"}]", -`1`);
4399	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4400	usetFlags \|= USET_CASE_INSENSITIVE;
4401	}
4402	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (setExpr, usetFlags, NULL, status), status);
4403	if (U_SUCCESS(status) \|\| status == U_MEMORY_ALLOCATION_ERROR) {
4404	break;
4405	}
4406
4407	//
4408	// The incoming property wasn't directly recognized by ICU.
4409
4410	// Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4411	// Java accepts 'word' with mixed case.
4412	// Java accepts 'all' only in all lower case.
4413
4414	status = U_ZERO_ERROR;
4415	if (propName.caseCompare(u"word", -`1`, `0`) == `0`) {
4416	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (*(fRXPat->fStaticSets[URX_ISWORD_SET])), status);
4417	break;
4418	}
4419	if (propName.compare(u"all", -`1`) == `0`) {
4420	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (`0`, `0x10ffff`), status);
4421	break;
4422	}
4423
4424
4425	// Do Java InBlock expressions
4426	//
4427	UnicodeString mPropName = propName;
4428	if (mPropName.startsWith(u"In", `2`) && mPropName.length() >= `3`) {
4429	status = U_ZERO_ERROR;
4430	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (), status);
4431	if (U_FAILURE(status)) {
4432	break;
4433	}
4434	UnicodeString blockName(mPropName, `2`); // Property with the leading "In" removed.
4435	set ->applyPropertyAlias(UnicodeString (u"Block"), blockName, status);
4436	break;
4437	}
4438
4439	// Check for the Java form "IsBooleanPropertyValue", which we will recast
4440	// as "BooleanPropertyValue". The property value can be either a
4441	// a General Category or a Script Name.
4442
4443	if (propName.startsWith(u"Is", `2`) && propName.length()>=`3`) {
4444	mPropName.remove(`0`, `2`); // Strip the "Is"
4445	if (mPropName.indexOf(u`'='`) >= `0`) {
4446	// Reject any "Is..." property expression containing an '=', that is,
4447	// any non-binary property expression.
4448	status = U_REGEX_PROPERTY_SYNTAX;
4449	break;
4450	}
4451
4452	if (mPropName.caseCompare(u"assigned", -`1`, `0`) == `0`) {
4453	mPropName.setTo(u"unassigned", -`1`);
4454	negated = !negated;
4455	} else if (mPropName.caseCompare(u"TitleCase", -`1`, `0`) == `0`) {
4456	mPropName.setTo(u"Titlecase_Letter", -`1`);
4457	}
4458
4459	mPropName.insert(`0`, u"[\\p{", -`1`);
4460	mPropName.append(u"}]", -`1`);
4461	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (mPropName, *fStatus), status);
4462
4463	if (U_SUCCESS(status) && !set ->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4464	set ->closeOver(USET_CASE_INSENSITIVE);
4465	}
4466	break;
4467
4468	}
4469
4470	if (propName.startsWith(u"java", -`1`)) {
4471	status = U_ZERO_ERROR;
4472	set.adoptInsteadAndCheckErrorCode(new UnicodeSet (), status);
4473	if (U_FAILURE(status)) {
4474	break;
4475	}
4476	//
4477	// Try the various Java specific properties.
4478	// These all begin with "java"
4479	//
4480	if (propName.compare(u"javaDefined", -`1`) == `0`) {
4481	addCategory(set.getAlias(), U_GC_CN_MASK, status);
4482	set ->complement();
4483	}
4484	else if (propName.compare(u"javaDigit", -`1`) == `0`) {
4485	addCategory(set.getAlias(), U_GC_ND_MASK, status);
4486	}
4487	else if (propName.compare(u"javaIdentifierIgnorable", -`1`) == `0`) {
4488	addIdentifierIgnorable(set.getAlias(), status);
4489	}
4490	else if (propName.compare(u"javaISOControl", -`1`) == `0`) {
4491	set ->add(`0`, `0x1F`).add(`0x7F`, `0x9F`);
4492	}
4493	else if (propName.compare(u"javaJavaIdentifierPart", -`1`) == `0`) {
4494	addCategory(set.getAlias(), U_GC_L_MASK, status);
4495	addCategory(set.getAlias(), U_GC_SC_MASK, status);
4496	addCategory(set.getAlias(), U_GC_PC_MASK, status);
4497	addCategory(set.getAlias(), U_GC_ND_MASK, status);
4498	addCategory(set.getAlias(), U_GC_NL_MASK, status);
4499	addCategory(set.getAlias(), U_GC_MC_MASK, status);
4500	addCategory(set.getAlias(), U_GC_MN_MASK, status);
4501	addIdentifierIgnorable(set.getAlias(), status);
4502	}
4503	else if (propName.compare(u"javaJavaIdentifierStart", -`1`) == `0`) {
4504	addCategory(set.getAlias(), U_GC_L_MASK, status);
4505	addCategory(set.getAlias(), U_GC_NL_MASK, status);
4506	addCategory(set.getAlias(), U_GC_SC_MASK, status);
4507	addCategory(set.getAlias(), U_GC_PC_MASK, status);
4508	}
4509	else if (propName.compare(u"javaLetter", -`1`) == `0`) {
4510	addCategory(set.getAlias(), U_GC_L_MASK, status);
4511	}
4512	else if (propName.compare(u"javaLetterOrDigit", -`1`) == `0`) {
4513	addCategory(set.getAlias(), U_GC_L_MASK, status);
4514	addCategory(set.getAlias(), U_GC_ND_MASK, status);
4515	}
4516	else if (propName.compare(u"javaLowerCase", -`1`) == `0`) {
4517	addCategory(set.getAlias(), U_GC_LL_MASK, status);
4518	}
4519	else if (propName.compare(u"javaMirrored", -`1`) == `0`) {
4520	set ->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, `1`, status);
4521	}
4522	else if (propName.compare(u"javaSpaceChar", -`1`) == `0`) {
4523	addCategory(set.getAlias(), U_GC_Z_MASK, status);
4524	}
4525	else if (propName.compare(u"javaSupplementaryCodePoint", -`1`) == `0`) {
4526	set ->add(`0x10000`, UnicodeSet::MAX_VALUE);
4527	}
4528	else if (propName.compare(u"javaTitleCase", -`1`) == `0`) {
4529	addCategory(set.getAlias(), U_GC_LT_MASK, status);
4530	}
4531	else if (propName.compare(u"javaUnicodeIdentifierStart", -`1`) == `0`) {
4532	addCategory(set.getAlias(), U_GC_L_MASK, status);
4533	addCategory(set.getAlias(), U_GC_NL_MASK, status);
4534	}
4535	else if (propName.compare(u"javaUnicodeIdentifierPart", -`1`) == `0`) {
4536	addCategory(set.getAlias(), U_GC_L_MASK, status);
4537	addCategory(set.getAlias(), U_GC_PC_MASK, status);
4538	addCategory(set.getAlias(), U_GC_ND_MASK, status);
4539	addCategory(set.getAlias(), U_GC_NL_MASK, status);
4540	addCategory(set.getAlias(), U_GC_MC_MASK, status);
4541	addCategory(set.getAlias(), U_GC_MN_MASK, status);
4542	addIdentifierIgnorable(set.getAlias(), status);
4543	}
4544	else if (propName.compare(u"javaUpperCase", -`1`) == `0`) {
4545	addCategory(set.getAlias(), U_GC_LU_MASK, status);
4546	}
4547	else if (propName.compare(u"javaValidCodePoint", -`1`) == `0`) {
4548	set ->add(`0`, UnicodeSet::MAX_VALUE);
4549	}
4550	else if (propName.compare(u"javaWhitespace", -`1`) == `0`) {
4551	addCategory(set.getAlias(), U_GC_Z_MASK, status);
4552	set ->removeAll(UnicodeSet ().add(`0xa0`).add(`0x2007`).add(`0x202f`));
4553	set ->add(`9`, `0x0d`).add(`0x1c`, `0x1f`);
4554	} else {
4555	status = U_REGEX_PROPERTY_SYNTAX;
4556	}
4557
4558	if (U_SUCCESS(status) && !set ->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4559	set ->closeOver(USET_CASE_INSENSITIVE);
4560	}
4561	break;
4562	}
4563
4564	// Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4565	// extensions matched it.
4566	status = U_REGEX_PROPERTY_SYNTAX;
4567	} while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
4568
4569	if (U_SUCCESS(status)) {
4570	U_ASSERT(set.isValid());
4571	if (negated) {
4572	set ->complement();
4573	}
4574	return set.orphan();
4575	} else {
4576	if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4577	status = U_REGEX_PROPERTY_SYNTAX;
4578	}
4579	error(status);
4580	return nullptr;
4581	}
4582	}
4583
4584
4585	//
4586	// SetEval Part of the evaluation of [set expressions].
4587	// Perform any pending (stacked) operations with precedence
4588	// equal or greater to that of the next operator encountered
4589	// in the expression.
4590	//
4591	void RegexCompile::setEval(int32_t nextOp) {
4592	UnicodeSet *rightOperand = NULL;
4593	UnicodeSet *leftOperand = NULL;
4594	for (;;) {
4595	U_ASSERT(fSetOpStack.empty()==FALSE);
4596	int32_t pendingSetOperation = fSetOpStack.peeki();
4597	if ((pendingSetOperation&`0xffff0000`) < (nextOp&`0xffff0000`)) {
4598	break;
4599	}
4600	fSetOpStack.popi();
4601	U_ASSERT(fSetStack.empty() == FALSE);
4602	rightOperand = (UnicodeSet *)fSetStack.peek();
4603	switch (pendingSetOperation) {
4604	case setNegation:
4605	rightOperand->complement();
4606	break;
4607	case setCaseClose:
4608	// TODO: need a simple close function. Ticket 6065
4609	rightOperand->closeOver(USET_CASE_INSENSITIVE);
4610	rightOperand->removeAllStrings();
4611	break;
4612	case setDifference1:
4613	case setDifference2:
4614	fSetStack.pop();
4615	leftOperand = (UnicodeSet *)fSetStack.peek();
4616	leftOperand->removeAll(*rightOperand);
4617	delete rightOperand;
4618	break;
4619	case setIntersection1:
4620	case setIntersection2:
4621	fSetStack.pop();
4622	leftOperand = (UnicodeSet *)fSetStack.peek();
4623	leftOperand->retainAll(*rightOperand);
4624	delete rightOperand;
4625	break;
4626	case setUnion:
4627	fSetStack.pop();
4628	leftOperand = (UnicodeSet *)fSetStack.peek();
4629	leftOperand->addAll(*rightOperand);
4630	delete rightOperand;
4631	break;
4632	default:
4633	UPRV_UNREACHABLE;
4634	}
4635	}
4636	}
4637
4638	void RegexCompile::setPushOp(int32_t op) {
4639	setEval(op);
4640	fSetOpStack.push(op, *fStatus);
4641	fSetStack.push(new UnicodeSet (), *fStatus);
4642	}
4643
4644	U_NAMESPACE_END
4645	#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4646
4647

Browse the source code of Skia/third_party/externals/icu/source/i18n/regexcmp.cpp