parse.cc source code [RE2/re2/parse.cc]

1	// Copyright 2006 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Regular expression parser.
6
7	// The parser is a simple precedence-based parser with a
8	// manual stack. The parsing work is done by the methods
9	// of the ParseState class. The Regexp::Parse function is
10	// essentially just a lexer that calls the ParseState method
11	// for each token.
12
13	// The parser recognizes POSIX extended regular expressions
14	// excluding backreferences, collating elements, and collating
15	// classes. It also allows the empty string as a regular expression
16	// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.
17	// See regexp.h for rationale.
18
19	#include <ctype.h>
20	#include <stddef.h>
21	#include <stdint.h>
22	#include <string.h>
23	#include <algorithm>
24	#include <map>
25	#include <string>
26	#include <vector>
27
28	#include "util/util.h"
29	#include "util/logging.h"
30	#include "util/strutil.h"
31	#include "util/utf.h"
32	#include "re2/pod_array.h"
33	#include "re2/regexp.h"
34	#include "re2/stringpiece.h"
35	#include "re2/unicode_casefold.h"
36	#include "re2/unicode_groups.h"
37	#include "re2/walker-inl.h"
38
39	#if defined(RE2_USE_ICU)
40	#include "unicode/uniset.h"
41	#include "unicode/unistr.h"
42	#include "unicode/utypes.h"
43	#endif
44
45	namespace re2 {
46
47	// Reduce the maximum repeat count by an order of magnitude when fuzzing.
48	#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
49	static const int kMaxRepeat = `100`;
50	#else
51	static const int kMaxRepeat = `1000`;
52	#endif
53
54	// Regular expression parse state.
55	// The list of parsed regexps so far is maintained as a vector of
56	// Regexp pointers called the stack. Left parenthesis and vertical
57	// bar markers are also placed on the stack, as Regexps with
58	// non-standard opcodes.
59	// Scanning a left parenthesis causes the parser to push a left parenthesis
60	// marker on the stack.
61	// Scanning a vertical bar causes the parser to pop the stack until it finds a
62	// vertical bar or left parenthesis marker (not popping the marker),
63	// concatenate all the popped results, and push them back on
64	// the stack (DoConcatenation).
65	// Scanning a right parenthesis causes the parser to act as though it
66	// has seen a vertical bar, which then leaves the top of the stack in the
67	// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
68	// The parser pops all this off the stack and creates an alternation of the
69	// regexps (DoAlternation).
70
71	class Regexp::ParseState {
72	public:
73	ParseState(ParseFlags flags, const StringPiece& whole_regexp,
74	RegexpStatus* status);
75	~ParseState();
76
77	ParseFlags flags() { return flags_; }
78	int rune_max() { return rune_max_; }
79
80	// Parse methods. All public methods return a bool saying
81	// whether parsing should continue. If a method returns
82	// false, it has set fields in status_, and the parser*
83	// should return NULL.
84
85	// Pushes the given regular expression onto the stack.
86	// Could check for too much memory used here.
87	bool PushRegexp(Regexp* re);
88
89	// Pushes the literal rune r onto the stack.
90	bool PushLiteral(Rune r);
91
92	// Pushes a regexp with the given op (and no args) onto the stack.
93	bool PushSimpleOp(RegexpOp op);
94
95	// Pushes a ^ onto the stack.
96	bool PushCarat();
97
98	// Pushes a \b (word == true) or \B (word == false) onto the stack.
99	bool PushWordBoundary(bool word);
100
101	// Pushes a $ onto the stack.
102	bool PushDollar();
103
104	// Pushes a . onto the stack
105	bool PushDot();
106
107	// Pushes a repeat operator regexp onto the stack.
108	// A valid argument for the operator must already be on the stack.
109	// s is the name of the operator, for use in error messages.
110	bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy);
111
112	// Pushes a repetition regexp onto the stack.
113	// A valid argument for the operator must already be on the stack.
114	bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy);
115
116	// Checks whether a particular regexp op is a marker.
117	bool IsMarker(RegexpOp op);
118
119	// Processes a left parenthesis in the input.
120	// Pushes a marker onto the stack.
121	bool DoLeftParen(const StringPiece& name);
122	bool DoLeftParenNoCapture();
123
124	// Processes a vertical bar in the input.
125	bool DoVerticalBar();
126
127	// Processes a right parenthesis in the input.
128	bool DoRightParen();
129
130	// Processes the end of input, returning the final regexp.
131	Regexp* DoFinish();
132
133	// Finishes the regexp if necessary, preparing it for use
134	// in a more complicated expression.
135	// If it is a CharClassBuilder, converts into a CharClass.
136	Regexp* FinishRegexp(Regexp*);
137
138	// These routines don't manipulate the parse stack
139	// directly, but they do need to look at flags_.
140	// ParseCharClass also manipulates the internals of Regexp
141	// while creating out_re.*
142
143	// Parse a character class into out_re.*
144	// Removes parsed text from s.
145	bool ParseCharClass(StringPiece* s, Regexp** out_re,
146	RegexpStatus* status);
147
148	// Parse a character class character into rp.*
149	// Removes parsed text from s.
150	bool ParseCCCharacter(StringPiece* s, Rune *rp,
151	const StringPiece& whole_class,
152	RegexpStatus* status);
153
154	// Parse a character class range into rr.
155	// Removes parsed text from s.
156	bool ParseCCRange(StringPiece* s, RuneRange* rr,
157	const StringPiece& whole_class,
158	RegexpStatus* status);
159
160	// Parse a Perl flag set or non-capturing group from s.
161	bool ParsePerlFlags(StringPiece* s);
162
163
164	// Finishes the current concatenation,
165	// collapsing it into a single regexp on the stack.
166	void DoConcatenation();
167
168	// Finishes the current alternation,
169	// collapsing it to a single regexp on the stack.
170	void DoAlternation();
171
172	// Generalized DoAlternation/DoConcatenation.
173	void DoCollapse(RegexpOp op);
174
175	// Maybe concatenate Literals into LiteralString.
176	bool MaybeConcatString(int r, ParseFlags flags);
177
178	private:
179	ParseFlags flags_;
180	StringPiece whole_regexp_;
181	RegexpStatus* status_;
182	Regexp* stacktop_;
183	int ncap_; // number of capturing parens seen
184	int rune_max_; // maximum char value for this encoding
185
186	ParseState(const ParseState&) = delete;
187	ParseState& operator=(const ParseState&) = delete;
188	};
189
190	// Pseudo-operators - only on parse stack.
191	const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+`1`);
192	const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+`2`);
193
194	Regexp::ParseState::ParseState(ParseFlags flags,
195	const StringPiece& whole_regexp,
196	RegexpStatus* status)
197	: flags_(flags), whole_regexp_(whole_regexp),
198	status_(status), stacktop_(NULL), ncap_(`0`) {
199	if (flags_ & Latin1)
200	rune_max_ = `0xFF`;
201	else
202	rune_max_ = Runemax;
203	}
204
205	// Cleans up by freeing all the regexps on the stack.
206	Regexp::ParseState::~ParseState() {
207	Regexp* next;
208	for (Regexp* re = stacktop_; re != NULL; re = next) {
209	next = re->down_;
210	re->down_ = NULL;
211	if (re->op() == kLeftParen)
212	delete re->name_;
213	re->Decref();
214	}
215	}
216
217	// Finishes the regexp if necessary, preparing it for use in
218	// a more complex expression.
219	// If it is a CharClassBuilder, converts into a CharClass.
220	Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {
221	if (re == NULL)
222	return NULL;
223	re->down_ = NULL;
224
225	if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
226	CharClassBuilder* ccb = re->ccb_;
227	re->ccb_ = NULL;
228	re->cc_ = ccb->GetCharClass();
229	delete ccb;
230	}
231
232	return re;
233	}
234
235	// Pushes the given regular expression onto the stack.
236	// Could check for too much memory used here.
237	bool Regexp::ParseState::PushRegexp(Regexp* re) {
238	MaybeConcatString(-`1`, NoParseFlags);
239
240	// Special case: a character class of one character is just
241	// a literal. This is a common idiom for escaping
242	// single characters (e.g., [.] instead of \.), and some
243	// analysis does better with fewer character classes.
244	// Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
245	if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
246	re->ccb_->RemoveAbove(rune_max_);
247	if (re->ccb_->size() == `1`) {
248	Rune r = re->ccb_->begin()->lo;
249	re->Decref();
250	re = new Regexp (kRegexpLiteral, flags_);
251	re->rune_ = r;
252	} else if (re->ccb_->size() == `2`) {
253	Rune r = re->ccb_->begin()->lo;
254	if (`'A'` <= r && r <= `'Z'` && re->ccb_->Contains(r + `'a'` - `'A'`)) {
255	re->Decref();
256	re = new Regexp (kRegexpLiteral, flags_ \| FoldCase);
257	re->rune_ = r + `'a'` - `'A'`;
258	}
259	}
260	}
261
262	if (!IsMarker(re->op()))
263	re->simple_ = re->ComputeSimple();
264	re->down_ = stacktop_;
265	stacktop_ = re;
266	return true;
267	}
268
269	// Searches the case folding tables and returns the CaseFold that contains r.*
270	// If there isn't one, returns the CaseFold with smallest f->lo bigger than r.*
271	// If there isn't one, returns NULL.
272	const CaseFold* LookupCaseFold(const CaseFold f, int* n, Rune r) {
273	const CaseFold* ef = f + n;
274
275	// Binary search for entry containing r.
276	while (n > `0`) {
277	int m = n/`2`;
278	if (f[m].lo <= r && r <= f[m].hi)
279	return &f[m];
280	if (r < f[m].lo) {
281	n = m;
282	} else {
283	f += m+`1`;
284	n -= m+`1`;
285	}
286	}
287
288	// There is no entry that contains r, but f points
289	// where it would have been. Unless f points at
290	// the end of the array, it points at the next entry
291	// after r.
292	if (f < ef)
293	return f;
294
295	// No entry contains r; no entry contains runes > r.
296	return NULL;
297	}
298
299	// Returns the result of applying the fold f to the rune r.
300	Rune ApplyFold(const CaseFold *f, Rune r) {
301	switch (f->delta) {
302	default:
303	return r + f->delta;
304
305	case EvenOddSkip: // even <-> odd but only applies to every other
306	if ((r - f->lo) % `2`)
307	return r;
308	FALLTHROUGH_INTENDED;
309	case EvenOdd: // even <-> odd
310	if (r%`2` == `0`)
311	return r + `1`;
312	return r - `1`;
313
314	case OddEvenSkip: // odd <-> even but only applies to every other
315	if ((r - f->lo) % `2`)
316	return r;
317	FALLTHROUGH_INTENDED;
318	case OddEven: // odd <-> even
319	if (r%`2` == `1`)
320	return r + `1`;
321	return r - `1`;
322	}
323	}
324
325	// Returns the next Rune in r's folding cycle (see unicode_casefold.h).
326	// Examples:
327	// CycleFoldRune('A') = 'a'
328	// CycleFoldRune('a') = 'A'
329	//
330	// CycleFoldRune('K') = 'k'
331	// CycleFoldRune('k') = 0x212A (Kelvin)
332	// CycleFoldRune(0x212A) = 'K'
333	//
334	// CycleFoldRune('?') = '?'
335	Rune CycleFoldRune(Rune r) {
336	const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
337	if (f == NULL \|\| r < f->lo)
338	return r;
339	return ApplyFold(f, r);
340	}
341
342	// Add lo-hi to the class, along with their fold-equivalent characters.
343	// If lo-hi is already in the class, assume that the fold-equivalent
344	// chars are there too, so there's no work to do.
345	static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
346	// AddFoldedRange calls itself recursively for each rune in the fold cycle.
347	// Most folding cycles are small: there aren't any bigger than four in the
348	// current Unicode tables. make_unicode_casefold.py checks that
349	// the cycles are not too long, and we double-check here using depth.
350	if (depth > `10`) {
351	LOG(DFATAL) << "AddFoldedRange recurses too much.";
352	return;
353	}
354
355	if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done
356	return;
357
358	while (lo <= hi) {
359	const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
360	if (f == NULL) // lo has no fold, nor does anything above lo
361	break;
362	if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo
363	lo = f->lo;
364	continue;
365	}
366
367	// Add in the result of folding the range lo - f->hi
368	// and that range's fold, recursively.
369	Rune lo1 = lo;
370	Rune hi1 = std::min<Rune>(hi, f->hi);
371	switch (f->delta) {
372	default:
373	lo1 += f->delta;
374	hi1 += f->delta;
375	break;
376	case EvenOdd:
377	if (lo1%`2` == `1`)
378	lo1--;
379	if (hi1%`2` == `0`)
380	hi1++;
381	break;
382	case OddEven:
383	if (lo1%`2` == `0`)
384	lo1--;
385	if (hi1%`2` == `1`)
386	hi1++;
387	break;
388	}
389	AddFoldedRange(cc, lo1, hi1, depth+`1`);
390
391	// Pick up where this fold left off.
392	lo = f->hi + `1`;
393	}
394	}
395
396	// Pushes the literal rune r onto the stack.
397	bool Regexp::ParseState::PushLiteral(Rune r) {
398	// Do case folding if needed.
399	if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
400	Regexp* re = new Regexp (kRegexpCharClass, flags_ & ~FoldCase);
401	re->ccb_ = new CharClassBuilder;
402	Rune r1 = r;
403	do {
404	if (!(flags_ & NeverNL) \|\| r != `'\n'`) {
405	re->ccb_->AddRange(r, r);
406	}
407	r = CycleFoldRune(r);
408	} while (r != r1);
409	return PushRegexp(re);
410	}
411
412	// Exclude newline if applicable.
413	if ((flags_ & NeverNL) && r == `'\n'`)
414	return PushRegexp(new Regexp (kRegexpNoMatch, flags_));
415
416	// No fancy stuff worked. Ordinary literal.
417	if (MaybeConcatString(r, flags_))
418	return true;
419
420	Regexp* re = new Regexp (kRegexpLiteral, flags_);
421	re->rune_ = r;
422	return PushRegexp(re);
423	}
424
425	// Pushes a ^ onto the stack.
426	bool Regexp::ParseState::PushCarat() {
427	if (flags_ & OneLine) {
428	return PushSimpleOp(kRegexpBeginText);
429	}
430	return PushSimpleOp(kRegexpBeginLine);
431	}
432
433	// Pushes a \b or \B onto the stack.
434	bool Regexp::ParseState::PushWordBoundary(bool word) {
435	if (word)
436	return PushSimpleOp(kRegexpWordBoundary);
437	return PushSimpleOp(kRegexpNoWordBoundary);
438	}
439
440	// Pushes a $ onto the stack.
441	bool Regexp::ParseState::PushDollar() {
442	if (flags_ & OneLine) {
443	// Clumsy marker so that MimicsPCRE() can tell whether
444	// this kRegexpEndText was a $ and not a \z.
445	Regexp::ParseFlags oflags = flags_;
446	flags_ = flags_ \| WasDollar;
447	bool ret = PushSimpleOp(kRegexpEndText);
448	flags_ = oflags;
449	return ret;
450	}
451	return PushSimpleOp(kRegexpEndLine);
452	}
453
454	// Pushes a . onto the stack.
455	bool Regexp::ParseState::PushDot() {
456	if ((flags_ & DotNL) && !(flags_ & NeverNL))
457	return PushSimpleOp(kRegexpAnyChar);
458	// Rewrite . into [^\n]
459	Regexp* re = new Regexp (kRegexpCharClass, flags_ & ~FoldCase);
460	re->ccb_ = new CharClassBuilder;
461	re->ccb_->AddRange(`0`, `'\n'` - `1`);
462	re->ccb_->AddRange(`'\n'` + `1`, rune_max_);
463	return PushRegexp(re);
464	}
465
466	// Pushes a regexp with the given op (and no args) onto the stack.
467	bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {
468	Regexp* re = new Regexp (op, flags_);
469	return PushRegexp(re);
470	}
471
472	// Pushes a repeat operator regexp onto the stack.
473	// A valid argument for the operator must already be on the stack.
474	// The char c is the name of the operator, for use in error messages.
475	bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s,
476	bool nongreedy) {
477	if (stacktop_ == NULL \|\| IsMarker(stacktop_->op())) {
478	status_->set_code(kRegexpRepeatArgument);
479	status_->set_error_arg(s);
480	return false;
481	}
482	Regexp::ParseFlags fl = flags_;
483	if (nongreedy)
484	fl = fl ^ NonGreedy;
485
486	// Squash , ++ and ??. Regexp::Star() et al. handle this too, but
487	// they're mostly for use during simplification, not during parsing.
488	if (op == stacktop_->op() && fl == stacktop_->parse_flags())
489	return true;
490
491	// Squash +, ?, +, +?, ?* and ?+. They all squash to , so because
492	// op is a repeat, we just have to check that stacktop_->op() is too,
493	// then adjust stacktop_.
494	if ((stacktop_->op() == kRegexpStar \|\|
495	stacktop_->op() == kRegexpPlus \|\|
496	stacktop_->op() == kRegexpQuest) &&
497	fl == stacktop_->parse_flags()) {
498	stacktop_->op_ = kRegexpStar;
499	return true;
500	}
501
502	Regexp* re = new Regexp (op, fl);
503	re->AllocSub(`1`);
504	re->down_ = stacktop_->down_;
505	re->sub()[`0`] = FinishRegexp(stacktop_);
506	re->simple_ = re->ComputeSimple();
507	stacktop_ = re;
508	return true;
509	}
510
511	// RepetitionWalker reports whether the repetition regexp is valid.
512	// Valid means that the combination of the top-level repetition
513	// and any inner repetitions does not exceed n copies of the
514	// innermost thing.
515	// This rewalks the regexp tree and is called for every repetition,
516	// so we have to worry about inducing quadratic behavior in the parser.
517	// We avoid this by only using RepetitionWalker when min or max >= 2.
518	// In that case the depth of any >= 2 nesting can only get to 9 without
519	// triggering a parse error, so each subtree can only be rewalked 9 times.
520	class RepetitionWalker : public Regexp::Walker<int> {
521	public:
522	RepetitionWalker() {}
523	virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
524	virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
525	int* child_args, int nchild_args);
526	virtual int ShortVisit(Regexp* re, int parent_arg);
527
528	private:
529	RepetitionWalker(const RepetitionWalker&) = delete;
530	RepetitionWalker& operator=(const RepetitionWalker&) = delete;
531	};
532
533	int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
534	int arg = parent_arg;
535	if (re->op() == kRegexpRepeat) {
536	int m = re->max();
537	if (m < `0`) {
538	m = re->min();
539	}
540	if (m > `0`) {
541	arg /= m;
542	}
543	}
544	return arg;
545	}
546
547	int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
548	int* child_args, int nchild_args) {
549	int arg = pre_arg;
550	for (int i = `0`; i < nchild_args; i++) {
551	if (child_args[i] < arg) {
552	arg = child_args[i];
553	}
554	}
555	return arg;
556	}
557
558	int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {
559	// This should never be called, since we use Walk and not
560	// WalkExponential.
561	LOG(DFATAL) << "RepetitionWalker::ShortVisit called";
562	return `0`;
563	}
564
565	// Pushes a repetition regexp onto the stack.
566	// A valid argument for the operator must already be on the stack.
567	bool Regexp::ParseState::PushRepetition(int min, int max,
568	const StringPiece& s,
569	bool nongreedy) {
570	if ((max != -`1` && max < min) \|\| min > kMaxRepeat \|\| max > kMaxRepeat) {
571	status_->set_code(kRegexpRepeatSize);
572	status_->set_error_arg(s);
573	return false;
574	}
575	if (stacktop_ == NULL \|\| IsMarker(stacktop_->op())) {
576	status_->set_code(kRegexpRepeatArgument);
577	status_->set_error_arg(s);
578	return false;
579	}
580	Regexp::ParseFlags fl = flags_;
581	if (nongreedy)
582	fl = fl ^ NonGreedy;
583	Regexp* re = new Regexp (kRegexpRepeat, fl);
584	re->min_ = min;
585	re->max_ = max;
586	re->AllocSub(`1`);
587	re->down_ = stacktop_->down_;
588	re->sub()[`0`] = FinishRegexp(stacktop_);
589	re->simple_ = re->ComputeSimple();
590	stacktop_ = re;
591	if (min >= `2` \|\| max >= `2`) {
592	RepetitionWalker w;
593	if (w.Walk(stacktop_, kMaxRepeat) == `0`) {
594	status_->set_code(kRegexpRepeatSize);
595	status_->set_error_arg(s);
596	return false;
597	}
598	}
599	return true;
600	}
601
602	// Checks whether a particular regexp op is a marker.
603	bool Regexp::ParseState::IsMarker(RegexpOp op) {
604	return op >= kLeftParen;
605	}
606
607	// Processes a left parenthesis in the input.
608	// Pushes a marker onto the stack.
609	bool Regexp::ParseState::DoLeftParen(const StringPiece& name) {
610	Regexp* re = new Regexp (kLeftParen, flags_);
611	re->cap_ = ++ncap_;
612	if (name.data() != NULL)
613	re->name_ = new std::string (name);
614	return PushRegexp(re);
615	}
616
617	// Pushes a non-capturing marker onto the stack.
618	bool Regexp::ParseState::DoLeftParenNoCapture() {
619	Regexp* re = new Regexp (kLeftParen, flags_);
620	re->cap_ = -`1`;
621	return PushRegexp(re);
622	}
623
624	// Processes a vertical bar in the input.
625	bool Regexp::ParseState::DoVerticalBar() {
626	MaybeConcatString(-`1`, NoParseFlags);
627	DoConcatenation();
628
629	// Below the vertical bar is a list to alternate.
630	// Above the vertical bar is a list to concatenate.
631	// We just did the concatenation, so either swap
632	// the result below the vertical bar or push a new
633	// vertical bar on the stack.
634	Regexp* r1;
635	Regexp* r2;
636	if ((r1 = stacktop_) != NULL &&
637	(r2 = r1->down_) != NULL &&
638	r2->op() == kVerticalBar) {
639	Regexp* r3;
640	if ((r3 = r2->down_) != NULL &&
641	(r1->op() == kRegexpAnyChar \|\| r3->op() == kRegexpAnyChar)) {
642	// AnyChar is above or below the vertical bar. Let it subsume
643	// the other when the other is Literal, CharClass or AnyChar.
644	if (r3->op() == kRegexpAnyChar &&
645	(r1->op() == kRegexpLiteral \|\|
646	r1->op() == kRegexpCharClass \|\|
647	r1->op() == kRegexpAnyChar)) {
648	// Discard r1.
649	stacktop_ = r2;
650	r1->Decref();
651	return true;
652	}
653	if (r1->op() == kRegexpAnyChar &&
654	(r3->op() == kRegexpLiteral \|\|
655	r3->op() == kRegexpCharClass \|\|
656	r3->op() == kRegexpAnyChar)) {
657	// Rearrange the stack and discard r3.
658	r1->down_ = r3->down_;
659	r2->down_ = r1;
660	stacktop_ = r2;
661	r3->Decref();
662	return true;
663	}
664	}
665	// Swap r1 below vertical bar (r2).
666	r1->down_ = r2->down_;
667	r2->down_ = r1;
668	stacktop_ = r2;
669	return true;
670	}
671	return PushSimpleOp(kVerticalBar);
672	}
673
674	// Processes a right parenthesis in the input.
675	bool Regexp::ParseState::DoRightParen() {
676	// Finish the current concatenation and alternation.
677	DoAlternation();
678
679	// The stack should be: LeftParen regexp
680	// Remove the LeftParen, leaving the regexp,
681	// parenthesized.
682	Regexp* r1;
683	Regexp* r2;
684	if ((r1 = stacktop_) == NULL \|\|
685	(r2 = r1->down_) == NULL \|\|
686	r2->op() != kLeftParen) {
687	status_->set_code(kRegexpMissingParen);
688	status_->set_error_arg(whole_regexp_);
689	return false;
690	}
691
692	// Pop off r1, r2. Will Decref or reuse below.
693	stacktop_ = r2->down_;
694
695	// Restore flags from when paren opened.
696	Regexp* re = r2;
697	flags_ = re->parse_flags();
698
699	// Rewrite LeftParen as capture if needed.
700	if (re->cap_ > `0`) {
701	re->op_ = kRegexpCapture;
702	// re->cap_ is already set
703	re->AllocSub(`1`);
704	re->sub()[`0`] = FinishRegexp(r1);
705	re->simple_ = re->ComputeSimple();
706	} else {
707	re->Decref();
708	re = r1;
709	}
710	return PushRegexp(re);
711	}
712
713	// Processes the end of input, returning the final regexp.
714	Regexp* Regexp::ParseState::DoFinish() {
715	DoAlternation();
716	Regexp* re = stacktop_;
717	if (re != NULL && re->down_ != NULL) {
718	status_->set_code(kRegexpMissingParen);
719	status_->set_error_arg(whole_regexp_);
720	return NULL;
721	}
722	stacktop_ = NULL;
723	return FinishRegexp(re);
724	}
725
726	// Returns the leading regexp that re starts with.
727	// The returned Regexp points into a piece of re,*
728	// so it must not be used after the caller calls re->Decref().
729	Regexp* Regexp::LeadingRegexp(Regexp* re) {
730	if (re->op() == kRegexpEmptyMatch)
731	return NULL;
732	if (re->op() == kRegexpConcat && re->nsub() >= `2`) {
733	Regexp** sub = re->sub();
734	if (sub[`0`]->op() == kRegexpEmptyMatch)
735	return NULL;
736	return sub[`0`];
737	}
738	return re;
739	}
740
741	// Removes LeadingRegexp(re) from re and returns what's left.
742	// Consumes the reference to re and may edit it in place.
743	// If caller wants to hold on to LeadingRegexp(re),
744	// must have already Incref'ed it.
745	Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {
746	if (re->op() == kRegexpEmptyMatch)
747	return re;
748	if (re->op() == kRegexpConcat && re->nsub() >= `2`) {
749	Regexp** sub = re->sub();
750	if (sub[`0`]->op() == kRegexpEmptyMatch)
751	return re;
752	sub[`0`]->Decref();
753	sub[`0`] = NULL;
754	if (re->nsub() == `2`) {
755	// Collapse concatenation to single regexp.
756	Regexp* nre = sub[`1`];
757	sub[`1`] = NULL;
758	re->Decref();
759	return nre;
760	}
761	// 3 or more -> 2 or more.
762	re->nsub_--;
763	memmove(sub, sub + `1`, re->nsub_ * sizeof sub[`0`]);
764	return re;
765	}
766	Regexp::ParseFlags pf = re->parse_flags();
767	re->Decref();
768	return new Regexp (kRegexpEmptyMatch, pf);
769	}
770
771	// Returns the leading string that re starts with.
772	// The returned Rune points into a piece of re,*
773	// so it must not be used after the caller calls re->Decref().
774	Rune* Regexp::LeadingString(Regexp* re, int *nrune,
775	Regexp::ParseFlags *flags) {
776	while (re->op() == kRegexpConcat && re->nsub() > `0`)
777	re = re->sub()[`0`];
778
779	flags = static_cast*<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);
780
781	if (re->op() == kRegexpLiteral) {
782	*nrune = `1`;
783	return &re->rune_;
784	}
785
786	if (re->op() == kRegexpLiteralString) {
787	*nrune = re->nrunes_;
788	return re->runes_;
789	}
790
791	*nrune = `0`;
792	return NULL;
793	}
794
795	// Removes the first n leading runes from the beginning of re.
796	// Edits re in place.
797	void Regexp::RemoveLeadingString(Regexp* re, int n) {
798	// Chase down concats to find first string.
799	// For regexps generated by parser, nested concats are
800	// flattened except when doing so would overflow the 16-bit
801	// limit on the size of a concatenation, so we should never
802	// see more than two here.
803	Regexp* stk[`4`];
804	size_t d = `0`;
805	while (re->op() == kRegexpConcat) {
806	if (d < arraysize(stk))
807	stk[d++] = re;
808	re = re->sub()[`0`];
809	}
810
811	// Remove leading string from re.
812	if (re->op() == kRegexpLiteral) {
813	re->rune_ = `0`;
814	re->op_ = kRegexpEmptyMatch;
815	} else if (re->op() == kRegexpLiteralString) {
816	if (n >= re->nrunes_) {
817	delete[] re->runes_;
818	re->runes_ = NULL;
819	re->nrunes_ = `0`;
820	re->op_ = kRegexpEmptyMatch;
821	} else if (n == re->nrunes_ - `1`) {
822	Rune rune = re->runes_[re->nrunes_ - `1`];
823	delete[] re->runes_;
824	re->runes_ = NULL;
825	re->nrunes_ = `0`;
826	re->rune_ = rune;
827	re->op_ = kRegexpLiteral;
828	} else {
829	re->nrunes_ -= n;
830	memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[`0`]);
831	}
832	}
833
834	// If re is now empty, concatenations might simplify too.
835	while (d > `0`) {
836	re = stk[--d];
837	Regexp** sub = re->sub();
838	if (sub[`0`]->op() == kRegexpEmptyMatch) {
839	sub[`0`]->Decref();
840	sub[`0`] = NULL;
841	// Delete first element of concat.
842	switch (re->nsub()) {
843	case `0`:
844	case `1`:
845	// Impossible.
846	LOG(DFATAL) << "Concat of " << re->nsub();
847	re->submany_ = NULL;
848	re->op_ = kRegexpEmptyMatch;
849	break;
850
851	case `2`: {
852	// Replace re with sub[1].
853	Regexp* old = sub[`1`];
854	sub[`1`] = NULL;
855	re->Swap(old);
856	old->Decref();
857	break;
858	}
859
860	default:
861	// Slide down.
862	re->nsub_--;
863	memmove(sub, sub + `1`, re->nsub_ * sizeof sub[`0`]);
864	break;
865	}
866	}
867	}
868	}
869
870	// In the context of factoring alternations, a Splice is: a factored prefix or
871	// merged character class computed by one iteration of one round of factoring;
872	// the span of subexpressions of the alternation to be "spliced" (i.e. removed
873	// and replaced); and, for a factored prefix, the number of suffixes after any
874	// factoring that might have subsequently been performed on them. For a merged
875	// character class, there are no suffixes, of course, so the field is ignored.
876	struct Splice {
877	Splice(Regexp* prefix, Regexp** sub, int nsub)
878	: prefix(prefix),
879	sub(sub),
880	nsub(nsub),
881	nsuffix(-`1`) {}
882
883	Regexp* prefix;
884	Regexp** sub;
885	int nsub;
886	int nsuffix;
887	};
888
889	// Named so because it is used to implement an explicit stack, a Frame is: the
890	// span of subexpressions of the alternation to be factored; the current round
891	// of factoring; any Splices computed; and, for a factored prefix, an iterator
892	// to the next Splice to be factored (i.e. in another Frame) because suffixes.
893	struct Frame {
894	Frame(Regexp** sub, int nsub)
895	: sub(sub),
896	nsub(nsub),
897	round(`0`) {}
898
899	Regexp** sub;
900	int nsub;
901	int round;
902	std::vector<Splice> splices;
903	int spliceidx;
904	};
905
906	// Bundled into a class for friend access to Regexp without needing to declare
907	// (or define) Splice in regexp.h.
908	class FactorAlternationImpl {
909	public:
910	static void Round1(Regexp** sub, int nsub,
911	Regexp::ParseFlags flags,
912	std::vector<Splice>* splices);
913	static void Round2(Regexp** sub, int nsub,
914	Regexp::ParseFlags flags,
915	std::vector<Splice>* splices);
916	static void Round3(Regexp** sub, int nsub,
917	Regexp::ParseFlags flags,
918	std::vector<Splice>* splices);
919	};
920
921	// Factors common prefixes from alternation.
922	// For example,
923	// ABC\|ABD\|AEF\|BCX\|BCY
924	// simplifies to
925	// A(B(C\|D)\|EF)\|BC(X\|Y)
926	// and thence to
927	// A(B[CD]\|EF)\|BC[XY]
928	//
929	// Rewrites sub to contain simplified list to alternate and returns
930	// the new length of sub. Adjusts reference counts accordingly
931	// (incoming sub[i] decremented, outgoing sub[i] incremented).
932	int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) {
933	std::vector<Frame> stk;
934	stk.emplace_back(sub, nsub);
935
936	for (;;) {
937	auto& sub = stk.back().sub;
938	auto& nsub = stk.back().nsub;
939	auto& round = stk.back().round;
940	auto& splices = stk.back().splices;
941	auto& spliceidx = stk.back().spliceidx;
942
943	if (splices.empty()) {
944	// Advance to the next round of factoring. Note that this covers
945	// the initialised state: when splices is empty and round is 0.
946	round++;
947	} else if (spliceidx < static_cast<int>(splices.size())) {
948	// We have at least one more Splice to factor. Recurse logically.
949	stk.emplace_back(splices [spliceidx].sub, splices [spliceidx].nsub);
950	continue;
951	} else {
952	// We have no more Splices to factor. Apply them.
953	auto iter = splices.begin();
954	int out = `0`;
955	for (int i = `0`; i < nsub; ) {
956	// Copy until we reach where the next Splice begins.
957	while (sub + i < iter ->sub)
958	sub[out++] = sub[i++];
959	switch (round) {
960	case `1`:
961	case `2`: {
962	// Assemble the Splice prefix and the suffixes.
963	Regexp* re[`2`];
964	re[`0`] = iter ->prefix;
965	re[`1`] = Regexp::AlternateNoFactor(iter ->sub, iter ->nsuffix, flags);
966	sub[out++] = Regexp::Concat(re, `2`, flags);
967	i += iter ->nsub;
968	break;
969	}
970	case `3`:
971	// Just use the Splice prefix.
972	sub[out++] = iter ->prefix;
973	i += iter ->nsub;
974	break;
975	default:
976	LOG(DFATAL) << "unknown round: " << round;
977	break;
978	}
979	// If we are done, copy until the end of sub.
980	if (++iter == splices.end()) {
981	while (i < nsub)
982	sub[out++] = sub[i++];
983	}
984	}
985	splices.clear();
986	nsub = out;
987	// Advance to the next round of factoring.
988	round++;
989	}
990
991	switch (round) {
992	case `1`:
993	FactorAlternationImpl::Round1(sub, nsub, flags, &splices);
994	break;
995	case `2`:
996	FactorAlternationImpl::Round2(sub, nsub, flags, &splices);
997	break;
998	case `3`:
999	FactorAlternationImpl::Round3(sub, nsub, flags, &splices);
1000	break;
1001	case `4`:
1002	if (stk.size() == `1`) {
1003	// We are at the top of the stack. Just return.
1004	return nsub;
1005	} else {
1006	// Pop the stack and set the number of suffixes.
1007	// (Note that references will be invalidated!)
1008	int nsuffix = nsub;
1009	stk.pop_back();
1010	stk.back().splices [stk.back().spliceidx].nsuffix = nsuffix;
1011	++stk.back().spliceidx;
1012	continue;
1013	}
1014	default:
1015	LOG(DFATAL) << "unknown round: " << round;
1016	break;
1017	}
1018
1019	// Set spliceidx depending on whether we have Splices to factor.
1020	if (splices.empty() \|\| round == `3`) {
1021	spliceidx = static_cast<int>(splices.size());
1022	} else {
1023	spliceidx = `0`;
1024	}
1025	}
1026	}
1027
1028	void FactorAlternationImpl::Round1(Regexp** sub, int nsub,
1029	Regexp::ParseFlags flags,
1030	std::vector<Splice>* splices) {
1031	// Round 1: Factor out common literal prefixes.
1032	int start = `0`;
1033	Rune* rune = NULL;
1034	int nrune = `0`;
1035	Regexp::ParseFlags runeflags = Regexp::NoParseFlags;
1036	for (int i = `0`; i <= nsub; i++) {
1037	// Invariant: sub[start:i] consists of regexps that all
1038	// begin with rune[0:nrune].
1039	Rune* rune_i = NULL;
1040	int nrune_i = `0`;
1041	Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;
1042	if (i < nsub) {
1043	rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i);
1044	if (runeflags_i == runeflags) {
1045	int same = `0`;
1046	while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
1047	same++;
1048	if (same > `0`) {
1049	// Matches at least one rune in current range. Keep going around.
1050	nrune = same;
1051	continue;
1052	}
1053	}
1054	}
1055
1056	// Found end of a run with common leading literal string:
1057	// sub[start:i] all begin with rune[0:nrune],
1058	// but sub[i] does not even begin with rune[0].
1059	if (i == start) {
1060	// Nothing to do - first iteration.
1061	} else if (i == start+`1`) {
1062	// Just one: don't bother factoring.
1063	} else {
1064	Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags);
1065	for (int j = start; j < i; j++)
1066	Regexp::RemoveLeadingString(sub[j], nrune);
1067	splices->emplace_back(prefix, sub + start, i - start);
1068	}
1069
1070	// Prepare for next iteration (if there is one).
1071	if (i < nsub) {
1072	start = i;
1073	rune = rune_i;
1074	nrune = nrune_i;
1075	runeflags = runeflags_i;
1076	}
1077	}
1078	}
1079
1080	void FactorAlternationImpl::Round2(Regexp** sub, int nsub,
1081	Regexp::ParseFlags flags,
1082	std::vector<Splice>* splices) {
1083	// Round 2: Factor out common simple prefixes,
1084	// just the first piece of each concatenation.
1085	// This will be good enough a lot of the time.
1086	//
1087	// Complex subexpressions (e.g. involving quantifiers)
1088	// are not safe to factor because that collapses their
1089	// distinct paths through the automaton, which affects
1090	// correctness in some cases.
1091	int start = `0`;
1092	Regexp* first = NULL;
1093	for (int i = `0`; i <= nsub; i++) {
1094	// Invariant: sub[start:i] consists of regexps that all
1095	// begin with first.
1096	Regexp* first_i = NULL;
1097	if (i < nsub) {
1098	first_i = Regexp::LeadingRegexp(sub[i]);
1099	if (first != NULL &&
1100	// first must be an empty-width op
1101	// OR a char class, any char or any byte
1102	// OR a fixed repeat of a literal, char class, any char or any byte.
1103	(first->op() == kRegexpBeginLine \|\|
1104	first->op() == kRegexpEndLine \|\|
1105	first->op() == kRegexpWordBoundary \|\|
1106	first->op() == kRegexpNoWordBoundary \|\|
1107	first->op() == kRegexpBeginText \|\|
1108	first->op() == kRegexpEndText \|\|
1109	first->op() == kRegexpCharClass \|\|
1110	first->op() == kRegexpAnyChar \|\|
1111	first->op() == kRegexpAnyByte \|\|
1112	(first->op() == kRegexpRepeat &&
1113	first->min() == first->max() &&
1114	(first->sub()[`0`]->op() == kRegexpLiteral \|\|
1115	first->sub()[`0`]->op() == kRegexpCharClass \|\|
1116	first->sub()[`0`]->op() == kRegexpAnyChar \|\|
1117	first->sub()[`0`]->op() == kRegexpAnyByte))) &&
1118	Regexp::Equal(first, first_i))
1119	continue;
1120	}
1121
1122	// Found end of a run with common leading regexp:
1123	// sub[start:i] all begin with first,
1124	// but sub[i] does not.
1125	if (i == start) {
1126	// Nothing to do - first iteration.
1127	} else if (i == start+`1`) {
1128	// Just one: don't bother factoring.
1129	} else {
1130	Regexp* prefix = first->Incref();
1131	for (int j = start; j < i; j++)
1132	sub[j] = Regexp::RemoveLeadingRegexp(sub[j]);
1133	splices->emplace_back(prefix, sub + start, i - start);
1134	}
1135
1136	// Prepare for next iteration (if there is one).
1137	if (i < nsub) {
1138	start = i;
1139	first = first_i;
1140	}
1141	}
1142	}
1143
1144	void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
1145	Regexp::ParseFlags flags,
1146	std::vector<Splice>* splices) {
1147	// Round 3: Merge runs of literals and/or character classes.
1148	int start = `0`;
1149	Regexp* first = NULL;
1150	for (int i = `0`; i <= nsub; i++) {
1151	// Invariant: sub[start:i] consists of regexps that all
1152	// are either literals (i.e. runes) or character classes.
1153	Regexp* first_i = NULL;
1154	if (i < nsub) {
1155	first_i = sub[i];
1156	if (first != NULL &&
1157	(first->op() == kRegexpLiteral \|\|
1158	first->op() == kRegexpCharClass) &&
1159	(first_i->op() == kRegexpLiteral \|\|
1160	first_i->op() == kRegexpCharClass))
1161	continue;
1162	}
1163
1164	// Found end of a run of Literal/CharClass:
1165	// sub[start:i] all are either one or the other,
1166	// but sub[i] is not.
1167	if (i == start) {
1168	// Nothing to do - first iteration.
1169	} else if (i == start+`1`) {
1170	// Just one: don't bother factoring.
1171	} else {
1172	CharClassBuilder ccb;
1173	for (int j = start; j < i; j++) {
1174	Regexp* re = sub[j];
1175	if (re->op() == kRegexpCharClass) {
1176	CharClass* cc = re->cc();
1177	for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
1178	ccb.AddRange(it->lo, it->hi);
1179	} else if (re->op() == kRegexpLiteral) {
1180	ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
1181	} else {
1182	LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "
1183	<< re->ToString();
1184	}
1185	re->Decref();
1186	}
1187	Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags);
1188	splices->emplace_back(re, sub + start, i - start);
1189	}
1190
1191	// Prepare for next iteration (if there is one).
1192	if (i < nsub) {
1193	start = i;
1194	first = first_i;
1195	}
1196	}
1197	}
1198
1199	// Collapse the regexps on top of the stack, down to the
1200	// first marker, into a new op node (op == kRegexpAlternate
1201	// or op == kRegexpConcat).
1202	void Regexp::ParseState::DoCollapse(RegexpOp op) {
1203	// Scan backward to marker, counting children of composite.
1204	int n = `0`;
1205	Regexp* next = NULL;
1206	Regexp* sub;
1207	for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1208	next = sub->down_;
1209	if (sub->op_ == op)
1210	n += sub->nsub_;
1211	else
1212	n++;
1213	}
1214
1215	// If there's just one child, leave it alone.
1216	// (Concat of one thing is that one thing; alternate of one thing is same.)
1217	if (stacktop_ != NULL && stacktop_->down_ == next)
1218	return;
1219
1220	// Construct op (alternation or concatenation), flattening op of op.
1221	PODArray<Regexp*> subs(n);
1222	next = NULL;
1223	int i = n;
1224	for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1225	next = sub->down_;
1226	if (sub->op_ == op) {
1227	Regexp** sub_subs = sub->sub();
1228	for (int k = sub->nsub_ - `1`; k >= `0`; k--)
1229	subs [--i] = sub_subs[k]->Incref();
1230	sub->Decref();
1231	} else {
1232	subs [--i] = FinishRegexp(sub);
1233	}
1234	}
1235
1236	Regexp* re = ConcatOrAlternate(op, subs.data(), n, flags_, true);
1237	re->simple_ = re->ComputeSimple();
1238	re->down_ = next;
1239	stacktop_ = re;
1240	}
1241
1242	// Finishes the current concatenation,
1243	// collapsing it into a single regexp on the stack.
1244	void Regexp::ParseState::DoConcatenation() {
1245	Regexp* r1 = stacktop_;
1246	if (r1 == NULL \|\| IsMarker(r1->op())) {
1247	// empty concatenation is special case
1248	Regexp* re = new Regexp (kRegexpEmptyMatch, flags_);
1249	PushRegexp(re);
1250	}
1251	DoCollapse(kRegexpConcat);
1252	}
1253
1254	// Finishes the current alternation,
1255	// collapsing it to a single regexp on the stack.
1256	void Regexp::ParseState::DoAlternation() {
1257	DoVerticalBar();
1258	// Now stack top is kVerticalBar.
1259	Regexp* r1 = stacktop_;
1260	stacktop_ = r1->down_;
1261	r1->Decref();
1262	DoCollapse(kRegexpAlternate);
1263	}
1264
1265	// Incremental conversion of concatenated literals into strings.
1266	// If top two elements on stack are both literal or string,
1267	// collapse into single string.
1268	// Don't walk down the stack -- the parser calls this frequently
1269	// enough that below the bottom two is known to be collapsed.
1270	// Only called when another regexp is about to be pushed
1271	// on the stack, so that the topmost literal is not being considered.
1272	// (Otherwise ab would turn into (ab).)
1273	// If r >= 0, consider pushing a literal r on the stack.
1274	// Return whether that happened.
1275	bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
1276	Regexp* re1;
1277	Regexp* re2;
1278	if ((re1 = stacktop_) == NULL \|\| (re2 = re1->down_) == NULL)
1279	return false;
1280
1281	if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)
1282	return false;
1283	if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)
1284	return false;
1285	if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))
1286	return false;
1287
1288	if (re2->op_ == kRegexpLiteral) {
1289	// convert into string
1290	Rune rune = re2->rune_;
1291	re2->op_ = kRegexpLiteralString;
1292	re2->nrunes_ = `0`;
1293	re2->runes_ = NULL;
1294	re2->AddRuneToString(rune);
1295	}
1296
1297	// push re1 into re2.
1298	if (re1->op_ == kRegexpLiteral) {
1299	re2->AddRuneToString(re1->rune_);
1300	} else {
1301	for (int i = `0`; i < re1->nrunes_; i++)
1302	re2->AddRuneToString(re1->runes_[i]);
1303	re1->nrunes_ = `0`;
1304	delete[] re1->runes_;
1305	re1->runes_ = NULL;
1306	}
1307
1308	// reuse re1 if possible
1309	if (r >= `0`) {
1310	re1->op_ = kRegexpLiteral;
1311	re1->rune_ = r;
1312	re1->parse_flags_ = static_cast<uint16_t>(flags);
1313	return true;
1314	}
1315
1316	stacktop_ = re2;
1317	re1->Decref();
1318	return false;
1319	}
1320
1321	// Lexing routines.
1322
1323	// Parses a decimal integer, storing it in np.*
1324	// Sets s to span the remainder of the string.*
1325	static bool ParseInteger(StringPiece* s, int* np) {
1326	if (s->empty() \|\| !isdigit((*s)[`0`] & `0xFF`))
1327	return false;
1328	// Disallow leading zeros.
1329	if (s->size() >= `2` && (s)[`0`] == `'0'` && isdigit((s)[`1`] & `0xFF`))
1330	return false;
1331	int n = `0`;
1332	int c;
1333	while (!s->empty() && isdigit(c = (*s)[`0`] & `0xFF`)) {
1334	// Avoid overflow.
1335	if (n >= `100000000`)
1336	return false;
1337	n = n*`10` + c - `'0'`;
1338	s->remove_prefix(`1`); // digit
1339	}
1340	*np = n;
1341	return true;
1342	}
1343
1344	// Parses a repetition suffix like {1,2} or {2} or {2,}.
1345	// Sets s to span the remainder of the string on success.*
1346	// Sets lo and hi to the given range.
1347	// In the case of {2,}, the high number is unbounded;
1348	// sets hi to -1 to signify this.*
1349	// {,2} is NOT a valid suffix.
1350	// The Maybe in the name signifies that the regexp parse
1351	// doesn't fail even if ParseRepetition does, so the StringPiece
1352	// s must NOT be edited unless MaybeParseRepetition returns true.
1353	static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) {
1354	StringPiece s = *sp;
1355	if (s.empty() \|\| s [`0`] != `'{'`)
1356	return false;
1357	s.remove_prefix(`1`); // '{'
1358	if (!ParseInteger(&s, lo))
1359	return false;
1360	if (s.empty())
1361	return false;
1362	if (s [`0`] == `','`) {
1363	s.remove_prefix(`1`); // ','
1364	if (s.empty())
1365	return false;
1366	if (s [`0`] == `'}'`) {
1367	// {2,} means at least 2
1368	*hi = -`1`;
1369	} else {
1370	// {2,4} means 2, 3, or 4.
1371	if (!ParseInteger(&s, hi))
1372	return false;
1373	}
1374	} else {
1375	// {2} means exactly two
1376	hi = lo;
1377	}
1378	if (s.empty() \|\| s [`0`] != `'}'`)
1379	return false;
1380	s.remove_prefix(`1`); // '}'
1381	*sp = s;
1382	return true;
1383	}
1384
1385	// Removes the next Rune from the StringPiece and stores it in r.*
1386	// Returns number of bytes removed from sp.
1387	// Behaves as though there is a terminating NUL at the end of sp.
1388	// Argument order is backwards from usual Google style
1389	// but consistent with chartorune.
1390	static int StringPieceToRune(Rune r, StringPiece sp, RegexpStatus* status) {
1391	// fullrune() takes int, not size_t. However, it just looks
1392	// at the leading byte and treats any length >= 4 the same.
1393	if (fullrune(sp->data(), static_cast<int>(std::min(size_t{`4`}, sp->size())))) {
1394	int n = chartorune(r, sp->data());
1395	// Some copies of chartorune have a bug that accepts
1396	// encodings of values in (10FFFF, 1FFFFF] as valid.
1397	// Those values break the character class algorithm,
1398	// which assumes Runemax is the largest rune.
1399	if (*r > Runemax) {
1400	n = `1`;
1401	*r = Runeerror;
1402	}
1403	if (!(n == `1` && r == Runeerror)) { // no decoding error*
1404	sp->remove_prefix(n);
1405	return n;
1406	}
1407	}
1408
1409	status->set_code(kRegexpBadUTF8);
1410	status->set_error_arg(StringPiece ());
1411	return -`1`;
1412	}
1413
1414	// Return whether name is valid UTF-8.
1415	// If not, set status to kRegexpBadUTF8.
1416	static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) {
1417	StringPiece t = s;
1418	Rune r;
1419	while (!t.empty()) {
1420	if (StringPieceToRune(&r, &t, status) < `0`)
1421	return false;
1422	}
1423	return true;
1424	}
1425
1426	// Is c a hex digit?
1427	static int IsHex(int c) {
1428	return (`'0'` <= c && c <= `'9'`) \|\|
1429	(`'A'` <= c && c <= `'F'`) \|\|
1430	(`'a'` <= c && c <= `'f'`);
1431	}
1432
1433	// Convert hex digit to value.
1434	static int UnHex(int c) {
1435	if (`'0'` <= c && c <= `'9'`)
1436	return c - `'0'`;
1437	if (`'A'` <= c && c <= `'F'`)
1438	return c - `'A'` + `10`;
1439	if (`'a'` <= c && c <= `'f'`)
1440	return c - `'a'` + `10`;
1441	LOG(DFATAL) << "Bad hex digit " << c;
1442	return `0`;
1443	}
1444
1445	// Parse an escape sequence (e.g., \n, \{).
1446	// Sets s to span the remainder of the string.*
1447	// Sets rp to the named character.*
1448	static bool ParseEscape(StringPiece* s, Rune* rp,
1449	RegexpStatus* status, int rune_max) {
1450	const char* begin = s->data();
1451	if (s->empty() \|\| (*s)[`0`] != `'\\'`) {
1452	// Should not happen - caller always checks.
1453	status->set_code(kRegexpInternalError);
1454	status->set_error_arg(StringPiece ());
1455	return false;
1456	}
1457	if (s->size() == `1`) {
1458	status->set_code(kRegexpTrailingBackslash);
1459	status->set_error_arg(StringPiece ());
1460	return false;
1461	}
1462	Rune c, c1;
1463	s->remove_prefix(`1`); // backslash
1464	if (StringPieceToRune(&c, s, status) < `0`)
1465	return false;
1466	int code;
1467	switch (c) {
1468	default:
1469	if (c < Runeself && !isalpha(c) && !isdigit(c)) {
1470	// Escaped non-word characters are always themselves.
1471	// PCRE is not quite so rigorous: it accepts things like
1472	// \q, but we don't. We once rejected \_, but too many
1473	// programs and people insist on using it, so allow \_.
1474	*rp = c;
1475	return true;
1476	}
1477	goto BadEscape;
1478
1479	// Octal escapes.
1480	case `'1'`:
1481	case `'2'`:
1482	case `'3'`:
1483	case `'4'`:
1484	case `'5'`:
1485	case `'6'`:
1486	case `'7'`:
1487	// Single non-zero octal digit is a backreference; not supported.
1488	if (s->empty() \|\| (s)[`0`] < `'0'` \|\| (s)[`0`] > `'7'`)
1489	goto BadEscape;
1490	FALLTHROUGH_INTENDED;
1491	case `'0'`:
1492	// consume up to three octal digits; already have one.
1493	code = c - `'0'`;
1494	if (!s->empty() && `'0'` <= (c = (*s)[`0`]) && c <= `'7'`) {
1495	code = code * `8` + c - `'0'`;
1496	s->remove_prefix(`1`); // digit
1497	if (!s->empty()) {
1498	c = (*s)[`0`];
1499	if (`'0'` <= c && c <= `'7'`) {
1500	code = code * `8` + c - `'0'`;
1501	s->remove_prefix(`1`); // digit
1502	}
1503	}
1504	}
1505	if (code > rune_max)
1506	goto BadEscape;
1507	*rp = code;
1508	return true;
1509
1510	// Hexadecimal escapes
1511	case `'x'`:
1512	if (s->empty())
1513	goto BadEscape;
1514	if (StringPieceToRune(&c, s, status) < `0`)
1515	return false;
1516	if (c == `'{'`) {
1517	// Any number of digits in braces.
1518	// Update n as we consume the string, so that
1519	// the whole thing gets shown in the error message.
1520	// Perl accepts any text at all; it ignores all text
1521	// after the first non-hex digit. We require only hex digits,
1522	// and at least one.
1523	if (StringPieceToRune(&c, s, status) < `0`)
1524	return false;
1525	int nhex = `0`;
1526	code = `0`;
1527	while (IsHex(c)) {
1528	nhex++;
1529	code = code * `16` + UnHex(c);
1530	if (code > rune_max)
1531	goto BadEscape;
1532	if (s->empty())
1533	goto BadEscape;
1534	if (StringPieceToRune(&c, s, status) < `0`)
1535	return false;
1536	}
1537	if (c != `'}'` \|\| nhex == `0`)
1538	goto BadEscape;
1539	*rp = code;
1540	return true;
1541	}
1542	// Easy case: two hex digits.
1543	if (s->empty())
1544	goto BadEscape;
1545	if (StringPieceToRune(&c1, s, status) < `0`)
1546	return false;
1547	if (!IsHex(c) \|\| !IsHex(c1))
1548	goto BadEscape;
1549	rp = UnHex(c) `16` + UnHex(c1);
1550	return true;
1551
1552	// C escapes.
1553	case `'n'`:
1554	*rp = `'\n'`;
1555	return true;
1556	case `'r'`:
1557	*rp = `'\r'`;
1558	return true;
1559	case `'t'`:
1560	*rp = `'\t'`;
1561	return true;
1562
1563	// Less common C escapes.
1564	case `'a'`:
1565	*rp = `'\a'`;
1566	return true;
1567	case `'f'`:
1568	*rp = `'\f'`;
1569	return true;
1570	case `'v'`:
1571	*rp = `'\v'`;
1572	return true;
1573
1574	// This code is disabled to avoid misparsing
1575	// the Perl word-boundary \b as a backspace
1576	// when in POSIX regexp mode. Surprisingly,
1577	// in Perl, \b means word-boundary but [\b]
1578	// means backspace. We don't support that:
1579	// if you want a backspace embed a literal
1580	// backspace character or use \x08.
1581	//
1582	// case 'b':
1583	// rp = '\b';*
1584	// return true;
1585	}
1586
1587	LOG(DFATAL) << "Not reached in ParseEscape.";
1588
1589	BadEscape:
1590	// Unrecognized escape sequence.
1591	status->set_code(kRegexpBadEscape);
1592	status->set_error_arg(
1593	StringPiece (begin, static_cast<size_t>(s->data() - begin)));
1594	return false;
1595	}
1596
1597	// Add a range to the character class, but exclude newline if asked.
1598	// Also handle case folding.
1599	void CharClassBuilder::AddRangeFlags(
1600	Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {
1601
1602	// Take out \n if the flags say so.
1603	bool cutnl = !(parse_flags & Regexp::ClassNL) \|\|
1604	(parse_flags & Regexp::NeverNL);
1605	if (cutnl && lo <= `'\n'` && `'\n'` <= hi) {
1606	if (lo < `'\n'`)
1607	AddRangeFlags(lo, `'\n'` - `1`, parse_flags);
1608	if (hi > `'\n'`)
1609	AddRangeFlags(`'\n'` + `1`, hi, parse_flags);
1610	return;
1611	}
1612
1613	// If folding case, add fold-equivalent characters too.
1614	if (parse_flags & Regexp::FoldCase)
1615	AddFoldedRange(this, lo, hi, `0`);
1616	else
1617	AddRange(lo, hi);
1618	}
1619
1620	// Look for a group with the given name.
1621	static const UGroup* LookupGroup(const StringPiece& name,
1622	const UGroup groups, int* ngroups) {
1623	// Simple name lookup.
1624	for (int i = `0`; i < ngroups; i++)
1625	if (StringPiece (groups[i].name) == name)
1626	return &groups[i];
1627	return NULL;
1628	}
1629
1630	// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
1631	static const UGroup* LookupPosixGroup(const StringPiece& name) {
1632	return LookupGroup(name, posix_groups, num_posix_groups);
1633	}
1634
1635	static const UGroup* LookupPerlGroup(const StringPiece& name) {
1636	return LookupGroup(name, perl_groups, num_perl_groups);
1637	}
1638
1639	#if !defined(RE2_USE_ICU)
1640	// Fake UGroup containing all Runes
1641	static URange16 any16[] = { { `0`, `65535` } };
1642	static URange32 any32[] = { { `65536`, Runemax } };
1643	static UGroup anygroup = { "Any", +`1`, any16, `1`, any32, `1` };
1644
1645	// Look for a Unicode group with the given name (e.g., "Han")
1646	static const UGroup* LookupUnicodeGroup(const StringPiece& name) {
1647	// Special case: "Any" means any.
1648	if (name == StringPiece ("Any"))
1649	return &anygroup;
1650	return LookupGroup(name, unicode_groups, num_unicode_groups);
1651	}
1652	#endif
1653
1654	// Add a UGroup or its negation to the character class.
1655	static void AddUGroup(CharClassBuilder cc, const* UGroup g, int* sign,
1656	Regexp::ParseFlags parse_flags) {
1657	if (sign == +`1`) {
1658	for (int i = `0`; i < g->nr16; i++) {
1659	cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);
1660	}
1661	for (int i = `0`; i < g->nr32; i++) {
1662	cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);
1663	}
1664	} else {
1665	if (parse_flags & Regexp::FoldCase) {
1666	// Normally adding a case-folded group means
1667	// adding all the extra fold-equivalent runes too.
1668	// But if we're adding the negation of the group,
1669	// we have to exclude all the runes that are fold-equivalent
1670	// to what's already missing. Too hard, so do in two steps.
1671	CharClassBuilder ccb1;
1672	AddUGroup(&ccb1, g, +`1`, parse_flags);
1673	// If the flags say to take out \n, put it in, so that negating will take it out.
1674	// Normally AddRangeFlags does this, but we're bypassing AddRangeFlags.
1675	bool cutnl = !(parse_flags & Regexp::ClassNL) \|\|
1676	(parse_flags & Regexp::NeverNL);
1677	if (cutnl) {
1678	ccb1.AddRange(`'\n'`, `'\n'`);
1679	}
1680	ccb1.Negate();
1681	cc->AddCharClass(&ccb1);
1682	return;
1683	}
1684	int next = `0`;
1685	for (int i = `0`; i < g->nr16; i++) {
1686	if (next < g->r16[i].lo)
1687	cc->AddRangeFlags(next, g->r16[i].lo - `1`, parse_flags);
1688	next = g->r16[i].hi + `1`;
1689	}
1690	for (int i = `0`; i < g->nr32; i++) {
1691	if (next < g->r32[i].lo)
1692	cc->AddRangeFlags(next, g->r32[i].lo - `1`, parse_flags);
1693	next = g->r32[i].hi + `1`;
1694	}
1695	if (next <= Runemax)
1696	cc->AddRangeFlags(next, Runemax, parse_flags);
1697	}
1698	}
1699
1700	// Maybe parse a Perl character class escape sequence.
1701	// Only recognizes the Perl character classes (\d \s \w \D \S \W),
1702	// not the Perl empty-string classes (\b \B \A \Z \z).
1703	// On success, sets s to span the remainder of the string*
1704	// and returns the corresponding UGroup.
1705	// The StringPiece must NOT* be edited unless the call succeeds.*
1706	const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) {
1707	if (!(parse_flags & Regexp::PerlClasses))
1708	return NULL;
1709	if (s->size() < `2` \|\| (*s)[`0`] != `'\\'`)
1710	return NULL;
1711	// Could use StringPieceToRune, but there aren't
1712	// any non-ASCII Perl group names.
1713	StringPiece name(s->data(), `2`);
1714	const UGroup *g = LookupPerlGroup(name);
1715	if (g == NULL)
1716	return NULL;
1717	s->remove_prefix(name.size());
1718	return g;
1719	}
1720
1721	enum ParseStatus {
1722	kParseOk, // Did some parsing.
1723	kParseError, // Found an error.
1724	kParseNothing, // Decided not to parse.
1725	};
1726
1727	// Maybe parses a Unicode character group like \p{Han} or \P{Han}
1728	// (the latter is a negated group).
1729	ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
1730	CharClassBuilder *cc,
1731	RegexpStatus* status) {
1732	// Decide whether to parse.
1733	if (!(parse_flags & Regexp::UnicodeGroups))
1734	return kParseNothing;
1735	if (s->size() < `2` \|\| (*s)[`0`] != `'\\'`)
1736	return kParseNothing;
1737	Rune c = (*s)[`1`];
1738	if (c != `'p'` && c != `'P'`)
1739	return kParseNothing;
1740
1741	// Committed to parse. Results:
1742	int sign = +`1`; // -1 = negated char class
1743	if (c == `'P'`)
1744	sign = -sign;
1745	StringPiece seq = s; // \p{Han} or \pL*
1746	StringPiece name; // Han or L
1747	s->remove_prefix(`2`); // '\\', 'p'
1748
1749	if (!StringPieceToRune(&c, s, status))
1750	return kParseError;
1751	if (c != `'{'`) {
1752	// Name is the bit of string we just skipped over for c.
1753	const char* p = seq.data() + `2`;
1754	name = StringPiece (p, static_cast<size_t>(s->data() - p));
1755	} else {
1756	// Name is in braces. Look for closing }
1757	size_t end = s->find(`'}'`, `0`);
1758	if (end == StringPiece::npos) {
1759	if (!IsValidUTF8(seq, status))
1760	return kParseError;
1761	status->set_code(kRegexpBadCharRange);
1762	status->set_error_arg(seq);
1763	return kParseError;
1764	}
1765	name = StringPiece (s->data(), end); // without '}'
1766	s->remove_prefix(end + `1`); // with '}'
1767	if (!IsValidUTF8(name, status))
1768	return kParseError;
1769	}
1770
1771	// Chop seq where s now begins.
1772	seq = StringPiece (seq.data(), static_cast<size_t>(s->data() - seq.data()));
1773
1774	if (!name.empty() && name [`0`] == `'^'`) {
1775	sign = -sign;
1776	name.remove_prefix(`1`); // '^'
1777	}
1778
1779	#if !defined(RE2_USE_ICU)
1780	// Look up the group in the RE2 Unicode data.
1781	const UGroup *g = LookupUnicodeGroup(name);
1782	if (g == NULL) {
1783	status->set_code(kRegexpBadCharRange);
1784	status->set_error_arg(seq);
1785	return kParseError;
1786	}
1787
1788	AddUGroup(cc, g, sign, parse_flags);
1789	#else
1790	// Look up the group in the ICU Unicode data. Because ICU provides full
1791	// Unicode properties support, this could be more than a lookup by name.
1792	::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8(
1793	std::string("\\p{") + std::string(name) + std::string("}"));
1794	UErrorCode uerr = U_ZERO_ERROR;
1795	::icu::UnicodeSet uset(ustr, uerr);
1796	if (U_FAILURE(uerr)) {
1797	status->set_code(kRegexpBadCharRange);
1798	status->set_error_arg(seq);
1799	return kParseError;
1800	}
1801
1802	// Convert the UnicodeSet to a URange32 and UGroup that we can add.
1803	int nr = uset.getRangeCount();
1804	URange32* r = new URange32[nr];
1805	for (int i = `0`; i < nr; i++) {
1806	r[i].lo = uset.getRangeStart(i);
1807	r[i].hi = uset.getRangeEnd(i);
1808	}
1809	UGroup g = {"", +`1`, `0`, `0`, r, nr};
1810	AddUGroup(cc, &g, sign, parse_flags);
1811	delete[] r;
1812	#endif
1813
1814	return kParseOk;
1815	}
1816
1817	// Parses a character class name like [:alnum:].
1818	// Sets s to span the remainder of the string.*
1819	// Adds the ranges corresponding to the class to ranges.
1820	static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,
1821	CharClassBuilder *cc,
1822	RegexpStatus* status) {
1823	// Check begins with [:
1824	const char* p = s->data();
1825	const char* ep = s->data() + s->size();
1826	if (ep - p < `2` \|\| p[`0`] != `'['` \|\| p[`1`] != `':'`)
1827	return kParseNothing;
1828
1829	// Look for closing :].
1830	const char* q;
1831	for (q = p+`2`; q <= ep-`2` && (q != `':'` \|\| (q+`1`) != `']'`); q++)
1832	;
1833
1834	// If no closing :], then ignore.
1835	if (q > ep-`2`)
1836	return kParseNothing;
1837
1838	// Got it. Check that it's valid.
1839	q += `2`;
1840	StringPiece name(p, static_cast<size_t>(q - p));
1841
1842	const UGroup *g = LookupPosixGroup(name);
1843	if (g == NULL) {
1844	status->set_code(kRegexpBadCharRange);
1845	status->set_error_arg(name);
1846	return kParseError;
1847	}
1848
1849	s->remove_prefix(name.size());
1850	AddUGroup(cc, g, g->sign, parse_flags);
1851	return kParseOk;
1852	}
1853
1854	// Parses a character inside a character class.
1855	// There are fewer special characters here than in the rest of the regexp.
1856	// Sets s to span the remainder of the string.*
1857	// Sets rp to the character.*
1858	bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp,
1859	const StringPiece& whole_class,
1860	RegexpStatus* status) {
1861	if (s->empty()) {
1862	status->set_code(kRegexpMissingBracket);
1863	status->set_error_arg(whole_class);
1864	return false;
1865	}
1866
1867	// Allow regular escape sequences even though
1868	// many need not be escaped in this context.
1869	if ((*s)[`0`] == `'\\'`)
1870	return ParseEscape(s, rp, status, rune_max_);
1871
1872	// Otherwise take the next rune.
1873	return StringPieceToRune(rp, s, status) >= `0`;
1874	}
1875
1876	// Parses a character class character, or, if the character
1877	// is followed by a hyphen, parses a character class range.
1878	// For single characters, rr->lo == rr->hi.
1879	// Sets s to span the remainder of the string.*
1880	// Sets rp to the character.*
1881	bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr,
1882	const StringPiece& whole_class,
1883	RegexpStatus* status) {
1884	StringPiece os = *s;
1885	if (!ParseCCCharacter(s, &rr->lo, whole_class, status))
1886	return false;
1887	// [a-] means (a\|-), so check for final ].
1888	if (s->size() >= `2` && (s)[`0`] == `'-'` && (s)[`1`] != `']'`) {
1889	s->remove_prefix(`1`); // '-'
1890	if (!ParseCCCharacter(s, &rr->hi, whole_class, status))
1891	return false;
1892	if (rr->hi < rr->lo) {
1893	status->set_code(kRegexpBadCharRange);
1894	status->set_error_arg(
1895	StringPiece (os.data(), static_cast<size_t>(s->data() - os.data())));
1896	return false;
1897	}
1898	} else {
1899	rr->hi = rr->lo;
1900	}
1901	return true;
1902	}
1903
1904	// Parses a possibly-negated character class expression like [^abx-z[:digit:]].
1905	// Sets s to span the remainder of the string.*
1906	// Sets out_re to the regexp for the class.*
1907	bool Regexp::ParseState::ParseCharClass(StringPiece* s,
1908	Regexp** out_re,
1909	RegexpStatus* status) {
1910	StringPiece whole_class = *s;
1911	if (s->empty() \|\| (*s)[`0`] != `'['`) {
1912	// Caller checked this.
1913	status->set_code(kRegexpInternalError);
1914	status->set_error_arg(StringPiece ());
1915	return false;
1916	}
1917	bool negated = false;
1918	Regexp* re = new Regexp (kRegexpCharClass, flags_ & ~FoldCase);
1919	re->ccb_ = new CharClassBuilder;
1920	s->remove_prefix(`1`); // '['
1921	if (!s->empty() && (*s)[`0`] == `'^'`) {
1922	s->remove_prefix(`1`); // '^'
1923	negated = true;
1924	if (!(flags_ & ClassNL) \|\| (flags_ & NeverNL)) {
1925	// If NL can't match implicitly, then pretend
1926	// negated classes include a leading \n.
1927	re->ccb_->AddRange(`'\n'`, `'\n'`);
1928	}
1929	}
1930	bool first = true; // ] is okay as first char in class
1931	while (!s->empty() && ((*s)[`0`] != `']'` \|\| first)) {
1932	// - is only okay unescaped as first or last in class.
1933	// Except that Perl allows - anywhere.
1934	if ((*s)[`0`] == `'-'` && !first && !(flags_&PerlX) &&
1935	(s->size() == `1` \|\| (*s)[`1`] != `']'`)) {
1936	StringPiece t = *s;
1937	t.remove_prefix(`1`); // '-'
1938	Rune r;
1939	int n = StringPieceToRune(&r, &t, status);
1940	if (n < `0`) {
1941	re->Decref();
1942	return false;
1943	}
1944	status->set_code(kRegexpBadCharRange);
1945	status->set_error_arg(StringPiece (s->data(), `1`+n));
1946	re->Decref();
1947	return false;
1948	}
1949	first = false;
1950
1951	// Look for [:alnum:] etc.
1952	if (s->size() > `2` && (s)[`0`] == `'['` && (s)[`1`] == `':'`) {
1953	switch (ParseCCName(s, flags_, re->ccb_, status)) {
1954	case kParseOk:
1955	continue;
1956	case kParseError:
1957	re->Decref();
1958	return false;
1959	case kParseNothing:
1960	break;
1961	}
1962	}
1963
1964	// Look for Unicode character group like \p{Han}
1965	if (s->size() > `2` &&
1966	(*s)[`0`] == `'\\'` &&
1967	((s)[`1`] == `'p'` \|\| (s)[`1`] == `'P'`)) {
1968	switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {
1969	case kParseOk:
1970	continue;
1971	case kParseError:
1972	re->Decref();
1973	return false;
1974	case kParseNothing:
1975	break;
1976	}
1977	}
1978
1979	// Look for Perl character class symbols (extension).
1980	const UGroup *g = MaybeParsePerlCCEscape(s, flags_);
1981	if (g != NULL) {
1982	AddUGroup(re->ccb_, g, g->sign, flags_);
1983	continue;
1984	}
1985
1986	// Otherwise assume single character or simple range.
1987	RuneRange rr;
1988	if (!ParseCCRange(s, &rr, whole_class, status)) {
1989	re->Decref();
1990	return false;
1991	}
1992	// AddRangeFlags is usually called in response to a class like
1993	// \p{Foo} or [[:foo:]]; for those, it filters \n out unless
1994	// Regexp::ClassNL is set. In an explicit range or singleton
1995	// like we just parsed, we do not filter \n out, so set ClassNL
1996	// in the flags.
1997	re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ \| Regexp::ClassNL);
1998	}
1999	if (s->empty()) {
2000	status->set_code(kRegexpMissingBracket);
2001	status->set_error_arg(whole_class);
2002	re->Decref();
2003	return false;
2004	}
2005	s->remove_prefix(`1`); // ']'
2006
2007	if (negated)
2008	re->ccb_->Negate();
2009
2010	*out_re = re;
2011	return true;
2012	}
2013
2014	// Is this a valid capture name? [A-Za-z0-9_]+
2015	// PCRE limits names to 32 bytes.
2016	// Python rejects names starting with digits.
2017	// We don't enforce either of those.
2018	static bool IsValidCaptureName(const StringPiece& name) {
2019	if (name.empty())
2020	return false;
2021	for (size_t i = `0`; i < name.size(); i++) {
2022	int c = name [i];
2023	if ((`'0'` <= c && c <= `'9'`) \|\|
2024	(`'a'` <= c && c <= `'z'`) \|\|
2025	(`'A'` <= c && c <= `'Z'`) \|\|
2026	c == `'_'`)
2027	continue;
2028	return false;
2029	}
2030	return true;
2031	}
2032
2033	// Parses a Perl flag setting or non-capturing group or both,
2034	// like (?i) or (?: or (?i:. Removes from s, updates parse state.
2035	// The caller must check that s begins with "(?".
2036	// Returns true on success. If the Perl flag is not
2037	// well-formed or not supported, sets status_ and returns false.
2038	bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
2039	StringPiece t = *s;
2040
2041	// Caller is supposed to check this.
2042	if (!(flags_ & PerlX) \|\| t.size() < `2` \|\| t [`0`] != `'('` \|\| t [`1`] != `'?'`) {
2043	LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";
2044	status_->set_code(kRegexpInternalError);
2045	return false;
2046	}
2047
2048	t.remove_prefix(`2`); // "(?"
2049
2050	// Check for named captures, first introduced in Python's regexp library.
2051	// As usual, there are three slightly different syntaxes:
2052	//
2053	// (?P<name>expr) the original, introduced by Python
2054	// (?<name>expr) the .NET alteration, adopted by Perl 5.10
2055	// (?'name'expr) another .NET alteration, adopted by Perl 5.10
2056	//
2057	// Perl 5.10 gave in and implemented the Python version too,
2058	// but they claim that the last two are the preferred forms.
2059	// PCRE and languages based on it (specifically, PHP and Ruby)
2060	// support all three as well. EcmaScript 4 uses only the Python form.
2061	//
2062	// In both the open source world (via Code Search) and the
2063	// Google source tree, (?P<expr>name) is the dominant form,
2064	// so that's the one we implement. One is enough.
2065	if (t.size() > `2` && t [`0`] == `'P'` && t [`1`] == `'<'`) {
2066	// Pull out name.
2067	size_t end = t.find(`'>'`, `2`);
2068	if (end == StringPiece::npos) {
2069	if (!IsValidUTF8(*s, status_))
2070	return false;
2071	status_->set_code(kRegexpBadNamedCapture);
2072	status_->set_error_arg(*s);
2073	return false;
2074	}
2075
2076	// t is "P<name>...", t[end] == '>'
2077	StringPiece capture(t.data()-`2`, end+`3`); // "(?P<name>"
2078	StringPiece name(t.data()+`2`, end-`2`); // "name"
2079	if (!IsValidUTF8(name, status_))
2080	return false;
2081	if (!IsValidCaptureName(name)) {
2082	status_->set_code(kRegexpBadNamedCapture);
2083	status_->set_error_arg(capture);
2084	return false;
2085	}
2086
2087	if (!DoLeftParen(name)) {
2088	// DoLeftParen's failure set status_.
2089	return false;
2090	}
2091
2092	s->remove_prefix(
2093	static_cast<size_t>(capture.data() + capture.size() - s->data()));
2094	return true;
2095	}
2096
2097	bool negated = false;
2098	bool sawflags = false;
2099	int nflags = flags_;
2100	Rune c;
2101	for (bool done = false; !done; ) {
2102	if (t.empty())
2103	goto BadPerlOp;
2104	if (StringPieceToRune(&c, &t, status_) < `0`)
2105	return false;
2106	switch (c) {
2107	default:
2108	goto BadPerlOp;
2109
2110	// Parse flags.
2111	case `'i'`:
2112	sawflags = true;
2113	if (negated)
2114	nflags &= ~FoldCase;
2115	else
2116	nflags \|= FoldCase;
2117	break;
2118
2119	case `'m'`: // opposite of our OneLine
2120	sawflags = true;
2121	if (negated)
2122	nflags \|= OneLine;
2123	else
2124	nflags &= ~OneLine;
2125	break;
2126
2127	case `'s'`:
2128	sawflags = true;
2129	if (negated)
2130	nflags &= ~DotNL;
2131	else
2132	nflags \|= DotNL;
2133	break;
2134
2135	case `'U'`:
2136	sawflags = true;
2137	if (negated)
2138	nflags &= ~NonGreedy;
2139	else
2140	nflags \|= NonGreedy;
2141	break;
2142
2143	// Negation
2144	case `'-'`:
2145	if (negated)
2146	goto BadPerlOp;
2147	negated = true;
2148	sawflags = false;
2149	break;
2150
2151	// Open new group.
2152	case `':'`:
2153	if (!DoLeftParenNoCapture()) {
2154	// DoLeftParenNoCapture's failure set status_.
2155	return false;
2156	}
2157	done = true;
2158	break;
2159
2160	// Finish flags.
2161	case `')'`:
2162	done = true;
2163	break;
2164	}
2165	}
2166
2167	if (negated && !sawflags)
2168	goto BadPerlOp;
2169
2170	flags_ = static_cast<Regexp::ParseFlags>(nflags);
2171	*s = t;
2172	return true;
2173
2174	BadPerlOp:
2175	status_->set_code(kRegexpBadPerlOp);
2176	status_->set_error_arg(
2177	StringPiece (s->data(), static_cast<size_t>(t.data() - s->data())));
2178	return false;
2179	}
2180
2181	// Converts latin1 (assumed to be encoded as Latin1 bytes)
2182	// into UTF8 encoding in string.
2183	// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
2184	// deprecated and because it rejects code points 0x80-0x9F.
2185	void ConvertLatin1ToUTF8(const StringPiece& latin1, std::string* utf) {
2186	char buf[UTFmax];
2187
2188	utf->clear();
2189	for (size_t i = `0`; i < latin1.size(); i++) {
2190	Rune r = latin1 [i] & `0xFF`;
2191	int n = runetochar(buf, &r);
2192	utf->append(buf, n);
2193	}
2194	}
2195
2196	// Parses the regular expression given by s,
2197	// returning the corresponding Regexp tree.
2198	// The caller must Decref the return value when done with it.
2199	// Returns NULL on error.
2200	Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
2201	RegexpStatus* status) {
2202	// Make status non-NULL (easier on everyone else).
2203	RegexpStatus xstatus;
2204	if (status == NULL)
2205	status = &xstatus;
2206
2207	ParseState ps(global_flags, s, status);
2208	StringPiece t = s;
2209
2210	// Convert regexp to UTF-8 (easier on the rest of the parser).
2211	if (global_flags & Latin1) {
2212	std::string* tmp = new std::string;
2213	ConvertLatin1ToUTF8(t, tmp);
2214	status->set_tmp(tmp);
2215	t = *tmp;
2216	}
2217
2218	if (global_flags & Literal) {
2219	// Special parse loop for literal string.
2220	while (!t.empty()) {
2221	Rune r;
2222	if (StringPieceToRune(&r, &t, status) < `0`)
2223	return NULL;
2224	if (!ps.PushLiteral(r))
2225	return NULL;
2226	}
2227	return ps.DoFinish();
2228	}
2229
2230	StringPiece lastunary = StringPiece ();
2231	while (!t.empty()) {
2232	StringPiece isunary = StringPiece ();
2233	switch (t [`0`]) {
2234	default: {
2235	Rune r;
2236	if (StringPieceToRune(&r, &t, status) < `0`)
2237	return NULL;
2238	if (!ps.PushLiteral(r))
2239	return NULL;
2240	break;
2241	}
2242
2243	case `'('`:
2244	// "(?" introduces Perl escape.
2245	if ((ps.flags() & PerlX) && (t.size() >= `2` && t [`1`] == `'?'`)) {
2246	// Flag changes and non-capturing groups.
2247	if (!ps.ParsePerlFlags(&t))
2248	return NULL;
2249	break;
2250	}
2251	if (ps.flags() & NeverCapture) {
2252	if (!ps.DoLeftParenNoCapture())
2253	return NULL;
2254	} else {
2255	if (!ps.DoLeftParen(StringPiece ()))
2256	return NULL;
2257	}
2258	t.remove_prefix(`1`); // '('
2259	break;
2260
2261	case `'\|'`:
2262	if (!ps.DoVerticalBar())
2263	return NULL;
2264	t.remove_prefix(`1`); // '\|'
2265	break;
2266
2267	case `')'`:
2268	if (!ps.DoRightParen())
2269	return NULL;
2270	t.remove_prefix(`1`); // ')'
2271	break;
2272
2273	case `'^'`: // Beginning of line.
2274	if (!ps.PushCarat())
2275	return NULL;
2276	t.remove_prefix(`1`); // '^'
2277	break;
2278
2279	case `'$'`: // End of line.
2280	if (!ps.PushDollar())
2281	return NULL;
2282	t.remove_prefix(`1`); // '$'
2283	break;
2284
2285	case `'.'`: // Any character (possibly except newline).
2286	if (!ps.PushDot())
2287	return NULL;
2288	t.remove_prefix(`1`); // '.'
2289	break;
2290
2291	case `'['`: { // Character class.
2292	Regexp* re;
2293	if (!ps.ParseCharClass(&t, &re, status))
2294	return NULL;
2295	if (!ps.PushRegexp(re))
2296	return NULL;
2297	break;
2298	}
2299
2300	case `''`: { // Zero or more.*
2301	RegexpOp op;
2302	op = kRegexpStar;
2303	goto Rep;
2304	case `'+'`: // One or more.
2305	op = kRegexpPlus;
2306	goto Rep;
2307	case `'?'`: // Zero or one.
2308	op = kRegexpQuest;
2309	goto Rep;
2310	Rep:
2311	StringPiece opstr = t;
2312	bool nongreedy = false;
2313	t.remove_prefix(`1`); // '' or '+' or '?'*
2314	if (ps.flags() & PerlX) {
2315	if (!t.empty() && t [`0`] == `'?'`) {
2316	nongreedy = true;
2317	t.remove_prefix(`1`); // '?'
2318	}
2319	if (!lastunary.empty()) {
2320	// In Perl it is not allowed to stack repetition operators:
2321	// a* is a syntax error, not a double-star.*
2322	// (and a++ means something else entirely, which we don't support!)
2323	status->set_code(kRegexpRepeatOp);
2324	status->set_error_arg(StringPiece (
2325	lastunary.data(),
2326	static_cast<size_t>(t.data() - lastunary.data())));
2327	return NULL;
2328	}
2329	}
2330	opstr = StringPiece (opstr.data(),
2331	static_cast<size_t>(t.data() - opstr.data()));
2332	if (!ps.PushRepeatOp(op, opstr, nongreedy))
2333	return NULL;
2334	isunary = opstr;
2335	break;
2336	}
2337
2338	case `'{'`: { // Counted repetition.
2339	int lo, hi;
2340	StringPiece opstr = t;
2341	if (!MaybeParseRepetition(&t, &lo, &hi)) {
2342	// Treat like a literal.
2343	if (!ps.PushLiteral(`'{'`))
2344	return NULL;
2345	t.remove_prefix(`1`); // '{'
2346	break;
2347	}
2348	bool nongreedy = false;
2349	if (ps.flags() & PerlX) {
2350	if (!t.empty() && t [`0`] == `'?'`) {
2351	nongreedy = true;
2352	t.remove_prefix(`1`); // '?'
2353	}
2354	if (!lastunary.empty()) {
2355	// Not allowed to stack repetition operators.
2356	status->set_code(kRegexpRepeatOp);
2357	status->set_error_arg(StringPiece (
2358	lastunary.data(),
2359	static_cast<size_t>(t.data() - lastunary.data())));
2360	return NULL;
2361	}
2362	}
2363	opstr = StringPiece (opstr.data(),
2364	static_cast<size_t>(t.data() - opstr.data()));
2365	if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
2366	return NULL;
2367	isunary = opstr;
2368	break;
2369	}
2370
2371	case `'\\'`: { // Escaped character or Perl sequence.
2372	// \b and \B: word boundary or not
2373	if ((ps.flags() & Regexp::PerlB) &&
2374	t.size() >= `2` && (t [`1`] == `'b'` \|\| t [`1`] == `'B'`)) {
2375	if (!ps.PushWordBoundary(t [`1`] == `'b'`))
2376	return NULL;
2377	t.remove_prefix(`2`); // '\\', 'b'
2378	break;
2379	}
2380
2381	if ((ps.flags() & Regexp::PerlX) && t.size() >= `2`) {
2382	if (t [`1`] == `'A'`) {
2383	if (!ps.PushSimpleOp(kRegexpBeginText))
2384	return NULL;
2385	t.remove_prefix(`2`); // '\\', 'A'
2386	break;
2387	}
2388	if (t [`1`] == `'z'`) {
2389	if (!ps.PushSimpleOp(kRegexpEndText))
2390	return NULL;
2391	t.remove_prefix(`2`); // '\\', 'z'
2392	break;
2393	}
2394	// Do not recognize \Z, because this library can't
2395	// implement the exact Perl/PCRE semantics.
2396	// (This library treats "(?-m)$" as \z, even though
2397	// in Perl and PCRE it is equivalent to \Z.)
2398
2399	if (t [`1`] == `'C'`) { // \C: any byte [sic]
2400	if (!ps.PushSimpleOp(kRegexpAnyByte))
2401	return NULL;
2402	t.remove_prefix(`2`); // '\\', 'C'
2403	break;
2404	}
2405
2406	if (t [`1`] == `'Q'`) { // \Q ... \E: the ... is always literals
2407	t.remove_prefix(`2`); // '\\', 'Q'
2408	while (!t.empty()) {
2409	if (t.size() >= `2` && t [`0`] == `'\\'` && t [`1`] == `'E'`) {
2410	t.remove_prefix(`2`); // '\\', 'E'
2411	break;
2412	}
2413	Rune r;
2414	if (StringPieceToRune(&r, &t, status) < `0`)
2415	return NULL;
2416	if (!ps.PushLiteral(r))
2417	return NULL;
2418	}
2419	break;
2420	}
2421	}
2422
2423	if (t.size() >= `2` && (t [`1`] == `'p'` \|\| t [`1`] == `'P'`)) {
2424	Regexp* re = new Regexp (kRegexpCharClass, ps.flags() & ~FoldCase);
2425	re->ccb_ = new CharClassBuilder;
2426	switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {
2427	case kParseOk:
2428	if (!ps.PushRegexp(re))
2429	return NULL;
2430	goto Break2;
2431	case kParseError:
2432	re->Decref();
2433	return NULL;
2434	case kParseNothing:
2435	re->Decref();
2436	break;
2437	}
2438	}
2439
2440	const UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());
2441	if (g != NULL) {
2442	Regexp* re = new Regexp (kRegexpCharClass, ps.flags() & ~FoldCase);
2443	re->ccb_ = new CharClassBuilder;
2444	AddUGroup(re->ccb_, g, g->sign, ps.flags());
2445	if (!ps.PushRegexp(re))
2446	return NULL;
2447	break;
2448	}
2449
2450	Rune r;
2451	if (!ParseEscape(&t, &r, status, ps.rune_max()))
2452	return NULL;
2453	if (!ps.PushLiteral(r))
2454	return NULL;
2455	break;
2456	}
2457	}
2458	Break2:
2459	lastunary = isunary;
2460	}
2461	return ps.DoFinish();
2462	}
2463
2464	} // namespace re2
2465

Browse the source code of RE2/re2/parse.cc