simplify.cc source code [RE2/re2/simplify.cc]

1	// Copyright 2006 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Rewrite POSIX and other features in re
6	// to use simple extended regular expression features.
7	// Also sort and simplify character classes.
8
9	#include <string>
10
11	#include "util/util.h"
12	#include "util/logging.h"
13	#include "util/utf.h"
14	#include "re2/pod_array.h"
15	#include "re2/regexp.h"
16	#include "re2/walker-inl.h"
17
18	namespace re2 {
19
20	// Parses the regexp src and then simplifies it and sets dst to the*
21	// string representation of the simplified form. Returns true on success.
22	// Returns false and sets error (if error != NULL) on error.*
23	bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
24	std::string* dst, RegexpStatus* status) {
25	Regexp* re = Parse(src, flags, status);
26	if (re == NULL)
27	return false;
28	Regexp* sre = re->Simplify();
29	re->Decref();
30	if (sre == NULL) {
31	// Should not happen, since Simplify never fails.
32	LOG(ERROR) << "Simplify failed on " << src;
33	if (status) {
34	status->set_code(kRegexpInternalError);
35	status->set_error_arg(src);
36	}
37	return false;
38	}
39	*dst = sre->ToString();
40	sre->Decref();
41	return true;
42	}
43
44	// Assuming the simple_ flags on the children are accurate,
45	// is this Regexp simple?*
46	bool Regexp::ComputeSimple() {
47	Regexp** subs;
48	switch (op_) {
49	case kRegexpNoMatch:
50	case kRegexpEmptyMatch:
51	case kRegexpLiteral:
52	case kRegexpLiteralString:
53	case kRegexpBeginLine:
54	case kRegexpEndLine:
55	case kRegexpBeginText:
56	case kRegexpWordBoundary:
57	case kRegexpNoWordBoundary:
58	case kRegexpEndText:
59	case kRegexpAnyChar:
60	case kRegexpAnyByte:
61	case kRegexpHaveMatch:
62	return true;
63	case kRegexpConcat:
64	case kRegexpAlternate:
65	// These are simple as long as the subpieces are simple.
66	subs = sub();
67	for (int i = `0`; i < nsub_; i++)
68	if (!subs[i]->simple())
69	return false;
70	return true;
71	case kRegexpCharClass:
72	// Simple as long as the char class is not empty, not full.
73	if (ccb_ != NULL)
74	return !ccb_->empty() && !ccb_->full();
75	return !cc_->empty() && !cc_->full();
76	case kRegexpCapture:
77	subs = sub();
78	return subs[`0`]->simple();
79	case kRegexpStar:
80	case kRegexpPlus:
81	case kRegexpQuest:
82	subs = sub();
83	if (!subs[`0`]->simple())
84	return false;
85	switch (subs[`0`]->op_) {
86	case kRegexpStar:
87	case kRegexpPlus:
88	case kRegexpQuest:
89	case kRegexpEmptyMatch:
90	case kRegexpNoMatch:
91	return false;
92	default:
93	break;
94	}
95	return true;
96	case kRegexpRepeat:
97	return false;
98	}
99	LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
100	return false;
101	}
102
103	// Walker subclass used by Simplify.
104	// Coalesces runs of star/plus/quest/repeat of the same literal along with any
105	// occurrences of that literal into repeats of that literal. It also works for
106	// char classes, any char and any byte.
107	// PostVisit creates the coalesced result, which should then be simplified.
108	class CoalesceWalker : public Regexp::Walker<Regexp*> {
109	public:
110	CoalesceWalker() {}
111	virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
112	Regexp** child_args, int nchild_args);
113	virtual Regexp* Copy(Regexp* re);
114	virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
115
116	private:
117	// These functions are declared inside CoalesceWalker so that
118	// they can edit the private fields of the Regexps they construct.
119
120	// Returns true if r1 and r2 can be coalesced. In particular, ensures that
121	// the parse flags are consistent. (They will not be checked again later.)
122	static bool CanCoalesce(Regexp* r1, Regexp* r2);
123
124	// Coalesces r1ptr and r2ptr. In most cases, the array elements afterwards
125	// will be empty match and the coalesced op. In other cases, where part of a
126	// literal string was removed to be coalesced, the array elements afterwards
127	// will be the coalesced op and the remainder of the literal string.
128	static void DoCoalesce(Regexp r1ptr, Regexp r2ptr);
129
130	CoalesceWalker(const CoalesceWalker&) = delete;
131	CoalesceWalker& operator=(const CoalesceWalker&) = delete;
132	};
133
134	// Walker subclass used by Simplify.
135	// The simplify walk is purely post-recursive: given the simplified children,
136	// PostVisit creates the simplified result.
137	// The child_args are simplified Regexps.*
138	class SimplifyWalker : public Regexp::Walker<Regexp*> {
139	public:
140	SimplifyWalker() {}
141	virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
142	virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
143	Regexp** child_args, int nchild_args);
144	virtual Regexp* Copy(Regexp* re);
145	virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
146
147	private:
148	// These functions are declared inside SimplifyWalker so that
149	// they can edit the private fields of the Regexps they construct.
150
151	// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
152	// Caller must Decref return value when done with it.
153	static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
154
155	// Simplifies the expression re{min,max} in terms of , +, and ?.*
156	// Returns a new regexp. Does not edit re. Does not consume reference to re.
157	// Caller must Decref return value when done with it.
158	static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
159	Regexp::ParseFlags parse_flags);
160
161	// Simplifies a character class by expanding any named classes
162	// into rune ranges. Does not edit re. Does not consume ref to re.
163	// Caller must Decref return value when done with it.
164	static Regexp* SimplifyCharClass(Regexp* re);
165
166	SimplifyWalker(const SimplifyWalker&) = delete;
167	SimplifyWalker& operator=(const SimplifyWalker&) = delete;
168	};
169
170	// Simplifies a regular expression, returning a new regexp.
171	// The new regexp uses traditional Unix egrep features only,
172	// plus the Perl (?:) non-capturing parentheses.
173	// Otherwise, no POSIX or Perl additions. The new regexp
174	// captures exactly the same subexpressions (with the same indices)
175	// as the original.
176	// Does not edit current object.
177	// Caller must Decref() return value when done with it.
178
179	Regexp* Regexp::Simplify() {
180	CoalesceWalker cw;
181	Regexp* cre = cw.Walk(this, NULL);
182	if (cre == NULL)
183	return cre;
184	SimplifyWalker sw;
185	Regexp* sre = sw.Walk(cre, NULL);
186	cre->Decref();
187	return sre;
188	}
189
190	#define Simplify DontCallSimplify // Avoid accidental recursion
191
192	// Utility function for PostVisit implementations that compares re->sub() with
193	// child_args to determine whether any child_args changed. In the common case,
194	// where nothing changed, calls Decref() for all child_args and returns false,
195	// so PostVisit must return re->Incref(). Otherwise, returns true.
196	static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
197	for (int i = `0`; i < re->nsub(); i++) {
198	Regexp* sub = re->sub()[i];
199	Regexp* newsub = child_args[i];
200	if (newsub != sub)
201	return true;
202	}
203	for (int i = `0`; i < re->nsub(); i++) {
204	Regexp* newsub = child_args[i];
205	newsub->Decref();
206	}
207	return false;
208	}
209
210	Regexp* CoalesceWalker::Copy(Regexp* re) {
211	return re->Incref();
212	}
213
214	Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
215	// This should never be called, since we use Walk and not
216	// WalkExponential.
217	LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
218	return re->Incref();
219	}
220
221	Regexp* CoalesceWalker::PostVisit(Regexp* re,
222	Regexp* parent_arg,
223	Regexp* pre_arg,
224	Regexp** child_args,
225	int nchild_args) {
226	if (re->nsub() == `0`)
227	return re->Incref();
228
229	if (re->op() != kRegexpConcat) {
230	if (!ChildArgsChanged(re, child_args))
231	return re->Incref();
232
233	// Something changed. Build a new op.
234	Regexp* nre = new Regexp (re->op(), re->parse_flags());
235	nre->AllocSub(re->nsub());
236	Regexp** nre_subs = nre->sub();
237	for (int i = `0`; i < re->nsub(); i++)
238	nre_subs[i] = child_args[i];
239	// Repeats and Captures have additional data that must be copied.
240	if (re->op() == kRegexpRepeat) {
241	nre->min_ = re->min();
242	nre->max_ = re->max();
243	} else if (re->op() == kRegexpCapture) {
244	nre->cap_ = re->cap();
245	}
246	return nre;
247	}
248
249	bool can_coalesce = false;
250	for (int i = `0`; i < re->nsub(); i++) {
251	if (i+`1` < re->nsub() &&
252	CanCoalesce(child_args[i], child_args[i+`1`])) {
253	can_coalesce = true;
254	break;
255	}
256	}
257	if (!can_coalesce) {
258	if (!ChildArgsChanged(re, child_args))
259	return re->Incref();
260
261	// Something changed. Build a new op.
262	Regexp* nre = new Regexp (re->op(), re->parse_flags());
263	nre->AllocSub(re->nsub());
264	Regexp** nre_subs = nre->sub();
265	for (int i = `0`; i < re->nsub(); i++)
266	nre_subs[i] = child_args[i];
267	return nre;
268	}
269
270	for (int i = `0`; i < re->nsub(); i++) {
271	if (i+`1` < re->nsub() &&
272	CanCoalesce(child_args[i], child_args[i+`1`]))
273	DoCoalesce(&child_args[i], &child_args[i+`1`]);
274	}
275	// Determine how many empty matches were left by DoCoalesce.
276	int n = `0`;
277	for (int i = n; i < re->nsub(); i++) {
278	if (child_args[i]->op() == kRegexpEmptyMatch)
279	n++;
280	}
281	// Build a new op.
282	Regexp* nre = new Regexp (re->op(), re->parse_flags());
283	nre->AllocSub(re->nsub() - n);
284	Regexp** nre_subs = nre->sub();
285	for (int i = `0`, j = `0`; i < re->nsub(); i++) {
286	if (child_args[i]->op() == kRegexpEmptyMatch) {
287	child_args[i]->Decref();
288	continue;
289	}
290	nre_subs[j] = child_args[i];
291	j++;
292	}
293	return nre;
294	}
295
296	bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
297	// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
298	// any byte.
299	if ((r1->op() == kRegexpStar \|\|
300	r1->op() == kRegexpPlus \|\|
301	r1->op() == kRegexpQuest \|\|
302	r1->op() == kRegexpRepeat) &&
303	(r1->sub()[`0`]->op() == kRegexpLiteral \|\|
304	r1->sub()[`0`]->op() == kRegexpCharClass \|\|
305	r1->sub()[`0`]->op() == kRegexpAnyChar \|\|
306	r1->sub()[`0`]->op() == kRegexpAnyByte)) {
307	// r2 must be a star/plus/quest/repeat of the same literal, char class,
308	// any char or any byte.
309	if ((r2->op() == kRegexpStar \|\|
310	r2->op() == kRegexpPlus \|\|
311	r2->op() == kRegexpQuest \|\|
312	r2->op() == kRegexpRepeat) &&
313	Regexp::Equal(r1->sub()[`0`], r2->sub()[`0`]) &&
314	// The parse flags must be consistent.
315	((r1->parse_flags() & Regexp::NonGreedy) ==
316	(r2->parse_flags() & Regexp::NonGreedy))) {
317	return true;
318	}
319	// ... OR an occurrence of that literal, char class, any char or any byte
320	if (Regexp::Equal(r1->sub()[`0`], r2)) {
321	return true;
322	}
323	// ... OR a literal string that begins with that literal.
324	if (r1->sub()[`0`]->op() == kRegexpLiteral &&
325	r2->op() == kRegexpLiteralString &&
326	r2->runes()[`0`] == r1->sub()[`0`]->rune() &&
327	// The parse flags must be consistent.
328	((r1->sub()[`0`]->parse_flags() & Regexp::FoldCase) ==
329	(r2->parse_flags() & Regexp::FoldCase))) {
330	return true;
331	}
332	}
333	return false;
334	}
335
336	void CoalesceWalker::DoCoalesce(Regexp r1ptr, Regexp r2ptr) {
337	Regexp* r1 = *r1ptr;
338	Regexp* r2 = *r2ptr;
339
340	Regexp* nre = Regexp::Repeat(
341	r1->sub()[`0`]->Incref(), r1->parse_flags(), `0`, `0`);
342
343	switch (r1->op()) {
344	case kRegexpStar:
345	nre->min_ = `0`;
346	nre->max_ = -`1`;
347	break;
348
349	case kRegexpPlus:
350	nre->min_ = `1`;
351	nre->max_ = -`1`;
352	break;
353
354	case kRegexpQuest:
355	nre->min_ = `0`;
356	nre->max_ = `1`;
357	break;
358
359	case kRegexpRepeat:
360	nre->min_ = r1->min();
361	nre->max_ = r1->max();
362	break;
363
364	default:
365	LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
366	nre->Decref();
367	return;
368	}
369
370	switch (r2->op()) {
371	case kRegexpStar:
372	nre->max_ = -`1`;
373	goto LeaveEmpty;
374
375	case kRegexpPlus:
376	nre->min_++;
377	nre->max_ = -`1`;
378	goto LeaveEmpty;
379
380	case kRegexpQuest:
381	if (nre->max() != -`1`)
382	nre->max_++;
383	goto LeaveEmpty;
384
385	case kRegexpRepeat:
386	nre->min_ += r2->min();
387	if (r2->max() == -`1`)
388	nre->max_ = -`1`;
389	else if (nre->max() != -`1`)
390	nre->max_ += r2->max();
391	goto LeaveEmpty;
392
393	case kRegexpLiteral:
394	case kRegexpCharClass:
395	case kRegexpAnyChar:
396	case kRegexpAnyByte:
397	nre->min_++;
398	if (nre->max() != -`1`)
399	nre->max_++;
400	goto LeaveEmpty;
401
402	LeaveEmpty:
403	r1ptr = new* Regexp (kRegexpEmptyMatch, Regexp::NoParseFlags);
404	*r2ptr = nre;
405	break;
406
407	case kRegexpLiteralString: {
408	Rune r = r1->sub()[`0`]->rune();
409	// Determine how much of the literal string is removed.
410	// We know that we have at least one rune. :)
411	int n = `1`;
412	while (n < r2->nrunes() && r2->runes()[n] == r)
413	n++;
414	nre->min_ += n;
415	if (nre->max() != -`1`)
416	nre->max_ += n;
417	if (n == r2->nrunes())
418	goto LeaveEmpty;
419	*r1ptr = nre;
420	*r2ptr = Regexp::LiteralString(
421	&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
422	break;
423	}
424
425	default:
426	LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
427	nre->Decref();
428	return;
429	}
430
431	r1->Decref();
432	r2->Decref();
433	}
434
435	Regexp* SimplifyWalker::Copy(Regexp* re) {
436	return re->Incref();
437	}
438
439	Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
440	// This should never be called, since we use Walk and not
441	// WalkExponential.
442	LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
443	return re->Incref();
444	}
445
446	Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
447	if (re->simple()) {
448	stop = true*;
449	return re->Incref();
450	}
451	return NULL;
452	}
453
454	Regexp* SimplifyWalker::PostVisit(Regexp* re,
455	Regexp* parent_arg,
456	Regexp* pre_arg,
457	Regexp** child_args,
458	int nchild_args) {
459	switch (re->op()) {
460	case kRegexpNoMatch:
461	case kRegexpEmptyMatch:
462	case kRegexpLiteral:
463	case kRegexpLiteralString:
464	case kRegexpBeginLine:
465	case kRegexpEndLine:
466	case kRegexpBeginText:
467	case kRegexpWordBoundary:
468	case kRegexpNoWordBoundary:
469	case kRegexpEndText:
470	case kRegexpAnyChar:
471	case kRegexpAnyByte:
472	case kRegexpHaveMatch:
473	// All these are always simple.
474	re->simple_ = true;
475	return re->Incref();
476
477	case kRegexpConcat:
478	case kRegexpAlternate: {
479	// These are simple as long as the subpieces are simple.
480	if (!ChildArgsChanged(re, child_args)) {
481	re->simple_ = true;
482	return re->Incref();
483	}
484	Regexp* nre = new Regexp (re->op(), re->parse_flags());
485	nre->AllocSub(re->nsub());
486	Regexp** nre_subs = nre->sub();
487	for (int i = `0`; i < re->nsub(); i++)
488	nre_subs[i] = child_args[i];
489	nre->simple_ = true;
490	return nre;
491	}
492
493	case kRegexpCapture: {
494	Regexp* newsub = child_args[`0`];
495	if (newsub == re->sub()[`0`]) {
496	newsub->Decref();
497	re->simple_ = true;
498	return re->Incref();
499	}
500	Regexp* nre = new Regexp (kRegexpCapture, re->parse_flags());
501	nre->AllocSub(`1`);
502	nre->sub()[`0`] = newsub;
503	nre->cap_ = re->cap();
504	nre->simple_ = true;
505	return nre;
506	}
507
508	case kRegexpStar:
509	case kRegexpPlus:
510	case kRegexpQuest: {
511	Regexp* newsub = child_args[`0`];
512	// Special case: repeat the empty string as much as
513	// you want, but it's still the empty string.
514	if (newsub->op() == kRegexpEmptyMatch)
515	return newsub;
516
517	// These are simple as long as the subpiece is simple.
518	if (newsub == re->sub()[`0`]) {
519	newsub->Decref();
520	re->simple_ = true;
521	return re->Incref();
522	}
523
524	// These are also idempotent if flags are constant.
525	if (re->op() == newsub->op() &&
526	re->parse_flags() == newsub->parse_flags())
527	return newsub;
528
529	Regexp* nre = new Regexp (re->op(), re->parse_flags());
530	nre->AllocSub(`1`);
531	nre->sub()[`0`] = newsub;
532	nre->simple_ = true;
533	return nre;
534	}
535
536	case kRegexpRepeat: {
537	Regexp* newsub = child_args[`0`];
538	// Special case: repeat the empty string as much as
539	// you want, but it's still the empty string.
540	if (newsub->op() == kRegexpEmptyMatch)
541	return newsub;
542
543	Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
544	re->parse_flags());
545	newsub->Decref();
546	nre->simple_ = true;
547	return nre;
548	}
549
550	case kRegexpCharClass: {
551	Regexp* nre = SimplifyCharClass(re);
552	nre->simple_ = true;
553	return nre;
554	}
555	}
556
557	LOG(ERROR) << "Simplify case not handled: " << re->op();
558	return re->Incref();
559	}
560
561	// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
562	// Returns a new Regexp, handing the ref to the caller.
563	Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
564	Regexp::ParseFlags parse_flags) {
565	Regexp* re = new Regexp (kRegexpConcat, parse_flags);
566	re->AllocSub(`2`);
567	Regexp** subs = re->sub();
568	subs[`0`] = re1;
569	subs[`1`] = re2;
570	return re;
571	}
572
573	// Simplifies the expression re{min,max} in terms of , +, and ?.*
574	// Returns a new regexp. Does not edit re. Does not consume reference to re.
575	// Caller must Decref return value when done with it.
576	// The result will not* necessarily have the right capturing parens*
577	// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
578	// but in the Regexp representation, both (x) are marked as $1.*
579	Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
580	Regexp::ParseFlags f) {
581	// x{n,} means at least n matches of x.
582	if (max == -`1`) {
583	// Special case: x{0,} is x*
584	if (min == `0`)
585	return Regexp::Star(re->Incref(), f);
586
587	// Special case: x{1,} is x+
588	if (min == `1`)
589	return Regexp::Plus(re->Incref(), f);
590
591	// General case: x{4,} is xxxx+
592	PODArray<Regexp*> nre_subs(min);
593	for (int i = `0`; i < min-`1`; i++)
594	nre_subs [i] = re->Incref();
595	nre_subs [min-`1`] = Regexp::Plus(re->Incref(), f);
596	return Regexp::Concat(nre_subs.data(), min, f);
597	}
598
599	// Special case: (x){0} matches only empty string.
600	if (min == `0` && max == `0`)
601	return new Regexp (kRegexpEmptyMatch, f);
602
603	// Special case: x{1} is just x.
604	if (min == `1` && max == `1`)
605	return re->Incref();
606
607	// General case: x{n,m} means n copies of x and m copies of x?.
608	// The machine will do less work if we nest the final m copies,
609	// so that x{2,5} = xx(x(x(x)?)?)?
610
611	// Build leading prefix: xx. Capturing only on the last one.
612	Regexp* nre = NULL;
613	if (min > `0`) {
614	PODArray<Regexp*> nre_subs(min);
615	for (int i = `0`; i < min; i++)
616	nre_subs [i] = re->Incref();
617	nre = Regexp::Concat(nre_subs.data(), min, f);
618	}
619
620	// Build and attach suffix: (x(x(x)?)?)?
621	if (max > min) {
622	Regexp* suf = Regexp::Quest(re->Incref(), f);
623	for (int i = min+`1`; i < max; i++)
624	suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
625	if (nre == NULL)
626	nre = suf;
627	else
628	nre = Concat2(nre, suf, f);
629	}
630
631	if (nre == NULL) {
632	// Some degenerate case, like min > max, or min < max < 0.
633	// This shouldn't happen, because the parser rejects such regexps.
634	LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
635	return new Regexp (kRegexpNoMatch, f);
636	}
637
638	return nre;
639	}
640
641	// Simplifies a character class.
642	// Caller must Decref return value when done with it.
643	Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
644	CharClass* cc = re->cc();
645
646	// Special cases
647	if (cc->empty())
648	return new Regexp (kRegexpNoMatch, re->parse_flags());
649	if (cc->full())
650	return new Regexp (kRegexpAnyChar, re->parse_flags());
651
652	return re->Incref();
653	}
654
655	} // namespace re2
656

Browse the source code of RE2/re2/simplify.cc