1// Copyright 2008 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Determine whether this library should match PCRE exactly
6// for a particular Regexp. (If so, the testing framework can
7// check that it does.)
8//
9// This library matches PCRE except in these cases:
10// * the regexp contains a repetition of an empty string,
11// like (a*)* or (a*)+. In this case, PCRE will treat
12// the repetition sequence as ending with an empty string,
13// while this library does not.
14// * Perl and PCRE differ on whether \v matches \n.
15// For historical reasons, this library implements the Perl behavior.
16// * Perl and PCRE allow $ in one-line mode to match either the very
17// end of the text or just before a \n at the end of the text.
18// This library requires it to match only the end of the text.
19// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20// match the end of the text if the last character is a \n.
21// This library does allow it.
22//
23// Regexp::MimicsPCRE checks for any of these conditions.
24
25#include "util/util.h"
26#include "util/logging.h"
27#include "re2/regexp.h"
28#include "re2/walker-inl.h"
29
30namespace re2 {
31
32// Returns whether re might match an empty string.
33static bool CanBeEmptyString(Regexp *re);
34
35// Walker class to compute whether library handles a regexp
36// exactly as PCRE would. See comment at top for conditions.
37
38class PCREWalker : public Regexp::Walker<bool> {
39 public:
40 PCREWalker() {}
41 bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
42 int nchild_args);
43
44 bool ShortVisit(Regexp* re, bool a) {
45 // Should never be called: we use Walk not WalkExponential.
46 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
47 return a;
48 }
49};
50
51// Called after visiting each of re's children and accumulating
52// the return values in child_args. So child_args contains whether
53// this library mimics PCRE for those subexpressions.
54bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
55 bool* child_args, int nchild_args) {
56 // If children failed, so do we.
57 for (int i = 0; i < nchild_args; i++)
58 if (!child_args[i])
59 return false;
60
61 // Otherwise look for other reasons to fail.
62 switch (re->op()) {
63 // Look for repeated empty string.
64 case kRegexpStar:
65 case kRegexpPlus:
66 case kRegexpQuest:
67 if (CanBeEmptyString(re->sub()[0]))
68 return false;
69 break;
70 case kRegexpRepeat:
71 if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
72 return false;
73 break;
74
75 // Look for \v
76 case kRegexpLiteral:
77 if (re->rune() == '\v')
78 return false;
79 break;
80
81 // Look for $ in single-line mode.
82 case kRegexpEndText:
83 case kRegexpEmptyMatch:
84 if (re->parse_flags() & Regexp::WasDollar)
85 return false;
86 break;
87
88 // Look for ^ in multi-line mode.
89 case kRegexpBeginLine:
90 // No condition: in single-line mode ^ becomes kRegexpBeginText.
91 return false;
92
93 default:
94 break;
95 }
96
97 // Not proven guilty.
98 return true;
99}
100
101// Returns whether this regexp's behavior will mimic PCRE's exactly.
102bool Regexp::MimicsPCRE() {
103 PCREWalker w;
104 return w.Walk(this, true);
105}
106
107
108// Walker class to compute whether a Regexp can match an empty string.
109// It is okay to overestimate. For example, \b\B cannot match an empty
110// string, because \b and \B are mutually exclusive, but this isn't
111// that smart and will say it can. Spurious empty strings
112// will reduce the number of regexps we sanity check against PCRE,
113// but they won't break anything.
114
115class EmptyStringWalker : public Regexp::Walker<bool> {
116 public:
117 EmptyStringWalker() { }
118 bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
119 bool* child_args, int nchild_args);
120
121 bool ShortVisit(Regexp* re, bool a) {
122 // Should never be called: we use Walk not WalkExponential.
123 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
124 return a;
125 }
126
127 private:
128 EmptyStringWalker(const EmptyStringWalker&) = delete;
129 EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
130};
131
132// Called after visiting re's children. child_args contains the return
133// value from each of the children's PostVisits (i.e., whether each child
134// can match an empty string). Returns whether this clause can match an
135// empty string.
136bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
137 bool* child_args, int nchild_args) {
138 switch (re->op()) {
139 case kRegexpNoMatch: // never empty
140 case kRegexpLiteral:
141 case kRegexpAnyChar:
142 case kRegexpAnyByte:
143 case kRegexpCharClass:
144 case kRegexpLiteralString:
145 return false;
146
147 case kRegexpEmptyMatch: // always empty
148 case kRegexpBeginLine: // always empty, when they match
149 case kRegexpEndLine:
150 case kRegexpNoWordBoundary:
151 case kRegexpWordBoundary:
152 case kRegexpBeginText:
153 case kRegexpEndText:
154 case kRegexpStar: // can always be empty
155 case kRegexpQuest:
156 case kRegexpHaveMatch:
157 return true;
158
159 case kRegexpConcat: // can be empty if all children can
160 for (int i = 0; i < nchild_args; i++)
161 if (!child_args[i])
162 return false;
163 return true;
164
165 case kRegexpAlternate: // can be empty if any child can
166 for (int i = 0; i < nchild_args; i++)
167 if (child_args[i])
168 return true;
169 return false;
170
171 case kRegexpPlus: // can be empty if the child can
172 case kRegexpCapture:
173 return child_args[0];
174
175 case kRegexpRepeat: // can be empty if child can or is x{0}
176 return child_args[0] || re->min() == 0;
177 }
178 return false;
179}
180
181// Returns whether re can match an empty string.
182static bool CanBeEmptyString(Regexp* re) {
183 EmptyStringWalker w;
184 return w.Walk(re, true);
185}
186
187} // namespace re2
188