1 | // Copyright 2008 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Determine whether this library should match PCRE exactly |
6 | // for a particular Regexp. (If so, the testing framework can |
7 | // check that it does.) |
8 | // |
9 | // This library matches PCRE except in these cases: |
10 | // * the regexp contains a repetition of an empty string, |
11 | // like (a*)* or (a*)+. In this case, PCRE will treat |
12 | // the repetition sequence as ending with an empty string, |
13 | // while this library does not. |
14 | // * Perl and PCRE differ on whether \v matches \n. |
15 | // For historical reasons, this library implements the Perl behavior. |
16 | // * Perl and PCRE allow $ in one-line mode to match either the very |
17 | // end of the text or just before a \n at the end of the text. |
18 | // This library requires it to match only the end of the text. |
19 | // * Similarly, Perl and PCRE do not allow ^ in multi-line mode to |
20 | // match the end of the text if the last character is a \n. |
21 | // This library does allow it. |
22 | // |
23 | // Regexp::MimicsPCRE checks for any of these conditions. |
24 | |
25 | #include "util/util.h" |
26 | #include "util/logging.h" |
27 | #include "re2/regexp.h" |
28 | #include "re2/walker-inl.h" |
29 | |
30 | namespace re2 { |
31 | |
32 | // Returns whether re might match an empty string. |
33 | static bool CanBeEmptyString(Regexp *re); |
34 | |
35 | // Walker class to compute whether library handles a regexp |
36 | // exactly as PCRE would. See comment at top for conditions. |
37 | |
38 | class PCREWalker : public Regexp::Walker<bool> { |
39 | public: |
40 | PCREWalker() {} |
41 | bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args, |
42 | int nchild_args); |
43 | |
44 | bool ShortVisit(Regexp* re, bool a) { |
45 | // Should never be called: we use Walk not WalkExponential. |
46 | LOG(DFATAL) << "EmptyStringWalker::ShortVisit called" ; |
47 | return a; |
48 | } |
49 | }; |
50 | |
51 | // Called after visiting each of re's children and accumulating |
52 | // the return values in child_args. So child_args contains whether |
53 | // this library mimics PCRE for those subexpressions. |
54 | bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, |
55 | bool* child_args, int nchild_args) { |
56 | // If children failed, so do we. |
57 | for (int i = 0; i < nchild_args; i++) |
58 | if (!child_args[i]) |
59 | return false; |
60 | |
61 | // Otherwise look for other reasons to fail. |
62 | switch (re->op()) { |
63 | // Look for repeated empty string. |
64 | case kRegexpStar: |
65 | case kRegexpPlus: |
66 | case kRegexpQuest: |
67 | if (CanBeEmptyString(re->sub()[0])) |
68 | return false; |
69 | break; |
70 | case kRegexpRepeat: |
71 | if (re->max() == -1 && CanBeEmptyString(re->sub()[0])) |
72 | return false; |
73 | break; |
74 | |
75 | // Look for \v |
76 | case kRegexpLiteral: |
77 | if (re->rune() == '\v') |
78 | return false; |
79 | break; |
80 | |
81 | // Look for $ in single-line mode. |
82 | case kRegexpEndText: |
83 | case kRegexpEmptyMatch: |
84 | if (re->parse_flags() & Regexp::WasDollar) |
85 | return false; |
86 | break; |
87 | |
88 | // Look for ^ in multi-line mode. |
89 | case kRegexpBeginLine: |
90 | // No condition: in single-line mode ^ becomes kRegexpBeginText. |
91 | return false; |
92 | |
93 | default: |
94 | break; |
95 | } |
96 | |
97 | // Not proven guilty. |
98 | return true; |
99 | } |
100 | |
101 | // Returns whether this regexp's behavior will mimic PCRE's exactly. |
102 | bool Regexp::MimicsPCRE() { |
103 | PCREWalker w; |
104 | return w.Walk(this, true); |
105 | } |
106 | |
107 | |
108 | // Walker class to compute whether a Regexp can match an empty string. |
109 | // It is okay to overestimate. For example, \b\B cannot match an empty |
110 | // string, because \b and \B are mutually exclusive, but this isn't |
111 | // that smart and will say it can. Spurious empty strings |
112 | // will reduce the number of regexps we sanity check against PCRE, |
113 | // but they won't break anything. |
114 | |
115 | class EmptyStringWalker : public Regexp::Walker<bool> { |
116 | public: |
117 | EmptyStringWalker() { } |
118 | bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, |
119 | bool* child_args, int nchild_args); |
120 | |
121 | bool ShortVisit(Regexp* re, bool a) { |
122 | // Should never be called: we use Walk not WalkExponential. |
123 | LOG(DFATAL) << "EmptyStringWalker::ShortVisit called" ; |
124 | return a; |
125 | } |
126 | |
127 | private: |
128 | EmptyStringWalker(const EmptyStringWalker&) = delete; |
129 | EmptyStringWalker& operator=(const EmptyStringWalker&) = delete; |
130 | }; |
131 | |
132 | // Called after visiting re's children. child_args contains the return |
133 | // value from each of the children's PostVisits (i.e., whether each child |
134 | // can match an empty string). Returns whether this clause can match an |
135 | // empty string. |
136 | bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, |
137 | bool* child_args, int nchild_args) { |
138 | switch (re->op()) { |
139 | case kRegexpNoMatch: // never empty |
140 | case kRegexpLiteral: |
141 | case kRegexpAnyChar: |
142 | case kRegexpAnyByte: |
143 | case kRegexpCharClass: |
144 | case kRegexpLiteralString: |
145 | return false; |
146 | |
147 | case kRegexpEmptyMatch: // always empty |
148 | case kRegexpBeginLine: // always empty, when they match |
149 | case kRegexpEndLine: |
150 | case kRegexpNoWordBoundary: |
151 | case kRegexpWordBoundary: |
152 | case kRegexpBeginText: |
153 | case kRegexpEndText: |
154 | case kRegexpStar: // can always be empty |
155 | case kRegexpQuest: |
156 | case kRegexpHaveMatch: |
157 | return true; |
158 | |
159 | case kRegexpConcat: // can be empty if all children can |
160 | for (int i = 0; i < nchild_args; i++) |
161 | if (!child_args[i]) |
162 | return false; |
163 | return true; |
164 | |
165 | case kRegexpAlternate: // can be empty if any child can |
166 | for (int i = 0; i < nchild_args; i++) |
167 | if (child_args[i]) |
168 | return true; |
169 | return false; |
170 | |
171 | case kRegexpPlus: // can be empty if the child can |
172 | case kRegexpCapture: |
173 | return child_args[0]; |
174 | |
175 | case kRegexpRepeat: // can be empty if child can or is x{0} |
176 | return child_args[0] || re->min() == 0; |
177 | } |
178 | return false; |
179 | } |
180 | |
181 | // Returns whether re can match an empty string. |
182 | static bool CanBeEmptyString(Regexp* re) { |
183 | EmptyStringWalker w; |
184 | return w.Walk(re, true); |
185 | } |
186 | |
187 | } // namespace re2 |
188 | |