1 | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Format a regular expression structure as a string. |
6 | // Tested by parse_test.cc |
7 | |
8 | #include <string.h> |
9 | #include <string> |
10 | |
11 | #include "util/util.h" |
12 | #include "util/logging.h" |
13 | #include "util/strutil.h" |
14 | #include "util/utf.h" |
15 | #include "re2/regexp.h" |
16 | #include "re2/walker-inl.h" |
17 | |
18 | namespace re2 { |
19 | |
20 | enum { |
21 | PrecAtom, |
22 | PrecUnary, |
23 | PrecConcat, |
24 | PrecAlternate, |
25 | PrecEmpty, |
26 | PrecParen, |
27 | PrecToplevel, |
28 | }; |
29 | |
30 | // Helper function. See description below. |
31 | static void AppendCCRange(std::string* t, Rune lo, Rune hi); |
32 | |
33 | // Walker to generate string in s_. |
34 | // The arg pointers are actually integers giving the |
35 | // context precedence. |
36 | // The child_args are always NULL. |
37 | class ToStringWalker : public Regexp::Walker<int> { |
38 | public: |
39 | explicit ToStringWalker(std::string* t) : t_(t) {} |
40 | |
41 | virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); |
42 | virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, |
43 | int* child_args, int nchild_args); |
44 | virtual int ShortVisit(Regexp* re, int parent_arg) { |
45 | return 0; |
46 | } |
47 | |
48 | private: |
49 | std::string* t_; // The string the walker appends to. |
50 | |
51 | ToStringWalker(const ToStringWalker&) = delete; |
52 | ToStringWalker& operator=(const ToStringWalker&) = delete; |
53 | }; |
54 | |
55 | std::string Regexp::ToString() { |
56 | std::string t; |
57 | ToStringWalker w(&t); |
58 | w.WalkExponential(this, PrecToplevel, 100000); |
59 | if (w.stopped_early()) |
60 | t += " [truncated]" ; |
61 | return t; |
62 | } |
63 | |
64 | #define ToString DontCallToString // Avoid accidental recursion. |
65 | |
66 | // Visits re before children are processed. |
67 | // Appends ( if needed and passes new precedence to children. |
68 | int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { |
69 | int prec = parent_arg; |
70 | int nprec = PrecAtom; |
71 | |
72 | switch (re->op()) { |
73 | case kRegexpNoMatch: |
74 | case kRegexpEmptyMatch: |
75 | case kRegexpLiteral: |
76 | case kRegexpAnyChar: |
77 | case kRegexpAnyByte: |
78 | case kRegexpBeginLine: |
79 | case kRegexpEndLine: |
80 | case kRegexpBeginText: |
81 | case kRegexpEndText: |
82 | case kRegexpWordBoundary: |
83 | case kRegexpNoWordBoundary: |
84 | case kRegexpCharClass: |
85 | case kRegexpHaveMatch: |
86 | nprec = PrecAtom; |
87 | break; |
88 | |
89 | case kRegexpConcat: |
90 | case kRegexpLiteralString: |
91 | if (prec < PrecConcat) |
92 | t_->append("(?:" ); |
93 | nprec = PrecConcat; |
94 | break; |
95 | |
96 | case kRegexpAlternate: |
97 | if (prec < PrecAlternate) |
98 | t_->append("(?:" ); |
99 | nprec = PrecAlternate; |
100 | break; |
101 | |
102 | case kRegexpCapture: |
103 | t_->append("(" ); |
104 | if (re->cap() == 0) |
105 | LOG(DFATAL) << "kRegexpCapture cap() == 0" ; |
106 | if (re->name()) { |
107 | t_->append("?P<" ); |
108 | t_->append(*re->name()); |
109 | t_->append(">" ); |
110 | } |
111 | nprec = PrecParen; |
112 | break; |
113 | |
114 | case kRegexpStar: |
115 | case kRegexpPlus: |
116 | case kRegexpQuest: |
117 | case kRegexpRepeat: |
118 | if (prec < PrecUnary) |
119 | t_->append("(?:" ); |
120 | // The subprecedence here is PrecAtom instead of PrecUnary |
121 | // because PCRE treats two unary ops in a row as a parse error. |
122 | nprec = PrecAtom; |
123 | break; |
124 | } |
125 | |
126 | return nprec; |
127 | } |
128 | |
129 | static void AppendLiteral(std::string *t, Rune r, bool foldcase) { |
130 | if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\" , r)) { |
131 | t->append(1, '\\'); |
132 | t->append(1, static_cast<char>(r)); |
133 | } else if (foldcase && 'a' <= r && r <= 'z') { |
134 | r -= 'a' - 'A'; |
135 | t->append(1, '['); |
136 | t->append(1, static_cast<char>(r)); |
137 | t->append(1, static_cast<char>(r) + 'a' - 'A'); |
138 | t->append(1, ']'); |
139 | } else { |
140 | AppendCCRange(t, r, r); |
141 | } |
142 | } |
143 | |
144 | // Visits re after children are processed. |
145 | // For childless regexps, all the work is done here. |
146 | // For regexps with children, append any unary suffixes or ). |
147 | int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, |
148 | int* child_args, int nchild_args) { |
149 | int prec = parent_arg; |
150 | switch (re->op()) { |
151 | case kRegexpNoMatch: |
152 | // There's no simple symbol for "no match", but |
153 | // [^0-Runemax] excludes everything. |
154 | t_->append("[^\\x00-\\x{10ffff}]" ); |
155 | break; |
156 | |
157 | case kRegexpEmptyMatch: |
158 | // Append (?:) to make empty string visible, |
159 | // unless this is already being parenthesized. |
160 | if (prec < PrecEmpty) |
161 | t_->append("(?:)" ); |
162 | break; |
163 | |
164 | case kRegexpLiteral: |
165 | AppendLiteral(t_, re->rune(), |
166 | (re->parse_flags() & Regexp::FoldCase) != 0); |
167 | break; |
168 | |
169 | case kRegexpLiteralString: |
170 | for (int i = 0; i < re->nrunes(); i++) |
171 | AppendLiteral(t_, re->runes()[i], |
172 | (re->parse_flags() & Regexp::FoldCase) != 0); |
173 | if (prec < PrecConcat) |
174 | t_->append(")" ); |
175 | break; |
176 | |
177 | case kRegexpConcat: |
178 | if (prec < PrecConcat) |
179 | t_->append(")" ); |
180 | break; |
181 | |
182 | case kRegexpAlternate: |
183 | // Clumsy but workable: the children all appended | |
184 | // at the end of their strings, so just remove the last one. |
185 | if ((*t_)[t_->size()-1] == '|') |
186 | t_->erase(t_->size()-1); |
187 | else |
188 | LOG(DFATAL) << "Bad final char: " << t_; |
189 | if (prec < PrecAlternate) |
190 | t_->append(")" ); |
191 | break; |
192 | |
193 | case kRegexpStar: |
194 | t_->append("*" ); |
195 | if (re->parse_flags() & Regexp::NonGreedy) |
196 | t_->append("?" ); |
197 | if (prec < PrecUnary) |
198 | t_->append(")" ); |
199 | break; |
200 | |
201 | case kRegexpPlus: |
202 | t_->append("+" ); |
203 | if (re->parse_flags() & Regexp::NonGreedy) |
204 | t_->append("?" ); |
205 | if (prec < PrecUnary) |
206 | t_->append(")" ); |
207 | break; |
208 | |
209 | case kRegexpQuest: |
210 | t_->append("?" ); |
211 | if (re->parse_flags() & Regexp::NonGreedy) |
212 | t_->append("?" ); |
213 | if (prec < PrecUnary) |
214 | t_->append(")" ); |
215 | break; |
216 | |
217 | case kRegexpRepeat: |
218 | if (re->max() == -1) |
219 | t_->append(StringPrintf("{%d,}" , re->min())); |
220 | else if (re->min() == re->max()) |
221 | t_->append(StringPrintf("{%d}" , re->min())); |
222 | else |
223 | t_->append(StringPrintf("{%d,%d}" , re->min(), re->max())); |
224 | if (re->parse_flags() & Regexp::NonGreedy) |
225 | t_->append("?" ); |
226 | if (prec < PrecUnary) |
227 | t_->append(")" ); |
228 | break; |
229 | |
230 | case kRegexpAnyChar: |
231 | t_->append("." ); |
232 | break; |
233 | |
234 | case kRegexpAnyByte: |
235 | t_->append("\\C" ); |
236 | break; |
237 | |
238 | case kRegexpBeginLine: |
239 | t_->append("^" ); |
240 | break; |
241 | |
242 | case kRegexpEndLine: |
243 | t_->append("$" ); |
244 | break; |
245 | |
246 | case kRegexpBeginText: |
247 | t_->append("(?-m:^)" ); |
248 | break; |
249 | |
250 | case kRegexpEndText: |
251 | if (re->parse_flags() & Regexp::WasDollar) |
252 | t_->append("(?-m:$)" ); |
253 | else |
254 | t_->append("\\z" ); |
255 | break; |
256 | |
257 | case kRegexpWordBoundary: |
258 | t_->append("\\b" ); |
259 | break; |
260 | |
261 | case kRegexpNoWordBoundary: |
262 | t_->append("\\B" ); |
263 | break; |
264 | |
265 | case kRegexpCharClass: { |
266 | if (re->cc()->size() == 0) { |
267 | t_->append("[^\\x00-\\x{10ffff}]" ); |
268 | break; |
269 | } |
270 | t_->append("[" ); |
271 | // Heuristic: show class as negated if it contains the |
272 | // non-character 0xFFFE and yet somehow isn't full. |
273 | CharClass* cc = re->cc(); |
274 | if (cc->Contains(0xFFFE) && !cc->full()) { |
275 | cc = cc->Negate(); |
276 | t_->append("^" ); |
277 | } |
278 | for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) |
279 | AppendCCRange(t_, i->lo, i->hi); |
280 | if (cc != re->cc()) |
281 | cc->Delete(); |
282 | t_->append("]" ); |
283 | break; |
284 | } |
285 | |
286 | case kRegexpCapture: |
287 | t_->append(")" ); |
288 | break; |
289 | |
290 | case kRegexpHaveMatch: |
291 | // There's no syntax accepted by the parser to generate |
292 | // this node (it is generated by RE2::Set) so make something |
293 | // up that is readable but won't compile. |
294 | t_->append("(?HaveMatch:%d)" , re->match_id()); |
295 | break; |
296 | } |
297 | |
298 | // If the parent is an alternation, append the | for it. |
299 | if (prec == PrecAlternate) |
300 | t_->append("|" ); |
301 | |
302 | return 0; |
303 | } |
304 | |
305 | // Appends a rune for use in a character class to the string t. |
306 | static void AppendCCChar(std::string* t, Rune r) { |
307 | if (0x20 <= r && r <= 0x7E) { |
308 | if (strchr("[]^-\\" , r)) |
309 | t->append("\\" ); |
310 | t->append(1, static_cast<char>(r)); |
311 | return; |
312 | } |
313 | switch (r) { |
314 | default: |
315 | break; |
316 | |
317 | case '\r': |
318 | t->append("\\r" ); |
319 | return; |
320 | |
321 | case '\t': |
322 | t->append("\\t" ); |
323 | return; |
324 | |
325 | case '\n': |
326 | t->append("\\n" ); |
327 | return; |
328 | |
329 | case '\f': |
330 | t->append("\\f" ); |
331 | return; |
332 | } |
333 | |
334 | if (r < 0x100) { |
335 | *t += StringPrintf("\\x%02x" , static_cast<int>(r)); |
336 | return; |
337 | } |
338 | *t += StringPrintf("\\x{%x}" , static_cast<int>(r)); |
339 | } |
340 | |
341 | static void AppendCCRange(std::string* t, Rune lo, Rune hi) { |
342 | if (lo > hi) |
343 | return; |
344 | AppendCCChar(t, lo); |
345 | if (lo < hi) { |
346 | t->append("-" ); |
347 | AppendCCChar(t, hi); |
348 | } |
349 | } |
350 | |
351 | } // namespace re2 |
352 | |