1 | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Dump the regexp into a string showing structure. |
6 | // Tested by parse_unittest.cc |
7 | |
8 | // This function traverses the regexp recursively, |
9 | // meaning that on inputs like Regexp::Simplify of |
10 | // a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100}, |
11 | // it takes time and space exponential in the size of the |
12 | // original regular expression. It can also use stack space |
13 | // linear in the size of the regular expression for inputs |
14 | // like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*. |
15 | // IT IS NOT SAFE TO CALL FROM PRODUCTION CODE. |
16 | // As a result, Dump is provided only in the testing |
17 | // library (see BUILD). |
18 | |
19 | #include <string> |
20 | |
21 | #include "util/test.h" |
22 | #include "util/logging.h" |
23 | #include "util/strutil.h" |
24 | #include "util/utf.h" |
25 | #include "re2/stringpiece.h" |
26 | #include "re2/regexp.h" |
27 | |
28 | namespace re2 { |
29 | |
30 | static const char* kOpcodeNames[] = { |
31 | "bad" , |
32 | "no" , |
33 | "emp" , |
34 | "lit" , |
35 | "str" , |
36 | "cat" , |
37 | "alt" , |
38 | "star" , |
39 | "plus" , |
40 | "que" , |
41 | "rep" , |
42 | "cap" , |
43 | "dot" , |
44 | "byte" , |
45 | "bol" , |
46 | "eol" , |
47 | "wb" , // kRegexpWordBoundary |
48 | "nwb" , // kRegexpNoWordBoundary |
49 | "bot" , |
50 | "eot" , |
51 | "cc" , |
52 | "match" , |
53 | }; |
54 | |
55 | // Create string representation of regexp with explicit structure. |
56 | // Nothing pretty, just for testing. |
57 | static void DumpRegexpAppending(Regexp* re, std::string* s) { |
58 | if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) { |
59 | *s += StringPrintf("op%d" , re->op()); |
60 | } else { |
61 | switch (re->op()) { |
62 | default: |
63 | break; |
64 | case kRegexpStar: |
65 | case kRegexpPlus: |
66 | case kRegexpQuest: |
67 | case kRegexpRepeat: |
68 | if (re->parse_flags() & Regexp::NonGreedy) |
69 | s->append("n" ); |
70 | break; |
71 | } |
72 | s->append(kOpcodeNames[re->op()]); |
73 | if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) { |
74 | Rune r = re->rune(); |
75 | if ('a' <= r && r <= 'z') |
76 | s->append("fold" ); |
77 | } |
78 | if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) { |
79 | for (int i = 0; i < re->nrunes(); i++) { |
80 | Rune r = re->runes()[i]; |
81 | if ('a' <= r && r <= 'z') { |
82 | s->append("fold" ); |
83 | break; |
84 | } |
85 | } |
86 | } |
87 | } |
88 | s->append("{" ); |
89 | switch (re->op()) { |
90 | default: |
91 | break; |
92 | case kRegexpEndText: |
93 | if (!(re->parse_flags() & Regexp::WasDollar)) { |
94 | s->append("\\z" ); |
95 | } |
96 | break; |
97 | case kRegexpLiteral: { |
98 | Rune r = re->rune(); |
99 | char buf[UTFmax+1]; |
100 | buf[runetochar(buf, &r)] = 0; |
101 | s->append(buf); |
102 | break; |
103 | } |
104 | case kRegexpLiteralString: |
105 | for (int i = 0; i < re->nrunes(); i++) { |
106 | Rune r = re->runes()[i]; |
107 | char buf[UTFmax+1]; |
108 | buf[runetochar(buf, &r)] = 0; |
109 | s->append(buf); |
110 | } |
111 | break; |
112 | case kRegexpConcat: |
113 | case kRegexpAlternate: |
114 | for (int i = 0; i < re->nsub(); i++) |
115 | DumpRegexpAppending(re->sub()[i], s); |
116 | break; |
117 | case kRegexpStar: |
118 | case kRegexpPlus: |
119 | case kRegexpQuest: |
120 | DumpRegexpAppending(re->sub()[0], s); |
121 | break; |
122 | case kRegexpCapture: |
123 | if (re->cap() == 0) |
124 | LOG(DFATAL) << "kRegexpCapture cap() == 0" ; |
125 | if (re->name()) { |
126 | s->append(*re->name()); |
127 | s->append(":" ); |
128 | } |
129 | DumpRegexpAppending(re->sub()[0], s); |
130 | break; |
131 | case kRegexpRepeat: |
132 | s->append(StringPrintf("%d,%d " , re->min(), re->max())); |
133 | DumpRegexpAppending(re->sub()[0], s); |
134 | break; |
135 | case kRegexpCharClass: { |
136 | std::string sep; |
137 | for (CharClass::iterator it = re->cc()->begin(); |
138 | it != re->cc()->end(); ++it) { |
139 | RuneRange rr = *it; |
140 | s->append(sep); |
141 | if (rr.lo == rr.hi) |
142 | s->append(StringPrintf("%#x" , rr.lo)); |
143 | else |
144 | s->append(StringPrintf("%#x-%#x" , rr.lo, rr.hi)); |
145 | sep = " " ; |
146 | } |
147 | break; |
148 | } |
149 | } |
150 | s->append("}" ); |
151 | } |
152 | |
153 | std::string Regexp::Dump() { |
154 | // Make sure that we are being called from a unit test. |
155 | // Should cause a link error if used outside of testing. |
156 | CHECK(!::testing::TempDir().empty()); |
157 | |
158 | std::string s; |
159 | DumpRegexpAppending(this, &s); |
160 | return s; |
161 | } |
162 | |
163 | } // namespace re2 |
164 | |