1 | // Copyright 2008 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Regular expression generator: generates all possible |
6 | // regular expressions within parameters (see regexp_generator.h for details). |
7 | |
8 | // The regexp generator first generates a sequence of commands in a simple |
9 | // postfix language. Each command in the language is a string, |
10 | // like "a" or "%s*" or "%s|%s". |
11 | // |
12 | // To evaluate a command, enough arguments are popped from the value stack to |
13 | // plug into the %s slots. Then the result is pushed onto the stack. |
14 | // For example, the command sequence |
15 | // a b %s%s c |
16 | // results in the stack |
17 | // ab c |
18 | // |
19 | // GeneratePostfix generates all possible command sequences. |
20 | // Then RunPostfix turns each sequence into a regular expression |
21 | // and passes the regexp to HandleRegexp. |
22 | |
23 | #include <stddef.h> |
24 | #include <stdint.h> |
25 | #include <stdio.h> |
26 | #include <string.h> |
27 | #include <memory> |
28 | #include <stack> |
29 | #include <string> |
30 | #include <vector> |
31 | |
32 | #include "util/test.h" |
33 | #include "util/logging.h" |
34 | #include "util/strutil.h" |
35 | #include "util/utf.h" |
36 | #include "re2/testing/regexp_generator.h" |
37 | |
38 | namespace re2 { |
39 | |
40 | // Returns a vector of the egrep regexp operators. |
41 | const std::vector<std::string>& RegexpGenerator::EgrepOps() { |
42 | static const char *ops[] = { |
43 | "%s%s" , |
44 | "%s|%s" , |
45 | "%s*" , |
46 | "%s+" , |
47 | "%s?" , |
48 | "%s\\C*" , |
49 | }; |
50 | static std::vector<std::string> v(ops, ops + arraysize(ops)); |
51 | return v; |
52 | } |
53 | |
54 | RegexpGenerator::RegexpGenerator(int maxatoms, int maxops, |
55 | const std::vector<std::string>& atoms, |
56 | const std::vector<std::string>& ops) |
57 | : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) { |
58 | // Degenerate case. |
59 | if (atoms_.empty()) |
60 | maxatoms_ = 0; |
61 | if (ops_.empty()) |
62 | maxops_ = 0; |
63 | } |
64 | |
65 | // Generates all possible regular expressions (within the parameters), |
66 | // calling HandleRegexp for each one. |
67 | void RegexpGenerator::Generate() { |
68 | std::vector<std::string> postfix; |
69 | GeneratePostfix(&postfix, 0, 0, 0); |
70 | } |
71 | |
72 | // Generates random regular expressions, calling HandleRegexp for each one. |
73 | void RegexpGenerator::GenerateRandom(int32_t seed, int n) { |
74 | rng_.seed(seed); |
75 | |
76 | for (int i = 0; i < n; i++) { |
77 | std::vector<std::string> postfix; |
78 | GenerateRandomPostfix(&postfix, 0, 0, 0); |
79 | } |
80 | } |
81 | |
82 | // Counts and returns the number of occurrences of "%s" in s. |
83 | static int CountArgs(const std::string& s) { |
84 | const char *p = s.c_str(); |
85 | int n = 0; |
86 | while ((p = strstr(p, "%s" )) != NULL) { |
87 | p += 2; |
88 | n++; |
89 | } |
90 | return n; |
91 | } |
92 | |
93 | // Generates all possible postfix command sequences. |
94 | // Each sequence is handed off to RunPostfix to generate a regular expression. |
95 | // The arguments are: |
96 | // post: the current postfix sequence |
97 | // nstk: the number of elements that would be on the stack after executing |
98 | // the sequence |
99 | // ops: the number of operators used in the sequence |
100 | // atoms: the number of atoms used in the sequence |
101 | // For example, if post were ["a", "b", "%s%s", "c"], |
102 | // then nstk = 2, ops = 1, atoms = 3. |
103 | // |
104 | // The initial call should be GeneratePostfix([empty vector], 0, 0, 0). |
105 | // |
106 | void RegexpGenerator::GeneratePostfix(std::vector<std::string>* post, |
107 | int nstk, int ops, int atoms) { |
108 | if (nstk == 1) |
109 | RunPostfix(*post); |
110 | |
111 | // Early out: if used too many operators or can't |
112 | // get back down to a single expression on the stack |
113 | // using binary operators, give up. |
114 | if (ops + nstk - 1 > maxops_) |
115 | return; |
116 | |
117 | // Add atoms if there is room. |
118 | if (atoms < maxatoms_) { |
119 | for (size_t i = 0; i < atoms_.size(); i++) { |
120 | post->push_back(atoms_[i]); |
121 | GeneratePostfix(post, nstk + 1, ops, atoms + 1); |
122 | post->pop_back(); |
123 | } |
124 | } |
125 | |
126 | // Add operators if there are enough arguments. |
127 | if (ops < maxops_) { |
128 | for (size_t i = 0; i < ops_.size(); i++) { |
129 | const std::string& fmt = ops_[i]; |
130 | int nargs = CountArgs(fmt); |
131 | if (nargs <= nstk) { |
132 | post->push_back(fmt); |
133 | GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms); |
134 | post->pop_back(); |
135 | } |
136 | } |
137 | } |
138 | } |
139 | |
140 | // Generates a random postfix command sequence. |
141 | // Stops and returns true once a single sequence has been generated. |
142 | bool RegexpGenerator::GenerateRandomPostfix(std::vector<std::string>* post, |
143 | int nstk, int ops, int atoms) { |
144 | std::uniform_int_distribution<int> random_stop(0, maxatoms_ - atoms); |
145 | std::uniform_int_distribution<int> random_bit(0, 1); |
146 | std::uniform_int_distribution<int> random_ops_index( |
147 | 0, static_cast<int>(ops_.size()) - 1); |
148 | std::uniform_int_distribution<int> random_atoms_index( |
149 | 0, static_cast<int>(atoms_.size()) - 1); |
150 | |
151 | for (;;) { |
152 | // Stop if we get to a single element, but only sometimes. |
153 | if (nstk == 1 && random_stop(rng_) == 0) { |
154 | RunPostfix(*post); |
155 | return true; |
156 | } |
157 | |
158 | // Early out: if used too many operators or can't |
159 | // get back down to a single expression on the stack |
160 | // using binary operators, give up. |
161 | if (ops + nstk - 1 > maxops_) |
162 | return false; |
163 | |
164 | // Add operators if there are enough arguments. |
165 | if (ops < maxops_ && random_bit(rng_) == 0) { |
166 | const std::string& fmt = ops_[random_ops_index(rng_)]; |
167 | int nargs = CountArgs(fmt); |
168 | if (nargs <= nstk) { |
169 | post->push_back(fmt); |
170 | bool ret = GenerateRandomPostfix(post, nstk - nargs + 1, |
171 | ops + 1, atoms); |
172 | post->pop_back(); |
173 | if (ret) |
174 | return true; |
175 | } |
176 | } |
177 | |
178 | // Add atoms if there is room. |
179 | if (atoms < maxatoms_ && random_bit(rng_) == 0) { |
180 | post->push_back(atoms_[random_atoms_index(rng_)]); |
181 | bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1); |
182 | post->pop_back(); |
183 | if (ret) |
184 | return true; |
185 | } |
186 | } |
187 | } |
188 | |
189 | // Interprets the postfix command sequence to create a regular expression |
190 | // passed to HandleRegexp. The results of operators like %s|%s are wrapped |
191 | // in (?: ) to avoid needing to maintain a precedence table. |
192 | void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) { |
193 | std::stack<std::string> regexps; |
194 | for (size_t i = 0; i < post.size(); i++) { |
195 | switch (CountArgs(post[i])) { |
196 | default: |
197 | LOG(FATAL) << "Bad operator: " << post[i]; |
198 | case 0: |
199 | regexps.push(post[i]); |
200 | break; |
201 | case 1: { |
202 | std::string a = regexps.top(); |
203 | regexps.pop(); |
204 | regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")" ); |
205 | break; |
206 | } |
207 | case 2: { |
208 | std::string b = regexps.top(); |
209 | regexps.pop(); |
210 | std::string a = regexps.top(); |
211 | regexps.pop(); |
212 | regexps.push("(?:" + |
213 | StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) + |
214 | ")" ); |
215 | break; |
216 | } |
217 | } |
218 | } |
219 | |
220 | if (regexps.size() != 1) { |
221 | // Internal error - should never happen. |
222 | printf("Bad regexp program:\n" ); |
223 | for (size_t i = 0; i < post.size(); i++) { |
224 | printf(" %s\n" , CEscape(post[i]).c_str()); |
225 | } |
226 | printf("Stack after running program:\n" ); |
227 | while (!regexps.empty()) { |
228 | printf(" %s\n" , CEscape(regexps.top()).c_str()); |
229 | regexps.pop(); |
230 | } |
231 | LOG(FATAL) << "Bad regexp program." ; |
232 | } |
233 | |
234 | HandleRegexp(regexps.top()); |
235 | HandleRegexp("^(?:" + regexps.top() + ")$" ); |
236 | HandleRegexp("^(?:" + regexps.top() + ")" ); |
237 | HandleRegexp("(?:" + regexps.top() + ")$" ); |
238 | } |
239 | |
240 | // Split s into an vector of strings, one for each UTF-8 character. |
241 | std::vector<std::string> Explode(const StringPiece& s) { |
242 | std::vector<std::string> v; |
243 | |
244 | for (const char *q = s.data(); q < s.data() + s.size(); ) { |
245 | const char* p = q; |
246 | Rune r; |
247 | q += chartorune(&r, q); |
248 | v.push_back(std::string(p, q - p)); |
249 | } |
250 | |
251 | return v; |
252 | } |
253 | |
254 | // Split string everywhere a substring is found, returning |
255 | // vector of pieces. |
256 | std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s) { |
257 | std::vector<std::string> v; |
258 | |
259 | if (sep.empty()) |
260 | return Explode(s); |
261 | |
262 | const char *p = s.data(); |
263 | for (const char *q = s.data(); q + sep.size() <= s.data() + s.size(); q++) { |
264 | if (StringPiece(q, sep.size()) == sep) { |
265 | v.push_back(std::string(p, q - p)); |
266 | p = q + sep.size(); |
267 | q = p - 1; // -1 for ++ in loop |
268 | continue; |
269 | } |
270 | } |
271 | if (p < s.data() + s.size()) |
272 | v.push_back(std::string(p, s.data() + s.size() - p)); |
273 | return v; |
274 | } |
275 | |
276 | } // namespace re2 |
277 | |