1// Copyright 2008 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression generator: generates all possible
6// regular expressions within parameters (see regexp_generator.h for details).
7
8// The regexp generator first generates a sequence of commands in a simple
9// postfix language. Each command in the language is a string,
10// like "a" or "%s*" or "%s|%s".
11//
12// To evaluate a command, enough arguments are popped from the value stack to
13// plug into the %s slots. Then the result is pushed onto the stack.
14// For example, the command sequence
15// a b %s%s c
16// results in the stack
17// ab c
18//
19// GeneratePostfix generates all possible command sequences.
20// Then RunPostfix turns each sequence into a regular expression
21// and passes the regexp to HandleRegexp.
22
23#include <stddef.h>
24#include <stdint.h>
25#include <stdio.h>
26#include <string.h>
27#include <memory>
28#include <stack>
29#include <string>
30#include <vector>
31
32#include "util/test.h"
33#include "util/logging.h"
34#include "util/strutil.h"
35#include "util/utf.h"
36#include "re2/testing/regexp_generator.h"
37
38namespace re2 {
39
40// Returns a vector of the egrep regexp operators.
41const std::vector<std::string>& RegexpGenerator::EgrepOps() {
42 static const char *ops[] = {
43 "%s%s",
44 "%s|%s",
45 "%s*",
46 "%s+",
47 "%s?",
48 "%s\\C*",
49 };
50 static std::vector<std::string> v(ops, ops + arraysize(ops));
51 return v;
52}
53
54RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
55 const std::vector<std::string>& atoms,
56 const std::vector<std::string>& ops)
57 : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
58 // Degenerate case.
59 if (atoms_.empty())
60 maxatoms_ = 0;
61 if (ops_.empty())
62 maxops_ = 0;
63}
64
65// Generates all possible regular expressions (within the parameters),
66// calling HandleRegexp for each one.
67void RegexpGenerator::Generate() {
68 std::vector<std::string> postfix;
69 GeneratePostfix(&postfix, 0, 0, 0);
70}
71
72// Generates random regular expressions, calling HandleRegexp for each one.
73void RegexpGenerator::GenerateRandom(int32_t seed, int n) {
74 rng_.seed(seed);
75
76 for (int i = 0; i < n; i++) {
77 std::vector<std::string> postfix;
78 GenerateRandomPostfix(&postfix, 0, 0, 0);
79 }
80}
81
82// Counts and returns the number of occurrences of "%s" in s.
83static int CountArgs(const std::string& s) {
84 const char *p = s.c_str();
85 int n = 0;
86 while ((p = strstr(p, "%s")) != NULL) {
87 p += 2;
88 n++;
89 }
90 return n;
91}
92
93// Generates all possible postfix command sequences.
94// Each sequence is handed off to RunPostfix to generate a regular expression.
95// The arguments are:
96// post: the current postfix sequence
97// nstk: the number of elements that would be on the stack after executing
98// the sequence
99// ops: the number of operators used in the sequence
100// atoms: the number of atoms used in the sequence
101// For example, if post were ["a", "b", "%s%s", "c"],
102// then nstk = 2, ops = 1, atoms = 3.
103//
104// The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
105//
106void RegexpGenerator::GeneratePostfix(std::vector<std::string>* post,
107 int nstk, int ops, int atoms) {
108 if (nstk == 1)
109 RunPostfix(*post);
110
111 // Early out: if used too many operators or can't
112 // get back down to a single expression on the stack
113 // using binary operators, give up.
114 if (ops + nstk - 1 > maxops_)
115 return;
116
117 // Add atoms if there is room.
118 if (atoms < maxatoms_) {
119 for (size_t i = 0; i < atoms_.size(); i++) {
120 post->push_back(atoms_[i]);
121 GeneratePostfix(post, nstk + 1, ops, atoms + 1);
122 post->pop_back();
123 }
124 }
125
126 // Add operators if there are enough arguments.
127 if (ops < maxops_) {
128 for (size_t i = 0; i < ops_.size(); i++) {
129 const std::string& fmt = ops_[i];
130 int nargs = CountArgs(fmt);
131 if (nargs <= nstk) {
132 post->push_back(fmt);
133 GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
134 post->pop_back();
135 }
136 }
137 }
138}
139
140// Generates a random postfix command sequence.
141// Stops and returns true once a single sequence has been generated.
142bool RegexpGenerator::GenerateRandomPostfix(std::vector<std::string>* post,
143 int nstk, int ops, int atoms) {
144 std::uniform_int_distribution<int> random_stop(0, maxatoms_ - atoms);
145 std::uniform_int_distribution<int> random_bit(0, 1);
146 std::uniform_int_distribution<int> random_ops_index(
147 0, static_cast<int>(ops_.size()) - 1);
148 std::uniform_int_distribution<int> random_atoms_index(
149 0, static_cast<int>(atoms_.size()) - 1);
150
151 for (;;) {
152 // Stop if we get to a single element, but only sometimes.
153 if (nstk == 1 && random_stop(rng_) == 0) {
154 RunPostfix(*post);
155 return true;
156 }
157
158 // Early out: if used too many operators or can't
159 // get back down to a single expression on the stack
160 // using binary operators, give up.
161 if (ops + nstk - 1 > maxops_)
162 return false;
163
164 // Add operators if there are enough arguments.
165 if (ops < maxops_ && random_bit(rng_) == 0) {
166 const std::string& fmt = ops_[random_ops_index(rng_)];
167 int nargs = CountArgs(fmt);
168 if (nargs <= nstk) {
169 post->push_back(fmt);
170 bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
171 ops + 1, atoms);
172 post->pop_back();
173 if (ret)
174 return true;
175 }
176 }
177
178 // Add atoms if there is room.
179 if (atoms < maxatoms_ && random_bit(rng_) == 0) {
180 post->push_back(atoms_[random_atoms_index(rng_)]);
181 bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
182 post->pop_back();
183 if (ret)
184 return true;
185 }
186 }
187}
188
189// Interprets the postfix command sequence to create a regular expression
190// passed to HandleRegexp. The results of operators like %s|%s are wrapped
191// in (?: ) to avoid needing to maintain a precedence table.
192void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) {
193 std::stack<std::string> regexps;
194 for (size_t i = 0; i < post.size(); i++) {
195 switch (CountArgs(post[i])) {
196 default:
197 LOG(FATAL) << "Bad operator: " << post[i];
198 case 0:
199 regexps.push(post[i]);
200 break;
201 case 1: {
202 std::string a = regexps.top();
203 regexps.pop();
204 regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
205 break;
206 }
207 case 2: {
208 std::string b = regexps.top();
209 regexps.pop();
210 std::string a = regexps.top();
211 regexps.pop();
212 regexps.push("(?:" +
213 StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
214 ")");
215 break;
216 }
217 }
218 }
219
220 if (regexps.size() != 1) {
221 // Internal error - should never happen.
222 printf("Bad regexp program:\n");
223 for (size_t i = 0; i < post.size(); i++) {
224 printf(" %s\n", CEscape(post[i]).c_str());
225 }
226 printf("Stack after running program:\n");
227 while (!regexps.empty()) {
228 printf(" %s\n", CEscape(regexps.top()).c_str());
229 regexps.pop();
230 }
231 LOG(FATAL) << "Bad regexp program.";
232 }
233
234 HandleRegexp(regexps.top());
235 HandleRegexp("^(?:" + regexps.top() + ")$");
236 HandleRegexp("^(?:" + regexps.top() + ")");
237 HandleRegexp("(?:" + regexps.top() + ")$");
238}
239
240// Split s into an vector of strings, one for each UTF-8 character.
241std::vector<std::string> Explode(const StringPiece& s) {
242 std::vector<std::string> v;
243
244 for (const char *q = s.data(); q < s.data() + s.size(); ) {
245 const char* p = q;
246 Rune r;
247 q += chartorune(&r, q);
248 v.push_back(std::string(p, q - p));
249 }
250
251 return v;
252}
253
254// Split string everywhere a substring is found, returning
255// vector of pieces.
256std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s) {
257 std::vector<std::string> v;
258
259 if (sep.empty())
260 return Explode(s);
261
262 const char *p = s.data();
263 for (const char *q = s.data(); q + sep.size() <= s.data() + s.size(); q++) {
264 if (StringPiece(q, sep.size()) == sep) {
265 v.push_back(std::string(p, q - p));
266 p = q + sep.size();
267 q = p - 1; // -1 for ++ in loop
268 continue;
269 }
270 }
271 if (p < s.data() + s.size())
272 v.push_back(std::string(p, s.data() + s.size() - p));
273 return v;
274}
275
276} // namespace re2
277