1// Copyright 2007 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Test prog.cc, compile.cc
6
7#include <string>
8
9#include "util/test.h"
10#include "util/logging.h"
11#include "re2/regexp.h"
12#include "re2/prog.h"
13
14namespace re2 {
15
16// Simple input/output tests checking that
17// the regexp compiles to the expected code.
18// These are just to sanity check the basic implementation.
19// The real confidence tests happen by testing the NFA/DFA
20// that run the compiled code.
21
22struct Test {
23 const char* regexp;
24 const char* code;
25};
26
27static Test tests[] = {
28 { "a",
29 "3. byte [61-61] 0 -> 4\n"
30 "4. match! 0\n" },
31 { "ab",
32 "3. byte [61-61] 0 -> 4\n"
33 "4. byte [62-62] 0 -> 5\n"
34 "5. match! 0\n" },
35 { "a|c",
36 "3+ byte [61-61] 0 -> 5\n"
37 "4. byte [63-63] 0 -> 5\n"
38 "5. match! 0\n" },
39 { "a|b",
40 "3. byte [61-62] 0 -> 4\n"
41 "4. match! 0\n" },
42 { "[ab]",
43 "3. byte [61-62] 0 -> 4\n"
44 "4. match! 0\n" },
45 { "a+",
46 "3. byte [61-61] 0 -> 4\n"
47 "4+ nop -> 3\n"
48 "5. match! 0\n" },
49 { "a+?",
50 "3. byte [61-61] 0 -> 4\n"
51 "4+ match! 0\n"
52 "5. nop -> 3\n" },
53 { "a*",
54 "3+ byte [61-61] 1 -> 3\n"
55 "4. match! 0\n" },
56 { "a*?",
57 "3+ match! 0\n"
58 "4. byte [61-61] 0 -> 3\n" },
59 { "a?",
60 "3+ byte [61-61] 1 -> 5\n"
61 "4. nop -> 5\n"
62 "5. match! 0\n" },
63 { "a??",
64 "3+ nop -> 5\n"
65 "4. byte [61-61] 0 -> 5\n"
66 "5. match! 0\n" },
67 { "a{4}",
68 "3. byte [61-61] 0 -> 4\n"
69 "4. byte [61-61] 0 -> 5\n"
70 "5. byte [61-61] 0 -> 6\n"
71 "6. byte [61-61] 0 -> 7\n"
72 "7. match! 0\n" },
73 { "(a)",
74 "3. capture 2 -> 4\n"
75 "4. byte [61-61] 0 -> 5\n"
76 "5. capture 3 -> 6\n"
77 "6. match! 0\n" },
78 { "(?:a)",
79 "3. byte [61-61] 0 -> 4\n"
80 "4. match! 0\n" },
81 { "",
82 "3. match! 0\n" },
83 { ".",
84 "3+ byte [00-09] 0 -> 5\n"
85 "4. byte [0b-ff] 0 -> 5\n"
86 "5. match! 0\n" },
87 { "[^ab]",
88 "3+ byte [00-09] 0 -> 6\n"
89 "4+ byte [0b-60] 0 -> 6\n"
90 "5. byte [63-ff] 0 -> 6\n"
91 "6. match! 0\n" },
92 { "[Aa]",
93 "3. byte/i [61-61] 0 -> 4\n"
94 "4. match! 0\n" },
95 { "\\C+",
96 "3. byte [00-ff] 0 -> 4\n"
97 "4+ altmatch -> 5 | 6\n"
98 "5+ nop -> 3\n"
99 "6. match! 0\n" },
100 { "\\C*",
101 "3+ altmatch -> 4 | 5\n"
102 "4+ byte [00-ff] 1 -> 3\n"
103 "5. match! 0\n" },
104 { "\\C?",
105 "3+ byte [00-ff] 1 -> 5\n"
106 "4. nop -> 5\n"
107 "5. match! 0\n" },
108 // Issue 20992936
109 { "[[-`]",
110 "3. byte [5b-60] 0 -> 4\n"
111 "4. match! 0\n" },
112};
113
114TEST(TestRegexpCompileToProg, Simple) {
115 int failed = 0;
116 for (size_t i = 0; i < arraysize(tests); i++) {
117 const re2::Test& t = tests[i];
118 Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
119 if (re == NULL) {
120 LOG(ERROR) << "Cannot parse: " << t.regexp;
121 failed++;
122 continue;
123 }
124 Prog* prog = re->CompileToProg(0);
125 if (prog == NULL) {
126 LOG(ERROR) << "Cannot compile: " << t.regexp;
127 re->Decref();
128 failed++;
129 continue;
130 }
131 ASSERT_TRUE(re->CompileToProg(1) == NULL);
132 std::string s = prog->Dump();
133 if (s != t.code) {
134 LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
135 LOG(ERROR) << "Want:\n" << t.code;
136 LOG(ERROR) << "Got:\n" << s;
137 failed++;
138 }
139 delete prog;
140 re->Decref();
141 }
142 EXPECT_EQ(failed, 0);
143}
144
145static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
146 std::string* bytemap) {
147 Regexp* re = Regexp::Parse(pattern, flags, NULL);
148 EXPECT_TRUE(re != NULL);
149
150 Prog* prog = re->CompileToProg(0);
151 EXPECT_TRUE(prog != NULL);
152 *bytemap = prog->DumpByteMap();
153 delete prog;
154
155 re->Decref();
156}
157
158TEST(TestCompile, Latin1Ranges) {
159 // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
160
161 std::string bytemap;
162
163 DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
164 EXPECT_EQ("[00-09] -> 0\n"
165 "[0a-0a] -> 1\n"
166 "[0b-ff] -> 0\n",
167 bytemap);
168}
169
170TEST(TestCompile, OtherByteMapTests) {
171 std::string bytemap;
172
173 // Test that "absent" ranges are mapped to the same byte class.
174 DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
175 EXPECT_EQ("[00-2f] -> 0\n"
176 "[30-39] -> 1\n"
177 "[3a-40] -> 0\n"
178 "[41-46] -> 1\n"
179 "[47-60] -> 0\n"
180 "[61-66] -> 1\n"
181 "[67-ff] -> 0\n",
182 bytemap);
183
184 // Test the byte classes for \b.
185 DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
186 EXPECT_EQ("[00-2f] -> 0\n"
187 "[30-39] -> 1\n"
188 "[3a-40] -> 0\n"
189 "[41-5a] -> 1\n"
190 "[5b-5e] -> 0\n"
191 "[5f-5f] -> 1\n"
192 "[60-60] -> 0\n"
193 "[61-7a] -> 1\n"
194 "[7b-ff] -> 0\n",
195 bytemap);
196
197 // Bug in the ASCII case-folding optimization created too many byte classes.
198 DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
199 EXPECT_EQ("[00-5e] -> 0\n"
200 "[5f-5f] -> 1\n"
201 "[60-ff] -> 0\n",
202 bytemap);
203}
204
205TEST(TestCompile, UTF8Ranges) {
206 // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
207 // Once, erroneously split between 0x3f and 0x40 because it is
208 // a 6-bit boundary.
209
210 std::string bytemap;
211
212 DumpByteMap(".", Regexp::PerlX, &bytemap);
213 EXPECT_EQ("[00-09] -> 0\n"
214 "[0a-0a] -> 1\n"
215 "[0b-7f] -> 0\n"
216 "[80-8f] -> 2\n"
217 "[90-9f] -> 3\n"
218 "[a0-bf] -> 4\n"
219 "[c0-c1] -> 1\n"
220 "[c2-df] -> 5\n"
221 "[e0-e0] -> 6\n"
222 "[e1-ef] -> 7\n"
223 "[f0-f0] -> 8\n"
224 "[f1-f3] -> 9\n"
225 "[f4-f4] -> 10\n"
226 "[f5-ff] -> 1\n",
227 bytemap);
228}
229
230TEST(TestCompile, InsufficientMemory) {
231 Regexp* re = Regexp::Parse(
232 "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
233 Regexp::LikePerl, NULL);
234 EXPECT_TRUE(re != NULL);
235 Prog* prog = re->CompileToProg(920);
236 // If the memory budget has been exhausted, compilation should fail
237 // and return NULL instead of trying to do anything with NoMatch().
238 EXPECT_TRUE(prog == NULL);
239 re->Decref();
240}
241
242static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
243 std::string* forward, std::string* reverse) {
244 Regexp* re = Regexp::Parse(pattern, flags, NULL);
245 EXPECT_TRUE(re != NULL);
246
247 if (forward != NULL) {
248 Prog* prog = re->CompileToProg(0);
249 EXPECT_TRUE(prog != NULL);
250 *forward = prog->Dump();
251 delete prog;
252 }
253
254 if (reverse != NULL) {
255 Prog* prog = re->CompileToReverseProg(0);
256 EXPECT_TRUE(prog != NULL);
257 *reverse = prog->Dump();
258 delete prog;
259 }
260
261 re->Decref();
262}
263
264TEST(TestCompile, Bug26705922) {
265 // Bug in the compiler caused inefficient bytecode to be generated for Unicode
266 // groups: common suffixes were cached, but common prefixes were not factored.
267
268 std::string forward, reverse;
269
270 Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
271 EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
272 "4. byte [90-90] 0 -> 5\n"
273 "5. byte [80-80] 0 -> 6\n"
274 "6+ byte [80-80] 0 -> 8\n"
275 "7. byte [90-90] 0 -> 8\n"
276 "8. match! 0\n",
277 forward);
278 EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
279 "4. byte [90-90] 0 -> 5\n"
280 "5. byte [80-80] 0 -> 6\n"
281 "6. byte [90-90] 0 -> 7\n"
282 "7. byte [f0-f0] 0 -> 8\n"
283 "8. match! 0\n",
284 reverse);
285
286 Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
287 EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
288 "4. byte [f0-f0] 0 -> 8\n"
289 "5. byte [80-bf] 0 -> 6\n"
290 "6. byte [80-bf] 0 -> 7\n"
291 "7. match! 0\n"
292 "8. byte [90-90] 0 -> 5\n",
293 forward);
294 EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
295 "4. byte [80-bf] 0 -> 5\n"
296 "5+ byte [e8-ef] 0 -> 7\n"
297 "6. byte [90-90] 0 -> 8\n"
298 "7. match! 0\n"
299 "8. byte [f0-f0] 0 -> 7\n",
300 reverse);
301
302 Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse);
303 EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
304 "4+ byte [c2-df] 0 -> 7\n"
305 "5+ byte [a0-bf] 1 -> 8\n"
306 "6. byte [80-bf] 0 -> 9\n"
307 "7. match! 0\n"
308 "8. byte [e0-e0] 0 -> 7\n"
309 "9+ byte [e1-ef] 0 -> 7\n"
310 "10+ byte [90-bf] 1 -> 13\n"
311 "11+ byte [80-bf] 1 -> 14\n"
312 "12. byte [80-8f] 0 -> 15\n"
313 "13. byte [f0-f0] 0 -> 7\n"
314 "14. byte [f1-f3] 0 -> 7\n"
315 "15. byte [f4-f4] 0 -> 7\n",
316 reverse);
317}
318
319TEST(TestCompile, Bug35237384) {
320 // Bug in the compiler caused inefficient bytecode to be generated for
321 // nested nullable subexpressions.
322
323 std::string forward;
324
325 Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
326 EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
327 "4. nop -> 5\n"
328 "5+ byte [61-61] 1 -> 5\n"
329 "6. nop -> 7\n"
330 "7+ byte [61-61] 1 -> 7\n"
331 "8. match! 0\n",
332 forward);
333
334 Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
335 EXPECT_EQ("3+ nop -> 6\n"
336 "4+ nop -> 8\n"
337 "5. nop -> 21\n"
338 "6+ byte [61-61] 1 -> 6\n"
339 "7. nop -> 3\n"
340 "8+ byte [62-62] 1 -> 8\n"
341 "9. nop -> 3\n"
342 "10+ byte [61-61] 1 -> 10\n"
343 "11. nop -> 21\n"
344 "12+ byte [62-62] 1 -> 12\n"
345 "13. nop -> 21\n"
346 "14+ byte [61-61] 1 -> 14\n"
347 "15. nop -> 18\n"
348 "16+ byte [62-62] 1 -> 16\n"
349 "17. nop -> 18\n"
350 "18+ nop -> 14\n"
351 "19+ nop -> 16\n"
352 "20. match! 0\n"
353 "21+ nop -> 10\n"
354 "22+ nop -> 12\n"
355 "23. nop -> 18\n",
356 forward);
357
358 Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
359 EXPECT_EQ("3+ nop -> 36\n"
360 "4+ nop -> 31\n"
361 "5. nop -> 33\n"
362 "6+ byte [00-09] 0 -> 8\n"
363 "7. byte [0b-ff] 0 -> 8\n"
364 "8+ nop -> 6\n"
365 "9+ nop -> 29\n"
366 "10. nop -> 28\n"
367 "11+ byte [00-09] 0 -> 13\n"
368 "12. byte [0b-ff] 0 -> 13\n"
369 "13+ nop -> 11\n"
370 "14+ nop -> 26\n"
371 "15. nop -> 28\n"
372 "16+ byte [00-09] 0 -> 18\n"
373 "17. byte [0b-ff] 0 -> 18\n"
374 "18+ nop -> 16\n"
375 "19+ nop -> 36\n"
376 "20. nop -> 33\n"
377 "21+ byte [00-09] 0 -> 23\n"
378 "22. byte [0b-ff] 0 -> 23\n"
379 "23+ nop -> 21\n"
380 "24+ nop -> 31\n"
381 "25. nop -> 33\n"
382 "26+ nop -> 28\n"
383 "27. byte [53-53] 0 -> 11\n"
384 "28. match! 0\n"
385 "29+ nop -> 28\n"
386 "30. byte [53-53] 0 -> 6\n"
387 "31+ nop -> 33\n"
388 "32. byte [53-53] 0 -> 21\n"
389 "33+ nop -> 29\n"
390 "34+ nop -> 26\n"
391 "35. nop -> 28\n"
392 "36+ nop -> 33\n"
393 "37. byte [53-53] 0 -> 16\n",
394 forward);
395}
396
397} // namespace re2
398