1 | // Copyright 2007 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Test prog.cc, compile.cc |
6 | |
7 | #include <string> |
8 | |
9 | #include "util/test.h" |
10 | #include "util/logging.h" |
11 | #include "re2/regexp.h" |
12 | #include "re2/prog.h" |
13 | |
14 | namespace re2 { |
15 | |
16 | // Simple input/output tests checking that |
17 | // the regexp compiles to the expected code. |
18 | // These are just to sanity check the basic implementation. |
19 | // The real confidence tests happen by testing the NFA/DFA |
20 | // that run the compiled code. |
21 | |
22 | struct Test { |
23 | const char* regexp; |
24 | const char* code; |
25 | }; |
26 | |
27 | static Test tests[] = { |
28 | { "a" , |
29 | "3. byte [61-61] 0 -> 4\n" |
30 | "4. match! 0\n" }, |
31 | { "ab" , |
32 | "3. byte [61-61] 0 -> 4\n" |
33 | "4. byte [62-62] 0 -> 5\n" |
34 | "5. match! 0\n" }, |
35 | { "a|c" , |
36 | "3+ byte [61-61] 0 -> 5\n" |
37 | "4. byte [63-63] 0 -> 5\n" |
38 | "5. match! 0\n" }, |
39 | { "a|b" , |
40 | "3. byte [61-62] 0 -> 4\n" |
41 | "4. match! 0\n" }, |
42 | { "[ab]" , |
43 | "3. byte [61-62] 0 -> 4\n" |
44 | "4. match! 0\n" }, |
45 | { "a+" , |
46 | "3. byte [61-61] 0 -> 4\n" |
47 | "4+ nop -> 3\n" |
48 | "5. match! 0\n" }, |
49 | { "a+?" , |
50 | "3. byte [61-61] 0 -> 4\n" |
51 | "4+ match! 0\n" |
52 | "5. nop -> 3\n" }, |
53 | { "a*" , |
54 | "3+ byte [61-61] 1 -> 3\n" |
55 | "4. match! 0\n" }, |
56 | { "a*?" , |
57 | "3+ match! 0\n" |
58 | "4. byte [61-61] 0 -> 3\n" }, |
59 | { "a?" , |
60 | "3+ byte [61-61] 1 -> 5\n" |
61 | "4. nop -> 5\n" |
62 | "5. match! 0\n" }, |
63 | { "a??" , |
64 | "3+ nop -> 5\n" |
65 | "4. byte [61-61] 0 -> 5\n" |
66 | "5. match! 0\n" }, |
67 | { "a{4}" , |
68 | "3. byte [61-61] 0 -> 4\n" |
69 | "4. byte [61-61] 0 -> 5\n" |
70 | "5. byte [61-61] 0 -> 6\n" |
71 | "6. byte [61-61] 0 -> 7\n" |
72 | "7. match! 0\n" }, |
73 | { "(a)" , |
74 | "3. capture 2 -> 4\n" |
75 | "4. byte [61-61] 0 -> 5\n" |
76 | "5. capture 3 -> 6\n" |
77 | "6. match! 0\n" }, |
78 | { "(?:a)" , |
79 | "3. byte [61-61] 0 -> 4\n" |
80 | "4. match! 0\n" }, |
81 | { "" , |
82 | "3. match! 0\n" }, |
83 | { "." , |
84 | "3+ byte [00-09] 0 -> 5\n" |
85 | "4. byte [0b-ff] 0 -> 5\n" |
86 | "5. match! 0\n" }, |
87 | { "[^ab]" , |
88 | "3+ byte [00-09] 0 -> 6\n" |
89 | "4+ byte [0b-60] 0 -> 6\n" |
90 | "5. byte [63-ff] 0 -> 6\n" |
91 | "6. match! 0\n" }, |
92 | { "[Aa]" , |
93 | "3. byte/i [61-61] 0 -> 4\n" |
94 | "4. match! 0\n" }, |
95 | { "\\C+" , |
96 | "3. byte [00-ff] 0 -> 4\n" |
97 | "4+ altmatch -> 5 | 6\n" |
98 | "5+ nop -> 3\n" |
99 | "6. match! 0\n" }, |
100 | { "\\C*" , |
101 | "3+ altmatch -> 4 | 5\n" |
102 | "4+ byte [00-ff] 1 -> 3\n" |
103 | "5. match! 0\n" }, |
104 | { "\\C?" , |
105 | "3+ byte [00-ff] 1 -> 5\n" |
106 | "4. nop -> 5\n" |
107 | "5. match! 0\n" }, |
108 | // Issue 20992936 |
109 | { "[[-`]" , |
110 | "3. byte [5b-60] 0 -> 4\n" |
111 | "4. match! 0\n" }, |
112 | }; |
113 | |
114 | TEST(TestRegexpCompileToProg, Simple) { |
115 | int failed = 0; |
116 | for (size_t i = 0; i < arraysize(tests); i++) { |
117 | const re2::Test& t = tests[i]; |
118 | Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL); |
119 | if (re == NULL) { |
120 | LOG(ERROR) << "Cannot parse: " << t.regexp; |
121 | failed++; |
122 | continue; |
123 | } |
124 | Prog* prog = re->CompileToProg(0); |
125 | if (prog == NULL) { |
126 | LOG(ERROR) << "Cannot compile: " << t.regexp; |
127 | re->Decref(); |
128 | failed++; |
129 | continue; |
130 | } |
131 | ASSERT_TRUE(re->CompileToProg(1) == NULL); |
132 | std::string s = prog->Dump(); |
133 | if (s != t.code) { |
134 | LOG(ERROR) << "Incorrect compiled code for: " << t.regexp; |
135 | LOG(ERROR) << "Want:\n" << t.code; |
136 | LOG(ERROR) << "Got:\n" << s; |
137 | failed++; |
138 | } |
139 | delete prog; |
140 | re->Decref(); |
141 | } |
142 | EXPECT_EQ(failed, 0); |
143 | } |
144 | |
145 | static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags, |
146 | std::string* bytemap) { |
147 | Regexp* re = Regexp::Parse(pattern, flags, NULL); |
148 | EXPECT_TRUE(re != NULL); |
149 | |
150 | Prog* prog = re->CompileToProg(0); |
151 | EXPECT_TRUE(prog != NULL); |
152 | *bytemap = prog->DumpByteMap(); |
153 | delete prog; |
154 | |
155 | re->Decref(); |
156 | } |
157 | |
158 | TEST(TestCompile, Latin1Ranges) { |
159 | // The distinct byte ranges involved in the Latin-1 dot ([^\n]). |
160 | |
161 | std::string bytemap; |
162 | |
163 | DumpByteMap("." , Regexp::PerlX|Regexp::Latin1, &bytemap); |
164 | EXPECT_EQ("[00-09] -> 0\n" |
165 | "[0a-0a] -> 1\n" |
166 | "[0b-ff] -> 0\n" , |
167 | bytemap); |
168 | } |
169 | |
170 | TEST(TestCompile, OtherByteMapTests) { |
171 | std::string bytemap; |
172 | |
173 | // Test that "absent" ranges are mapped to the same byte class. |
174 | DumpByteMap("[0-9A-Fa-f]+" , Regexp::PerlX|Regexp::Latin1, &bytemap); |
175 | EXPECT_EQ("[00-2f] -> 0\n" |
176 | "[30-39] -> 1\n" |
177 | "[3a-40] -> 0\n" |
178 | "[41-46] -> 1\n" |
179 | "[47-60] -> 0\n" |
180 | "[61-66] -> 1\n" |
181 | "[67-ff] -> 0\n" , |
182 | bytemap); |
183 | |
184 | // Test the byte classes for \b. |
185 | DumpByteMap("\\b" , Regexp::LikePerl|Regexp::Latin1, &bytemap); |
186 | EXPECT_EQ("[00-2f] -> 0\n" |
187 | "[30-39] -> 1\n" |
188 | "[3a-40] -> 0\n" |
189 | "[41-5a] -> 1\n" |
190 | "[5b-5e] -> 0\n" |
191 | "[5f-5f] -> 1\n" |
192 | "[60-60] -> 0\n" |
193 | "[61-7a] -> 1\n" |
194 | "[7b-ff] -> 0\n" , |
195 | bytemap); |
196 | |
197 | // Bug in the ASCII case-folding optimization created too many byte classes. |
198 | DumpByteMap("[^_]" , Regexp::LikePerl|Regexp::Latin1, &bytemap); |
199 | EXPECT_EQ("[00-5e] -> 0\n" |
200 | "[5f-5f] -> 1\n" |
201 | "[60-ff] -> 0\n" , |
202 | bytemap); |
203 | } |
204 | |
205 | TEST(TestCompile, UTF8Ranges) { |
206 | // The distinct byte ranges involved in the UTF-8 dot ([^\n]). |
207 | // Once, erroneously split between 0x3f and 0x40 because it is |
208 | // a 6-bit boundary. |
209 | |
210 | std::string bytemap; |
211 | |
212 | DumpByteMap("." , Regexp::PerlX, &bytemap); |
213 | EXPECT_EQ("[00-09] -> 0\n" |
214 | "[0a-0a] -> 1\n" |
215 | "[0b-7f] -> 0\n" |
216 | "[80-8f] -> 2\n" |
217 | "[90-9f] -> 3\n" |
218 | "[a0-bf] -> 4\n" |
219 | "[c0-c1] -> 1\n" |
220 | "[c2-df] -> 5\n" |
221 | "[e0-e0] -> 6\n" |
222 | "[e1-ef] -> 7\n" |
223 | "[f0-f0] -> 8\n" |
224 | "[f1-f3] -> 9\n" |
225 | "[f4-f4] -> 10\n" |
226 | "[f5-ff] -> 1\n" , |
227 | bytemap); |
228 | } |
229 | |
230 | TEST(TestCompile, InsufficientMemory) { |
231 | Regexp* re = Regexp::Parse( |
232 | "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$" , |
233 | Regexp::LikePerl, NULL); |
234 | EXPECT_TRUE(re != NULL); |
235 | Prog* prog = re->CompileToProg(920); |
236 | // If the memory budget has been exhausted, compilation should fail |
237 | // and return NULL instead of trying to do anything with NoMatch(). |
238 | EXPECT_TRUE(prog == NULL); |
239 | re->Decref(); |
240 | } |
241 | |
242 | static void Dump(StringPiece pattern, Regexp::ParseFlags flags, |
243 | std::string* forward, std::string* reverse) { |
244 | Regexp* re = Regexp::Parse(pattern, flags, NULL); |
245 | EXPECT_TRUE(re != NULL); |
246 | |
247 | if (forward != NULL) { |
248 | Prog* prog = re->CompileToProg(0); |
249 | EXPECT_TRUE(prog != NULL); |
250 | *forward = prog->Dump(); |
251 | delete prog; |
252 | } |
253 | |
254 | if (reverse != NULL) { |
255 | Prog* prog = re->CompileToReverseProg(0); |
256 | EXPECT_TRUE(prog != NULL); |
257 | *reverse = prog->Dump(); |
258 | delete prog; |
259 | } |
260 | |
261 | re->Decref(); |
262 | } |
263 | |
264 | TEST(TestCompile, Bug26705922) { |
265 | // Bug in the compiler caused inefficient bytecode to be generated for Unicode |
266 | // groups: common suffixes were cached, but common prefixes were not factored. |
267 | |
268 | std::string forward, reverse; |
269 | |
270 | Dump("[\\x{10000}\\x{10010}]" , Regexp::LikePerl, &forward, &reverse); |
271 | EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n" |
272 | "4. byte [90-90] 0 -> 5\n" |
273 | "5. byte [80-80] 0 -> 6\n" |
274 | "6+ byte [80-80] 0 -> 8\n" |
275 | "7. byte [90-90] 0 -> 8\n" |
276 | "8. match! 0\n" , |
277 | forward); |
278 | EXPECT_EQ("3+ byte [80-80] 0 -> 5\n" |
279 | "4. byte [90-90] 0 -> 5\n" |
280 | "5. byte [80-80] 0 -> 6\n" |
281 | "6. byte [90-90] 0 -> 7\n" |
282 | "7. byte [f0-f0] 0 -> 8\n" |
283 | "8. match! 0\n" , |
284 | reverse); |
285 | |
286 | Dump("[\\x{8000}-\\x{10FFF}]" , Regexp::LikePerl, &forward, &reverse); |
287 | EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n" |
288 | "4. byte [f0-f0] 0 -> 8\n" |
289 | "5. byte [80-bf] 0 -> 6\n" |
290 | "6. byte [80-bf] 0 -> 7\n" |
291 | "7. match! 0\n" |
292 | "8. byte [90-90] 0 -> 5\n" , |
293 | forward); |
294 | EXPECT_EQ("3. byte [80-bf] 0 -> 4\n" |
295 | "4. byte [80-bf] 0 -> 5\n" |
296 | "5+ byte [e8-ef] 0 -> 7\n" |
297 | "6. byte [90-90] 0 -> 8\n" |
298 | "7. match! 0\n" |
299 | "8. byte [f0-f0] 0 -> 7\n" , |
300 | reverse); |
301 | |
302 | Dump("[\\x{80}-\\x{10FFFF}]" , Regexp::LikePerl, NULL, &reverse); |
303 | EXPECT_EQ("3. byte [80-bf] 0 -> 4\n" |
304 | "4+ byte [c2-df] 0 -> 7\n" |
305 | "5+ byte [a0-bf] 1 -> 8\n" |
306 | "6. byte [80-bf] 0 -> 9\n" |
307 | "7. match! 0\n" |
308 | "8. byte [e0-e0] 0 -> 7\n" |
309 | "9+ byte [e1-ef] 0 -> 7\n" |
310 | "10+ byte [90-bf] 1 -> 13\n" |
311 | "11+ byte [80-bf] 1 -> 14\n" |
312 | "12. byte [80-8f] 0 -> 15\n" |
313 | "13. byte [f0-f0] 0 -> 7\n" |
314 | "14. byte [f1-f3] 0 -> 7\n" |
315 | "15. byte [f4-f4] 0 -> 7\n" , |
316 | reverse); |
317 | } |
318 | |
319 | TEST(TestCompile, Bug35237384) { |
320 | // Bug in the compiler caused inefficient bytecode to be generated for |
321 | // nested nullable subexpressions. |
322 | |
323 | std::string forward; |
324 | |
325 | Dump("a**{3,}" , Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); |
326 | EXPECT_EQ("3+ byte [61-61] 1 -> 3\n" |
327 | "4. nop -> 5\n" |
328 | "5+ byte [61-61] 1 -> 5\n" |
329 | "6. nop -> 7\n" |
330 | "7+ byte [61-61] 1 -> 7\n" |
331 | "8. match! 0\n" , |
332 | forward); |
333 | |
334 | Dump("(a*|b*)*{3,}" , Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); |
335 | EXPECT_EQ("3+ nop -> 6\n" |
336 | "4+ nop -> 8\n" |
337 | "5. nop -> 21\n" |
338 | "6+ byte [61-61] 1 -> 6\n" |
339 | "7. nop -> 3\n" |
340 | "8+ byte [62-62] 1 -> 8\n" |
341 | "9. nop -> 3\n" |
342 | "10+ byte [61-61] 1 -> 10\n" |
343 | "11. nop -> 21\n" |
344 | "12+ byte [62-62] 1 -> 12\n" |
345 | "13. nop -> 21\n" |
346 | "14+ byte [61-61] 1 -> 14\n" |
347 | "15. nop -> 18\n" |
348 | "16+ byte [62-62] 1 -> 16\n" |
349 | "17. nop -> 18\n" |
350 | "18+ nop -> 14\n" |
351 | "19+ nop -> 16\n" |
352 | "20. match! 0\n" |
353 | "21+ nop -> 10\n" |
354 | "22+ nop -> 12\n" |
355 | "23. nop -> 18\n" , |
356 | forward); |
357 | |
358 | Dump("((|S.+)+|(|S.+)+|){2}" , Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); |
359 | EXPECT_EQ("3+ nop -> 36\n" |
360 | "4+ nop -> 31\n" |
361 | "5. nop -> 33\n" |
362 | "6+ byte [00-09] 0 -> 8\n" |
363 | "7. byte [0b-ff] 0 -> 8\n" |
364 | "8+ nop -> 6\n" |
365 | "9+ nop -> 29\n" |
366 | "10. nop -> 28\n" |
367 | "11+ byte [00-09] 0 -> 13\n" |
368 | "12. byte [0b-ff] 0 -> 13\n" |
369 | "13+ nop -> 11\n" |
370 | "14+ nop -> 26\n" |
371 | "15. nop -> 28\n" |
372 | "16+ byte [00-09] 0 -> 18\n" |
373 | "17. byte [0b-ff] 0 -> 18\n" |
374 | "18+ nop -> 16\n" |
375 | "19+ nop -> 36\n" |
376 | "20. nop -> 33\n" |
377 | "21+ byte [00-09] 0 -> 23\n" |
378 | "22. byte [0b-ff] 0 -> 23\n" |
379 | "23+ nop -> 21\n" |
380 | "24+ nop -> 31\n" |
381 | "25. nop -> 33\n" |
382 | "26+ nop -> 28\n" |
383 | "27. byte [53-53] 0 -> 11\n" |
384 | "28. match! 0\n" |
385 | "29+ nop -> 28\n" |
386 | "30. byte [53-53] 0 -> 6\n" |
387 | "31+ nop -> 33\n" |
388 | "32. byte [53-53] 0 -> 21\n" |
389 | "33+ nop -> 29\n" |
390 | "34+ nop -> 26\n" |
391 | "35. nop -> 28\n" |
392 | "36+ nop -> 33\n" |
393 | "37. byte [53-53] 0 -> 16\n" , |
394 | forward); |
395 | } |
396 | |
397 | } // namespace re2 |
398 | |