1 | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Test parse.cc, dump.cc, and tostring.cc. |
6 | |
7 | #include <string> |
8 | |
9 | #include "util/test.h" |
10 | #include "util/logging.h" |
11 | #include "re2/regexp.h" |
12 | |
13 | namespace re2 { |
14 | |
15 | // In the past, we used 1<<30 here and zeroed the bit later, but that |
16 | // has undefined behaviour, so now we use an internal-only flag because |
17 | // otherwise we would have to introduce a new flag value just for this. |
18 | static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar; |
19 | |
20 | struct Test { |
21 | const char* regexp; |
22 | const char* parse; |
23 | Regexp::ParseFlags flags; |
24 | }; |
25 | |
26 | static Regexp::ParseFlags kTestFlags = Regexp::MatchNL | |
27 | Regexp::PerlX | |
28 | Regexp::PerlClasses | |
29 | Regexp::UnicodeGroups; |
30 | |
31 | static Test tests[] = { |
32 | // Base cases |
33 | { "a" , "lit{a}" }, |
34 | { "a." , "cat{lit{a}dot{}}" }, |
35 | { "a.b" , "cat{lit{a}dot{}lit{b}}" }, |
36 | { "ab" , "str{ab}" }, |
37 | { "a.b.c" , "cat{lit{a}dot{}lit{b}dot{}lit{c}}" }, |
38 | { "abc" , "str{abc}" }, |
39 | { "a|^" , "alt{lit{a}bol{}}" }, |
40 | { "a|b" , "cc{0x61-0x62}" }, |
41 | { "(a)" , "cap{lit{a}}" }, |
42 | { "(a)|b" , "alt{cap{lit{a}}lit{b}}" }, |
43 | { "a*" , "star{lit{a}}" }, |
44 | { "a+" , "plus{lit{a}}" }, |
45 | { "a?" , "que{lit{a}}" }, |
46 | { "a{2}" , "rep{2,2 lit{a}}" }, |
47 | { "a{2,3}" , "rep{2,3 lit{a}}" }, |
48 | { "a{2,}" , "rep{2,-1 lit{a}}" }, |
49 | { "a*?" , "nstar{lit{a}}" }, |
50 | { "a+?" , "nplus{lit{a}}" }, |
51 | { "a??" , "nque{lit{a}}" }, |
52 | { "a{2}?" , "nrep{2,2 lit{a}}" }, |
53 | { "a{2,3}?" , "nrep{2,3 lit{a}}" }, |
54 | { "a{2,}?" , "nrep{2,-1 lit{a}}" }, |
55 | { "" , "emp{}" }, |
56 | { "|" , "alt{emp{}emp{}}" }, |
57 | { "|x|" , "alt{emp{}lit{x}emp{}}" }, |
58 | { "." , "dot{}" }, |
59 | { "^" , "bol{}" }, |
60 | { "$" , "eol{}" }, |
61 | { "\\|" , "lit{|}" }, |
62 | { "\\(" , "lit{(}" }, |
63 | { "\\)" , "lit{)}" }, |
64 | { "\\*" , "lit{*}" }, |
65 | { "\\+" , "lit{+}" }, |
66 | { "\\?" , "lit{?}" }, |
67 | { "{" , "lit{{}" }, |
68 | { "}" , "lit{}}" }, |
69 | { "\\." , "lit{.}" }, |
70 | { "\\^" , "lit{^}" }, |
71 | { "\\$" , "lit{$}" }, |
72 | { "\\\\" , "lit{\\}" }, |
73 | { "[ace]" , "cc{0x61 0x63 0x65}" }, |
74 | { "[abc]" , "cc{0x61-0x63}" }, |
75 | { "[a-z]" , "cc{0x61-0x7a}" }, |
76 | { "[a]" , "lit{a}" }, |
77 | { "\\-" , "lit{-}" }, |
78 | { "-" , "lit{-}" }, |
79 | { "\\_" , "lit{_}" }, |
80 | |
81 | // Posix and Perl extensions |
82 | { "[[:lower:]]" , "cc{0x61-0x7a}" }, |
83 | { "[a-z]" , "cc{0x61-0x7a}" }, |
84 | { "[^[:lower:]]" , "cc{0-0x60 0x7b-0x10ffff}" }, |
85 | { "[[:^lower:]]" , "cc{0-0x60 0x7b-0x10ffff}" }, |
86 | { "(?i)[[:lower:]]" , "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, |
87 | { "(?i)[a-z]" , "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, |
88 | { "(?i)[^[:lower:]]" , "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, |
89 | { "(?i)[[:^lower:]]" , "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, |
90 | { "\\d" , "cc{0x30-0x39}" }, |
91 | { "\\D" , "cc{0-0x2f 0x3a-0x10ffff}" }, |
92 | { "\\s" , "cc{0x9-0xa 0xc-0xd 0x20}" }, |
93 | { "\\S" , "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" }, |
94 | { "\\w" , "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" }, |
95 | { "\\W" , "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" }, |
96 | { "(?i)\\w" , "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" }, |
97 | { "(?i)\\W" , "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, |
98 | { "[^\\\\]" , "cc{0-0x5b 0x5d-0x10ffff}" }, |
99 | { "\\C" , "byte{}" }, |
100 | |
101 | // Unicode, negatives, and a double negative. |
102 | { "\\p{Braille}" , "cc{0x2800-0x28ff}" }, |
103 | { "\\P{Braille}" , "cc{0-0x27ff 0x2900-0x10ffff}" }, |
104 | { "\\p{^Braille}" , "cc{0-0x27ff 0x2900-0x10ffff}" }, |
105 | { "\\P{^Braille}" , "cc{0x2800-0x28ff}" }, |
106 | |
107 | // More interesting regular expressions. |
108 | { "a{,2}" , "str{a{,2}}" }, |
109 | { "\\.\\^\\$\\\\" , "str{.^$\\}" }, |
110 | { "[a-zABC]" , "cc{0x41-0x43 0x61-0x7a}" }, |
111 | { "[^a]" , "cc{0-0x60 0x62-0x10ffff}" }, |
112 | { "[\xce\xb1-\xce\xb5\xe2\x98\xba]" , "cc{0x3b1-0x3b5 0x263a}" }, // utf-8 |
113 | { "a*{" , "cat{star{lit{a}}lit{{}}" }, |
114 | |
115 | // Test precedences |
116 | { "(?:ab)*" , "star{str{ab}}" }, |
117 | { "(ab)*" , "star{cap{str{ab}}}" }, |
118 | { "ab|cd" , "alt{str{ab}str{cd}}" }, |
119 | { "a(b|c)d" , "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" }, |
120 | |
121 | // Test squashing of **, ++, ?? et cetera. |
122 | { "(?:(?:a)*)*" , "star{lit{a}}" }, |
123 | { "(?:(?:a)+)+" , "plus{lit{a}}" }, |
124 | { "(?:(?:a)?)?" , "que{lit{a}}" }, |
125 | { "(?:(?:a)*)+" , "star{lit{a}}" }, |
126 | { "(?:(?:a)*)?" , "star{lit{a}}" }, |
127 | { "(?:(?:a)+)*" , "star{lit{a}}" }, |
128 | { "(?:(?:a)+)?" , "star{lit{a}}" }, |
129 | { "(?:(?:a)?)*" , "star{lit{a}}" }, |
130 | { "(?:(?:a)?)+" , "star{lit{a}}" }, |
131 | |
132 | // Test flattening. |
133 | { "(?:a)" , "lit{a}" }, |
134 | { "(?:ab)(?:cd)" , "str{abcd}" }, |
135 | { "(?:a|b)|(?:c|d)" , "cc{0x61-0x64}" }, |
136 | { "a|c" , "cc{0x61 0x63}" }, |
137 | { "a|[cd]" , "cc{0x61 0x63-0x64}" }, |
138 | { "a|." , "dot{}" }, |
139 | { "[ab]|c" , "cc{0x61-0x63}" }, |
140 | { "[ab]|[cd]" , "cc{0x61-0x64}" }, |
141 | { "[ab]|." , "dot{}" }, |
142 | { ".|c" , "dot{}" }, |
143 | { ".|[cd]" , "dot{}" }, |
144 | { ".|." , "dot{}" }, |
145 | |
146 | // Test Perl quoted literals |
147 | { "\\Q+|*?{[\\E" , "str{+|*?{[}" }, |
148 | { "\\Q+\\E+" , "plus{lit{+}}" }, |
149 | { "\\Q\\\\E" , "lit{\\}" }, |
150 | { "\\Q\\\\\\E" , "str{\\\\}" }, |
151 | { "\\Qa\\E*" , "star{lit{a}}" }, |
152 | { "\\Qab\\E*" , "cat{lit{a}star{lit{b}}}" }, |
153 | { "\\Qabc\\E*" , "cat{str{ab}star{lit{c}}}" }, |
154 | |
155 | // Test Perl \A and \z |
156 | { "(?m)^" , "bol{}" }, |
157 | { "(?m)$" , "eol{}" }, |
158 | { "(?-m)^" , "bot{}" }, |
159 | { "(?-m)$" , "eot{}" }, |
160 | { "(?m)\\A" , "bot{}" }, |
161 | { "(?m)\\z" , "eot{\\z}" }, |
162 | { "(?-m)\\A" , "bot{}" }, |
163 | { "(?-m)\\z" , "eot{\\z}" }, |
164 | |
165 | // Test named captures |
166 | { "(?P<name>a)" , "cap{name:lit{a}}" }, |
167 | |
168 | // Case-folded literals |
169 | { "[Aa]" , "litfold{a}" }, |
170 | |
171 | // Strings |
172 | { "abcde" , "str{abcde}" }, |
173 | { "[Aa][Bb]cd" , "cat{strfold{ab}str{cd}}" }, |
174 | |
175 | // Reported bug involving \n leaking in despite use of NeverNL. |
176 | { "[^ ]" , "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}" , TestZeroFlags }, |
177 | { "[^ ]" , "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}" , Regexp::FoldCase }, |
178 | { "[^ ]" , "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}" , Regexp::NeverNL }, |
179 | { "[^ ]" , "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}" , Regexp::NeverNL | Regexp::FoldCase }, |
180 | { "[^ \f]" , "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}" , TestZeroFlags }, |
181 | { "[^ \f]" , "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}" , Regexp::FoldCase }, |
182 | { "[^ \f]" , "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}" , Regexp::NeverNL }, |
183 | { "[^ \f]" , "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}" , Regexp::NeverNL | Regexp::FoldCase }, |
184 | { "[^ \r]" , "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}" , TestZeroFlags }, |
185 | { "[^ \r]" , "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}" , Regexp::FoldCase }, |
186 | { "[^ \r]" , "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}" , Regexp::NeverNL }, |
187 | { "[^ \r]" , "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}" , Regexp::NeverNL | Regexp::FoldCase }, |
188 | { "[^ \v]" , "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}" , TestZeroFlags }, |
189 | { "[^ \v]" , "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}" , Regexp::FoldCase }, |
190 | { "[^ \v]" , "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}" , Regexp::NeverNL }, |
191 | { "[^ \v]" , "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}" , Regexp::NeverNL | Regexp::FoldCase }, |
192 | { "[^ \t]" , "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}" , TestZeroFlags }, |
193 | { "[^ \t]" , "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}" , Regexp::FoldCase }, |
194 | { "[^ \t]" , "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}" , Regexp::NeverNL }, |
195 | { "[^ \t]" , "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}" , Regexp::NeverNL | Regexp::FoldCase }, |
196 | { "[^ \r\f\v]" , "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}" , Regexp::NeverNL }, |
197 | { "[^ \r\f\v]" , "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}" , Regexp::NeverNL | Regexp::FoldCase }, |
198 | { "[^ \r\f\t\v]" , "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}" , Regexp::NeverNL }, |
199 | { "[^ \r\f\t\v]" , "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}" , Regexp::NeverNL | Regexp::FoldCase }, |
200 | { "[^ \r\n\f\t\v]" , "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}" , Regexp::NeverNL }, |
201 | { "[^ \r\n\f\t\v]" , "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}" , Regexp::NeverNL | Regexp::FoldCase }, |
202 | { "[^ \r\n\f\t]" , "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" , Regexp::NeverNL }, |
203 | { "[^ \r\n\f\t]" , "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" , Regexp::NeverNL | Regexp::FoldCase }, |
204 | { "[^\t-\n\f-\r ]" , "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" , |
205 | Regexp::PerlClasses }, |
206 | { "[^\t-\n\f-\r ]" , "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" , |
207 | Regexp::PerlClasses | Regexp::FoldCase }, |
208 | { "[^\t-\n\f-\r ]" , "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" , |
209 | Regexp::PerlClasses | Regexp::NeverNL }, |
210 | { "[^\t-\n\f-\r ]" , "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" , |
211 | Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase }, |
212 | { "\\S" , "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" , |
213 | Regexp::PerlClasses }, |
214 | { "\\S" , "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" , |
215 | Regexp::PerlClasses | Regexp::FoldCase }, |
216 | { "\\S" , "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" , |
217 | Regexp::PerlClasses | Regexp::NeverNL }, |
218 | { "\\S" , "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" , |
219 | Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase }, |
220 | |
221 | // Bug in Regexp::ToString() that emitted [^], which |
222 | // would (obviously) fail to parse when fed back in. |
223 | { "[\\s\\S]" , "cc{0-0x10ffff}" }, |
224 | }; |
225 | |
226 | bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) { |
227 | return Regexp::Equal(a, b); |
228 | } |
229 | |
230 | void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags, |
231 | const std::string& title) { |
232 | Regexp** re = new Regexp*[ntests]; |
233 | for (int i = 0; i < ntests; i++) { |
234 | RegexpStatus status; |
235 | Regexp::ParseFlags f = flags; |
236 | if (tests[i].flags != 0) { |
237 | f = tests[i].flags & ~TestZeroFlags; |
238 | } |
239 | re[i] = Regexp::Parse(tests[i].regexp, f, &status); |
240 | ASSERT_TRUE(re[i] != NULL) |
241 | << " " << tests[i].regexp << " " << status.Text(); |
242 | std::string s = re[i]->Dump(); |
243 | EXPECT_EQ(std::string(tests[i].parse), s) |
244 | << "Regexp: " << tests[i].regexp |
245 | << "\nparse: " << std::string(tests[i].parse) |
246 | << " s: " << s << " flag=" << f; |
247 | } |
248 | |
249 | for (int i = 0; i < ntests; i++) { |
250 | for (int j = 0; j < ntests; j++) { |
251 | EXPECT_EQ(std::string(tests[i].parse) == std::string(tests[j].parse), |
252 | RegexpEqualTestingOnly(re[i], re[j])) |
253 | << "Regexp: " << tests[i].regexp << " " << tests[j].regexp; |
254 | } |
255 | } |
256 | |
257 | for (int i = 0; i < ntests; i++) |
258 | re[i]->Decref(); |
259 | delete[] re; |
260 | } |
261 | |
262 | // Test that regexps parse to expected structures. |
263 | TEST(TestParse, SimpleRegexps) { |
264 | TestParse(tests, arraysize(tests), kTestFlags, "simple" ); |
265 | } |
266 | |
267 | Test foldcase_tests[] = { |
268 | { "AbCdE" , "strfold{abcde}" }, |
269 | { "[Aa]" , "litfold{a}" }, |
270 | { "a" , "litfold{a}" }, |
271 | |
272 | // 0x17F is an old English long s (looks like an f) and folds to s. |
273 | // 0x212A is the Kelvin symbol and folds to k. |
274 | { "A[F-g]" , "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...] |
275 | { "[[:upper:]]" , "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, |
276 | { "[[:lower:]]" , "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, |
277 | }; |
278 | |
279 | // Test that parsing with FoldCase works. |
280 | TEST(TestParse, FoldCase) { |
281 | TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase" ); |
282 | } |
283 | |
284 | Test literal_tests[] = { |
285 | { "(|)^$.[*+?]{5,10},\\" , "str{(|)^$.[*+?]{5,10},\\}" }, |
286 | }; |
287 | |
288 | // Test that parsing with Literal works. |
289 | TEST(TestParse, Literal) { |
290 | TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal" ); |
291 | } |
292 | |
293 | Test matchnl_tests[] = { |
294 | { "." , "dot{}" }, |
295 | { "\n" , "lit{\n}" }, |
296 | { "[^a]" , "cc{0-0x60 0x62-0x10ffff}" }, |
297 | { "[a\\n]" , "cc{0xa 0x61}" }, |
298 | }; |
299 | |
300 | // Test that parsing with MatchNL works. |
301 | // (Also tested above during simple cases.) |
302 | TEST(TestParse, MatchNL) { |
303 | TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL" ); |
304 | } |
305 | |
306 | Test nomatchnl_tests[] = { |
307 | { "." , "cc{0-0x9 0xb-0x10ffff}" }, |
308 | { "\n" , "lit{\n}" }, |
309 | { "[^a]" , "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" }, |
310 | { "[a\\n]" , "cc{0xa 0x61}" }, |
311 | }; |
312 | |
313 | // Test that parsing without MatchNL works. |
314 | TEST(TestParse, NoMatchNL) { |
315 | TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL" ); |
316 | } |
317 | |
318 | Test prefix_tests[] = { |
319 | { "abc|abd" , "cat{str{ab}cc{0x63-0x64}}" }, |
320 | { "a(?:b)c|abd" , "cat{str{ab}cc{0x63-0x64}}" }, |
321 | { "abc|abd|aef|bcx|bcy" , |
322 | "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}" |
323 | "cat{str{bc}cc{0x78-0x79}}}" }, |
324 | { "abc|x|abd" , "alt{str{abc}lit{x}str{abd}}" }, |
325 | { "(?i)abc|ABD" , "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" }, |
326 | { "[ab]c|[ab]d" , "cat{cc{0x61-0x62}cc{0x63-0x64}}" }, |
327 | { ".c|.d" , "cat{cc{0-0x9 0xb-0x10ffff}cc{0x63-0x64}}" }, |
328 | { "\\Cc|\\Cd" , "cat{byte{}cc{0x63-0x64}}" }, |
329 | { "x{2}|x{2}[0-9]" , |
330 | "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" }, |
331 | { "x{2}y|x{2}[0-9]y" , |
332 | "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" }, |
333 | { "n|r|rs" , |
334 | "alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" }, |
335 | { "n|rs|r" , |
336 | "alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" }, |
337 | { "r|rs|n" , |
338 | "alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" }, |
339 | { "rs|r|n" , |
340 | "alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" }, |
341 | { "a\\C*?c|a\\C*?b" , |
342 | "cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" }, |
343 | { "^/a/bc|^/a/de" , |
344 | "cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" }, |
345 | // In the past, factoring was limited to kFactorAlternationMaxDepth (8). |
346 | { "a|aa|aaa|aaaa|aaaaa|aaaaaa|aaaaaaa|aaaaaaaa|aaaaaaaaa|aaaaaaaaaa" , |
347 | "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" |
348 | "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" |
349 | "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" |
350 | "lit{a}}}}}}}}}}}}}}}}}}}" }, |
351 | { "a|aardvark|aardvarks|abaci|aback|abacus|abacuses|abaft|abalone|abalones" , |
352 | "cat{lit{a}alt{emp{}cat{str{ardvark}alt{emp{}lit{s}}}" |
353 | "cat{str{ba}alt{cat{lit{c}alt{cc{0x69 0x6b}cat{str{us}alt{emp{}str{es}}}}}" |
354 | "str{ft}cat{str{lone}alt{emp{}lit{s}}}}}}}" }, |
355 | }; |
356 | |
357 | // Test that prefix factoring works. |
358 | TEST(TestParse, Prefix) { |
359 | TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix" ); |
360 | } |
361 | |
362 | Test nested_tests[] = { |
363 | { "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))" , |
364 | "cap{cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}}}}}}}}" }, |
365 | { "((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})" , |
366 | "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{1,1 lit{x}}}}}}}}}}}}}}}}}}}}}" }, |
367 | { "((((((((((x{0}){2}){2}){2}){2}){2}){2}){2}){2}){2})" , |
368 | "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{0,0 lit{x}}}}}}}}}}}}}}}}}}}}}" }, |
369 | { "((((((x{2}){2}){2}){5}){5}){5})" , |
370 | "cap{rep{5,5 cap{rep{5,5 cap{rep{5,5 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}" }, |
371 | }; |
372 | |
373 | // Test that nested repetition works. |
374 | TEST(TestParse, Nested) { |
375 | TestParse(nested_tests, arraysize(nested_tests), Regexp::PerlX, "nested" ); |
376 | } |
377 | |
378 | // Invalid regular expressions |
379 | const char* badtests[] = { |
380 | "(" , |
381 | ")" , |
382 | "(a" , |
383 | "(a|b|" , |
384 | "(a|b" , |
385 | "[a-z" , |
386 | "([a-z)" , |
387 | "x{1001}" , |
388 | "\xff" , // Invalid UTF-8 |
389 | "[\xff]" , |
390 | "[\\\xff]" , |
391 | "\\\xff" , |
392 | "(?P<name>a" , |
393 | "(?P<name>" , |
394 | "(?P<name" , |
395 | "(?P<x y>a)" , |
396 | "(?P<>a)" , |
397 | "[a-Z]" , |
398 | "(?i)[a-Z]" , |
399 | "a{100000}" , |
400 | "a{100000,}" , |
401 | "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})" , |
402 | "(((x{7}){11}){13})" , |
403 | "\\Q\\E*" , |
404 | }; |
405 | |
406 | // Valid in Perl, bad in POSIX |
407 | const char* only_perl[] = { |
408 | "[a-b-c]" , |
409 | "\\Qabc\\E" , |
410 | "\\Q*+?{[\\E" , |
411 | "\\Q\\\\E" , |
412 | "\\Q\\\\\\E" , |
413 | "\\Q\\\\\\\\E" , |
414 | "\\Q\\\\\\\\\\E" , |
415 | "(?:a)" , |
416 | "(?P<name>a)" , |
417 | }; |
418 | |
419 | // Valid in POSIX, bad in Perl. |
420 | const char* only_posix[] = { |
421 | "a++" , |
422 | "a**" , |
423 | "a?*" , |
424 | "a+*" , |
425 | "a{1}*" , |
426 | }; |
427 | |
428 | // Test that parser rejects bad regexps. |
429 | TEST(TestParse, InvalidRegexps) { |
430 | for (size_t i = 0; i < arraysize(badtests); i++) { |
431 | ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL) |
432 | << " " << badtests[i]; |
433 | ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL) |
434 | << " " << badtests[i]; |
435 | } |
436 | for (size_t i = 0; i < arraysize(only_posix); i++) { |
437 | ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL) |
438 | << " " << only_posix[i]; |
439 | Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL); |
440 | ASSERT_TRUE(re != NULL) << " " << only_posix[i]; |
441 | re->Decref(); |
442 | } |
443 | for (size_t i = 0; i < arraysize(only_perl); i++) { |
444 | ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL) |
445 | << " " << only_perl[i]; |
446 | Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL); |
447 | ASSERT_TRUE(re != NULL) << " " << only_perl[i]; |
448 | re->Decref(); |
449 | } |
450 | } |
451 | |
452 | // Test that ToString produces original regexp or equivalent one. |
453 | TEST(TestToString, EquivalentParse) { |
454 | for (size_t i = 0; i < arraysize(tests); i++) { |
455 | RegexpStatus status; |
456 | Regexp::ParseFlags f = kTestFlags; |
457 | if (tests[i].flags != 0) { |
458 | f = tests[i].flags & ~TestZeroFlags; |
459 | } |
460 | Regexp* re = Regexp::Parse(tests[i].regexp, f, &status); |
461 | ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text(); |
462 | std::string s = re->Dump(); |
463 | EXPECT_EQ(std::string(tests[i].parse), s) |
464 | << "Regexp: " << tests[i].regexp |
465 | << "\nparse: " << std::string(tests[i].parse) |
466 | << " s: " << s << " flag=" << f; |
467 | std::string t = re->ToString(); |
468 | if (t != tests[i].regexp) { |
469 | // If ToString didn't return the original regexp, |
470 | // it must have found one with fewer parens. |
471 | // Unfortunately we can't check the length here, because |
472 | // ToString produces "\\{" for a literal brace, |
473 | // but "{" is a shorter equivalent. |
474 | // ASSERT_LT(t.size(), strlen(tests[i].regexp)) |
475 | // << " t=" << t << " regexp=" << tests[i].regexp; |
476 | |
477 | // Test that if we parse the new regexp we get the same structure. |
478 | Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status); |
479 | ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text(); |
480 | std::string ss = nre->Dump(); |
481 | std::string tt = nre->ToString(); |
482 | if (s != ss || t != tt) |
483 | LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t; |
484 | EXPECT_EQ(s, ss); |
485 | EXPECT_EQ(t, tt); |
486 | nre->Decref(); |
487 | } |
488 | re->Decref(); |
489 | } |
490 | } |
491 | |
492 | // Test that capture error args are correct. |
493 | TEST(NamedCaptures, ErrorArgs) { |
494 | RegexpStatus status; |
495 | Regexp* re; |
496 | |
497 | re = Regexp::Parse("test(?P<name" , Regexp::LikePerl, &status); |
498 | EXPECT_TRUE(re == NULL); |
499 | EXPECT_EQ(status.code(), kRegexpBadNamedCapture); |
500 | EXPECT_EQ(status.error_arg(), "(?P<name" ); |
501 | |
502 | re = Regexp::Parse("test(?P<space bar>z)" , Regexp::LikePerl, &status); |
503 | EXPECT_TRUE(re == NULL); |
504 | EXPECT_EQ(status.code(), kRegexpBadNamedCapture); |
505 | EXPECT_EQ(status.error_arg(), "(?P<space bar>" ); |
506 | } |
507 | |
508 | } // namespace re2 |
509 | |