parse_test.cc source code [RE2/re2/testing/parse_test.cc]

1	// Copyright 2006 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Test parse.cc, dump.cc, and tostring.cc.
6
7	#include <string>
8
9	#include "util/test.h"
10	#include "util/logging.h"
11	#include "re2/regexp.h"
12
13	namespace re2 {
14
15	// In the past, we used 1<<30 here and zeroed the bit later, but that
16	// has undefined behaviour, so now we use an internal-only flag because
17	// otherwise we would have to introduce a new flag value just for this.
18	static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar;
19
20	struct Test {
21	const char* regexp;
22	const char* parse;
23	Regexp::ParseFlags flags;
24	};
25
26	static Regexp::ParseFlags kTestFlags = Regexp::MatchNL \|
27	Regexp::PerlX \|
28	Regexp::PerlClasses \|
29	Regexp::UnicodeGroups;
30
31	static Test tests[] = {
32	// Base cases
33	{ "a", "lit{a}" },
34	{ "a.", "cat{lit{a}dot{}}" },
35	{ "a.b", "cat{lit{a}dot{}lit{b}}" },
36	{ "ab", "str{ab}" },
37	{ "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
38	{ "abc", "str{abc}" },
39	{ "a\|^", "alt{lit{a}bol{}}" },
40	{ "a\|b", "cc{0x61-0x62}" },
41	{ "(a)", "cap{lit{a}}" },
42	{ "(a)\|b", "alt{cap{lit{a}}lit{b}}" },
43	{ "a*", "star{lit{a}}" },
44	{ "a+", "plus{lit{a}}" },
45	{ "a?", "que{lit{a}}" },
46	{ "a{2}", "rep{2,2 lit{a}}" },
47	{ "a{2,3}", "rep{2,3 lit{a}}" },
48	{ "a{2,}", "rep{2,-1 lit{a}}" },
49	{ "a*?", "nstar{lit{a}}" },
50	{ "a+?", "nplus{lit{a}}" },
51	{ "a??", "nque{lit{a}}" },
52	{ "a{2}?", "nrep{2,2 lit{a}}" },
53	{ "a{2,3}?", "nrep{2,3 lit{a}}" },
54	{ "a{2,}?", "nrep{2,-1 lit{a}}" },
55	{ "", "emp{}" },
56	{ "\|", "alt{emp{}emp{}}" },
57	{ "\|x\|", "alt{emp{}lit{x}emp{}}" },
58	{ ".", "dot{}" },
59	{ "^", "bol{}" },
60	{ "$", "eol{}" },
61	{ "\\\|", "lit{\|}" },
62	{ "\\(", "lit{(}" },
63	{ "\\)", "lit{)}" },
64	{ "\\", "lit{}" },
65	{ "\\+", "lit{+}" },
66	{ "\\?", "lit{?}" },
67	{ "{", "lit{{}" },
68	{ "}", "lit{}}" },
69	{ "\\.", "lit{.}" },
70	{ "\\^", "lit{^}" },
71	{ "\\$", "lit{$}" },
72	{ "\\\\", "lit{\\}" },
73	{ "[ace]", "cc{0x61 0x63 0x65}" },
74	{ "[abc]", "cc{0x61-0x63}" },
75	{ "[a-z]", "cc{0x61-0x7a}" },
76	{ "[a]", "lit{a}" },
77	{ "\\-", "lit{-}" },
78	{ "-", "lit{-}" },
79	{ "\\_", "lit{_}" },
80
81	// Posix and Perl extensions
82	{ "[[:lower:]]", "cc{0x61-0x7a}" },
83	{ "[a-z]", "cc{0x61-0x7a}" },
84	{ "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
85	{ "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
86	{ "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
87	{ "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
88	{ "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
89	{ "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
90	{ "\\d", "cc{0x30-0x39}" },
91	{ "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
92	{ "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
93	{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
94	{ "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
95	{ "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
96	{ "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
97	{ "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
98	{ "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
99	{ "\\C", "byte{}" },
100
101	// Unicode, negatives, and a double negative.
102	{ "\\p{Braille}", "cc{0x2800-0x28ff}" },
103	{ "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
104	{ "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
105	{ "\\P{^Braille}", "cc{0x2800-0x28ff}" },
106
107	// More interesting regular expressions.
108	{ "a{,2}", "str{a{,2}}" },
109	{ "\\.\\^\\$\\\\", "str{.^$\\}" },
110	{ "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
111	{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
112	{ "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8
113	{ "a*{", "cat{star{lit{a}}lit{{}}" },
114
115	// Test precedences
116	{ "(?:ab)*", "star{str{ab}}" },
117	{ "(ab)*", "star{cap{str{ab}}}" },
118	{ "ab\|cd", "alt{str{ab}str{cd}}" },
119	{ "a(b\|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
120
121	// Test squashing of , ++, ?? et cetera.
122	{ "(?:(?:a))", "star{lit{a}}" },
123	{ "(?:(?:a)+)+", "plus{lit{a}}" },
124	{ "(?:(?:a)?)?", "que{lit{a}}" },
125	{ "(?:(?:a)*)+", "star{lit{a}}" },
126	{ "(?:(?:a)*)?", "star{lit{a}}" },
127	{ "(?:(?:a)+)*", "star{lit{a}}" },
128	{ "(?:(?:a)+)?", "star{lit{a}}" },
129	{ "(?:(?:a)?)*", "star{lit{a}}" },
130	{ "(?:(?:a)?)+", "star{lit{a}}" },
131
132	// Test flattening.
133	{ "(?:a)", "lit{a}" },
134	{ "(?:ab)(?:cd)", "str{abcd}" },
135	{ "(?:a\|b)\|(?:c\|d)", "cc{0x61-0x64}" },
136	{ "a\|c", "cc{0x61 0x63}" },
137	{ "a\|[cd]", "cc{0x61 0x63-0x64}" },
138	{ "a\|.", "dot{}" },
139	{ "[ab]\|c", "cc{0x61-0x63}" },
140	{ "[ab]\|[cd]", "cc{0x61-0x64}" },
141	{ "[ab]\|.", "dot{}" },
142	{ ".\|c", "dot{}" },
143	{ ".\|[cd]", "dot{}" },
144	{ ".\|.", "dot{}" },
145
146	// Test Perl quoted literals
147	{ "\\Q+\|?{[\\E", "str{+\|?{[}" },
148	{ "\\Q+\\E+", "plus{lit{+}}" },
149	{ "\\Q\\\\E", "lit{\\}" },
150	{ "\\Q\\\\\\E", "str{\\\\}" },
151	{ "\\Qa\\E*", "star{lit{a}}" },
152	{ "\\Qab\\E*", "cat{lit{a}star{lit{b}}}" },
153	{ "\\Qabc\\E*", "cat{str{ab}star{lit{c}}}" },
154
155	// Test Perl \A and \z
156	{ "(?m)^", "bol{}" },
157	{ "(?m)$", "eol{}" },
158	{ "(?-m)^", "bot{}" },
159	{ "(?-m)$", "eot{}" },
160	{ "(?m)\\A", "bot{}" },
161	{ "(?m)\\z", "eot{\\z}" },
162	{ "(?-m)\\A", "bot{}" },
163	{ "(?-m)\\z", "eot{\\z}" },
164
165	// Test named captures
166	{ "(?P<name>a)", "cap{name:lit{a}}" },
167
168	// Case-folded literals
169	{ "[Aa]", "litfold{a}" },
170
171	// Strings
172	{ "abcde", "str{abcde}" },
173	{ "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
174
175	// Reported bug involving \n leaking in despite use of NeverNL.
176	{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
177	{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
178	{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
179	{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
180	{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags },
181	{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
182	{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
183	{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
184	{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags },
185	{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
186	{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
187	{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
188	{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags },
189	{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
190	{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
191	{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
192	{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
193	{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
194	{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
195	{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
196	{ "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
197	{ "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
198	{ "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
199	{ "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
200	{ "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
201	{ "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
202	{ "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
203	{ "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
204	{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
205	Regexp::PerlClasses },
206	{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
207	Regexp::PerlClasses \| Regexp::FoldCase },
208	{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
209	Regexp::PerlClasses \| Regexp::NeverNL },
210	{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
211	Regexp::PerlClasses \| Regexp::NeverNL \| Regexp::FoldCase },
212	{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
213	Regexp::PerlClasses },
214	{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
215	Regexp::PerlClasses \| Regexp::FoldCase },
216	{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
217	Regexp::PerlClasses \| Regexp::NeverNL },
218	{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
219	Regexp::PerlClasses \| Regexp::NeverNL \| Regexp::FoldCase },
220
221	// Bug in Regexp::ToString() that emitted [^], which
222	// would (obviously) fail to parse when fed back in.
223	{ "[\\s\\S]", "cc{0-0x10ffff}" },
224	};
225
226	bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
227	return Regexp::Equal(a, b);
228	}
229
230	void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
231	const std::string& title) {
232	Regexp re = new** Regexp*[ntests];
233	for (int i = `0`; i < ntests; i++) {
234	RegexpStatus status;
235	Regexp::ParseFlags f = flags;
236	if (tests[i].flags != `0`) {
237	f = tests[i].flags & ~TestZeroFlags;
238	}
239	re[i] = Regexp::Parse(tests[i].regexp, f, &status);
240	ASSERT_TRUE(re[i] != NULL)
241	<< " " << tests[i].regexp << " " << status.Text();
242	std::string s = re[i]->Dump();
243	EXPECT_EQ(std::string (tests[i].parse), s)
244	<< "Regexp: " << tests[i].regexp
245	<< "\nparse: " << std::string (tests[i].parse)
246	<< " s: " << s << " flag=" << f;
247	}
248
249	for (int i = `0`; i < ntests; i++) {
250	for (int j = `0`; j < ntests; j++) {
251	EXPECT_EQ(std::string (tests[i].parse) == std::string (tests[j].parse),
252	RegexpEqualTestingOnly(re[i], re[j]))
253	<< "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
254	}
255	}
256
257	for (int i = `0`; i < ntests; i++)
258	re[i]->Decref();
259	delete[] re;
260	}
261
262	// Test that regexps parse to expected structures.
263	TEST(TestParse, SimpleRegexps) {
264	TestParse(tests, arraysize(tests), kTestFlags, "simple");
265	}
266
267	Test foldcase_tests[] = {
268	{ "AbCdE", "strfold{abcde}" },
269	{ "[Aa]", "litfold{a}" },
270	{ "a", "litfold{a}" },
271
272	// 0x17F is an old English long s (looks like an f) and folds to s.
273	// 0x212A is the Kelvin symbol and folds to k.
274	{ "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...]
275	{ "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
276	{ "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
277	};
278
279	// Test that parsing with FoldCase works.
280	TEST(TestParse, FoldCase) {
281	TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase");
282	}
283
284	Test literal_tests[] = {
285	{ "(\|)^$.[+?]{5,10},\\", "str{(\|)^$.[+?]{5,10},\\}" },
286	};
287
288	// Test that parsing with Literal works.
289	TEST(TestParse, Literal) {
290	TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal");
291	}
292
293	Test matchnl_tests[] = {
294	{ ".", "dot{}" },
295	{ "\n", "lit{\n}" },
296	{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
297	{ "[a\\n]", "cc{0xa 0x61}" },
298	};
299
300	// Test that parsing with MatchNL works.
301	// (Also tested above during simple cases.)
302	TEST(TestParse, MatchNL) {
303	TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL");
304	}
305
306	Test nomatchnl_tests[] = {
307	{ ".", "cc{0-0x9 0xb-0x10ffff}" },
308	{ "\n", "lit{\n}" },
309	{ "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
310	{ "[a\\n]", "cc{0xa 0x61}" },
311	};
312
313	// Test that parsing without MatchNL works.
314	TEST(TestParse, NoMatchNL) {
315	TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
316	}
317
318	Test prefix_tests[] = {
319	{ "abc\|abd", "cat{str{ab}cc{0x63-0x64}}" },
320	{ "a(?:b)c\|abd", "cat{str{ab}cc{0x63-0x64}}" },
321	{ "abc\|abd\|aef\|bcx\|bcy",
322	"alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
323	"cat{str{bc}cc{0x78-0x79}}}" },
324	{ "abc\|x\|abd", "alt{str{abc}lit{x}str{abd}}" },
325	{ "(?i)abc\|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
326	{ "[ab]c\|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
327	{ ".c\|.d", "cat{cc{0-0x9 0xb-0x10ffff}cc{0x63-0x64}}" },
328	{ "\\Cc\|\\Cd", "cat{byte{}cc{0x63-0x64}}" },
329	{ "x{2}\|x{2}[0-9]",
330	"cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
331	{ "x{2}y\|x{2}[0-9]y",
332	"cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
333	{ "n\|r\|rs",
334	"alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" },
335	{ "n\|rs\|r",
336	"alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" },
337	{ "r\|rs\|n",
338	"alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" },
339	{ "rs\|r\|n",
340	"alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" },
341	{ "a\\C?c\|a\\C?b",
342	"cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" },
343	{ "^/a/bc\|^/a/de",
344	"cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" },
345	// In the past, factoring was limited to kFactorAlternationMaxDepth (8).
346	{ "a\|aa\|aaa\|aaaa\|aaaaa\|aaaaaa\|aaaaaaa\|aaaaaaaa\|aaaaaaaaa\|aaaaaaaaaa",
347	"cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
348	"cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
349	"cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
350	"lit{a}}}}}}}}}}}}}}}}}}}" },
351	{ "a\|aardvark\|aardvarks\|abaci\|aback\|abacus\|abacuses\|abaft\|abalone\|abalones",
352	"cat{lit{a}alt{emp{}cat{str{ardvark}alt{emp{}lit{s}}}"
353	"cat{str{ba}alt{cat{lit{c}alt{cc{0x69 0x6b}cat{str{us}alt{emp{}str{es}}}}}"
354	"str{ft}cat{str{lone}alt{emp{}lit{s}}}}}}}" },
355	};
356
357	// Test that prefix factoring works.
358	TEST(TestParse, Prefix) {
359	TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix");
360	}
361
362	Test nested_tests[] = {
363	{ "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))",
364	"cap{cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}}}}}}}}" },
365	{ "((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
366	"cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{1,1 lit{x}}}}}}}}}}}}}}}}}}}}}" },
367	{ "((((((((((x{0}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
368	"cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{0,0 lit{x}}}}}}}}}}}}}}}}}}}}}" },
369	{ "((((((x{2}){2}){2}){5}){5}){5})",
370	"cap{rep{5,5 cap{rep{5,5 cap{rep{5,5 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}" },
371	};
372
373	// Test that nested repetition works.
374	TEST(TestParse, Nested) {
375	TestParse(nested_tests, arraysize(nested_tests), Regexp::PerlX, "nested");
376	}
377
378	// Invalid regular expressions
379	const char* badtests[] = {
380	"(",
381	")",
382	"(a",
383	"(a\|b\|",
384	"(a\|b",
385	"[a-z",
386	"([a-z)",
387	"x{1001}",
388	"\xff", // Invalid UTF-8
389	"[\xff]",
390	"[\\\xff]",
391	"\\\xff",
392	"(?P<name>a",
393	"(?P<name>",
394	"(?P<name",
395	"(?P<x y>a)",
396	"(?P<>a)",
397	"[a-Z]",
398	"(?i)[a-Z]",
399	"a{100000}",
400	"a{100000,}",
401	"((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
402	"(((x{7}){11}){13})",
403	"\\Q\\E*",
404	};
405
406	// Valid in Perl, bad in POSIX
407	const char* only_perl[] = {
408	"[a-b-c]",
409	"\\Qabc\\E",
410	"\\Q*+?{[\\E",
411	"\\Q\\\\E",
412	"\\Q\\\\\\E",
413	"\\Q\\\\\\\\E",
414	"\\Q\\\\\\\\\\E",
415	"(?:a)",
416	"(?P<name>a)",
417	};
418
419	// Valid in POSIX, bad in Perl.
420	const char* only_posix[] = {
421	"a++",
422	"a**",
423	"a?*",
424	"a+*",
425	"a{1}*",
426	};
427
428	// Test that parser rejects bad regexps.
429	TEST(TestParse, InvalidRegexps) {
430	for (size_t i = `0`; i < arraysize(badtests); i++) {
431	ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
432	<< " " << badtests[i];
433	ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
434	<< " " << badtests[i];
435	}
436	for (size_t i = `0`; i < arraysize(only_posix); i++) {
437	ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
438	<< " " << only_posix[i];
439	Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
440	ASSERT_TRUE(re != NULL) << " " << only_posix[i];
441	re->Decref();
442	}
443	for (size_t i = `0`; i < arraysize(only_perl); i++) {
444	ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
445	<< " " << only_perl[i];
446	Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
447	ASSERT_TRUE(re != NULL) << " " << only_perl[i];
448	re->Decref();
449	}
450	}
451
452	// Test that ToString produces original regexp or equivalent one.
453	TEST(TestToString, EquivalentParse) {
454	for (size_t i = `0`; i < arraysize(tests); i++) {
455	RegexpStatus status;
456	Regexp::ParseFlags f = kTestFlags;
457	if (tests[i].flags != `0`) {
458	f = tests[i].flags & ~TestZeroFlags;
459	}
460	Regexp* re = Regexp::Parse(tests[i].regexp, f, &status);
461	ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text();
462	std::string s = re->Dump();
463	EXPECT_EQ(std::string (tests[i].parse), s)
464	<< "Regexp: " << tests[i].regexp
465	<< "\nparse: " << std::string (tests[i].parse)
466	<< " s: " << s << " flag=" << f;
467	std::string t = re->ToString();
468	if (t != tests[i].regexp) {
469	// If ToString didn't return the original regexp,
470	// it must have found one with fewer parens.
471	// Unfortunately we can't check the length here, because
472	// ToString produces "\\{" for a literal brace,
473	// but "{" is a shorter equivalent.
474	// ASSERT_LT(t.size(), strlen(tests[i].regexp))
475	// << " t=" << t << " regexp=" << tests[i].regexp;
476
477	// Test that if we parse the new regexp we get the same structure.
478	Regexp* nre = Regexp::Parse(t, Regexp::MatchNL \| Regexp::PerlX, &status);
479	ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
480	std::string ss = nre->Dump();
481	std::string tt = nre->ToString();
482	if (s != ss \|\| t != tt)
483	LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
484	EXPECT_EQ(s, ss);
485	EXPECT_EQ(t, tt);
486	nre->Decref();
487	}
488	re->Decref();
489	}
490	}
491
492	// Test that capture error args are correct.
493	TEST(NamedCaptures, ErrorArgs) {
494	RegexpStatus status;
495	Regexp* re;
496
497	re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
498	EXPECT_TRUE(re == NULL);
499	EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
500	EXPECT_EQ(status.error_arg(), "(?P<name");
501
502	re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
503	EXPECT_TRUE(re == NULL);
504	EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
505	EXPECT_EQ(status.error_arg(), "(?P<space bar>");
506	}
507
508	} // namespace re2
509

Browse the source code of RE2/re2/testing/parse_test.cc