simplify_test.cc source code [RE2/re2/testing/simplify_test.cc]

1	// Copyright 2006 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Test simplify.cc.
6
7	#include <string.h>
8	#include <string>
9
10	#include "util/test.h"
11	#include "util/logging.h"
12	#include "re2/regexp.h"
13
14	namespace re2 {
15
16	struct Test {
17	const char* regexp;
18	const char* simplified;
19	};
20
21	static Test tests[] = {
22	// Already-simple constructs
23	{ "a", "a" },
24	{ "ab", "ab" },
25	{ "a\|b", "[a-b]" },
26	{ "ab\|cd", "ab\|cd" },
27	{ "(ab)", "(ab)" },
28	{ "(ab)+", "(ab)+" },
29	{ "(ab)?", "(ab)?" },
30	{ ".", "." },
31	{ "^", "^" },
32	{ "$", "$" },
33	{ "[ac]", "[ac]" },
34	{ "[^ac]", "[^ac]" },
35
36	// Posix character classes
37	{ "[[:alnum:]]", "[0-9A-Za-z]" },
38	{ "[[:alpha:]]", "[A-Za-z]" },
39	{ "[[:blank:]]", "[\\t ]" },
40	{ "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
41	{ "[[:digit:]]", "[0-9]" },
42	{ "[[:graph:]]", "[!-~]" },
43	{ "[[:lower:]]", "[a-z]" },
44	{ "[[:print:]]", "[ -~]" },
45	{ "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
46	{ "[[:space:]]" , "[\\t-\\r ]" },
47	{ "[[:upper:]]", "[A-Z]" },
48	{ "[[:xdigit:]]", "[0-9A-Fa-f]" },
49
50	// Perl character classes
51	{ "\\d", "[0-9]" },
52	{ "\\s", "[\\t-\\n\\f-\\r ]" },
53	{ "\\w", "[0-9A-Z_a-z]" },
54	{ "\\D", "[^0-9]" },
55	{ "\\S", "[^\\t-\\n\\f-\\r ]" },
56	{ "\\W", "[^0-9A-Z_a-z]" },
57	{ "[\\d]", "[0-9]" },
58	{ "[\\s]", "[\\t-\\n\\f-\\r ]" },
59	{ "[\\w]", "[0-9A-Z_a-z]" },
60	{ "[\\D]", "[^0-9]" },
61	{ "[\\S]", "[^\\t-\\n\\f-\\r ]" },
62	{ "[\\W]", "[^0-9A-Z_a-z]" },
63
64	// Posix repetitions
65	{ "a{1}", "a" },
66	{ "a{2}", "aa" },
67	{ "a{5}", "aaaaa" },
68	{ "a{0,1}", "a?" },
69	// The next three are illegible because Simplify inserts (?:)
70	// parens instead of () parens to avoid creating extra
71	// captured subexpressions. The comments show a version fewer parens.
72	{ "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)?
73	{ "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)?
74	{ "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)?
75	{ "a{0,2}", "(?:aa?)?" }, // (aa?)?
76	{ "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)?
77	{ "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)?
78	{ "a{0,}", "a*" },
79	{ "a{1,}", "a+" },
80	{ "a{2,}", "aa+" },
81	{ "a{5,}", "aaaaa+" },
82
83	// Test that operators simplify their arguments.
84	// (Simplify used to not simplify arguments to a {} repeat.)
85	{ "(?:a{1,}){1,}", "a+" },
86	{ "(a{1,}b{1,})", "(a+b+)" },
87	{ "a{1,}\|b{1,}", "a+\|b+" },
88	{ "(?:a{1,})", "(?:a+)" },
89	{ "(?:a{1,})+", "a+" },
90	{ "(?:a{1,})?", "(?:a+)?" },
91	{ "a{0}", "" },
92
93	// Character class simplification
94	{ "[ab]", "[a-b]" },
95	{ "[a-za-za-z]", "[a-z]" },
96	{ "[A-Za-zA-Za-z]", "[A-Za-z]" },
97	{ "[ABCDEFGH]", "[A-H]" },
98	{ "[AB-CD-EF-GH]", "[A-H]" },
99	{ "[W-ZP-XE-R]", "[E-Z]" },
100	{ "[a-ee-gg-m]", "[a-m]" },
101	{ "[a-ea-ha-m]", "[a-m]" },
102	{ "[a-ma-ha-e]", "[a-m]" },
103	{ "[a-zA-Z0-9 -~]", "[ -~]" },
104
105	// Empty character classes
106	{ "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
107
108	// Full character classes
109	{ "[[:cntrl:][:^cntrl:]]", "." },
110
111	// Unicode case folding.
112	{ "(?i)A", "[Aa]" },
113	{ "(?i)a", "[Aa]" },
114	{ "(?i)K", "[Kk\\x{212a}]" },
115	{ "(?i)k", "[Kk\\x{212a}]" },
116	{ "(?i)\\x{212a}", "[Kk\\x{212a}]" },
117	{ "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
118	{ "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
119	{ "(?i)[\\x00-\\x{10ffff}]", "." },
120
121	// Empty string as a regular expression.
122	// Empty string must be preserved inside parens in order
123	// to make submatches work right, so these are less
124	// interesting than they used to be. ToString inserts
125	// explicit (?:) in place of non-parenthesized empty strings,
126	// to make them easier to spot for other parsers.
127	{ "(a\|b\|)", "([a-b]\|(?:))" },
128	{ "(\|)", "((?:)\|(?:))" },
129	{ "a()", "a()" },
130	{ "(()\|())", "(()\|())" },
131	{ "(a\|)", "(a\|(?:))" },
132	{ "ab()cd()", "ab()cd()" },
133	{ "()", "()" },
134	{ "()", "()" },
135	{ "()+", "()+" },
136	{ "()?" , "()?" },
137	{ "(){0}", "" },
138	{ "(){1}", "()" },
139	{ "(){1,}", "()+" },
140	{ "(){0,2}", "(?:()()?)?" },
141
142	// Test that coalescing occurs and that the resulting repeats are simplified.
143	// Two-op combinations of , +, ?, {n}, {n,} and {n,m} with a literal:*
144	{ "aa", "a*" },
145	{ "a*a+", "a+" },
146	{ "aa?", "a" },
147	{ "a*a{2}", "aa+" },
148	{ "a*a{2,}", "aa+" },
149	{ "a*a{2,3}", "aa+" },
150	{ "a+a*", "a+" },
151	{ "a+a+", "aa+" },
152	{ "a+a?", "a+" },
153	{ "a+a{2}", "aaa+" },
154	{ "a+a{2,}", "aaa+" },
155	{ "a+a{2,3}", "aaa+" },
156	{ "a?a", "a" },
157	{ "a?a+", "a+" },
158	{ "a?a?", "(?:aa?)?" },
159	{ "a?a{2}", "aaa?" },
160	{ "a?a{2,}", "aa+" },
161	{ "a?a{2,3}", "aa(?:aa?)?" },
162	{ "a{2}a*", "aa+" },
163	{ "a{2}a+", "aaa+" },
164	{ "a{2}a?", "aaa?" },
165	{ "a{2}a{2}", "aaaa" },
166	{ "a{2}a{2,}", "aaaa+" },
167	{ "a{2}a{2,3}", "aaaaa?" },
168	{ "a{2,}a*", "aa+" },
169	{ "a{2,}a+", "aaa+" },
170	{ "a{2,}a?", "aa+" },
171	{ "a{2,}a{2}", "aaaa+" },
172	{ "a{2,}a{2,}", "aaaa+" },
173	{ "a{2,}a{2,3}", "aaaa+" },
174	{ "a{2,3}a*", "aa+" },
175	{ "a{2,3}a+", "aaa+" },
176	{ "a{2,3}a?", "aa(?:aa?)?" },
177	{ "a{2,3}a{2}", "aaaaa?" },
178	{ "a{2,3}a{2,}", "aaaa+" },
179	{ "a{2,3}a{2,3}", "aaaa(?:aa?)?" },
180	// With a char class, any char and any byte:
181	{ "\\d\\d", "[0-9]*" },
182	{ "..", ".*" },
183	{ "\\C\\C", "\\C*" },
184	// FoldCase works, but must be consistent:
185	{ "(?i)Aa", "[Aa]*" },
186	{ "(?i)a+A+", "[Aa][Aa]+" },
187	{ "(?i)A(?-i)a", "[Aa]a" },
188	{ "(?i)a+(?-i)A+", "[Aa]+A+" },
189	// NonGreedy works, but must be consistent:
190	{ "a?a?", "a*?" },
191	{ "a+?a+?", "aa+?" },
192	{ "a?a", "a?a" },
193	{ "a+a+?", "a+a+?" },
194	// The second element is the literal, char class, any char or any byte:
195	{ "a*a", "a+" },
196	{ "\\d*\\d", "[0-9]+" },
197	{ ".*.", ".+" },
198	{ "\\C*\\C", "\\C+" },
199	// FoldCase works, but must be consistent:
200	{ "(?i)A*a", "[Aa]+" },
201	{ "(?i)a+A", "[Aa][Aa]+" },
202	{ "(?i)A(?-i)a", "[Aa]a" },
203	{ "(?i)a+(?-i)A", "[Aa]+A" },
204	// The second element is a literal string that begins with the literal:
205	{ "a*aa", "aa+" },
206	{ "a*aab", "aa+b" },
207	// FoldCase works, but must be consistent:
208	{ "(?i)a*aa", "[Aa][Aa]+" },
209	{ "(?i)a*aab", "[Aa][Aa]+[Bb]" },
210	{ "(?i)a(?-i)aa", "[Aa]aa" },
211	{ "(?i)a(?-i)aab", "[Aa]aab" },
212	// Negative tests with mismatching ops:
213	{ "ab", "ab" },
214	{ "\\d\\D", "[0-9][^0-9]" },
215	{ "a+b", "a+b" },
216	{ "\\d+\\D", "[0-9]+[^0-9]" },
217	{ "a?bb", "a?bb" },
218	// Negative tests with capturing groups:
219	{ "(a)a", "(a)a" },
220	{ "a+(a)", "a+(a)" },
221	{ "(a?)(aa)", "(a?)(aa)" },
222	// Just for fun:
223	{ "aa*aa+aa?aa{2}aaa{2,}aaa{2,3}a", "aaaaaaaaaaaaaaaa+" },
224
225	// During coalescing, the child of the repeat changes, so we build a new
226	// repeat. The new repeat must have the min and max of the old repeat.
227	// Failure to copy them results in min=0 and max=0 -> empty match.
228	{ "(?:a*aab){2}", "aa+baa+b" },
229
230	// During coalescing, the child of the capture changes, so we build a new
231	// capture. The new capture must have the cap of the old capture.
232	// Failure to copy it results in cap=0 -> ToString() logs a fatal error.
233	{ "(a*aab)", "(aa+b)" },
234
235	// Test squashing of , ++, ?? et cetera.
236	{ "(?:(?:a){0,}){0,}", "a*" },
237	{ "(?:(?:a){1,}){1,}", "a+" },
238	{ "(?:(?:a){0,1}){0,1}", "a?" },
239	{ "(?:(?:a){0,}){1,}", "a*" },
240	{ "(?:(?:a){0,}){0,1}", "a*" },
241	{ "(?:(?:a){1,}){0,}", "a*" },
242	{ "(?:(?:a){1,}){0,1}", "a*" },
243	{ "(?:(?:a){0,1}){0,}", "a*" },
244	{ "(?:(?:a){0,1}){1,}", "a*" },
245	};
246
247	TEST(TestSimplify, SimpleRegexps) {
248	for (size_t i = `0`; i < arraysize(tests); i++) {
249	RegexpStatus status;
250	VLOG(`1`) << "Testing " << tests[i].regexp;
251	Regexp* re = Regexp::Parse(tests[i].regexp,
252	Regexp::MatchNL \| (Regexp::LikePerl &
253	~Regexp::OneLine),
254	&status);
255	ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text();
256	Regexp* sre = re->Simplify();
257	ASSERT_TRUE(sre != NULL);
258
259	// Check that already-simple regexps don't allocate new ones.
260	if (strcmp(tests[i].regexp, tests[i].simplified) == `0`) {
261	ASSERT_TRUE(re == sre) << " " << tests[i].regexp
262	<< " " << re->ToString() << " " << sre->ToString();
263	}
264
265	EXPECT_EQ(tests[i].simplified, sre->ToString())
266	<< " " << tests[i].regexp << " " << sre->Dump();
267
268	re->Decref();
269	sre->Decref();
270	}
271	}
272
273	} // namespace re2
274

Browse the source code of RE2/re2/testing/simplify_test.cc