1 | // Copyright 2006-2007 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #include "util/test.h" |
6 | #include "re2/prog.h" |
7 | #include "re2/regexp.h" |
8 | #include "re2/testing/tester.h" |
9 | #include "re2/testing/exhaustive_tester.h" |
10 | |
11 | // For target `log' in the Makefile. |
12 | #ifndef LOGGING |
13 | #define LOGGING 0 |
14 | #endif |
15 | |
16 | namespace re2 { |
17 | |
18 | struct RegexpTest { |
19 | const char* regexp; |
20 | const char* text; |
21 | }; |
22 | |
23 | RegexpTest simple_tests[] = { |
24 | { "a" , "a" }, |
25 | { "a" , "zyzzyva" }, |
26 | { "a+" , "aa" }, |
27 | { "(a+|b)+" , "ab" }, |
28 | { "ab|cd" , "xabcdx" }, |
29 | { "h.*od?" , "hello\ngoodbye\n" }, |
30 | { "h.*o" , "hello\ngoodbye\n" }, |
31 | { "h.*o" , "goodbye\nhello\n" }, |
32 | { "h.*o" , "hello world" }, |
33 | { "h.*o" , "othello, world" }, |
34 | { "[^\\s\\S]" , "aaaaaaa" }, |
35 | { "a" , "aaaaaaa" }, |
36 | { "a*" , "aaaaaaa" }, |
37 | { "a*" , "" }, |
38 | { "ab|cd" , "xabcdx" }, |
39 | { "a" , "cab" }, |
40 | { "a*b" , "cab" }, |
41 | { "((((((((((((((((((((x))))))))))))))))))))" , "x" }, |
42 | { "[abcd]" , "xxxabcdxxx" }, |
43 | { "[^x]" , "xxxabcdxxx" }, |
44 | { "[abcd]+" , "xxxabcdxxx" }, |
45 | { "[^x]+" , "xxxabcdxxx" }, |
46 | { "(fo|foo)" , "fo" }, |
47 | { "(foo|fo)" , "foo" }, |
48 | |
49 | { "aa" , "aA" }, |
50 | { "a" , "Aa" }, |
51 | { "a" , "A" }, |
52 | { "ABC" , "abc" }, |
53 | { "abc" , "XABCY" }, |
54 | { "ABC" , "xabcy" }, |
55 | |
56 | // Make sure ^ and $ work. |
57 | // The pathological cases didn't work |
58 | // in the original grep code. |
59 | { "foo|bar|[A-Z]" , "foo" }, |
60 | { "^(foo|bar|[A-Z])" , "foo" }, |
61 | { "(foo|bar|[A-Z])$" , "foo\n" }, |
62 | { "(foo|bar|[A-Z])$" , "foo" }, |
63 | { "^(foo|bar|[A-Z])$" , "foo\n" }, |
64 | { "^(foo|bar|[A-Z])$" , "foo" }, |
65 | { "^(foo|bar|[A-Z])$" , "bar" }, |
66 | { "^(foo|bar|[A-Z])$" , "X" }, |
67 | { "^(foo|bar|[A-Z])$" , "XY" }, |
68 | { "^(fo|foo)$" , "fo" }, |
69 | { "^(fo|foo)$" , "foo" }, |
70 | { "^^(fo|foo)$" , "fo" }, |
71 | { "^^(fo|foo)$" , "foo" }, |
72 | { "^$" , "" }, |
73 | { "^$" , "x" }, |
74 | { "^^$" , "" }, |
75 | { "^$$" , "" }, |
76 | { "^^$" , "x" }, |
77 | { "^$$" , "x" }, |
78 | { "^^$$" , "" }, |
79 | { "^^$$" , "x" }, |
80 | { "^^^^^^^^$$$$$$$$" , "" }, |
81 | { "^" , "x" }, |
82 | { "$" , "x" }, |
83 | |
84 | // Word boundaries. |
85 | { "\\bfoo\\b" , "nofoo foo that" }, |
86 | { "a\\b" , "faoa x" }, |
87 | { "\\bbar" , "bar x" }, |
88 | { "\\bbar" , "foo\nbar x" }, |
89 | { "bar\\b" , "foobar" }, |
90 | { "bar\\b" , "foobar\nxxx" }, |
91 | { "(foo|bar|[A-Z])\\b" , "foo" }, |
92 | { "(foo|bar|[A-Z])\\b" , "foo\n" }, |
93 | { "\\b" , "" }, |
94 | { "\\b" , "x" }, |
95 | { "\\b(foo|bar|[A-Z])" , "foo" }, |
96 | { "\\b(foo|bar|[A-Z])\\b" , "X" }, |
97 | { "\\b(foo|bar|[A-Z])\\b" , "XY" }, |
98 | { "\\b(foo|bar|[A-Z])\\b" , "bar" }, |
99 | { "\\b(foo|bar|[A-Z])\\b" , "foo" }, |
100 | { "\\b(foo|bar|[A-Z])\\b" , "foo\n" }, |
101 | { "\\b(foo|bar|[A-Z])\\b" , "ffoo bbar N x" }, |
102 | { "\\b(fo|foo)\\b" , "fo" }, |
103 | { "\\b(fo|foo)\\b" , "foo" }, |
104 | { "\\b\\b" , "" }, |
105 | { "\\b\\b" , "x" }, |
106 | { "\\b$" , "" }, |
107 | { "\\b$" , "x" }, |
108 | { "\\b$" , "y x" }, |
109 | { "\\b.$" , "x" }, |
110 | { "^\\b(fo|foo)\\b" , "fo" }, |
111 | { "^\\b(fo|foo)\\b" , "foo" }, |
112 | { "^\\b" , "" }, |
113 | { "^\\b" , "x" }, |
114 | { "^\\b\\b" , "" }, |
115 | { "^\\b\\b" , "x" }, |
116 | { "^\\b$" , "" }, |
117 | { "^\\b$" , "x" }, |
118 | { "^\\b.$" , "x" }, |
119 | { "^\\b.\\b$" , "x" }, |
120 | { "^^^^^^^^\\b$$$$$$$" , "" }, |
121 | { "^^^^^^^^\\b.$$$$$$" , "x" }, |
122 | { "^^^^^^^^\\b$$$$$$$" , "x" }, |
123 | |
124 | // Non-word boundaries. |
125 | { "\\Bfoo\\B" , "n foo xfoox that" }, |
126 | { "a\\B" , "faoa x" }, |
127 | { "\\Bbar" , "bar x" }, |
128 | { "\\Bbar" , "foo\nbar x" }, |
129 | { "bar\\B" , "foobar" }, |
130 | { "bar\\B" , "foobar\nxxx" }, |
131 | { "(foo|bar|[A-Z])\\B" , "foox" }, |
132 | { "(foo|bar|[A-Z])\\B" , "foo\n" }, |
133 | { "\\B" , "" }, |
134 | { "\\B" , "x" }, |
135 | { "\\B(foo|bar|[A-Z])" , "foo" }, |
136 | { "\\B(foo|bar|[A-Z])\\B" , "xXy" }, |
137 | { "\\B(foo|bar|[A-Z])\\B" , "XY" }, |
138 | { "\\B(foo|bar|[A-Z])\\B" , "XYZ" }, |
139 | { "\\B(foo|bar|[A-Z])\\B" , "abara" }, |
140 | { "\\B(foo|bar|[A-Z])\\B" , "xfoo_" }, |
141 | { "\\B(foo|bar|[A-Z])\\B" , "xfoo\n" }, |
142 | { "\\B(foo|bar|[A-Z])\\B" , "foo bar vNx" }, |
143 | { "\\B(fo|foo)\\B" , "xfoo" }, |
144 | { "\\B(foo|fo)\\B" , "xfooo" }, |
145 | { "\\B\\B" , "" }, |
146 | { "\\B\\B" , "x" }, |
147 | { "\\B$" , "" }, |
148 | { "\\B$" , "x" }, |
149 | { "\\B$" , "y x" }, |
150 | { "\\B.$" , "x" }, |
151 | { "^\\B(fo|foo)\\B" , "fo" }, |
152 | { "^\\B(fo|foo)\\B" , "foo" }, |
153 | { "^\\B" , "" }, |
154 | { "^\\B" , "x" }, |
155 | { "^\\B\\B" , "" }, |
156 | { "^\\B\\B" , "x" }, |
157 | { "^\\B$" , "" }, |
158 | { "^\\B$" , "x" }, |
159 | { "^\\B.$" , "x" }, |
160 | { "^\\B.\\B$" , "x" }, |
161 | { "^^^^^^^^\\B$$$$$$$" , "" }, |
162 | { "^^^^^^^^\\B.$$$$$$" , "x" }, |
163 | { "^^^^^^^^\\B$$$$$$$" , "x" }, |
164 | |
165 | // PCRE uses only ASCII for \b computation. |
166 | // All non-ASCII are *not* word characters. |
167 | { "\\bx\\b" , "x" }, |
168 | { "\\bx\\b" , "x>" }, |
169 | { "\\bx\\b" , "<x" }, |
170 | { "\\bx\\b" , "<x>" }, |
171 | { "\\bx\\b" , "ax" }, |
172 | { "\\bx\\b" , "xb" }, |
173 | { "\\bx\\b" , "axb" }, |
174 | { "\\bx\\b" , "«x" }, |
175 | { "\\bx\\b" , "x»" }, |
176 | { "\\bx\\b" , "«x»" }, |
177 | { "\\bx\\b" , "axb" }, |
178 | { "\\bx\\b" , "áxβ" }, |
179 | { "\\Bx\\B" , "axb" }, |
180 | { "\\Bx\\B" , "áxβ" }, |
181 | |
182 | // Weird boundary cases. |
183 | { "^$^$" , "" }, |
184 | { "^$^" , "" }, |
185 | { "$^$" , "" }, |
186 | |
187 | { "^$^$" , "x" }, |
188 | { "^$^" , "x" }, |
189 | { "$^$" , "x" }, |
190 | |
191 | { "^$^$" , "x\ny" }, |
192 | { "^$^" , "x\ny" }, |
193 | { "$^$" , "x\ny" }, |
194 | |
195 | { "^$^$" , "x\n\ny" }, |
196 | { "^$^" , "x\n\ny" }, |
197 | { "$^$" , "x\n\ny" }, |
198 | |
199 | { "^(foo\\$)$" , "foo$bar" }, |
200 | { "(foo\\$)" , "foo$bar" }, |
201 | { "^...$" , "abc" }, |
202 | |
203 | // UTF-8 |
204 | { "^\xe6\x9c\xac$" , "\xe6\x9c\xac" }, |
205 | { "^...$" , "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, |
206 | { "^...$" , ".\xe6\x9c\xac." }, |
207 | |
208 | { "^\\C\\C\\C$" , "\xe6\x9c\xac" }, |
209 | { "^\\C$" , "\xe6\x9c\xac" }, |
210 | { "^\\C\\C\\C$" , "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, |
211 | |
212 | // Latin1 |
213 | { "^...$" , "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, |
214 | { "^.........$" , "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, |
215 | { "^...$" , ".\xe6\x9c\xac." }, |
216 | { "^.....$" , ".\xe6\x9c\xac." }, |
217 | |
218 | // Perl v Posix |
219 | { "\\B(fo|foo)\\B" , "xfooo" }, |
220 | { "(fo|foo)" , "foo" }, |
221 | |
222 | // Octal escapes. |
223 | { "\\141" , "a" }, |
224 | { "\\060" , "0" }, |
225 | { "\\0600" , "00" }, |
226 | { "\\608" , "08" }, |
227 | { "\\01" , "\01" }, |
228 | { "\\018" , "\01" "8" }, |
229 | |
230 | // Hexadecimal escapes |
231 | { "\\x{61}" , "a" }, |
232 | { "\\x61" , "a" }, |
233 | { "\\x{00000061}" , "a" }, |
234 | |
235 | // Unicode scripts. |
236 | { "\\p{Greek}+" , "aαβb" }, |
237 | { "\\P{Greek}+" , "aαβb" }, |
238 | { "\\p{^Greek}+" , "aαβb" }, |
239 | { "\\P{^Greek}+" , "aαβb" }, |
240 | |
241 | // Unicode properties. Nd is decimal number. N is any number. |
242 | { "[^0-9]+" , "abc123" }, |
243 | { "\\p{Nd}+" , "abc123²³¼½¾₀₉" }, |
244 | { "\\p{^Nd}+" , "abc123²³¼½¾₀₉" }, |
245 | { "\\P{Nd}+" , "abc123²³¼½¾₀₉" }, |
246 | { "\\P{^Nd}+" , "abc123²³¼½¾₀₉" }, |
247 | { "\\pN+" , "abc123²³¼½¾₀₉" }, |
248 | { "\\p{N}+" , "abc123²³¼½¾₀₉" }, |
249 | { "\\p{^N}+" , "abc123²³¼½¾₀₉" }, |
250 | |
251 | { "\\p{Any}+" , "abc123" }, |
252 | |
253 | // Character classes & case folding. |
254 | { "(?i)[@-A]+" , "@AaB" }, // matches @Aa but not B |
255 | { "(?i)[A-Z]+" , "aAzZ" }, |
256 | { "(?i)[^\\\\]+" , "Aa\\" }, // \\ is between A-Z and a-z - |
257 | // splits the ranges in an interesting way. |
258 | |
259 | // would like to use, but PCRE mishandles in full-match, non-greedy mode |
260 | // { "(?i)[\\\\]+", "Aa" }, |
261 | |
262 | { "(?i)[acegikmoqsuwy]+" , "acegikmoqsuwyACEGIKMOQSUWY" }, |
263 | |
264 | // Character classes & case folding. |
265 | { "[@-A]+" , "@AaB" }, |
266 | { "[A-Z]+" , "aAzZ" }, |
267 | { "[^\\\\]+" , "Aa\\" }, |
268 | { "[acegikmoqsuwy]+" , "acegikmoqsuwyACEGIKMOQSUWY" }, |
269 | |
270 | // Anchoring. (^abc in aabcdef was a former bug) |
271 | // The tester checks for a match in the text and |
272 | // subpieces of the text with a byte removed on either side. |
273 | { "^abc" , "abcdef" }, |
274 | { "^abc" , "aabcdef" }, |
275 | { "^[ay]*[bx]+c" , "abcdef" }, |
276 | { "^[ay]*[bx]+c" , "aabcdef" }, |
277 | { "def$" , "abcdef" }, |
278 | { "def$" , "abcdeff" }, |
279 | { "d[ex][fy]$" , "abcdef" }, |
280 | { "d[ex][fy]$" , "abcdeff" }, |
281 | { "[dz][ex][fy]$" , "abcdef" }, |
282 | { "[dz][ex][fy]$" , "abcdeff" }, |
283 | { "(?m)^abc" , "abcdef" }, |
284 | { "(?m)^abc" , "aabcdef" }, |
285 | { "(?m)^[ay]*[bx]+c" , "abcdef" }, |
286 | { "(?m)^[ay]*[bx]+c" , "aabcdef" }, |
287 | { "(?m)def$" , "abcdef" }, |
288 | { "(?m)def$" , "abcdeff" }, |
289 | { "(?m)d[ex][fy]$" , "abcdef" }, |
290 | { "(?m)d[ex][fy]$" , "abcdeff" }, |
291 | { "(?m)[dz][ex][fy]$" , "abcdef" }, |
292 | { "(?m)[dz][ex][fy]$" , "abcdeff" }, |
293 | { "^" , "a" }, |
294 | { "^^" , "a" }, |
295 | |
296 | // Context. |
297 | // The tester checks for a match in the text and |
298 | // subpieces of the text with a byte removed on either side. |
299 | { "a" , "a" }, |
300 | { "ab*" , "a" }, |
301 | { "a\\C*" , "a" }, |
302 | { "a\\C+" , "a" }, |
303 | { "a\\C?" , "a" }, |
304 | { "a\\C*?" , "a" }, |
305 | { "a\\C+?" , "a" }, |
306 | { "a\\C??" , "a" }, |
307 | |
308 | // Former bugs. |
309 | { "a\\C*|ba\\C" , "baba" }, |
310 | { "\\w*I\\w*" , "Inc." }, |
311 | }; |
312 | |
313 | TEST(Regexp, SearchTests) { |
314 | int failures = 0; |
315 | for (size_t i = 0; i < arraysize(simple_tests); i++) { |
316 | const RegexpTest& t = simple_tests[i]; |
317 | if (!TestRegexpOnText(t.regexp, t.text)) |
318 | failures++; |
319 | |
320 | if (LOGGING) { |
321 | // Build a dummy ExhaustiveTest call that will trigger just |
322 | // this one test, so that we log the test case. |
323 | std::vector<std::string> atom, alpha, ops; |
324 | atom.push_back(t.regexp); |
325 | alpha.push_back(t.text); |
326 | ExhaustiveTest(1, 0, atom, ops, 1, alpha, "" , "" ); |
327 | } |
328 | } |
329 | EXPECT_EQ(failures, 0); |
330 | } |
331 | |
332 | } // namespace re2 |
333 | |