1 | // -*- coding: utf-8 -*- |
2 | // Copyright 2002-2009 The RE2 Authors. All Rights Reserved. |
3 | // Use of this source code is governed by a BSD-style |
4 | // license that can be found in the LICENSE file. |
5 | |
6 | // TODO: Test extractions for PartialMatch/Consume |
7 | |
8 | #include <errno.h> |
9 | #include <stddef.h> |
10 | #include <stdint.h> |
11 | #include <string.h> |
12 | #include <map> |
13 | #include <string> |
14 | #include <utility> |
15 | #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) |
16 | #include <sys/mman.h> |
17 | #include <unistd.h> /* for sysconf */ |
18 | #endif |
19 | |
20 | #include "util/test.h" |
21 | #include "util/logging.h" |
22 | #include "util/strutil.h" |
23 | #include "re2/re2.h" |
24 | #include "re2/regexp.h" |
25 | |
26 | namespace re2 { |
27 | |
28 | TEST(RE2, HexTests) { |
29 | #define ASSERT_HEX(type, value) \ |
30 | do { \ |
31 | type v; \ |
32 | ASSERT_TRUE( \ |
33 | RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \ |
34 | ASSERT_EQ(v, 0x##value); \ |
35 | ASSERT_TRUE(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", \ |
36 | RE2::CRadix(&v))); \ |
37 | ASSERT_EQ(v, 0x##value); \ |
38 | } while (0) |
39 | |
40 | ASSERT_HEX(short, 2bad); |
41 | ASSERT_HEX(unsigned short, 2badU); |
42 | ASSERT_HEX(int, dead); |
43 | ASSERT_HEX(unsigned int, deadU); |
44 | ASSERT_HEX(long, 7eadbeefL); |
45 | ASSERT_HEX(unsigned long, deadbeefUL); |
46 | ASSERT_HEX(long long, 12345678deadbeefLL); |
47 | ASSERT_HEX(unsigned long long, cafebabedeadbeefULL); |
48 | |
49 | #undef ASSERT_HEX |
50 | } |
51 | |
52 | TEST(RE2, OctalTests) { |
53 | #define ASSERT_OCTAL(type, value) \ |
54 | do { \ |
55 | type v; \ |
56 | ASSERT_TRUE(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \ |
57 | ASSERT_EQ(v, 0##value); \ |
58 | ASSERT_TRUE(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", \ |
59 | RE2::CRadix(&v))); \ |
60 | ASSERT_EQ(v, 0##value); \ |
61 | } while (0) |
62 | |
63 | ASSERT_OCTAL(short, 77777); |
64 | ASSERT_OCTAL(unsigned short, 177777U); |
65 | ASSERT_OCTAL(int, 17777777777); |
66 | ASSERT_OCTAL(unsigned int, 37777777777U); |
67 | ASSERT_OCTAL(long, 17777777777L); |
68 | ASSERT_OCTAL(unsigned long, 37777777777UL); |
69 | ASSERT_OCTAL(long long, 777777777777777777777LL); |
70 | ASSERT_OCTAL(unsigned long long, 1777777777777777777777ULL); |
71 | |
72 | #undef ASSERT_OCTAL |
73 | } |
74 | |
75 | TEST(RE2, DecimalTests) { |
76 | #define ASSERT_DECIMAL(type, value) \ |
77 | do { \ |
78 | type v; \ |
79 | ASSERT_TRUE(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \ |
80 | ASSERT_EQ(v, value); \ |
81 | ASSERT_TRUE( \ |
82 | RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ |
83 | ASSERT_EQ(v, value); \ |
84 | } while (0) |
85 | |
86 | ASSERT_DECIMAL(short, -1); |
87 | ASSERT_DECIMAL(unsigned short, 9999); |
88 | ASSERT_DECIMAL(int, -1000); |
89 | ASSERT_DECIMAL(unsigned int, 12345U); |
90 | ASSERT_DECIMAL(long, -10000000L); |
91 | ASSERT_DECIMAL(unsigned long, 3083324652U); |
92 | ASSERT_DECIMAL(long long, -100000000000000LL); |
93 | ASSERT_DECIMAL(unsigned long long, 1234567890987654321ULL); |
94 | |
95 | #undef ASSERT_DECIMAL |
96 | } |
97 | |
98 | TEST(RE2, Replace) { |
99 | struct ReplaceTest { |
100 | const char *regexp; |
101 | const char *rewrite; |
102 | const char *original; |
103 | const char *single; |
104 | const char *global; |
105 | int greplace_count; |
106 | }; |
107 | static const ReplaceTest tests[] = { |
108 | { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)" , |
109 | "\\2\\1ay" , |
110 | "the quick brown fox jumps over the lazy dogs." , |
111 | "ethay quick brown fox jumps over the lazy dogs." , |
112 | "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday." , |
113 | 9 }, |
114 | { "\\w+" , |
115 | "\\0-NOSPAM" , |
116 | "abcd.efghi@google.com" , |
117 | "abcd-NOSPAM.efghi@google.com" , |
118 | "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM" , |
119 | 4 }, |
120 | { "^" , |
121 | "(START)" , |
122 | "foo" , |
123 | "(START)foo" , |
124 | "(START)foo" , |
125 | 1 }, |
126 | { "^" , |
127 | "(START)" , |
128 | "" , |
129 | "(START)" , |
130 | "(START)" , |
131 | 1 }, |
132 | { "$" , |
133 | "(END)" , |
134 | "" , |
135 | "(END)" , |
136 | "(END)" , |
137 | 1 }, |
138 | { "b" , |
139 | "bb" , |
140 | "ababababab" , |
141 | "abbabababab" , |
142 | "abbabbabbabbabb" , |
143 | 5 }, |
144 | { "b" , |
145 | "bb" , |
146 | "bbbbbb" , |
147 | "bbbbbbb" , |
148 | "bbbbbbbbbbbb" , |
149 | 6 }, |
150 | { "b+" , |
151 | "bb" , |
152 | "bbbbbb" , |
153 | "bb" , |
154 | "bb" , |
155 | 1 }, |
156 | { "b*" , |
157 | "bb" , |
158 | "bbbbbb" , |
159 | "bb" , |
160 | "bb" , |
161 | 1 }, |
162 | { "b*" , |
163 | "bb" , |
164 | "aaaaa" , |
165 | "bbaaaaa" , |
166 | "bbabbabbabbabbabb" , |
167 | 6 }, |
168 | // Check newline handling |
169 | { "a.*a" , |
170 | "(\\0)" , |
171 | "aba\naba" , |
172 | "(aba)\naba" , |
173 | "(aba)\n(aba)" , |
174 | 2 }, |
175 | { "" , NULL, NULL, NULL, NULL, 0 } |
176 | }; |
177 | |
178 | for (const ReplaceTest* t = tests; t->original != NULL; t++) { |
179 | std::string one(t->original); |
180 | ASSERT_TRUE(RE2::Replace(&one, t->regexp, t->rewrite)); |
181 | ASSERT_EQ(one, t->single); |
182 | std::string all(t->original); |
183 | ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count) |
184 | << "Got: " << all; |
185 | ASSERT_EQ(all, t->global); |
186 | } |
187 | } |
188 | |
189 | static void TestCheckRewriteString(const char* regexp, const char* rewrite, |
190 | bool expect_ok) { |
191 | std::string error; |
192 | RE2 exp(regexp); |
193 | bool actual_ok = exp.CheckRewriteString(rewrite, &error); |
194 | EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error; |
195 | } |
196 | |
197 | TEST(CheckRewriteString, all) { |
198 | TestCheckRewriteString("abc" , "foo" , true); |
199 | TestCheckRewriteString("abc" , "foo\\" , false); |
200 | TestCheckRewriteString("abc" , "foo\\0bar" , true); |
201 | |
202 | TestCheckRewriteString("a(b)c" , "foo" , true); |
203 | TestCheckRewriteString("a(b)c" , "foo\\0bar" , true); |
204 | TestCheckRewriteString("a(b)c" , "foo\\1bar" , true); |
205 | TestCheckRewriteString("a(b)c" , "foo\\2bar" , false); |
206 | TestCheckRewriteString("a(b)c" , "f\\\\2o\\1o" , true); |
207 | |
208 | TestCheckRewriteString("a(b)(c)" , "foo\\12" , true); |
209 | TestCheckRewriteString("a(b)(c)" , "f\\2o\\1o" , true); |
210 | TestCheckRewriteString("a(b)(c)" , "f\\oo\\1" , false); |
211 | } |
212 | |
213 | TEST(RE2, Extract) { |
214 | std::string s; |
215 | |
216 | ASSERT_TRUE(RE2::Extract("boris@kremvax.ru" , "(.*)@([^.]*)" , "\\2!\\1" , &s)); |
217 | ASSERT_EQ(s, "kremvax!boris" ); |
218 | |
219 | ASSERT_TRUE(RE2::Extract("foo" , ".*" , "'\\0'" , &s)); |
220 | ASSERT_EQ(s, "'foo'" ); |
221 | // check that false match doesn't overwrite |
222 | ASSERT_FALSE(RE2::Extract("baz" , "bar" , "'\\0'" , &s)); |
223 | ASSERT_EQ(s, "'foo'" ); |
224 | } |
225 | |
226 | TEST(RE2, Consume) { |
227 | RE2 r("\\s*(\\w+)" ); // matches a word, possibly proceeded by whitespace |
228 | std::string word; |
229 | |
230 | std::string s(" aaa b!@#$@#$cccc" ); |
231 | StringPiece input(s); |
232 | |
233 | ASSERT_TRUE(RE2::Consume(&input, r, &word)); |
234 | ASSERT_EQ(word, "aaa" ) << " input: " << input; |
235 | ASSERT_TRUE(RE2::Consume(&input, r, &word)); |
236 | ASSERT_EQ(word, "b" ) << " input: " << input; |
237 | ASSERT_FALSE(RE2::Consume(&input, r, &word)) << " input: " << input; |
238 | } |
239 | |
240 | TEST(RE2, ConsumeN) { |
241 | const std::string s(" one two three 4" ); |
242 | StringPiece input(s); |
243 | |
244 | RE2::Arg argv[2]; |
245 | const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; |
246 | |
247 | // 0 arg |
248 | EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)" , args, 0)); // Skips "one". |
249 | |
250 | // 1 arg |
251 | std::string word; |
252 | argv[0] = &word; |
253 | EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)" , args, 1)); |
254 | EXPECT_EQ("two" , word); |
255 | |
256 | // Multi-args |
257 | int n; |
258 | argv[1] = &n; |
259 | EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)" , args, 2)); |
260 | EXPECT_EQ("three" , word); |
261 | EXPECT_EQ(4, n); |
262 | } |
263 | |
264 | TEST(RE2, FindAndConsume) { |
265 | RE2 r("(\\w+)" ); // matches a word |
266 | std::string word; |
267 | |
268 | std::string s(" aaa b!@#$@#$cccc" ); |
269 | StringPiece input(s); |
270 | |
271 | ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); |
272 | ASSERT_EQ(word, "aaa" ); |
273 | ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); |
274 | ASSERT_EQ(word, "b" ); |
275 | ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); |
276 | ASSERT_EQ(word, "cccc" ); |
277 | ASSERT_FALSE(RE2::FindAndConsume(&input, r, &word)); |
278 | |
279 | // Check that FindAndConsume works without any submatches. |
280 | // Earlier version used uninitialized data for |
281 | // length to consume. |
282 | input = "aaa" ; |
283 | ASSERT_TRUE(RE2::FindAndConsume(&input, "aaa" )); |
284 | ASSERT_EQ(input, "" ); |
285 | } |
286 | |
287 | TEST(RE2, FindAndConsumeN) { |
288 | const std::string s(" one two three 4" ); |
289 | StringPiece input(s); |
290 | |
291 | RE2::Arg argv[2]; |
292 | const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; |
293 | |
294 | // 0 arg |
295 | EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)" , args, 0)); // Skips "one". |
296 | |
297 | // 1 arg |
298 | std::string word; |
299 | argv[0] = &word; |
300 | EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)" , args, 1)); |
301 | EXPECT_EQ("two" , word); |
302 | |
303 | // Multi-args |
304 | int n; |
305 | argv[1] = &n; |
306 | EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)" , args, 2)); |
307 | EXPECT_EQ("three" , word); |
308 | EXPECT_EQ(4, n); |
309 | } |
310 | |
311 | TEST(RE2, MatchNumberPeculiarity) { |
312 | RE2 r("(foo)|(bar)|(baz)" ); |
313 | std::string word1; |
314 | std::string word2; |
315 | std::string word3; |
316 | |
317 | ASSERT_TRUE(RE2::PartialMatch("foo" , r, &word1, &word2, &word3)); |
318 | ASSERT_EQ(word1, "foo" ); |
319 | ASSERT_EQ(word2, "" ); |
320 | ASSERT_EQ(word3, "" ); |
321 | ASSERT_TRUE(RE2::PartialMatch("bar" , r, &word1, &word2, &word3)); |
322 | ASSERT_EQ(word1, "" ); |
323 | ASSERT_EQ(word2, "bar" ); |
324 | ASSERT_EQ(word3, "" ); |
325 | ASSERT_TRUE(RE2::PartialMatch("baz" , r, &word1, &word2, &word3)); |
326 | ASSERT_EQ(word1, "" ); |
327 | ASSERT_EQ(word2, "" ); |
328 | ASSERT_EQ(word3, "baz" ); |
329 | ASSERT_FALSE(RE2::PartialMatch("f" , r, &word1, &word2, &word3)); |
330 | |
331 | std::string a; |
332 | ASSERT_TRUE(RE2::FullMatch("hello" , "(foo)|hello" , &a)); |
333 | ASSERT_EQ(a, "" ); |
334 | } |
335 | |
336 | TEST(RE2, Match) { |
337 | RE2 re("((\\w+):([0-9]+))" ); // extracts host and port |
338 | StringPiece group[4]; |
339 | |
340 | // No match. |
341 | StringPiece s = "zyzzyva" ; |
342 | ASSERT_FALSE( |
343 | re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group))); |
344 | |
345 | // Matches and extracts. |
346 | s = "a chrisr:9000 here" ; |
347 | ASSERT_TRUE( |
348 | re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group))); |
349 | ASSERT_EQ(group[0], "chrisr:9000" ); |
350 | ASSERT_EQ(group[1], "chrisr:9000" ); |
351 | ASSERT_EQ(group[2], "chrisr" ); |
352 | ASSERT_EQ(group[3], "9000" ); |
353 | |
354 | std::string all, host; |
355 | int port; |
356 | ASSERT_TRUE(RE2::PartialMatch("a chrisr:9000 here" , re, &all, &host, &port)); |
357 | ASSERT_EQ(all, "chrisr:9000" ); |
358 | ASSERT_EQ(host, "chrisr" ); |
359 | ASSERT_EQ(port, 9000); |
360 | } |
361 | |
362 | static void TestRecursion(int size, const char* pattern) { |
363 | // Fill up a string repeating the pattern given |
364 | std::string domain; |
365 | domain.resize(size); |
366 | size_t patlen = strlen(pattern); |
367 | for (int i = 0; i < size; i++) { |
368 | domain[i] = pattern[i % patlen]; |
369 | } |
370 | // Just make sure it doesn't crash due to too much recursion. |
371 | RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?" , RE2::Quiet); |
372 | RE2::FullMatch(domain, re); |
373 | } |
374 | |
375 | // A meta-quoted string, interpreted as a pattern, should always match |
376 | // the original unquoted string. |
377 | static void TestQuoteMeta(const std::string& unquoted, |
378 | const RE2::Options& options = RE2::DefaultOptions) { |
379 | std::string quoted = RE2::QuoteMeta(unquoted); |
380 | RE2 re(quoted, options); |
381 | EXPECT_TRUE(RE2::FullMatch(unquoted, re)) |
382 | << "Unquoted='" << unquoted << "', quoted='" << quoted << "'." ; |
383 | } |
384 | |
385 | // A meta-quoted string, interpreted as a pattern, should always match |
386 | // the original unquoted string. |
387 | static void NegativeTestQuoteMeta( |
388 | const std::string& unquoted, const std::string& should_not_match, |
389 | const RE2::Options& options = RE2::DefaultOptions) { |
390 | std::string quoted = RE2::QuoteMeta(unquoted); |
391 | RE2 re(quoted, options); |
392 | EXPECT_FALSE(RE2::FullMatch(should_not_match, re)) |
393 | << "Unquoted='" << unquoted << "', quoted='" << quoted << "'." ; |
394 | } |
395 | |
396 | // Tests that quoted meta characters match their original strings, |
397 | // and that a few things that shouldn't match indeed do not. |
398 | TEST(QuoteMeta, Simple) { |
399 | TestQuoteMeta("foo" ); |
400 | TestQuoteMeta("foo.bar" ); |
401 | TestQuoteMeta("foo\\.bar" ); |
402 | TestQuoteMeta("[1-9]" ); |
403 | TestQuoteMeta("1.5-2.0?" ); |
404 | TestQuoteMeta("\\d" ); |
405 | TestQuoteMeta("Who doesn't like ice cream?" ); |
406 | TestQuoteMeta("((a|b)c?d*e+[f-h]i)" ); |
407 | TestQuoteMeta("((?!)xxx).*yyy" ); |
408 | TestQuoteMeta("([" ); |
409 | } |
410 | TEST(QuoteMeta, SimpleNegative) { |
411 | NegativeTestQuoteMeta("foo" , "bar" ); |
412 | NegativeTestQuoteMeta("..." , "bar" ); |
413 | NegativeTestQuoteMeta("\\." , "." ); |
414 | NegativeTestQuoteMeta("\\." , ".." ); |
415 | NegativeTestQuoteMeta("(a)" , "a" ); |
416 | NegativeTestQuoteMeta("(a|b)" , "a" ); |
417 | NegativeTestQuoteMeta("(a|b)" , "(a)" ); |
418 | NegativeTestQuoteMeta("(a|b)" , "a|b" ); |
419 | NegativeTestQuoteMeta("[0-9]" , "0" ); |
420 | NegativeTestQuoteMeta("[0-9]" , "0-9" ); |
421 | NegativeTestQuoteMeta("[0-9]" , "[9]" ); |
422 | NegativeTestQuoteMeta("((?!)xxx)" , "xxx" ); |
423 | } |
424 | |
425 | TEST(QuoteMeta, Latin1) { |
426 | TestQuoteMeta("3\xb2 = 9" , RE2::Latin1); |
427 | } |
428 | |
429 | TEST(QuoteMeta, UTF8) { |
430 | TestQuoteMeta("Plácido Domingo" ); |
431 | TestQuoteMeta("xyz" ); // No fancy utf8. |
432 | TestQuoteMeta("\xc2\xb0" ); // 2-byte utf8 -- a degree symbol. |
433 | TestQuoteMeta("27\xc2\xb0 degrees" ); // As a middle character. |
434 | TestQuoteMeta("\xe2\x80\xb3" ); // 3-byte utf8 -- a double prime. |
435 | TestQuoteMeta("\xf0\x9d\x85\x9f" ); // 4-byte utf8 -- a music note. |
436 | TestQuoteMeta("27\xc2\xb0" ); // Interpreted as Latin-1, this should |
437 | // still work. |
438 | NegativeTestQuoteMeta("27\xc2\xb0" , |
439 | "27\\\xc2\\\xb0" ); // 2-byte utf8 -- a degree symbol. |
440 | } |
441 | |
442 | TEST(QuoteMeta, HasNull) { |
443 | std::string has_null; |
444 | |
445 | // string with one null character |
446 | has_null += '\0'; |
447 | TestQuoteMeta(has_null); |
448 | NegativeTestQuoteMeta(has_null, "" ); |
449 | |
450 | // Don't want null-followed-by-'1' to be interpreted as '\01'. |
451 | has_null += '1'; |
452 | TestQuoteMeta(has_null); |
453 | NegativeTestQuoteMeta(has_null, "\1" ); |
454 | } |
455 | |
456 | TEST(ProgramSize, BigProgram) { |
457 | RE2 re_simple("simple regexp" ); |
458 | RE2 re_medium("medium.*regexp" ); |
459 | RE2 re_complex("complex.{1,128}regexp" ); |
460 | |
461 | ASSERT_GT(re_simple.ProgramSize(), 0); |
462 | ASSERT_GT(re_medium.ProgramSize(), re_simple.ProgramSize()); |
463 | ASSERT_GT(re_complex.ProgramSize(), re_medium.ProgramSize()); |
464 | |
465 | ASSERT_GT(re_simple.ReverseProgramSize(), 0); |
466 | ASSERT_GT(re_medium.ReverseProgramSize(), re_simple.ReverseProgramSize()); |
467 | ASSERT_GT(re_complex.ReverseProgramSize(), re_medium.ReverseProgramSize()); |
468 | } |
469 | |
470 | TEST(ProgramFanout, BigProgram) { |
471 | RE2 re1("(?:(?:(?:(?:(?:.)?){1})*)+)" ); |
472 | RE2 re10("(?:(?:(?:(?:(?:.)?){10})*)+)" ); |
473 | RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)" ); |
474 | RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)" ); |
475 | |
476 | std::map<int, int> histogram; |
477 | |
478 | // 3 is the largest non-empty bucket and has 1 element. |
479 | ASSERT_EQ(3, re1.ProgramFanout(&histogram)); |
480 | ASSERT_EQ(1, histogram[3]); |
481 | |
482 | // 7 is the largest non-empty bucket and has 10 elements. |
483 | ASSERT_EQ(7, re10.ProgramFanout(&histogram)); |
484 | ASSERT_EQ(10, histogram[7]); |
485 | |
486 | // 10 is the largest non-empty bucket and has 100 elements. |
487 | ASSERT_EQ(10, re100.ProgramFanout(&histogram)); |
488 | ASSERT_EQ(100, histogram[10]); |
489 | |
490 | // 13 is the largest non-empty bucket and has 1000 elements. |
491 | ASSERT_EQ(13, re1000.ProgramFanout(&histogram)); |
492 | ASSERT_EQ(1000, histogram[13]); |
493 | |
494 | // 2 is the largest non-empty bucket and has 3 elements. |
495 | // This differs from the others due to how reverse `.' works. |
496 | ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram)); |
497 | ASSERT_EQ(3, histogram[2]); |
498 | |
499 | // 5 is the largest non-empty bucket and has 10 elements. |
500 | ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram)); |
501 | ASSERT_EQ(10, histogram[5]); |
502 | |
503 | // 9 is the largest non-empty bucket and has 100 elements. |
504 | ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram)); |
505 | ASSERT_EQ(100, histogram[9]); |
506 | |
507 | // 12 is the largest non-empty bucket and has 1000 elements. |
508 | ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram)); |
509 | ASSERT_EQ(1000, histogram[12]); |
510 | } |
511 | |
512 | // Issue 956519: handling empty character sets was |
513 | // causing NULL dereference. This tests a few empty character sets. |
514 | // (The way to get an empty character set is to negate a full one.) |
515 | TEST(EmptyCharset, Fuzz) { |
516 | static const char *empties[] = { |
517 | "[^\\S\\s]" , |
518 | "[^\\S[:space:]]" , |
519 | "[^\\D\\d]" , |
520 | "[^\\D[:digit:]]" |
521 | }; |
522 | for (size_t i = 0; i < arraysize(empties); i++) |
523 | ASSERT_FALSE(RE2(empties[i]).Match("abc" , 0, 3, RE2::UNANCHORED, NULL, 0)); |
524 | } |
525 | |
526 | // Bitstate assumes that kInstFail instructions in |
527 | // alternations or capture groups have been "compiled away". |
528 | TEST(EmptyCharset, BitstateAssumptions) { |
529 | // Captures trigger use of Bitstate. |
530 | static const char *nop_empties[] = { |
531 | "((((()))))" "[^\\S\\s]?" , |
532 | "((((()))))" "([^\\S\\s])?" , |
533 | "((((()))))" "([^\\S\\s]|[^\\S\\s])?" , |
534 | "((((()))))" "(([^\\S\\s]|[^\\S\\s])|)" |
535 | }; |
536 | StringPiece group[6]; |
537 | for (size_t i = 0; i < arraysize(nop_empties); i++) |
538 | ASSERT_TRUE(RE2(nop_empties[i]).Match("" , 0, 0, RE2::UNANCHORED, group, 6)); |
539 | } |
540 | |
541 | // Test that named groups work correctly. |
542 | TEST(Capture, NamedGroups) { |
543 | { |
544 | RE2 re("(hello world)" ); |
545 | ASSERT_EQ(re.NumberOfCapturingGroups(), 1); |
546 | const std::map<std::string, int>& m = re.NamedCapturingGroups(); |
547 | ASSERT_EQ(m.size(), 0); |
548 | } |
549 | |
550 | { |
551 | RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))" ); |
552 | ASSERT_EQ(re.NumberOfCapturingGroups(), 6); |
553 | const std::map<std::string, int>& m = re.NamedCapturingGroups(); |
554 | ASSERT_EQ(m.size(), 4); |
555 | ASSERT_EQ(m.find("A" )->second, 1); |
556 | ASSERT_EQ(m.find("B" )->second, 2); |
557 | ASSERT_EQ(m.find("C" )->second, 3); |
558 | ASSERT_EQ(m.find("D" )->second, 6); // $4 and $5 are anonymous |
559 | } |
560 | } |
561 | |
562 | TEST(RE2, CapturedGroupTest) { |
563 | RE2 re("directions from (?P<S>.*) to (?P<D>.*)" ); |
564 | int num_groups = re.NumberOfCapturingGroups(); |
565 | EXPECT_EQ(2, num_groups); |
566 | std::string args[4]; |
567 | RE2::Arg arg0(&args[0]); |
568 | RE2::Arg arg1(&args[1]); |
569 | RE2::Arg arg2(&args[2]); |
570 | RE2::Arg arg3(&args[3]); |
571 | |
572 | const RE2::Arg* const matches[4] = {&arg0, &arg1, &arg2, &arg3}; |
573 | EXPECT_TRUE(RE2::FullMatchN("directions from mountain view to san jose" , |
574 | re, matches, num_groups)); |
575 | const std::map<std::string, int>& named_groups = re.NamedCapturingGroups(); |
576 | EXPECT_TRUE(named_groups.find("S" ) != named_groups.end()); |
577 | EXPECT_TRUE(named_groups.find("D" ) != named_groups.end()); |
578 | |
579 | // The named group index is 1-based. |
580 | int source_group_index = named_groups.find("S" )->second; |
581 | int destination_group_index = named_groups.find("D" )->second; |
582 | EXPECT_EQ(1, source_group_index); |
583 | EXPECT_EQ(2, destination_group_index); |
584 | |
585 | // The args is zero-based. |
586 | EXPECT_EQ("mountain view" , args[source_group_index - 1]); |
587 | EXPECT_EQ("san jose" , args[destination_group_index - 1]); |
588 | } |
589 | |
590 | TEST(RE2, FullMatchWithNoArgs) { |
591 | ASSERT_TRUE(RE2::FullMatch("h" , "h" )); |
592 | ASSERT_TRUE(RE2::FullMatch("hello" , "hello" )); |
593 | ASSERT_TRUE(RE2::FullMatch("hello" , "h.*o" )); |
594 | ASSERT_FALSE(RE2::FullMatch("othello" , "h.*o" )); // Must be anchored at front |
595 | ASSERT_FALSE(RE2::FullMatch("hello!" , "h.*o" )); // Must be anchored at end |
596 | } |
597 | |
598 | TEST(RE2, PartialMatch) { |
599 | ASSERT_TRUE(RE2::PartialMatch("x" , "x" )); |
600 | ASSERT_TRUE(RE2::PartialMatch("hello" , "h.*o" )); |
601 | ASSERT_TRUE(RE2::PartialMatch("othello" , "h.*o" )); |
602 | ASSERT_TRUE(RE2::PartialMatch("hello!" , "h.*o" )); |
603 | ASSERT_TRUE(RE2::PartialMatch("x" , "((((((((((((((((((((x))))))))))))))))))))" )); |
604 | } |
605 | |
606 | TEST(RE2, PartialMatchN) { |
607 | RE2::Arg argv[2]; |
608 | const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; |
609 | |
610 | // 0 arg |
611 | EXPECT_TRUE(RE2::PartialMatchN("hello" , "e.*o" , args, 0)); |
612 | EXPECT_FALSE(RE2::PartialMatchN("othello" , "a.*o" , args, 0)); |
613 | |
614 | // 1 arg |
615 | int i; |
616 | argv[0] = &i; |
617 | EXPECT_TRUE(RE2::PartialMatchN("1001 nights" , "(\\d+)" , args, 1)); |
618 | EXPECT_EQ(1001, i); |
619 | EXPECT_FALSE(RE2::PartialMatchN("three" , "(\\d+)" , args, 1)); |
620 | |
621 | // Multi-arg |
622 | std::string s; |
623 | argv[1] = &s; |
624 | EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life" , "(\\d+):(\\w+)" , args, 2)); |
625 | EXPECT_EQ(42, i); |
626 | EXPECT_EQ("life" , s); |
627 | EXPECT_FALSE(RE2::PartialMatchN("hi1" , "(\\w+)(1)" , args, 2)); |
628 | } |
629 | |
630 | TEST(RE2, FullMatchZeroArg) { |
631 | // Zero-arg |
632 | ASSERT_TRUE(RE2::FullMatch("1001" , "\\d+" )); |
633 | } |
634 | |
635 | TEST(RE2, FullMatchOneArg) { |
636 | int i; |
637 | |
638 | // Single-arg |
639 | ASSERT_TRUE(RE2::FullMatch("1001" , "(\\d+)" , &i)); |
640 | ASSERT_EQ(i, 1001); |
641 | ASSERT_TRUE(RE2::FullMatch("-123" , "(-?\\d+)" , &i)); |
642 | ASSERT_EQ(i, -123); |
643 | ASSERT_FALSE(RE2::FullMatch("10" , "()\\d+" , &i)); |
644 | ASSERT_FALSE( |
645 | RE2::FullMatch("1234567890123456789012345678901234567890" , "(\\d+)" , &i)); |
646 | } |
647 | |
648 | TEST(RE2, FullMatchIntegerArg) { |
649 | int i; |
650 | |
651 | // Digits surrounding integer-arg |
652 | ASSERT_TRUE(RE2::FullMatch("1234" , "1(\\d*)4" , &i)); |
653 | ASSERT_EQ(i, 23); |
654 | ASSERT_TRUE(RE2::FullMatch("1234" , "(\\d)\\d+" , &i)); |
655 | ASSERT_EQ(i, 1); |
656 | ASSERT_TRUE(RE2::FullMatch("-1234" , "(-\\d)\\d+" , &i)); |
657 | ASSERT_EQ(i, -1); |
658 | ASSERT_TRUE(RE2::PartialMatch("1234" , "(\\d)" , &i)); |
659 | ASSERT_EQ(i, 1); |
660 | ASSERT_TRUE(RE2::PartialMatch("-1234" , "(-\\d)" , &i)); |
661 | ASSERT_EQ(i, -1); |
662 | } |
663 | |
664 | TEST(RE2, FullMatchStringArg) { |
665 | std::string s; |
666 | // String-arg |
667 | ASSERT_TRUE(RE2::FullMatch("hello" , "h(.*)o" , &s)); |
668 | ASSERT_EQ(s, std::string("ell" )); |
669 | } |
670 | |
671 | TEST(RE2, FullMatchStringPieceArg) { |
672 | int i; |
673 | // StringPiece-arg |
674 | StringPiece sp; |
675 | ASSERT_TRUE(RE2::FullMatch("ruby:1234" , "(\\w+):(\\d+)" , &sp, &i)); |
676 | ASSERT_EQ(sp.size(), 4); |
677 | ASSERT_TRUE(memcmp(sp.data(), "ruby" , 4) == 0); |
678 | ASSERT_EQ(i, 1234); |
679 | } |
680 | |
681 | TEST(RE2, FullMatchMultiArg) { |
682 | int i; |
683 | std::string s; |
684 | // Multi-arg |
685 | ASSERT_TRUE(RE2::FullMatch("ruby:1234" , "(\\w+):(\\d+)" , &s, &i)); |
686 | ASSERT_EQ(s, std::string("ruby" )); |
687 | ASSERT_EQ(i, 1234); |
688 | } |
689 | |
690 | TEST(RE2, FullMatchN) { |
691 | RE2::Arg argv[2]; |
692 | const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; |
693 | |
694 | // 0 arg |
695 | EXPECT_TRUE(RE2::FullMatchN("hello" , "h.*o" , args, 0)); |
696 | EXPECT_FALSE(RE2::FullMatchN("othello" , "h.*o" , args, 0)); |
697 | |
698 | // 1 arg |
699 | int i; |
700 | argv[0] = &i; |
701 | EXPECT_TRUE(RE2::FullMatchN("1001" , "(\\d+)" , args, 1)); |
702 | EXPECT_EQ(1001, i); |
703 | EXPECT_FALSE(RE2::FullMatchN("three" , "(\\d+)" , args, 1)); |
704 | |
705 | // Multi-arg |
706 | std::string s; |
707 | argv[1] = &s; |
708 | EXPECT_TRUE(RE2::FullMatchN("42:life" , "(\\d+):(\\w+)" , args, 2)); |
709 | EXPECT_EQ(42, i); |
710 | EXPECT_EQ("life" , s); |
711 | EXPECT_FALSE(RE2::FullMatchN("hi1" , "(\\w+)(1)" , args, 2)); |
712 | } |
713 | |
714 | TEST(RE2, FullMatchIgnoredArg) { |
715 | int i; |
716 | std::string s; |
717 | |
718 | // Old-school NULL should be ignored. |
719 | ASSERT_TRUE( |
720 | RE2::FullMatch("ruby:1234" , "(\\w+)(:)(\\d+)" , &s, (void*)NULL, &i)); |
721 | ASSERT_EQ(s, std::string("ruby" )); |
722 | ASSERT_EQ(i, 1234); |
723 | |
724 | // C++11 nullptr should also be ignored. |
725 | ASSERT_TRUE(RE2::FullMatch("rubz:1235" , "(\\w+)(:)(\\d+)" , &s, nullptr, &i)); |
726 | ASSERT_EQ(s, std::string("rubz" )); |
727 | ASSERT_EQ(i, 1235); |
728 | } |
729 | |
730 | TEST(RE2, FullMatchTypedNullArg) { |
731 | std::string s; |
732 | |
733 | // Ignore non-void* NULL arg |
734 | ASSERT_TRUE(RE2::FullMatch("hello" , "he(.*)lo" , (char*)NULL)); |
735 | ASSERT_TRUE(RE2::FullMatch("hello" , "h(.*)o" , (std::string*)NULL)); |
736 | ASSERT_TRUE(RE2::FullMatch("hello" , "h(.*)o" , (StringPiece*)NULL)); |
737 | ASSERT_TRUE(RE2::FullMatch("1234" , "(.*)" , (int*)NULL)); |
738 | ASSERT_TRUE(RE2::FullMatch("1234567890123456" , "(.*)" , (long long*)NULL)); |
739 | ASSERT_TRUE(RE2::FullMatch("123.4567890123456" , "(.*)" , (double*)NULL)); |
740 | ASSERT_TRUE(RE2::FullMatch("123.4567890123456" , "(.*)" , (float*)NULL)); |
741 | |
742 | // Fail on non-void* NULL arg if the match doesn't parse for the given type. |
743 | ASSERT_FALSE(RE2::FullMatch("hello" , "h(.*)lo" , &s, (char*)NULL)); |
744 | ASSERT_FALSE(RE2::FullMatch("hello" , "(.*)" , (int*)NULL)); |
745 | ASSERT_FALSE(RE2::FullMatch("1234567890123456" , "(.*)" , (int*)NULL)); |
746 | ASSERT_FALSE(RE2::FullMatch("hello" , "(.*)" , (double*)NULL)); |
747 | ASSERT_FALSE(RE2::FullMatch("hello" , "(.*)" , (float*)NULL)); |
748 | } |
749 | |
750 | // Check that numeric parsing code does not read past the end of |
751 | // the number being parsed. |
752 | // This implementation requires mmap(2) et al. and thus cannot |
753 | // be used unless they are available. |
754 | TEST(RE2, NULTerminated) { |
755 | #if defined(_POSIX_MAPPED_FILES) && _POSIX_MAPPED_FILES > 0 |
756 | char *v; |
757 | int x; |
758 | long pagesize = sysconf(_SC_PAGE_SIZE); |
759 | |
760 | #ifndef MAP_ANONYMOUS |
761 | #define MAP_ANONYMOUS MAP_ANON |
762 | #endif |
763 | v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE, |
764 | MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)); |
765 | ASSERT_TRUE(v != reinterpret_cast<char*>(-1)); |
766 | LOG(INFO) << "Memory at " << (void*)v; |
767 | ASSERT_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno; |
768 | v[pagesize - 1] = '1'; |
769 | |
770 | x = 0; |
771 | ASSERT_TRUE(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)" , &x)); |
772 | ASSERT_EQ(x, 1); |
773 | #endif |
774 | } |
775 | |
776 | TEST(RE2, FullMatchTypeTests) { |
777 | // Type tests |
778 | std::string zeros(1000, '0'); |
779 | { |
780 | char c; |
781 | ASSERT_TRUE(RE2::FullMatch("Hello" , "(H)ello" , &c)); |
782 | ASSERT_EQ(c, 'H'); |
783 | } |
784 | { |
785 | unsigned char c; |
786 | ASSERT_TRUE(RE2::FullMatch("Hello" , "(H)ello" , &c)); |
787 | ASSERT_EQ(c, static_cast<unsigned char>('H')); |
788 | } |
789 | { |
790 | int16_t v; |
791 | ASSERT_TRUE(RE2::FullMatch("100" , "(-?\\d+)" , &v)); ASSERT_EQ(v, 100); |
792 | ASSERT_TRUE(RE2::FullMatch("-100" , "(-?\\d+)" , &v)); ASSERT_EQ(v, -100); |
793 | ASSERT_TRUE(RE2::FullMatch("32767" , "(-?\\d+)" , &v)); ASSERT_EQ(v, 32767); |
794 | ASSERT_TRUE(RE2::FullMatch("-32768" , "(-?\\d+)" , &v)); ASSERT_EQ(v, -32768); |
795 | ASSERT_FALSE(RE2::FullMatch("-32769" , "(-?\\d+)" , &v)); |
796 | ASSERT_FALSE(RE2::FullMatch("32768" , "(-?\\d+)" , &v)); |
797 | } |
798 | { |
799 | uint16_t v; |
800 | ASSERT_TRUE(RE2::FullMatch("100" , "(\\d+)" , &v)); ASSERT_EQ(v, 100); |
801 | ASSERT_TRUE(RE2::FullMatch("32767" , "(\\d+)" , &v)); ASSERT_EQ(v, 32767); |
802 | ASSERT_TRUE(RE2::FullMatch("65535" , "(\\d+)" , &v)); ASSERT_EQ(v, 65535); |
803 | ASSERT_FALSE(RE2::FullMatch("65536" , "(\\d+)" , &v)); |
804 | } |
805 | { |
806 | int32_t v; |
807 | static const int32_t max = INT32_C(0x7fffffff); |
808 | static const int32_t min = -max - 1; |
809 | ASSERT_TRUE(RE2::FullMatch("100" , "(-?\\d+)" , &v)); ASSERT_EQ(v, 100); |
810 | ASSERT_TRUE(RE2::FullMatch("-100" , "(-?\\d+)" , &v)); ASSERT_EQ(v, -100); |
811 | ASSERT_TRUE(RE2::FullMatch("2147483647" , "(-?\\d+)" , &v)); ASSERT_EQ(v, max); |
812 | ASSERT_TRUE(RE2::FullMatch("-2147483648" , "(-?\\d+)" , &v)); ASSERT_EQ(v, min); |
813 | ASSERT_FALSE(RE2::FullMatch("-2147483649" , "(-?\\d+)" , &v)); |
814 | ASSERT_FALSE(RE2::FullMatch("2147483648" , "(-?\\d+)" , &v)); |
815 | |
816 | ASSERT_TRUE(RE2::FullMatch(zeros + "2147483647" , "(-?\\d+)" , &v)); |
817 | ASSERT_EQ(v, max); |
818 | ASSERT_TRUE(RE2::FullMatch("-" + zeros + "2147483648" , "(-?\\d+)" , &v)); |
819 | ASSERT_EQ(v, min); |
820 | |
821 | ASSERT_FALSE(RE2::FullMatch("-" + zeros + "2147483649" , "(-?\\d+)" , &v)); |
822 | ASSERT_TRUE(RE2::FullMatch("0x7fffffff" , "(.*)" , RE2::CRadix(&v))); |
823 | ASSERT_EQ(v, max); |
824 | ASSERT_FALSE(RE2::FullMatch("000x7fffffff" , "(.*)" , RE2::CRadix(&v))); |
825 | } |
826 | { |
827 | uint32_t v; |
828 | static const uint32_t max = UINT32_C(0xffffffff); |
829 | ASSERT_TRUE(RE2::FullMatch("100" , "(\\d+)" , &v)); ASSERT_EQ(v, 100); |
830 | ASSERT_TRUE(RE2::FullMatch("4294967295" , "(\\d+)" , &v)); ASSERT_EQ(v, max); |
831 | ASSERT_FALSE(RE2::FullMatch("4294967296" , "(\\d+)" , &v)); |
832 | ASSERT_FALSE(RE2::FullMatch("-1" , "(\\d+)" , &v)); |
833 | |
834 | ASSERT_TRUE(RE2::FullMatch(zeros + "4294967295" , "(\\d+)" , &v)); ASSERT_EQ(v, max); |
835 | } |
836 | { |
837 | int64_t v; |
838 | static const int64_t max = INT64_C(0x7fffffffffffffff); |
839 | static const int64_t min = -max - 1; |
840 | std::string str; |
841 | |
842 | ASSERT_TRUE(RE2::FullMatch("100" , "(-?\\d+)" , &v)); ASSERT_EQ(v, 100); |
843 | ASSERT_TRUE(RE2::FullMatch("-100" , "(-?\\d+)" , &v)); ASSERT_EQ(v, -100); |
844 | |
845 | str = std::to_string(max); |
846 | ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)" , &v)); ASSERT_EQ(v, max); |
847 | |
848 | str = std::to_string(min); |
849 | ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)" , &v)); ASSERT_EQ(v, min); |
850 | |
851 | str = std::to_string(max); |
852 | ASSERT_NE(str.back(), '9'); |
853 | str.back()++; |
854 | ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)" , &v)); |
855 | |
856 | str = std::to_string(min); |
857 | ASSERT_NE(str.back(), '9'); |
858 | str.back()++; |
859 | ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)" , &v)); |
860 | } |
861 | { |
862 | uint64_t v; |
863 | int64_t v2; |
864 | static const uint64_t max = UINT64_C(0xffffffffffffffff); |
865 | std::string str; |
866 | |
867 | ASSERT_TRUE(RE2::FullMatch("100" , "(-?\\d+)" , &v)); ASSERT_EQ(v, 100); |
868 | ASSERT_TRUE(RE2::FullMatch("-100" , "(-?\\d+)" , &v2)); ASSERT_EQ(v2, -100); |
869 | |
870 | str = std::to_string(max); |
871 | ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)" , &v)); ASSERT_EQ(v, max); |
872 | |
873 | ASSERT_NE(str.back(), '9'); |
874 | str.back()++; |
875 | ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)" , &v)); |
876 | } |
877 | } |
878 | |
879 | TEST(RE2, FloatingPointFullMatchTypes) { |
880 | std::string zeros(1000, '0'); |
881 | { |
882 | float v; |
883 | ASSERT_TRUE(RE2::FullMatch("100" , "(.*)" , &v)); ASSERT_EQ(v, 100); |
884 | ASSERT_TRUE(RE2::FullMatch("-100." , "(.*)" , &v)); ASSERT_EQ(v, -100); |
885 | ASSERT_TRUE(RE2::FullMatch("1e23" , "(.*)" , &v)); ASSERT_EQ(v, float(1e23)); |
886 | ASSERT_TRUE(RE2::FullMatch(" 100" , "(.*)" , &v)); ASSERT_EQ(v, 100); |
887 | |
888 | ASSERT_TRUE(RE2::FullMatch(zeros + "1e23" , "(.*)" , &v)); |
889 | ASSERT_EQ(v, float(1e23)); |
890 | |
891 | // 6700000000081920.1 is an edge case. |
892 | // 6700000000081920 is exactly halfway between |
893 | // two float32s, so the .1 should make it round up. |
894 | // However, the .1 is outside the precision possible with |
895 | // a float64: the nearest float64 is 6700000000081920. |
896 | // So if the code uses strtod and then converts to float32, |
897 | // round-to-even will make it round down instead of up. |
898 | // To pass the test, the parser must call strtof directly. |
899 | // This test case is carefully chosen to use only a 17-digit |
900 | // number, since C does not guarantee to get the correctly |
901 | // rounded answer for strtod and strtof unless the input is |
902 | // short. |
903 | // |
904 | // This is known to fail on Cygwin and MinGW due to a broken |
905 | // implementation of strtof(3). And apparently MSVC too. Sigh. |
906 | #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) |
907 | ASSERT_TRUE(RE2::FullMatch("0.1" , "(.*)" , &v)); |
908 | ASSERT_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g" , v, 0.1f); |
909 | ASSERT_TRUE(RE2::FullMatch("6700000000081920.1" , "(.*)" , &v)); |
910 | ASSERT_EQ(v, 6700000000081920.1f) |
911 | << StringPrintf("%.8g != %.8g" , v, 6700000000081920.1f); |
912 | #endif |
913 | } |
914 | { |
915 | double v; |
916 | ASSERT_TRUE(RE2::FullMatch("100" , "(.*)" , &v)); ASSERT_EQ(v, 100); |
917 | ASSERT_TRUE(RE2::FullMatch("-100." , "(.*)" , &v)); ASSERT_EQ(v, -100); |
918 | ASSERT_TRUE(RE2::FullMatch("1e23" , "(.*)" , &v)); ASSERT_EQ(v, 1e23); |
919 | ASSERT_TRUE(RE2::FullMatch(zeros + "1e23" , "(.*)" , &v)); |
920 | ASSERT_EQ(v, double(1e23)); |
921 | |
922 | ASSERT_TRUE(RE2::FullMatch("0.1" , "(.*)" , &v)); |
923 | ASSERT_EQ(v, 0.1) << StringPrintf("%.17g != %.17g" , v, 0.1); |
924 | ASSERT_TRUE(RE2::FullMatch("1.00000005960464485" , "(.*)" , &v)); |
925 | ASSERT_EQ(v, 1.0000000596046448) |
926 | << StringPrintf("%.17g != %.17g" , v, 1.0000000596046448); |
927 | } |
928 | } |
929 | |
930 | TEST(RE2, FullMatchAnchored) { |
931 | int i; |
932 | // Check that matching is fully anchored |
933 | ASSERT_FALSE(RE2::FullMatch("x1001" , "(\\d+)" , &i)); |
934 | ASSERT_FALSE(RE2::FullMatch("1001x" , "(\\d+)" , &i)); |
935 | ASSERT_TRUE(RE2::FullMatch("x1001" , "x(\\d+)" , &i)); ASSERT_EQ(i, 1001); |
936 | ASSERT_TRUE(RE2::FullMatch("1001x" , "(\\d+)x" , &i)); ASSERT_EQ(i, 1001); |
937 | } |
938 | |
939 | TEST(RE2, FullMatchBraces) { |
940 | // Braces |
941 | ASSERT_TRUE(RE2::FullMatch("0abcd" , "[0-9a-f+.-]{5,}" )); |
942 | ASSERT_TRUE(RE2::FullMatch("0abcde" , "[0-9a-f+.-]{5,}" )); |
943 | ASSERT_FALSE(RE2::FullMatch("0abc" , "[0-9a-f+.-]{5,}" )); |
944 | } |
945 | |
946 | TEST(RE2, Complicated) { |
947 | // Complicated RE2 |
948 | ASSERT_TRUE(RE2::FullMatch("foo" , "foo|bar|[A-Z]" )); |
949 | ASSERT_TRUE(RE2::FullMatch("bar" , "foo|bar|[A-Z]" )); |
950 | ASSERT_TRUE(RE2::FullMatch("X" , "foo|bar|[A-Z]" )); |
951 | ASSERT_FALSE(RE2::FullMatch("XY" , "foo|bar|[A-Z]" )); |
952 | } |
953 | |
954 | TEST(RE2, FullMatchEnd) { |
955 | // Check full-match handling (needs '$' tacked on internally) |
956 | ASSERT_TRUE(RE2::FullMatch("fo" , "fo|foo" )); |
957 | ASSERT_TRUE(RE2::FullMatch("foo" , "fo|foo" )); |
958 | ASSERT_TRUE(RE2::FullMatch("fo" , "fo|foo$" )); |
959 | ASSERT_TRUE(RE2::FullMatch("foo" , "fo|foo$" )); |
960 | ASSERT_TRUE(RE2::FullMatch("foo" , "foo$" )); |
961 | ASSERT_FALSE(RE2::FullMatch("foo$bar" , "foo\\$" )); |
962 | ASSERT_FALSE(RE2::FullMatch("fox" , "fo|bar" )); |
963 | |
964 | // Uncomment the following if we change the handling of '$' to |
965 | // prevent it from matching a trailing newline |
966 | if (false) { |
967 | // Check that we don't get bitten by pcre's special handling of a |
968 | // '\n' at the end of the string matching '$' |
969 | ASSERT_FALSE(RE2::PartialMatch("foo\n" , "foo$" )); |
970 | } |
971 | } |
972 | |
973 | TEST(RE2, FullMatchArgCount) { |
974 | // Number of args |
975 | int a[16]; |
976 | ASSERT_TRUE(RE2::FullMatch("" , "" )); |
977 | |
978 | memset(a, 0, sizeof(0)); |
979 | ASSERT_TRUE(RE2::FullMatch("1" , "(\\d){1}" , &a[0])); |
980 | ASSERT_EQ(a[0], 1); |
981 | |
982 | memset(a, 0, sizeof(0)); |
983 | ASSERT_TRUE(RE2::FullMatch("12" , "(\\d)(\\d)" , &a[0], &a[1])); |
984 | ASSERT_EQ(a[0], 1); |
985 | ASSERT_EQ(a[1], 2); |
986 | |
987 | memset(a, 0, sizeof(0)); |
988 | ASSERT_TRUE(RE2::FullMatch("123" , "(\\d)(\\d)(\\d)" , &a[0], &a[1], &a[2])); |
989 | ASSERT_EQ(a[0], 1); |
990 | ASSERT_EQ(a[1], 2); |
991 | ASSERT_EQ(a[2], 3); |
992 | |
993 | memset(a, 0, sizeof(0)); |
994 | ASSERT_TRUE(RE2::FullMatch("1234" , "(\\d)(\\d)(\\d)(\\d)" , &a[0], &a[1], |
995 | &a[2], &a[3])); |
996 | ASSERT_EQ(a[0], 1); |
997 | ASSERT_EQ(a[1], 2); |
998 | ASSERT_EQ(a[2], 3); |
999 | ASSERT_EQ(a[3], 4); |
1000 | |
1001 | memset(a, 0, sizeof(0)); |
1002 | ASSERT_TRUE(RE2::FullMatch("12345" , "(\\d)(\\d)(\\d)(\\d)(\\d)" , &a[0], &a[1], |
1003 | &a[2], &a[3], &a[4])); |
1004 | ASSERT_EQ(a[0], 1); |
1005 | ASSERT_EQ(a[1], 2); |
1006 | ASSERT_EQ(a[2], 3); |
1007 | ASSERT_EQ(a[3], 4); |
1008 | ASSERT_EQ(a[4], 5); |
1009 | |
1010 | memset(a, 0, sizeof(0)); |
1011 | ASSERT_TRUE(RE2::FullMatch("123456" , "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)" , &a[0], |
1012 | &a[1], &a[2], &a[3], &a[4], &a[5])); |
1013 | ASSERT_EQ(a[0], 1); |
1014 | ASSERT_EQ(a[1], 2); |
1015 | ASSERT_EQ(a[2], 3); |
1016 | ASSERT_EQ(a[3], 4); |
1017 | ASSERT_EQ(a[4], 5); |
1018 | ASSERT_EQ(a[5], 6); |
1019 | |
1020 | memset(a, 0, sizeof(0)); |
1021 | ASSERT_TRUE(RE2::FullMatch("1234567" , "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)" , |
1022 | &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6])); |
1023 | ASSERT_EQ(a[0], 1); |
1024 | ASSERT_EQ(a[1], 2); |
1025 | ASSERT_EQ(a[2], 3); |
1026 | ASSERT_EQ(a[3], 4); |
1027 | ASSERT_EQ(a[4], 5); |
1028 | ASSERT_EQ(a[5], 6); |
1029 | ASSERT_EQ(a[6], 7); |
1030 | |
1031 | memset(a, 0, sizeof(0)); |
1032 | ASSERT_TRUE(RE2::FullMatch("1234567890123456" , |
1033 | "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)" |
1034 | "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)" , |
1035 | &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], |
1036 | &a[7], &a[8], &a[9], &a[10], &a[11], &a[12], |
1037 | &a[13], &a[14], &a[15])); |
1038 | ASSERT_EQ(a[0], 1); |
1039 | ASSERT_EQ(a[1], 2); |
1040 | ASSERT_EQ(a[2], 3); |
1041 | ASSERT_EQ(a[3], 4); |
1042 | ASSERT_EQ(a[4], 5); |
1043 | ASSERT_EQ(a[5], 6); |
1044 | ASSERT_EQ(a[6], 7); |
1045 | ASSERT_EQ(a[7], 8); |
1046 | ASSERT_EQ(a[8], 9); |
1047 | ASSERT_EQ(a[9], 0); |
1048 | ASSERT_EQ(a[10], 1); |
1049 | ASSERT_EQ(a[11], 2); |
1050 | ASSERT_EQ(a[12], 3); |
1051 | ASSERT_EQ(a[13], 4); |
1052 | ASSERT_EQ(a[14], 5); |
1053 | ASSERT_EQ(a[15], 6); |
1054 | } |
1055 | |
1056 | TEST(RE2, Accessors) { |
1057 | // Check the pattern() accessor |
1058 | { |
1059 | const std::string kPattern = "http://([^/]+)/.*" ; |
1060 | const RE2 re(kPattern); |
1061 | ASSERT_EQ(kPattern, re.pattern()); |
1062 | } |
1063 | |
1064 | // Check RE2 error field. |
1065 | { |
1066 | RE2 re("foo" ); |
1067 | ASSERT_TRUE(re.error().empty()); // Must have no error |
1068 | ASSERT_TRUE(re.ok()); |
1069 | ASSERT_EQ(re.error_code(), RE2::NoError); |
1070 | } |
1071 | } |
1072 | |
1073 | TEST(RE2, UTF8) { |
1074 | // Check UTF-8 handling |
1075 | // Three Japanese characters (nihongo) |
1076 | const char utf8_string[] = { |
1077 | (char)0xe6, (char)0x97, (char)0xa5, // 65e5 |
1078 | (char)0xe6, (char)0x9c, (char)0xac, // 627c |
1079 | (char)0xe8, (char)0xaa, (char)0x9e, // 8a9e |
1080 | 0 |
1081 | }; |
1082 | const char utf8_pattern[] = { |
1083 | '.', |
1084 | (char)0xe6, (char)0x9c, (char)0xac, // 627c |
1085 | '.', |
1086 | 0 |
1087 | }; |
1088 | |
1089 | // Both should match in either mode, bytes or UTF-8 |
1090 | RE2 re_test1("........." , RE2::Latin1); |
1091 | ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test1)); |
1092 | RE2 re_test2("..." ); |
1093 | ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test2)); |
1094 | |
1095 | // Check that '.' matches one byte or UTF-8 character |
1096 | // according to the mode. |
1097 | std::string s; |
1098 | RE2 re_test3("(.)" , RE2::Latin1); |
1099 | ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test3, &s)); |
1100 | ASSERT_EQ(s, std::string("\xe6" )); |
1101 | RE2 re_test4("(.)" ); |
1102 | ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test4, &s)); |
1103 | ASSERT_EQ(s, std::string("\xe6\x97\xa5" )); |
1104 | |
1105 | // Check that string matches itself in either mode |
1106 | RE2 re_test5(utf8_string, RE2::Latin1); |
1107 | ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test5)); |
1108 | RE2 re_test6(utf8_string); |
1109 | ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test6)); |
1110 | |
1111 | // Check that pattern matches string only in UTF8 mode |
1112 | RE2 re_test7(utf8_pattern, RE2::Latin1); |
1113 | ASSERT_FALSE(RE2::FullMatch(utf8_string, re_test7)); |
1114 | RE2 re_test8(utf8_pattern); |
1115 | ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test8)); |
1116 | } |
1117 | |
1118 | TEST(RE2, UngreedyUTF8) { |
1119 | // Check that ungreedy, UTF8 regular expressions don't match when they |
1120 | // oughtn't -- see bug 82246. |
1121 | { |
1122 | // This code always worked. |
1123 | const char* pattern = "\\w+X" ; |
1124 | const std::string target = "a aX" ; |
1125 | RE2 match_sentence(pattern, RE2::Latin1); |
1126 | RE2 match_sentence_re(pattern); |
1127 | |
1128 | ASSERT_FALSE(RE2::FullMatch(target, match_sentence)); |
1129 | ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re)); |
1130 | } |
1131 | { |
1132 | const char* pattern = "(?U)\\w+X" ; |
1133 | const std::string target = "a aX" ; |
1134 | RE2 match_sentence(pattern, RE2::Latin1); |
1135 | ASSERT_EQ(match_sentence.error(), "" ); |
1136 | RE2 match_sentence_re(pattern); |
1137 | |
1138 | ASSERT_FALSE(RE2::FullMatch(target, match_sentence)); |
1139 | ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re)); |
1140 | } |
1141 | } |
1142 | |
1143 | TEST(RE2, Rejects) { |
1144 | { |
1145 | RE2 re("a\\1" , RE2::Quiet); |
1146 | ASSERT_FALSE(re.ok()); } |
1147 | { |
1148 | RE2 re("a[x" , RE2::Quiet); |
1149 | ASSERT_FALSE(re.ok()); |
1150 | } |
1151 | { |
1152 | RE2 re("a[z-a]" , RE2::Quiet); |
1153 | ASSERT_FALSE(re.ok()); |
1154 | } |
1155 | { |
1156 | RE2 re("a[[:foobar:]]" , RE2::Quiet); |
1157 | ASSERT_FALSE(re.ok()); |
1158 | } |
1159 | { |
1160 | RE2 re("a(b" , RE2::Quiet); |
1161 | ASSERT_FALSE(re.ok()); |
1162 | } |
1163 | { |
1164 | RE2 re("a\\" , RE2::Quiet); |
1165 | ASSERT_FALSE(re.ok()); |
1166 | } |
1167 | } |
1168 | |
1169 | TEST(RE2, NoCrash) { |
1170 | // Test that using a bad regexp doesn't crash. |
1171 | { |
1172 | RE2 re("a\\" , RE2::Quiet); |
1173 | ASSERT_FALSE(re.ok()); |
1174 | ASSERT_FALSE(RE2::PartialMatch("a\\b" , re)); |
1175 | } |
1176 | |
1177 | // Test that using an enormous regexp doesn't crash |
1178 | { |
1179 | RE2 re("(((.{100}){100}){100}){100}" , RE2::Quiet); |
1180 | ASSERT_FALSE(re.ok()); |
1181 | ASSERT_FALSE(RE2::PartialMatch("aaa" , re)); |
1182 | } |
1183 | |
1184 | // Test that a crazy regexp still compiles and runs. |
1185 | { |
1186 | RE2 re(".{512}x" , RE2::Quiet); |
1187 | ASSERT_TRUE(re.ok()); |
1188 | std::string s; |
1189 | s.append(515, 'c'); |
1190 | s.append("x" ); |
1191 | ASSERT_TRUE(RE2::PartialMatch(s, re)); |
1192 | } |
1193 | } |
1194 | |
1195 | TEST(RE2, Recursion) { |
1196 | // Test that recursion is stopped. |
1197 | // This test is PCRE-legacy -- there's no recursion in RE2. |
1198 | int bytes = 15 * 1024; // enough to crash PCRE |
1199 | TestRecursion(bytes, "." ); |
1200 | TestRecursion(bytes, "a" ); |
1201 | TestRecursion(bytes, "a." ); |
1202 | TestRecursion(bytes, "ab." ); |
1203 | TestRecursion(bytes, "abc." ); |
1204 | } |
1205 | |
1206 | TEST(RE2, BigCountedRepetition) { |
1207 | // Test that counted repetition works, given tons of memory. |
1208 | RE2::Options opt; |
1209 | opt.set_max_mem(256<<20); |
1210 | |
1211 | RE2 re(".{512}x" , opt); |
1212 | ASSERT_TRUE(re.ok()); |
1213 | std::string s; |
1214 | s.append(515, 'c'); |
1215 | s.append("x" ); |
1216 | ASSERT_TRUE(RE2::PartialMatch(s, re)); |
1217 | } |
1218 | |
1219 | TEST(RE2, DeepRecursion) { |
1220 | // Test for deep stack recursion. This would fail with a |
1221 | // segmentation violation due to stack overflow before pcre was |
1222 | // patched. |
1223 | // Again, a PCRE legacy test. RE2 doesn't recurse. |
1224 | std::string ("x*" ); |
1225 | std::string a(131072, 'a'); |
1226 | comment += a; |
1227 | comment += "*x" ; |
1228 | RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)" ); |
1229 | ASSERT_TRUE(RE2::FullMatch(comment, re)); |
1230 | } |
1231 | |
1232 | // Suggested by Josh Hyman. Failed when SearchOnePass was |
1233 | // not implementing case-folding. |
1234 | TEST(CaseInsensitive, MatchAndConsume) { |
1235 | std::string result; |
1236 | std::string text = "A fish named *Wanda*" ; |
1237 | StringPiece sp(text); |
1238 | |
1239 | EXPECT_TRUE(RE2::PartialMatch(sp, "(?i)([wand]{5})" , &result)); |
1240 | EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})" , &result)); |
1241 | } |
1242 | |
1243 | // RE2 should permit implicit conversions from string, StringPiece, const char*, |
1244 | // and C string literals. |
1245 | TEST(RE2, ImplicitConversions) { |
1246 | std::string re_string("." ); |
1247 | StringPiece re_stringpiece("." ); |
1248 | const char* re_cstring = "." ; |
1249 | EXPECT_TRUE(RE2::PartialMatch("e" , re_string)); |
1250 | EXPECT_TRUE(RE2::PartialMatch("e" , re_stringpiece)); |
1251 | EXPECT_TRUE(RE2::PartialMatch("e" , re_cstring)); |
1252 | EXPECT_TRUE(RE2::PartialMatch("e" , "." )); |
1253 | } |
1254 | |
1255 | // Bugs introduced by 8622304 |
1256 | TEST(RE2, CL8622304) { |
1257 | // reported by ingow |
1258 | std::string dir; |
1259 | EXPECT_TRUE(RE2::FullMatch("D" , "([^\\\\])" )); // ok |
1260 | EXPECT_TRUE(RE2::FullMatch("D" , "([^\\\\])" , &dir)); // fails |
1261 | |
1262 | // reported by jacobsa |
1263 | std::string key, val; |
1264 | EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true" , |
1265 | "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?" , |
1266 | &key, |
1267 | &val)); |
1268 | EXPECT_EQ(key, "bar" ); |
1269 | EXPECT_EQ(val, "1,0x2F,030,4,5" ); |
1270 | } |
1271 | |
1272 | |
1273 | // Check that RE2 returns correct regexp pieces on error. |
1274 | // In particular, make sure it returns whole runes |
1275 | // and that it always reports invalid UTF-8. |
1276 | // Also check that Perl error flag piece is big enough. |
1277 | static struct ErrorTest { |
1278 | const char *regexp; |
1279 | const char *error; |
1280 | } error_tests[] = { |
1281 | { "ab\\αcd" , "\\α" }, |
1282 | { "ef\\x☺01" , "\\x☺0" }, |
1283 | { "gh\\x1☺01" , "\\x1☺" }, |
1284 | { "ij\\x1" , "\\x1" }, |
1285 | { "kl\\x" , "\\x" }, |
1286 | { "uv\\x{0000☺}" , "\\x{0000☺" }, |
1287 | { "wx\\p{ABC" , "\\p{ABC" }, |
1288 | { "yz(?smiUX:abc)" , "(?smiUX" }, // used to return (?s but the error is X |
1289 | { "aa(?sm☺i" , "(?sm☺" }, |
1290 | { "bb[abc" , "[abc" }, |
1291 | |
1292 | { "mn\\x1\377" , "" }, // no argument string returned for invalid UTF-8 |
1293 | { "op\377qr" , "" }, |
1294 | { "st\\x{00000\377" , "" }, |
1295 | { "zz\\p{\377}" , "" }, |
1296 | { "zz\\x{00\377}" , "" }, |
1297 | { "zz(?P<name\377>abc)" , "" }, |
1298 | }; |
1299 | TEST(RE2, ErrorArgs) { |
1300 | for (size_t i = 0; i < arraysize(error_tests); i++) { |
1301 | RE2 re(error_tests[i].regexp, RE2::Quiet); |
1302 | EXPECT_FALSE(re.ok()); |
1303 | EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error(); |
1304 | } |
1305 | } |
1306 | |
1307 | // Check that "never match \n" mode never matches \n. |
1308 | static struct NeverTest { |
1309 | const char* regexp; |
1310 | const char* text; |
1311 | const char* match; |
1312 | } never_tests[] = { |
1313 | { "(.*)" , "abc\ndef\nghi\n" , "abc" }, |
1314 | { "(?s)(abc.*def)" , "abc\ndef\n" , NULL }, |
1315 | { "(abc(.|\n)*def)" , "abc\ndef\n" , NULL }, |
1316 | { "(abc[^x]*def)" , "abc\ndef\n" , NULL }, |
1317 | { "(abc[^x]*def)" , "abczzzdef\ndef\n" , "abczzzdef" }, |
1318 | }; |
1319 | TEST(RE2, NeverNewline) { |
1320 | RE2::Options opt; |
1321 | opt.set_never_nl(true); |
1322 | for (size_t i = 0; i < arraysize(never_tests); i++) { |
1323 | const NeverTest& t = never_tests[i]; |
1324 | RE2 re(t.regexp, opt); |
1325 | if (t.match == NULL) { |
1326 | EXPECT_FALSE(re.PartialMatch(t.text, re)); |
1327 | } else { |
1328 | StringPiece m; |
1329 | EXPECT_TRUE(re.PartialMatch(t.text, re, &m)); |
1330 | EXPECT_EQ(m, t.match); |
1331 | } |
1332 | } |
1333 | } |
1334 | |
1335 | // Check that dot_nl option works. |
1336 | TEST(RE2, DotNL) { |
1337 | RE2::Options opt; |
1338 | opt.set_dot_nl(true); |
1339 | EXPECT_TRUE(RE2::PartialMatch("\n" , RE2("." , opt))); |
1340 | EXPECT_FALSE(RE2::PartialMatch("\n" , RE2("(?-s)." , opt))); |
1341 | opt.set_never_nl(true); |
1342 | EXPECT_FALSE(RE2::PartialMatch("\n" , RE2("." , opt))); |
1343 | } |
1344 | |
1345 | // Check that there are no capturing groups in "never capture" mode. |
1346 | TEST(RE2, NeverCapture) { |
1347 | RE2::Options opt; |
1348 | opt.set_never_capture(true); |
1349 | RE2 re("(r)(e)" , opt); |
1350 | EXPECT_EQ(0, re.NumberOfCapturingGroups()); |
1351 | } |
1352 | |
1353 | // Bitstate bug was looking at submatch[0] even if nsubmatch == 0. |
1354 | // Triggered by a failed DFA search falling back to Bitstate when |
1355 | // using Match with a NULL submatch set. Bitstate tried to read |
1356 | // the submatch[0] entry even if nsubmatch was 0. |
1357 | TEST(RE2, BitstateCaptureBug) { |
1358 | RE2::Options opt; |
1359 | opt.set_max_mem(20000); |
1360 | RE2 re("(_________$)" , opt); |
1361 | StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x" ; |
1362 | EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0)); |
1363 | } |
1364 | |
1365 | // C++ version of bug 609710. |
1366 | TEST(RE2, UnicodeClasses) { |
1367 | const std::string str = "ABCDEFGHI譚永鋒" ; |
1368 | std::string a, b, c; |
1369 | |
1370 | EXPECT_TRUE(RE2::FullMatch("A" , "\\p{L}" )); |
1371 | EXPECT_TRUE(RE2::FullMatch("A" , "\\p{Lu}" )); |
1372 | EXPECT_FALSE(RE2::FullMatch("A" , "\\p{Ll}" )); |
1373 | EXPECT_FALSE(RE2::FullMatch("A" , "\\P{L}" )); |
1374 | EXPECT_FALSE(RE2::FullMatch("A" , "\\P{Lu}" )); |
1375 | EXPECT_TRUE(RE2::FullMatch("A" , "\\P{Ll}" )); |
1376 | |
1377 | EXPECT_TRUE(RE2::FullMatch("譚" , "\\p{L}" )); |
1378 | EXPECT_FALSE(RE2::FullMatch("譚" , "\\p{Lu}" )); |
1379 | EXPECT_FALSE(RE2::FullMatch("譚" , "\\p{Ll}" )); |
1380 | EXPECT_FALSE(RE2::FullMatch("譚" , "\\P{L}" )); |
1381 | EXPECT_TRUE(RE2::FullMatch("譚" , "\\P{Lu}" )); |
1382 | EXPECT_TRUE(RE2::FullMatch("譚" , "\\P{Ll}" )); |
1383 | |
1384 | EXPECT_TRUE(RE2::FullMatch("永" , "\\p{L}" )); |
1385 | EXPECT_FALSE(RE2::FullMatch("永" , "\\p{Lu}" )); |
1386 | EXPECT_FALSE(RE2::FullMatch("永" , "\\p{Ll}" )); |
1387 | EXPECT_FALSE(RE2::FullMatch("永" , "\\P{L}" )); |
1388 | EXPECT_TRUE(RE2::FullMatch("永" , "\\P{Lu}" )); |
1389 | EXPECT_TRUE(RE2::FullMatch("永" , "\\P{Ll}" )); |
1390 | |
1391 | EXPECT_TRUE(RE2::FullMatch("鋒" , "\\p{L}" )); |
1392 | EXPECT_FALSE(RE2::FullMatch("鋒" , "\\p{Lu}" )); |
1393 | EXPECT_FALSE(RE2::FullMatch("鋒" , "\\p{Ll}" )); |
1394 | EXPECT_FALSE(RE2::FullMatch("鋒" , "\\P{L}" )); |
1395 | EXPECT_TRUE(RE2::FullMatch("鋒" , "\\P{Lu}" )); |
1396 | EXPECT_TRUE(RE2::FullMatch("鋒" , "\\P{Ll}" )); |
1397 | |
1398 | EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)" , &a, &b, &c)); |
1399 | EXPECT_EQ("A" , a); |
1400 | EXPECT_EQ("B" , b); |
1401 | EXPECT_EQ("C" , c); |
1402 | |
1403 | EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)" , &a, &b, &c)); |
1404 | EXPECT_EQ("A" , a); |
1405 | EXPECT_EQ("B" , b); |
1406 | EXPECT_EQ("C" , c); |
1407 | |
1408 | EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}" )); |
1409 | |
1410 | EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)" , &a, &b, &c)); |
1411 | EXPECT_EQ("A" , a); |
1412 | EXPECT_EQ("B" , b); |
1413 | EXPECT_EQ("C" , c); |
1414 | |
1415 | EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]" )); |
1416 | |
1417 | EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)" , &a, &b, &c)); |
1418 | EXPECT_EQ("譚" , a); |
1419 | EXPECT_EQ("永" , b); |
1420 | EXPECT_EQ("鋒" , c); |
1421 | } |
1422 | |
1423 | TEST(RE2, LazyRE2) { |
1424 | // Test with and without options. |
1425 | static LazyRE2 a = {"a" }; |
1426 | static LazyRE2 b = {"b" , RE2::Latin1}; |
1427 | |
1428 | EXPECT_EQ("a" , a->pattern()); |
1429 | EXPECT_EQ(RE2::Options::EncodingUTF8, a->options().encoding()); |
1430 | |
1431 | EXPECT_EQ("b" , b->pattern()); |
1432 | EXPECT_EQ(RE2::Options::EncodingLatin1, b->options().encoding()); |
1433 | } |
1434 | |
1435 | // Bug reported by saito. 2009/02/17 |
1436 | TEST(RE2, NullVsEmptyString) { |
1437 | RE2 re(".*" ); |
1438 | EXPECT_TRUE(re.ok()); |
1439 | |
1440 | StringPiece null; |
1441 | EXPECT_TRUE(RE2::FullMatch(null, re)); |
1442 | |
1443 | StringPiece empty("" ); |
1444 | EXPECT_TRUE(RE2::FullMatch(empty, re)); |
1445 | } |
1446 | |
1447 | // Similar to the previous test, check that the null string and the empty |
1448 | // string both match, but also that the null string can only provide null |
1449 | // submatches whereas the empty string can also provide empty submatches. |
1450 | TEST(RE2, NullVsEmptyStringSubmatches) { |
1451 | RE2 re("()|(foo)" ); |
1452 | EXPECT_TRUE(re.ok()); |
1453 | |
1454 | // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent. |
1455 | StringPiece matches[4]; |
1456 | |
1457 | for (size_t i = 0; i < arraysize(matches); i++) |
1458 | matches[i] = "bar" ; |
1459 | |
1460 | StringPiece null; |
1461 | EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED, |
1462 | matches, arraysize(matches))); |
1463 | for (size_t i = 0; i < arraysize(matches); i++) { |
1464 | EXPECT_TRUE(matches[i].data() == NULL); // always null |
1465 | EXPECT_TRUE(matches[i].empty()); |
1466 | } |
1467 | |
1468 | for (size_t i = 0; i < arraysize(matches); i++) |
1469 | matches[i] = "bar" ; |
1470 | |
1471 | StringPiece empty("" ); |
1472 | EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED, |
1473 | matches, arraysize(matches))); |
1474 | EXPECT_TRUE(matches[0].data() != NULL); // empty, not null |
1475 | EXPECT_TRUE(matches[0].empty()); |
1476 | EXPECT_TRUE(matches[1].data() != NULL); // empty, not null |
1477 | EXPECT_TRUE(matches[1].empty()); |
1478 | EXPECT_TRUE(matches[2].data() == NULL); |
1479 | EXPECT_TRUE(matches[2].empty()); |
1480 | EXPECT_TRUE(matches[3].data() == NULL); |
1481 | EXPECT_TRUE(matches[3].empty()); |
1482 | } |
1483 | |
1484 | // Issue 1816809 |
1485 | TEST(RE2, Bug1816809) { |
1486 | RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))" ); |
1487 | StringPiece piece("llx-3;llx4" ); |
1488 | std::string x; |
1489 | EXPECT_TRUE(RE2::Consume(&piece, re, &x)); |
1490 | } |
1491 | |
1492 | // Issue 3061120 |
1493 | TEST(RE2, Bug3061120) { |
1494 | RE2 re("(?i)\\W" ); |
1495 | EXPECT_FALSE(RE2::PartialMatch("x" , re)); // always worked |
1496 | EXPECT_FALSE(RE2::PartialMatch("k" , re)); // broke because of kelvin |
1497 | EXPECT_FALSE(RE2::PartialMatch("s" , re)); // broke because of latin long s |
1498 | } |
1499 | |
1500 | TEST(RE2, CapturingGroupNames) { |
1501 | // Opening parentheses annotated with group IDs: |
1502 | // 12 3 45 6 7 |
1503 | RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))" ); |
1504 | EXPECT_TRUE(re.ok()); |
1505 | const std::map<int, std::string>& have = re.CapturingGroupNames(); |
1506 | std::map<int, std::string> want; |
1507 | want[3] = "G2" ; |
1508 | want[6] = "G2" ; |
1509 | want[7] = "G1" ; |
1510 | EXPECT_EQ(want, have); |
1511 | } |
1512 | |
1513 | TEST(RE2, RegexpToStringLossOfAnchor) { |
1514 | EXPECT_EQ(RE2("^[a-c]at" , RE2::POSIX).Regexp()->ToString(), "^[a-c]at" ); |
1515 | EXPECT_EQ(RE2("^[a-c]at" ).Regexp()->ToString(), "(?-m:^)[a-c]at" ); |
1516 | EXPECT_EQ(RE2("ca[t-z]$" , RE2::POSIX).Regexp()->ToString(), "ca[t-z]$" ); |
1517 | EXPECT_EQ(RE2("ca[t-z]$" ).Regexp()->ToString(), "ca[t-z](?-m:$)" ); |
1518 | } |
1519 | |
1520 | // Issue 10131674 |
1521 | TEST(RE2, Bug10131674) { |
1522 | // Some of these escapes describe values that do not fit in a byte. |
1523 | RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332" , RE2::Latin1); |
1524 | EXPECT_FALSE(re.ok()); |
1525 | EXPECT_FALSE(RE2::FullMatch("hello world" , re)); |
1526 | } |
1527 | |
1528 | TEST(RE2, Bug18391750) { |
1529 | // Stray write past end of match_ in nfa.cc, caught by fuzzing + address sanitizer. |
1530 | const char t[] = { |
1531 | (char)0x28, (char)0x28, (char)0xfc, (char)0xfc, (char)0x08, (char)0x08, |
1532 | (char)0x26, (char)0x26, (char)0x28, (char)0xc2, (char)0x9b, (char)0xc5, |
1533 | (char)0xc5, (char)0xd4, (char)0x8f, (char)0x8f, (char)0x69, (char)0x69, |
1534 | (char)0xe7, (char)0x29, (char)0x7b, (char)0x37, (char)0x31, (char)0x31, |
1535 | (char)0x7d, (char)0xae, (char)0x7c, (char)0x7c, (char)0xf3, (char)0x29, |
1536 | (char)0xae, (char)0xae, (char)0x2e, (char)0x2a, (char)0x29, (char)0x00, |
1537 | }; |
1538 | RE2::Options opt; |
1539 | opt.set_encoding(RE2::Options::EncodingLatin1); |
1540 | opt.set_longest_match(true); |
1541 | opt.set_dot_nl(true); |
1542 | opt.set_case_sensitive(false); |
1543 | RE2 re(t, opt); |
1544 | ASSERT_TRUE(re.ok()); |
1545 | RE2::PartialMatch(t, re); |
1546 | } |
1547 | |
1548 | TEST(RE2, Bug18458852) { |
1549 | // Bug in parser accepting invalid (too large) rune, |
1550 | // causing compiler to fail in DCHECK in UTF-8 |
1551 | // character class code. |
1552 | const char b[] = { |
1553 | (char)0x28, (char)0x05, (char)0x05, (char)0x41, (char)0x41, (char)0x28, |
1554 | (char)0x24, (char)0x5b, (char)0x5e, (char)0xf5, (char)0x87, (char)0x87, |
1555 | (char)0x90, (char)0x29, (char)0x5d, (char)0x29, (char)0x29, (char)0x00, |
1556 | }; |
1557 | RE2 re(b); |
1558 | ASSERT_FALSE(re.ok()); |
1559 | } |
1560 | |
1561 | TEST(RE2, Bug18523943) { |
1562 | // Bug in BitState: case kFailInst failed the match entirely. |
1563 | |
1564 | RE2::Options opt; |
1565 | const char a[] = { |
1566 | (char)0x29, (char)0x29, (char)0x24, (char)0x00, |
1567 | }; |
1568 | const char b[] = { |
1569 | (char)0x28, (char)0x0a, (char)0x2a, (char)0x2a, (char)0x29, (char)0x00, |
1570 | }; |
1571 | opt.set_log_errors(false); |
1572 | opt.set_encoding(RE2::Options::EncodingLatin1); |
1573 | opt.set_posix_syntax(true); |
1574 | opt.set_longest_match(true); |
1575 | opt.set_literal(false); |
1576 | opt.set_never_nl(true); |
1577 | |
1578 | RE2 re((const char*)b, opt); |
1579 | ASSERT_TRUE(re.ok()); |
1580 | std::string s1; |
1581 | ASSERT_TRUE(RE2::PartialMatch((const char*)a, re, &s1)); |
1582 | } |
1583 | |
1584 | TEST(RE2, Bug21371806) { |
1585 | // Bug in parser accepting Unicode groups in Latin-1 mode, |
1586 | // causing compiler to fail in DCHECK in prog.cc. |
1587 | |
1588 | RE2::Options opt; |
1589 | opt.set_encoding(RE2::Options::EncodingLatin1); |
1590 | |
1591 | RE2 re("g\\p{Zl}]" , opt); |
1592 | ASSERT_TRUE(re.ok()); |
1593 | } |
1594 | |
1595 | TEST(RE2, Bug26356109) { |
1596 | // Bug in parser caused by factoring of common prefixes in alternations. |
1597 | |
1598 | // In the past, this was factored to "a\\C*?[bc]". Thus, the automaton would |
1599 | // consume "ab" and then stop (when unanchored) whereas it should consume all |
1600 | // of "abc" as per first-match semantics. |
1601 | RE2 re("a\\C*?c|a\\C*?b" ); |
1602 | ASSERT_TRUE(re.ok()); |
1603 | |
1604 | std::string s = "abc" ; |
1605 | StringPiece m; |
1606 | |
1607 | ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); |
1608 | ASSERT_EQ(m, s) << " (UNANCHORED) got m='" << m << "', want '" << s << "'" ; |
1609 | |
1610 | ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::ANCHOR_BOTH, &m, 1)); |
1611 | ASSERT_EQ(m, s) << " (ANCHOR_BOTH) got m='" << m << "', want '" << s << "'" ; |
1612 | } |
1613 | |
1614 | TEST(RE2, Issue104) { |
1615 | // RE2::GlobalReplace always advanced by one byte when the empty string was |
1616 | // matched, which would clobber any rune that is longer than one byte. |
1617 | |
1618 | std::string s = "bc" ; |
1619 | ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*" , "d" )); |
1620 | ASSERT_EQ("dbdcd" , s); |
1621 | |
1622 | s = "ąć" ; |
1623 | ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*" , "Ĉ" )); |
1624 | ASSERT_EQ("ĈąĈćĈ" , s); |
1625 | |
1626 | s = "人类" ; |
1627 | ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*" , "小" )); |
1628 | ASSERT_EQ("小人小类小" , s); |
1629 | } |
1630 | |
1631 | } // namespace re2 |
1632 | |