1 | // Copyright 2008 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Exhaustive testing of regular expression matching. |
6 | |
7 | #include <stddef.h> |
8 | #include <memory> |
9 | #include <string> |
10 | #include <vector> |
11 | |
12 | #include "util/test.h" |
13 | #include "re2/testing/exhaustive_tester.h" |
14 | |
15 | namespace re2 { |
16 | |
17 | // Test empty string matches (aka "(?:)") |
18 | TEST(EmptyString, Exhaustive) { |
19 | ExhaustiveTest(2, 2, Split(" " , "(?:) a" ), |
20 | RegexpGenerator::EgrepOps(), |
21 | 5, Split("" , "ab" ), "" , "" ); |
22 | } |
23 | |
24 | // Test escaped versions of regexp syntax. |
25 | TEST(Punctuation, Literals) { |
26 | std::vector<std::string> alphabet = Explode("()*+?{}[]\\^$." ); |
27 | std::vector<std::string> escaped = alphabet; |
28 | for (size_t i = 0; i < escaped.size(); i++) |
29 | escaped[i] = "\\" + escaped[i]; |
30 | ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(), |
31 | 2, alphabet, "" , "" ); |
32 | } |
33 | |
34 | // Test ^ $ . \A \z in presence of line endings. |
35 | // Have to wrap the empty-width ones in (?:) so that |
36 | // they can be repeated -- PCRE rejects ^* but allows (?:^)* |
37 | TEST(LineEnds, Exhaustive) { |
38 | ExhaustiveTest(2, 2, Split(" " , "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)" ), |
39 | RegexpGenerator::EgrepOps(), |
40 | 4, Explode("ab\n" ), "" , "" ); |
41 | } |
42 | |
43 | // Test what does and does not match \n. |
44 | // This would be a good test, except that PCRE seems to have a bug: |
45 | // in single-byte character set mode (the default), |
46 | // [^a] matches \n, but in UTF-8 mode it does not. |
47 | // So when we run the test, the tester complains that |
48 | // we don't agree with PCRE, but it's PCRE that is at fault. |
49 | // For what it's worth, Perl gets this right (matches |
50 | // regardless of whether UTF-8 input is selected): |
51 | // |
52 | // #!/usr/bin/perl |
53 | // use POSIX qw(locale_h); |
54 | // print "matches in latin1\n" if "\n" =~ /[^a]/; |
55 | // setlocale("en_US.utf8"); |
56 | // print "matches in utf8\n" if "\n" =~ /[^a]/; |
57 | // |
58 | // The rule chosen for RE2 is that by default, like Perl, |
59 | // dot does not match \n but negated character classes [^a] do. |
60 | // (?s) will allow dot to match \n; there is no way in RE2 |
61 | // to stop [^a] from matching \n, though the underlying library |
62 | // provides a mechanism, and RE2 could add new syntax if needed. |
63 | // |
64 | // TEST(Newlines, Exhaustive) { |
65 | // std::vector<std::string> empty_vector; |
66 | // ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"), |
67 | // RegexpGenerator::EgrepOps(), |
68 | // 4, Explode("a\n"), ""); |
69 | // } |
70 | |
71 | } // namespace re2 |
72 | |
73 | |