1 | // Copyright 2008 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Exhaustive testing of regular expression matching. |
6 | |
7 | #include <stddef.h> |
8 | #include <memory> |
9 | #include <string> |
10 | #include <vector> |
11 | |
12 | #include "util/test.h" |
13 | #include "util/utf.h" |
14 | #include "re2/testing/exhaustive_tester.h" |
15 | |
16 | namespace re2 { |
17 | |
18 | // Test simple character classes by themselves. |
19 | TEST(CharacterClasses, Exhaustive) { |
20 | std::vector<std::string> atoms = Split(" " , |
21 | "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b ." ); |
22 | ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(), |
23 | 5, Explode("ab" ), "" , "" ); |
24 | } |
25 | |
26 | // Test simple character classes inside a___b (for example, a[a]b). |
27 | TEST(CharacterClasses, ExhaustiveAB) { |
28 | std::vector<std::string> atoms = Split(" " , |
29 | "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b ." ); |
30 | ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(), |
31 | 5, Explode("ab" ), "a%sb" , "" ); |
32 | } |
33 | |
34 | // Returns UTF8 for Rune r |
35 | static std::string UTF8(Rune r) { |
36 | char buf[UTFmax+1]; |
37 | buf[runetochar(buf, &r)] = 0; |
38 | return std::string(buf); |
39 | } |
40 | |
41 | // Returns a vector of "interesting" UTF8 characters. |
42 | // Unicode is now too big to just return all of them, |
43 | // so UTF8Characters return a set likely to be good test cases. |
44 | static const std::vector<std::string>& InterestingUTF8() { |
45 | static bool init; |
46 | static std::vector<std::string> v; |
47 | |
48 | if (init) |
49 | return v; |
50 | |
51 | init = true; |
52 | // All the Latin1 equivalents are interesting. |
53 | for (int i = 1; i < 256; i++) |
54 | v.push_back(UTF8(i)); |
55 | |
56 | // After that, the codes near bit boundaries are |
57 | // interesting, because they span byte sequence lengths. |
58 | for (int j = 0; j < 8; j++) |
59 | v.push_back(UTF8(256 + j)); |
60 | for (int i = 512; i < Runemax; i <<= 1) |
61 | for (int j = -8; j < 8; j++) |
62 | v.push_back(UTF8(i + j)); |
63 | |
64 | // The codes near Runemax, including Runemax itself, are interesting. |
65 | for (int j = -8; j <= 0; j++) |
66 | v.push_back(UTF8(Runemax + j)); |
67 | |
68 | return v; |
69 | } |
70 | |
71 | // Test interesting UTF-8 characters against character classes. |
72 | TEST(InterestingUTF8, SingleOps) { |
73 | std::vector<std::string> atoms = Split(" " , |
74 | ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B " |
75 | "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] " |
76 | "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] " |
77 | "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]" ); |
78 | std::vector<std::string> ops; // no ops |
79 | ExhaustiveTest(1, 0, atoms, ops, |
80 | 1, InterestingUTF8(), "" , "" ); |
81 | } |
82 | |
83 | // Test interesting UTF-8 characters against character classes, |
84 | // but wrap everything inside AB. |
85 | TEST(InterestingUTF8, AB) { |
86 | std::vector<std::string> atoms = Split(" " , |
87 | ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B " |
88 | "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] " |
89 | "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] " |
90 | "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]" ); |
91 | std::vector<std::string> ops; // no ops |
92 | std::vector<std::string> alpha = InterestingUTF8(); |
93 | for (size_t i = 0; i < alpha.size(); i++) |
94 | alpha[i] = "a" + alpha[i] + "b" ; |
95 | ExhaustiveTest(1, 0, atoms, ops, |
96 | 1, alpha, "a%sb" , "" ); |
97 | } |
98 | |
99 | } // namespace re2 |
100 | |
101 | |