1 | // Copyright 2006-2008 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #include <string.h> |
6 | #include <string> |
7 | #include <vector> |
8 | |
9 | #include "util/test.h" |
10 | #include "util/logging.h" |
11 | #include "util/strutil.h" |
12 | #include "re2/prog.h" |
13 | #include "re2/re2.h" |
14 | #include "re2/regexp.h" |
15 | #include "re2/testing/exhaustive_tester.h" |
16 | #include "re2/testing/regexp_generator.h" |
17 | #include "re2/testing/string_generator.h" |
18 | |
19 | namespace re2 { |
20 | |
21 | // Test that C++ strings are compared as uint8s, not int8s. |
22 | // PossibleMatchRange doesn't depend on this, but callers probably will. |
23 | TEST(CplusplusStrings, EightBit) { |
24 | std::string s = "\x70" ; |
25 | std::string t = "\xA0" ; |
26 | EXPECT_LT(s, t); |
27 | } |
28 | |
29 | struct PrefixTest { |
30 | const char* regexp; |
31 | int maxlen; |
32 | const char* min; |
33 | const char* max; |
34 | }; |
35 | |
36 | static PrefixTest tests[] = { |
37 | { "" , 10, "" , "" , }, |
38 | { "Abcdef" , 10, "Abcdef" , "Abcdef" }, |
39 | { "abc(def|ghi)" , 10, "abcdef" , "abcghi" }, |
40 | { "a+hello" , 10, "aa" , "ahello" }, |
41 | { "a*hello" , 10, "a" , "hello" }, |
42 | { "def|abc" , 10, "abc" , "def" }, |
43 | { "a(b)(c)[d]" , 10, "abcd" , "abcd" }, |
44 | { "ab(cab|cat)" , 10, "abcab" , "abcat" }, |
45 | { "ab(cab|ca)x" , 10, "abcabx" , "abcax" }, |
46 | { "(ab|x)(c|de)" , 10, "abc" , "xde" }, |
47 | { "(ab|x)?(c|z)?" , 10, "" , "z" }, |
48 | { "[^\\s\\S]" , 10, "" , "" }, |
49 | { "(abc)+" , 5, "abc" , "abcac" }, |
50 | { "(abc)+" , 2, "ab" , "ac" }, |
51 | { "(abc)+" , 1, "a" , "b" }, |
52 | { "[a\xC3\xA1]" , 4, "a" , "\xC3\xA1" }, |
53 | { "a*" , 10, "" , "ab" }, |
54 | |
55 | { "(?i)Abcdef" , 10, "ABCDEF" , "abcdef" }, |
56 | { "(?i)abc(def|ghi)" , 10, "ABCDEF" , "abcghi" }, |
57 | { "(?i)a+hello" , 10, "AA" , "ahello" }, |
58 | { "(?i)a*hello" , 10, "A" , "hello" }, |
59 | { "(?i)def|abc" , 10, "ABC" , "def" }, |
60 | { "(?i)a(b)(c)[d]" , 10, "ABCD" , "abcd" }, |
61 | { "(?i)ab(cab|cat)" , 10, "ABCAB" , "abcat" }, |
62 | { "(?i)ab(cab|ca)x" , 10, "ABCABX" , "abcax" }, |
63 | { "(?i)(ab|x)(c|de)" , 10, "ABC" , "xde" }, |
64 | { "(?i)(ab|x)?(c|z)?" , 10, "" , "z" }, |
65 | { "(?i)[^\\s\\S]" , 10, "" , "" }, |
66 | { "(?i)(abc)+" , 5, "ABC" , "abcac" }, |
67 | { "(?i)(abc)+" , 2, "AB" , "ac" }, |
68 | { "(?i)(abc)+" , 1, "A" , "b" }, |
69 | { "(?i)[a\xC3\xA1]" , 4, "A" , "\xC3\xA1" }, |
70 | { "(?i)a*" , 10, "" , "ab" }, |
71 | { "(?i)A*" , 10, "" , "ab" }, |
72 | |
73 | { "\\AAbcdef" , 10, "Abcdef" , "Abcdef" }, |
74 | { "\\Aabc(def|ghi)" , 10, "abcdef" , "abcghi" }, |
75 | { "\\Aa+hello" , 10, "aa" , "ahello" }, |
76 | { "\\Aa*hello" , 10, "a" , "hello" }, |
77 | { "\\Adef|abc" , 10, "abc" , "def" }, |
78 | { "\\Aa(b)(c)[d]" , 10, "abcd" , "abcd" }, |
79 | { "\\Aab(cab|cat)" , 10, "abcab" , "abcat" }, |
80 | { "\\Aab(cab|ca)x" , 10, "abcabx" , "abcax" }, |
81 | { "\\A(ab|x)(c|de)" , 10, "abc" , "xde" }, |
82 | { "\\A(ab|x)?(c|z)?" , 10, "" , "z" }, |
83 | { "\\A[^\\s\\S]" , 10, "" , "" }, |
84 | { "\\A(abc)+" , 5, "abc" , "abcac" }, |
85 | { "\\A(abc)+" , 2, "ab" , "ac" }, |
86 | { "\\A(abc)+" , 1, "a" , "b" }, |
87 | { "\\A[a\xC3\xA1]" , 4, "a" , "\xC3\xA1" }, |
88 | { "\\Aa*" , 10, "" , "ab" }, |
89 | |
90 | { "(?i)\\AAbcdef" , 10, "ABCDEF" , "abcdef" }, |
91 | { "(?i)\\Aabc(def|ghi)" , 10, "ABCDEF" , "abcghi" }, |
92 | { "(?i)\\Aa+hello" , 10, "AA" , "ahello" }, |
93 | { "(?i)\\Aa*hello" , 10, "A" , "hello" }, |
94 | { "(?i)\\Adef|abc" , 10, "ABC" , "def" }, |
95 | { "(?i)\\Aa(b)(c)[d]" , 10, "ABCD" , "abcd" }, |
96 | { "(?i)\\Aab(cab|cat)" , 10, "ABCAB" , "abcat" }, |
97 | { "(?i)\\Aab(cab|ca)x" , 10, "ABCABX" , "abcax" }, |
98 | { "(?i)\\A(ab|x)(c|de)" , 10, "ABC" , "xde" }, |
99 | { "(?i)\\A(ab|x)?(c|z)?" , 10, "" , "z" }, |
100 | { "(?i)\\A[^\\s\\S]" , 10, "" , "" }, |
101 | { "(?i)\\A(abc)+" , 5, "ABC" , "abcac" }, |
102 | { "(?i)\\A(abc)+" , 2, "AB" , "ac" }, |
103 | { "(?i)\\A(abc)+" , 1, "A" , "b" }, |
104 | { "(?i)\\A[a\xC3\xA1]" , 4, "A" , "\xC3\xA1" }, |
105 | { "(?i)\\Aa*" , 10, "" , "ab" }, |
106 | { "(?i)\\AA*" , 10, "" , "ab" }, |
107 | }; |
108 | |
109 | TEST(PossibleMatchRange, HandWritten) { |
110 | for (size_t i = 0; i < arraysize(tests); i++) { |
111 | for (size_t j = 0; j < 2; j++) { |
112 | const PrefixTest& t = tests[i]; |
113 | std::string min, max; |
114 | if (j == 0) { |
115 | LOG(INFO) << "Checking regexp=" << CEscape(t.regexp); |
116 | Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); |
117 | ASSERT_TRUE(re != NULL); |
118 | Prog* prog = re->CompileToProg(0); |
119 | ASSERT_TRUE(prog != NULL); |
120 | ASSERT_TRUE(prog->PossibleMatchRange(&min, &max, t.maxlen)) |
121 | << " " << t.regexp; |
122 | delete prog; |
123 | re->Decref(); |
124 | } else { |
125 | ASSERT_TRUE(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen)); |
126 | } |
127 | EXPECT_EQ(t.min, min) << t.regexp; |
128 | EXPECT_EQ(t.max, max) << t.regexp; |
129 | } |
130 | } |
131 | } |
132 | |
133 | // Test cases where PossibleMatchRange should return false. |
134 | TEST(PossibleMatchRange, Failures) { |
135 | std::string min, max; |
136 | |
137 | // Fails because no room to write max. |
138 | EXPECT_FALSE(RE2("abc" ).PossibleMatchRange(&min, &max, 0)); |
139 | |
140 | // Fails because there is no max -- any non-empty string matches |
141 | // or begins a match. Have to use Latin-1 input, because there |
142 | // are no valid UTF-8 strings beginning with byte 0xFF. |
143 | EXPECT_FALSE(RE2("[\\s\\S]+" , RE2::Latin1). |
144 | PossibleMatchRange(&min, &max, 10)) |
145 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
146 | EXPECT_FALSE(RE2("[\\0-\xFF]+" , RE2::Latin1). |
147 | PossibleMatchRange(&min, &max, 10)) |
148 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
149 | EXPECT_FALSE(RE2(".+hello" , RE2::Latin1). |
150 | PossibleMatchRange(&min, &max, 10)) |
151 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
152 | EXPECT_FALSE(RE2(".*hello" , RE2::Latin1). |
153 | PossibleMatchRange(&min, &max, 10)) |
154 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
155 | EXPECT_FALSE(RE2(".*" , RE2::Latin1). |
156 | PossibleMatchRange(&min, &max, 10)) |
157 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
158 | EXPECT_FALSE(RE2("\\C*" ). |
159 | PossibleMatchRange(&min, &max, 10)) |
160 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
161 | |
162 | // Fails because it's a malformed regexp. |
163 | EXPECT_FALSE(RE2("*hello" ).PossibleMatchRange(&min, &max, 10)) |
164 | << "min=" << CEscape(min) << ", max=" << CEscape(max); |
165 | } |
166 | |
167 | // Exhaustive test: generate all regexps within parameters, |
168 | // then generate all strings of a given length over a given alphabet, |
169 | // then check that the prefix information agrees with whether |
170 | // the regexp matches each of the strings. |
171 | class PossibleMatchTester : public RegexpGenerator { |
172 | public: |
173 | PossibleMatchTester(int maxatoms, |
174 | int maxops, |
175 | const std::vector<std::string>& alphabet, |
176 | const std::vector<std::string>& ops, |
177 | int maxstrlen, |
178 | const std::vector<std::string>& stralphabet) |
179 | : RegexpGenerator(maxatoms, maxops, alphabet, ops), |
180 | strgen_(maxstrlen, stralphabet), |
181 | regexps_(0), tests_(0) { } |
182 | |
183 | int regexps() { return regexps_; } |
184 | int tests() { return tests_; } |
185 | |
186 | // Needed for RegexpGenerator interface. |
187 | void HandleRegexp(const std::string& regexp); |
188 | |
189 | private: |
190 | StringGenerator strgen_; |
191 | |
192 | int regexps_; // Number of HandleRegexp calls |
193 | int tests_; // Number of regexp tests. |
194 | |
195 | PossibleMatchTester(const PossibleMatchTester&) = delete; |
196 | PossibleMatchTester& operator=(const PossibleMatchTester&) = delete; |
197 | }; |
198 | |
199 | // Processes a single generated regexp. |
200 | // Checks that all accepted strings agree with the prefix range. |
201 | void PossibleMatchTester::HandleRegexp(const std::string& regexp) { |
202 | regexps_++; |
203 | |
204 | VLOG(3) << CEscape(regexp); |
205 | |
206 | RE2 re(regexp, RE2::Latin1); |
207 | ASSERT_EQ(re.error(), "" ); |
208 | |
209 | std::string min, max; |
210 | if(!re.PossibleMatchRange(&min, &max, 10)) { |
211 | // There's no good max for "\\C*". Can't use strcmp |
212 | // because sometimes it gets embedded in more |
213 | // complicated expressions. |
214 | if(strstr(regexp.c_str(), "\\C*" )) |
215 | return; |
216 | LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp); |
217 | } |
218 | |
219 | strgen_.Reset(); |
220 | while (strgen_.HasNext()) { |
221 | const StringPiece& s = strgen_.Next(); |
222 | tests_++; |
223 | if (!RE2::FullMatch(s, re)) |
224 | continue; |
225 | ASSERT_GE(s, min) << " regexp: " << regexp << " max: " << max; |
226 | ASSERT_LE(s, max) << " regexp: " << regexp << " min: " << min; |
227 | } |
228 | } |
229 | |
230 | TEST(PossibleMatchRange, Exhaustive) { |
231 | int natom = 3; |
232 | int noperator = 3; |
233 | int stringlen = 5; |
234 | if (RE2_DEBUG_MODE) { |
235 | natom = 2; |
236 | noperator = 3; |
237 | stringlen = 3; |
238 | } |
239 | PossibleMatchTester t(natom, noperator, Split(" " , "a b [0-9]" ), |
240 | RegexpGenerator::EgrepOps(), |
241 | stringlen, Explode("ab4" )); |
242 | t.Generate(); |
243 | LOG(INFO) << t.regexps() << " regexps, " |
244 | << t.tests() << " tests" ; |
245 | } |
246 | |
247 | } // namespace re2 |
248 | |