1 | // Copyright 2009 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #include <stddef.h> |
6 | #include <algorithm> |
7 | #include <memory> |
8 | #include <string> |
9 | #include <vector> |
10 | |
11 | #include "util/test.h" |
12 | #include "util/logging.h" |
13 | #include "re2/filtered_re2.h" |
14 | #include "re2/re2.h" |
15 | |
16 | namespace re2 { |
17 | |
18 | struct FilterTestVars { |
19 | FilterTestVars() {} |
20 | explicit FilterTestVars(int min_atom_len) : f(min_atom_len) {} |
21 | |
22 | std::vector<std::string> atoms; |
23 | std::vector<int> atom_indices; |
24 | std::vector<int> matches; |
25 | RE2::Options opts; |
26 | FilteredRE2 f; |
27 | }; |
28 | |
29 | TEST(FilteredRE2Test, EmptyTest) { |
30 | FilterTestVars v; |
31 | |
32 | v.f.Compile(&v.atoms); |
33 | EXPECT_EQ(0, v.atoms.size()); |
34 | |
35 | // Compile has no effect at all when called before Add: it will not |
36 | // record that it has been called and it will not clear the vector. |
37 | // The second point does not matter here, but the first point means |
38 | // that an error will be logged during the call to AllMatches. |
39 | v.f.AllMatches("foo" , v.atom_indices, &v.matches); |
40 | EXPECT_EQ(0, v.matches.size()); |
41 | } |
42 | |
43 | TEST(FilteredRE2Test, SmallOrTest) { |
44 | FilterTestVars v(4); // override the minimum atom length |
45 | int id; |
46 | v.f.Add("(foo|bar)" , v.opts, &id); |
47 | |
48 | v.f.Compile(&v.atoms); |
49 | EXPECT_EQ(0, v.atoms.size()); |
50 | |
51 | v.f.AllMatches("lemurs bar" , v.atom_indices, &v.matches); |
52 | EXPECT_EQ(1, v.matches.size()); |
53 | EXPECT_EQ(id, v.matches[0]); |
54 | } |
55 | |
56 | TEST(FilteredRE2Test, SmallLatinTest) { |
57 | FilterTestVars v; |
58 | int id; |
59 | |
60 | v.opts.set_encoding(RE2::Options::EncodingLatin1); |
61 | v.f.Add("\xde\xadQ\xbe\xef" , v.opts, &id); |
62 | v.f.Compile(&v.atoms); |
63 | EXPECT_EQ(1, v.atoms.size()); |
64 | EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef" ); |
65 | |
66 | v.atom_indices.push_back(0); |
67 | v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur" , v.atom_indices, &v.matches); |
68 | EXPECT_EQ(1, v.matches.size()); |
69 | EXPECT_EQ(id, v.matches[0]); |
70 | } |
71 | |
72 | struct AtomTest { |
73 | const char* testname; |
74 | // If any test needs more than this many regexps or atoms, increase |
75 | // the size of the corresponding array. |
76 | const char* regexps[20]; |
77 | const char* atoms[20]; |
78 | }; |
79 | |
80 | AtomTest atom_tests[] = { |
81 | { |
82 | // This test checks to make sure empty patterns are allowed. |
83 | "CheckEmptyPattern" , |
84 | {"" }, |
85 | {} |
86 | }, { |
87 | // This test checks that all atoms of length greater than min length |
88 | // are found, and no atoms that are of smaller length are found. |
89 | "AllAtomsGtMinLengthFound" , { |
90 | "(abc123|def456|ghi789).*mnop[x-z]+" , |
91 | "abc..yyy..zz" , |
92 | "mnmnpp[a-z]+PPP" |
93 | }, { |
94 | "abc123" , |
95 | "def456" , |
96 | "ghi789" , |
97 | "mnop" , |
98 | "abc" , |
99 | "yyy" , |
100 | "mnmnpp" , |
101 | "ppp" |
102 | } |
103 | }, { |
104 | // Test to make sure that any atoms that have another atom as a |
105 | // substring in an OR are removed; that is, only the shortest |
106 | // substring is kept. |
107 | "SubstrAtomRemovesSuperStrInOr" , { |
108 | "(abc123|abc|ghi789|abc1234).*[x-z]+" , |
109 | "abcd..yyy..yyyzzz" , |
110 | "mnmnpp[a-z]+PPP" |
111 | }, { |
112 | "abc" , |
113 | "ghi789" , |
114 | "abcd" , |
115 | "yyy" , |
116 | "yyyzzz" , |
117 | "mnmnpp" , |
118 | "ppp" |
119 | } |
120 | }, { |
121 | // Test character class expansion. |
122 | "CharClassExpansion" , { |
123 | "m[a-c][d-f]n.*[x-z]+" , |
124 | "[x-y]bcde[ab]" |
125 | }, { |
126 | "madn" , "maen" , "mafn" , |
127 | "mbdn" , "mben" , "mbfn" , |
128 | "mcdn" , "mcen" , "mcfn" , |
129 | "xbcdea" , "xbcdeb" , |
130 | "ybcdea" , "ybcdeb" |
131 | } |
132 | }, { |
133 | // Test upper/lower of non-ASCII. |
134 | "UnicodeLower" , { |
135 | "(?i)ΔδΠϖπΣςσ" , |
136 | "ΛΜΝΟΠ" , |
137 | "ψρστυ" , |
138 | }, { |
139 | "δδπππσσσ" , |
140 | "λμνοπ" , |
141 | "ψρστυ" , |
142 | }, |
143 | }, |
144 | }; |
145 | |
146 | void AddRegexpsAndCompile(const char* regexps[], |
147 | size_t n, |
148 | struct FilterTestVars* v) { |
149 | for (size_t i = 0; i < n; i++) { |
150 | int id; |
151 | v->f.Add(regexps[i], v->opts, &id); |
152 | } |
153 | v->f.Compile(&v->atoms); |
154 | } |
155 | |
156 | bool CheckExpectedAtoms(const char* atoms[], |
157 | size_t n, |
158 | const char* testname, |
159 | struct FilterTestVars* v) { |
160 | std::vector<std::string> expected; |
161 | for (size_t i = 0; i < n; i++) |
162 | expected.push_back(atoms[i]); |
163 | |
164 | bool pass = expected.size() == v->atoms.size(); |
165 | |
166 | std::sort(v->atoms.begin(), v->atoms.end()); |
167 | std::sort(expected.begin(), expected.end()); |
168 | for (size_t i = 0; pass && i < n; i++) |
169 | pass = pass && expected[i] == v->atoms[i]; |
170 | |
171 | if (!pass) { |
172 | LOG(ERROR) << "Failed " << testname; |
173 | LOG(ERROR) << "Expected #atoms = " << expected.size(); |
174 | for (size_t i = 0; i < expected.size(); i++) |
175 | LOG(ERROR) << expected[i]; |
176 | LOG(ERROR) << "Found #atoms = " << v->atoms.size(); |
177 | for (size_t i = 0; i < v->atoms.size(); i++) |
178 | LOG(ERROR) << v->atoms[i]; |
179 | } |
180 | |
181 | return pass; |
182 | } |
183 | |
184 | TEST(FilteredRE2Test, AtomTests) { |
185 | int nfail = 0; |
186 | for (size_t i = 0; i < arraysize(atom_tests); i++) { |
187 | FilterTestVars v; |
188 | AtomTest* t = &atom_tests[i]; |
189 | size_t nregexp, natom; |
190 | for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) |
191 | if (t->regexps[nregexp] == NULL) |
192 | break; |
193 | for (natom = 0; natom < arraysize(t->atoms); natom++) |
194 | if (t->atoms[natom] == NULL) |
195 | break; |
196 | AddRegexpsAndCompile(t->regexps, nregexp, &v); |
197 | if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v)) |
198 | nfail++; |
199 | } |
200 | EXPECT_EQ(0, nfail); |
201 | } |
202 | |
203 | void FindAtomIndices(const std::vector<std::string>& atoms, |
204 | const std::vector<std::string>& matched_atoms, |
205 | std::vector<int>* atom_indices) { |
206 | atom_indices->clear(); |
207 | for (size_t i = 0; i < matched_atoms.size(); i++) { |
208 | for (size_t j = 0; j < atoms.size(); j++) { |
209 | if (matched_atoms[i] == atoms[j]) { |
210 | atom_indices->push_back(static_cast<int>(j)); |
211 | break; |
212 | } |
213 | } |
214 | } |
215 | } |
216 | |
217 | TEST(FilteredRE2Test, MatchEmptyPattern) { |
218 | FilterTestVars v; |
219 | AtomTest* t = &atom_tests[0]; |
220 | // We are using the regexps used in one of the atom tests |
221 | // for this test. Adding the EXPECT here to make sure |
222 | // the index we use for the test is for the correct test. |
223 | EXPECT_EQ("CheckEmptyPattern" , std::string(t->testname)); |
224 | size_t nregexp; |
225 | for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) |
226 | if (t->regexps[nregexp] == NULL) |
227 | break; |
228 | AddRegexpsAndCompile(t->regexps, nregexp, &v); |
229 | std::string text = "0123" ; |
230 | std::vector<int> atom_ids; |
231 | std::vector<int> matching_regexps; |
232 | EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids)); |
233 | } |
234 | |
235 | TEST(FilteredRE2Test, MatchTests) { |
236 | FilterTestVars v; |
237 | AtomTest* t = &atom_tests[2]; |
238 | // We are using the regexps used in one of the atom tests |
239 | // for this test. |
240 | EXPECT_EQ("SubstrAtomRemovesSuperStrInOr" , std::string(t->testname)); |
241 | size_t nregexp; |
242 | for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) |
243 | if (t->regexps[nregexp] == NULL) |
244 | break; |
245 | AddRegexpsAndCompile(t->regexps, nregexp, &v); |
246 | |
247 | std::string text = "abc121212xyz" ; |
248 | // atoms = abc |
249 | std::vector<int> atom_ids; |
250 | std::vector<std::string> atoms; |
251 | atoms.push_back("abc" ); |
252 | FindAtomIndices(v.atoms, atoms, &atom_ids); |
253 | std::vector<int> matching_regexps; |
254 | v.f.AllMatches(text, atom_ids, &matching_regexps); |
255 | EXPECT_EQ(1, matching_regexps.size()); |
256 | |
257 | text = "abc12312yyyzzz" ; |
258 | atoms.clear(); |
259 | atoms.push_back("abc" ); |
260 | atoms.push_back("yyy" ); |
261 | atoms.push_back("yyyzzz" ); |
262 | FindAtomIndices(v.atoms, atoms, &atom_ids); |
263 | v.f.AllMatches(text, atom_ids, &matching_regexps); |
264 | EXPECT_EQ(1, matching_regexps.size()); |
265 | |
266 | text = "abcd12yyy32yyyzzz" ; |
267 | atoms.clear(); |
268 | atoms.push_back("abc" ); |
269 | atoms.push_back("abcd" ); |
270 | atoms.push_back("yyy" ); |
271 | atoms.push_back("yyyzzz" ); |
272 | FindAtomIndices(v.atoms, atoms, &atom_ids); |
273 | LOG(INFO) << "S: " << atom_ids.size(); |
274 | for (size_t i = 0; i < atom_ids.size(); i++) |
275 | LOG(INFO) << "i: " << i << " : " << atom_ids[i]; |
276 | v.f.AllMatches(text, atom_ids, &matching_regexps); |
277 | EXPECT_EQ(2, matching_regexps.size()); |
278 | } |
279 | |
280 | TEST(FilteredRE2Test, EmptyStringInStringSetBug) { |
281 | // Bug due to find() finding "" at the start of everything in a string |
282 | // set and thus SimplifyStringSet() would end up erasing everything. |
283 | // In order to test this, we have to keep PrefilterTree from discarding |
284 | // the OR entirely, so we have to make the minimum atom length zero. |
285 | |
286 | FilterTestVars v(0); // override the minimum atom length |
287 | const char* regexps[] = {"-R.+(|ADD=;AA){12}}" }; |
288 | const char* atoms[] = {"" , "-r" , "add=;aa" , "}" }; |
289 | AddRegexpsAndCompile(regexps, arraysize(regexps), &v); |
290 | EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms), |
291 | "EmptyStringInStringSetBug" , &v)); |
292 | } |
293 | |
294 | } // namespace re2 |
295 | |