1// Copyright 2009 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include <stddef.h>
6#include <algorithm>
7#include <memory>
8#include <string>
9#include <vector>
10
11#include "util/test.h"
12#include "util/logging.h"
13#include "re2/filtered_re2.h"
14#include "re2/re2.h"
15
16namespace re2 {
17
18struct FilterTestVars {
19 FilterTestVars() {}
20 explicit FilterTestVars(int min_atom_len) : f(min_atom_len) {}
21
22 std::vector<std::string> atoms;
23 std::vector<int> atom_indices;
24 std::vector<int> matches;
25 RE2::Options opts;
26 FilteredRE2 f;
27};
28
29TEST(FilteredRE2Test, EmptyTest) {
30 FilterTestVars v;
31
32 v.f.Compile(&v.atoms);
33 EXPECT_EQ(0, v.atoms.size());
34
35 // Compile has no effect at all when called before Add: it will not
36 // record that it has been called and it will not clear the vector.
37 // The second point does not matter here, but the first point means
38 // that an error will be logged during the call to AllMatches.
39 v.f.AllMatches("foo", v.atom_indices, &v.matches);
40 EXPECT_EQ(0, v.matches.size());
41}
42
43TEST(FilteredRE2Test, SmallOrTest) {
44 FilterTestVars v(4); // override the minimum atom length
45 int id;
46 v.f.Add("(foo|bar)", v.opts, &id);
47
48 v.f.Compile(&v.atoms);
49 EXPECT_EQ(0, v.atoms.size());
50
51 v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches);
52 EXPECT_EQ(1, v.matches.size());
53 EXPECT_EQ(id, v.matches[0]);
54}
55
56TEST(FilteredRE2Test, SmallLatinTest) {
57 FilterTestVars v;
58 int id;
59
60 v.opts.set_encoding(RE2::Options::EncodingLatin1);
61 v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id);
62 v.f.Compile(&v.atoms);
63 EXPECT_EQ(1, v.atoms.size());
64 EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef");
65
66 v.atom_indices.push_back(0);
67 v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches);
68 EXPECT_EQ(1, v.matches.size());
69 EXPECT_EQ(id, v.matches[0]);
70}
71
72struct AtomTest {
73 const char* testname;
74 // If any test needs more than this many regexps or atoms, increase
75 // the size of the corresponding array.
76 const char* regexps[20];
77 const char* atoms[20];
78};
79
80AtomTest atom_tests[] = {
81 {
82 // This test checks to make sure empty patterns are allowed.
83 "CheckEmptyPattern",
84 {""},
85 {}
86 }, {
87 // This test checks that all atoms of length greater than min length
88 // are found, and no atoms that are of smaller length are found.
89 "AllAtomsGtMinLengthFound", {
90 "(abc123|def456|ghi789).*mnop[x-z]+",
91 "abc..yyy..zz",
92 "mnmnpp[a-z]+PPP"
93 }, {
94 "abc123",
95 "def456",
96 "ghi789",
97 "mnop",
98 "abc",
99 "yyy",
100 "mnmnpp",
101 "ppp"
102 }
103 }, {
104 // Test to make sure that any atoms that have another atom as a
105 // substring in an OR are removed; that is, only the shortest
106 // substring is kept.
107 "SubstrAtomRemovesSuperStrInOr", {
108 "(abc123|abc|ghi789|abc1234).*[x-z]+",
109 "abcd..yyy..yyyzzz",
110 "mnmnpp[a-z]+PPP"
111 }, {
112 "abc",
113 "ghi789",
114 "abcd",
115 "yyy",
116 "yyyzzz",
117 "mnmnpp",
118 "ppp"
119 }
120 }, {
121 // Test character class expansion.
122 "CharClassExpansion", {
123 "m[a-c][d-f]n.*[x-z]+",
124 "[x-y]bcde[ab]"
125 }, {
126 "madn", "maen", "mafn",
127 "mbdn", "mben", "mbfn",
128 "mcdn", "mcen", "mcfn",
129 "xbcdea", "xbcdeb",
130 "ybcdea", "ybcdeb"
131 }
132 }, {
133 // Test upper/lower of non-ASCII.
134 "UnicodeLower", {
135 "(?i)ΔδΠϖπΣςσ",
136 "ΛΜΝΟΠ",
137 "ψρστυ",
138 }, {
139 "δδπππσσσ",
140 "λμνοπ",
141 "ψρστυ",
142 },
143 },
144};
145
146void AddRegexpsAndCompile(const char* regexps[],
147 size_t n,
148 struct FilterTestVars* v) {
149 for (size_t i = 0; i < n; i++) {
150 int id;
151 v->f.Add(regexps[i], v->opts, &id);
152 }
153 v->f.Compile(&v->atoms);
154}
155
156bool CheckExpectedAtoms(const char* atoms[],
157 size_t n,
158 const char* testname,
159 struct FilterTestVars* v) {
160 std::vector<std::string> expected;
161 for (size_t i = 0; i < n; i++)
162 expected.push_back(atoms[i]);
163
164 bool pass = expected.size() == v->atoms.size();
165
166 std::sort(v->atoms.begin(), v->atoms.end());
167 std::sort(expected.begin(), expected.end());
168 for (size_t i = 0; pass && i < n; i++)
169 pass = pass && expected[i] == v->atoms[i];
170
171 if (!pass) {
172 LOG(ERROR) << "Failed " << testname;
173 LOG(ERROR) << "Expected #atoms = " << expected.size();
174 for (size_t i = 0; i < expected.size(); i++)
175 LOG(ERROR) << expected[i];
176 LOG(ERROR) << "Found #atoms = " << v->atoms.size();
177 for (size_t i = 0; i < v->atoms.size(); i++)
178 LOG(ERROR) << v->atoms[i];
179 }
180
181 return pass;
182}
183
184TEST(FilteredRE2Test, AtomTests) {
185 int nfail = 0;
186 for (size_t i = 0; i < arraysize(atom_tests); i++) {
187 FilterTestVars v;
188 AtomTest* t = &atom_tests[i];
189 size_t nregexp, natom;
190 for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
191 if (t->regexps[nregexp] == NULL)
192 break;
193 for (natom = 0; natom < arraysize(t->atoms); natom++)
194 if (t->atoms[natom] == NULL)
195 break;
196 AddRegexpsAndCompile(t->regexps, nregexp, &v);
197 if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v))
198 nfail++;
199 }
200 EXPECT_EQ(0, nfail);
201}
202
203void FindAtomIndices(const std::vector<std::string>& atoms,
204 const std::vector<std::string>& matched_atoms,
205 std::vector<int>* atom_indices) {
206 atom_indices->clear();
207 for (size_t i = 0; i < matched_atoms.size(); i++) {
208 for (size_t j = 0; j < atoms.size(); j++) {
209 if (matched_atoms[i] == atoms[j]) {
210 atom_indices->push_back(static_cast<int>(j));
211 break;
212 }
213 }
214 }
215}
216
217TEST(FilteredRE2Test, MatchEmptyPattern) {
218 FilterTestVars v;
219 AtomTest* t = &atom_tests[0];
220 // We are using the regexps used in one of the atom tests
221 // for this test. Adding the EXPECT here to make sure
222 // the index we use for the test is for the correct test.
223 EXPECT_EQ("CheckEmptyPattern", std::string(t->testname));
224 size_t nregexp;
225 for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
226 if (t->regexps[nregexp] == NULL)
227 break;
228 AddRegexpsAndCompile(t->regexps, nregexp, &v);
229 std::string text = "0123";
230 std::vector<int> atom_ids;
231 std::vector<int> matching_regexps;
232 EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids));
233}
234
235TEST(FilteredRE2Test, MatchTests) {
236 FilterTestVars v;
237 AtomTest* t = &atom_tests[2];
238 // We are using the regexps used in one of the atom tests
239 // for this test.
240 EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", std::string(t->testname));
241 size_t nregexp;
242 for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
243 if (t->regexps[nregexp] == NULL)
244 break;
245 AddRegexpsAndCompile(t->regexps, nregexp, &v);
246
247 std::string text = "abc121212xyz";
248 // atoms = abc
249 std::vector<int> atom_ids;
250 std::vector<std::string> atoms;
251 atoms.push_back("abc");
252 FindAtomIndices(v.atoms, atoms, &atom_ids);
253 std::vector<int> matching_regexps;
254 v.f.AllMatches(text, atom_ids, &matching_regexps);
255 EXPECT_EQ(1, matching_regexps.size());
256
257 text = "abc12312yyyzzz";
258 atoms.clear();
259 atoms.push_back("abc");
260 atoms.push_back("yyy");
261 atoms.push_back("yyyzzz");
262 FindAtomIndices(v.atoms, atoms, &atom_ids);
263 v.f.AllMatches(text, atom_ids, &matching_regexps);
264 EXPECT_EQ(1, matching_regexps.size());
265
266 text = "abcd12yyy32yyyzzz";
267 atoms.clear();
268 atoms.push_back("abc");
269 atoms.push_back("abcd");
270 atoms.push_back("yyy");
271 atoms.push_back("yyyzzz");
272 FindAtomIndices(v.atoms, atoms, &atom_ids);
273 LOG(INFO) << "S: " << atom_ids.size();
274 for (size_t i = 0; i < atom_ids.size(); i++)
275 LOG(INFO) << "i: " << i << " : " << atom_ids[i];
276 v.f.AllMatches(text, atom_ids, &matching_regexps);
277 EXPECT_EQ(2, matching_regexps.size());
278}
279
280TEST(FilteredRE2Test, EmptyStringInStringSetBug) {
281 // Bug due to find() finding "" at the start of everything in a string
282 // set and thus SimplifyStringSet() would end up erasing everything.
283 // In order to test this, we have to keep PrefilterTree from discarding
284 // the OR entirely, so we have to make the minimum atom length zero.
285
286 FilterTestVars v(0); // override the minimum atom length
287 const char* regexps[] = {"-R.+(|ADD=;AA){12}}"};
288 const char* atoms[] = {"", "-r", "add=;aa", "}"};
289 AddRegexpsAndCompile(regexps, arraysize(regexps), &v);
290 EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms),
291 "EmptyStringInStringSetBug", &v));
292}
293
294} // namespace re2
295