1// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include <stdint.h>
6#include <string>
7#include <thread>
8#include <vector>
9
10#include "util/test.h"
11#include "util/flags.h"
12#include "util/logging.h"
13#include "util/malloc_counter.h"
14#include "util/strutil.h"
15#include "re2/prog.h"
16#include "re2/re2.h"
17#include "re2/regexp.h"
18#include "re2/testing/regexp_generator.h"
19#include "re2/testing/string_generator.h"
20
21static const bool UsingMallocCounter = false;
22
23DEFINE_FLAG(int, size, 8, "log2(number of DFA nodes)");
24DEFINE_FLAG(int, repeat, 2, "Repetition count.");
25DEFINE_FLAG(int, threads, 4, "number of threads");
26
27namespace re2 {
28
29// Check that multithreaded access to DFA class works.
30
31// Helper function: builds entire DFA for prog.
32static void DoBuild(Prog* prog) {
33 ASSERT_TRUE(prog->BuildEntireDFA(Prog::kFirstMatch, nullptr));
34}
35
36TEST(Multithreaded, BuildEntireDFA) {
37 // Create regexp with 2^FLAGS_size states in DFA.
38 std::string s = "a";
39 for (int i = 0; i < GetFlag(FLAGS_size); i++)
40 s += "[ab]";
41 s += "b";
42 Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
43 ASSERT_TRUE(re != NULL);
44
45 // Check that single-threaded code works.
46 {
47 Prog* prog = re->CompileToProg(0);
48 ASSERT_TRUE(prog != NULL);
49
50 std::thread t(DoBuild, prog);
51 t.join();
52
53 delete prog;
54 }
55
56 // Build the DFA simultaneously in a bunch of threads.
57 for (int i = 0; i < GetFlag(FLAGS_repeat); i++) {
58 Prog* prog = re->CompileToProg(0);
59 ASSERT_TRUE(prog != NULL);
60
61 std::vector<std::thread> threads;
62 for (int j = 0; j < GetFlag(FLAGS_threads); j++)
63 threads.emplace_back(DoBuild, prog);
64 for (int j = 0; j < GetFlag(FLAGS_threads); j++)
65 threads[j].join();
66
67 // One more compile, to make sure everything is okay.
68 prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
69 delete prog;
70 }
71
72 re->Decref();
73}
74
75// Check that DFA size requirements are followed.
76// BuildEntireDFA will, like SearchDFA, stop building out
77// the DFA once the memory limits are reached.
78TEST(SingleThreaded, BuildEntireDFA) {
79 // Create regexp with 2^30 states in DFA.
80 Regexp* re = Regexp::Parse("a[ab]{30}b", Regexp::LikePerl, NULL);
81 ASSERT_TRUE(re != NULL);
82
83 for (int i = 17; i < 24; i++) {
84 int64_t limit = int64_t{1}<<i;
85 int64_t usage;
86 //int64_t progusage, dfamem;
87 {
88 testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
89 Prog* prog = re->CompileToProg(limit);
90 ASSERT_TRUE(prog != NULL);
91 //progusage = m.HeapGrowth();
92 //dfamem = prog->dfa_mem();
93 prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
94 prog->BuildEntireDFA(Prog::kLongestMatch, nullptr);
95 usage = m.HeapGrowth();
96 delete prog;
97 }
98 if (UsingMallocCounter) {
99 //LOG(INFO) << "limit " << limit << ", "
100 // << "prog usage " << progusage << ", "
101 // << "DFA budget " << dfamem << ", "
102 // << "total " << usage;
103 // Tolerate +/- 10%.
104 ASSERT_GT(usage, limit*9/10);
105 ASSERT_LT(usage, limit*11/10);
106 }
107 }
108 re->Decref();
109}
110
111// Generates and returns a string over binary alphabet {0,1} that contains
112// all possible binary sequences of length n as subsequences. The obvious
113// brute force method would generate a string of length n * 2^n, but this
114// generates a string of length n + 2^n - 1 called a De Bruijn cycle.
115// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
116// Such a string is useful for testing a DFA. If you have a DFA
117// where distinct last n bytes implies distinct states, then running on a
118// DeBruijn string causes the DFA to need to create a new state at every
119// position in the input, never reusing any states until it gets to the
120// end of the string. This is the worst possible case for DFA execution.
121static std::string DeBruijnString(int n) {
122 CHECK_LT(n, static_cast<int>(8*sizeof(int)));
123 CHECK_GT(n, 0);
124
125 std::vector<bool> did(size_t{1}<<n);
126 for (int i = 0; i < 1<<n; i++)
127 did[i] = false;
128
129 std::string s;
130 for (int i = 0; i < n-1; i++)
131 s.append("0");
132 int bits = 0;
133 int mask = (1<<n) - 1;
134 for (int i = 0; i < (1<<n); i++) {
135 bits <<= 1;
136 bits &= mask;
137 if (!did[bits|1]) {
138 bits |= 1;
139 s.append("1");
140 } else {
141 s.append("0");
142 }
143 CHECK(!did[bits]);
144 did[bits] = true;
145 }
146 return s;
147}
148
149// Test that the DFA gets the right result even if it runs
150// out of memory during a search. The regular expression
151// 0[01]{n}$ matches a binary string of 0s and 1s only if
152// the (n+1)th-to-last character is a 0. Matching this in
153// a single forward pass (as done by the DFA) requires
154// keeping one bit for each of the last n+1 characters
155// (whether each was a 0), or 2^(n+1) possible states.
156// If we run this regexp to search in a string that contains
157// every possible n-character binary string as a substring,
158// then it will have to run through at least 2^n states.
159// States are big data structures -- certainly more than 1 byte --
160// so if the DFA can search correctly while staying within a
161// 2^n byte limit, it must be handling out-of-memory conditions
162// gracefully.
163TEST(SingleThreaded, SearchDFA) {
164 // The De Bruijn string is the worst case input for this regexp.
165 // By default, the DFA will notice that it is flushing its cache
166 // too frequently and will bail out early, so that RE2 can use the
167 // NFA implementation instead. (The DFA loses its speed advantage
168 // if it can't get a good cache hit rate.)
169 // Tell the DFA to trudge along instead.
170 Prog::TEST_dfa_should_bail_when_slow(false);
171
172 // Choice of n is mostly arbitrary, except that:
173 // * making n too big makes the test run for too long.
174 // * making n too small makes the DFA refuse to run,
175 // because it has so little memory compared to the program size.
176 // Empirically, n = 18 is a good compromise between the two.
177 const int n = 18;
178
179 Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
180 Regexp::LikePerl, NULL);
181 ASSERT_TRUE(re != NULL);
182
183 // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
184 // which is not a match for 0[01]{n}$. Adding one more 0 is a match.
185 std::string no_match = DeBruijnString(n);
186 std::string match = no_match + "0";
187
188 int64_t usage;
189 int64_t peak_usage;
190 {
191 testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
192 Prog* prog = re->CompileToProg(1<<n);
193 ASSERT_TRUE(prog != NULL);
194 for (int i = 0; i < 10; i++) {
195 bool matched = false;
196 bool failed = false;
197 matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
198 Prog::kFirstMatch, NULL, &failed, NULL);
199 ASSERT_FALSE(failed);
200 ASSERT_TRUE(matched);
201 matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
202 Prog::kFirstMatch, NULL, &failed, NULL);
203 ASSERT_FALSE(failed);
204 ASSERT_FALSE(matched);
205 }
206 usage = m.HeapGrowth();
207 peak_usage = m.PeakHeapGrowth();
208 delete prog;
209 }
210 if (UsingMallocCounter) {
211 //LOG(INFO) << "usage " << usage << ", "
212 // << "peak usage " << peak_usage;
213 ASSERT_LT(usage, 1<<n);
214 ASSERT_LT(peak_usage, 1<<n);
215 }
216 re->Decref();
217
218 // Reset to original behaviour.
219 Prog::TEST_dfa_should_bail_when_slow(true);
220}
221
222// Helper function: searches for match, which should match,
223// and no_match, which should not.
224static void DoSearch(Prog* prog, const StringPiece& match,
225 const StringPiece& no_match) {
226 for (int i = 0; i < 2; i++) {
227 bool matched = false;
228 bool failed = false;
229 matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
230 Prog::kFirstMatch, NULL, &failed, NULL);
231 ASSERT_FALSE(failed);
232 ASSERT_TRUE(matched);
233 matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
234 Prog::kFirstMatch, NULL, &failed, NULL);
235 ASSERT_FALSE(failed);
236 ASSERT_FALSE(matched);
237 }
238}
239
240TEST(Multithreaded, SearchDFA) {
241 Prog::TEST_dfa_should_bail_when_slow(false);
242
243 // Same as single-threaded test above.
244 const int n = 18;
245 Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
246 Regexp::LikePerl, NULL);
247 ASSERT_TRUE(re != NULL);
248 std::string no_match = DeBruijnString(n);
249 std::string match = no_match + "0";
250
251 // Check that single-threaded code works.
252 {
253 Prog* prog = re->CompileToProg(1<<n);
254 ASSERT_TRUE(prog != NULL);
255
256 std::thread t(DoSearch, prog, match, no_match);
257 t.join();
258
259 delete prog;
260 }
261
262 // Run the search simultaneously in a bunch of threads.
263 // Reuse same flags for Multithreaded.BuildDFA above.
264 for (int i = 0; i < GetFlag(FLAGS_repeat); i++) {
265 Prog* prog = re->CompileToProg(1<<n);
266 ASSERT_TRUE(prog != NULL);
267
268 std::vector<std::thread> threads;
269 for (int j = 0; j < GetFlag(FLAGS_threads); j++)
270 threads.emplace_back(DoSearch, prog, match, no_match);
271 for (int j = 0; j < GetFlag(FLAGS_threads); j++)
272 threads[j].join();
273
274 delete prog;
275 }
276
277 re->Decref();
278
279 // Reset to original behaviour.
280 Prog::TEST_dfa_should_bail_when_slow(true);
281}
282
283struct ReverseTest {
284 const char* regexp;
285 const char* text;
286 bool match;
287};
288
289// Test that reverse DFA handles anchored/unanchored correctly.
290// It's in the DFA interface but not used by RE2.
291ReverseTest reverse_tests[] = {
292 { "\\A(a|b)", "abc", true },
293 { "(a|b)\\z", "cba", true },
294 { "\\A(a|b)", "cba", false },
295 { "(a|b)\\z", "abc", false },
296};
297
298TEST(DFA, ReverseMatch) {
299 int nfail = 0;
300 for (size_t i = 0; i < arraysize(reverse_tests); i++) {
301 const ReverseTest& t = reverse_tests[i];
302 Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
303 ASSERT_TRUE(re != NULL);
304 Prog* prog = re->CompileToReverseProg(0);
305 ASSERT_TRUE(prog != NULL);
306 bool failed = false;
307 bool matched = prog->SearchDFA(t.text, StringPiece(), Prog::kUnanchored,
308 Prog::kFirstMatch, NULL, &failed, NULL);
309 if (matched != t.match) {
310 LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
311 nfail++;
312 }
313 delete prog;
314 re->Decref();
315 }
316 EXPECT_EQ(nfail, 0);
317}
318
319struct CallbackTest {
320 const char* regexp;
321 const char* dump;
322};
323
324// Test that DFA::BuildAllStates() builds the expected DFA states
325// and issues the expected callbacks. These test cases reflect the
326// very compact encoding of the callbacks, but that also makes them
327// very difficult to understand, so let's work through "\\Aa\\z".
328// There are three slots per DFA state because the bytemap has two
329// equivalence classes and there is a third slot for kByteEndText:
330// 0: all bytes that are not 'a'
331// 1: the byte 'a'
332// 2: kByteEndText
333// -1 means that there is no transition from that DFA state to any
334// other DFA state for that slot. The valid transitions are thus:
335// state 0 --slot 1--> state 1
336// state 1 --slot 2--> state 2
337// The double brackets indicate that state 2 is a matching state.
338// Putting it together, this means that the DFA must consume the
339// byte 'a' and then hit end of text. Q.E.D.
340CallbackTest callback_tests[] = {
341 { "\\Aa\\z", "[-1,1,-1] [-1,-1,2] [[-1,-1,-1]]" },
342 { "\\Aab\\z", "[-1,1,-1,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
343 { "\\Aa*b\\z", "[-1,0,1,-1] [-1,-1,-1,2] [[-1,-1,-1,-1]]" },
344 { "\\Aa+b\\z", "[-1,1,-1,-1] [-1,1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
345 { "\\Aa?b\\z", "[-1,1,2,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
346 { "\\Aa\\C*\\z", "[-1,1,-1] [1,1,2] [[-1,-1,-1]]" },
347 { "\\Aa\\C*", "[-1,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
348 { "a\\C*", "[0,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
349 { "\\C*", "[1,2] [[1,1]] [[-1,-1]]" },
350 { "a", "[0,1,-1] [2,2,2] [[-1,-1,-1]]"} ,
351};
352
353TEST(DFA, Callback) {
354 int nfail = 0;
355 for (size_t i = 0; i < arraysize(callback_tests); i++) {
356 const CallbackTest& t = callback_tests[i];
357 Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
358 ASSERT_TRUE(re != NULL);
359 Prog* prog = re->CompileToProg(0);
360 ASSERT_TRUE(prog != NULL);
361 std::string dump;
362 prog->BuildEntireDFA(Prog::kLongestMatch, [&](const int* next, bool match) {
363 ASSERT_TRUE(next != NULL);
364 if (!dump.empty())
365 dump += " ";
366 dump += match ? "[[" : "[";
367 for (int b = 0; b < prog->bytemap_range() + 1; b++)
368 dump += StringPrintf("%d,", next[b]);
369 dump.pop_back();
370 dump += match ? "]]" : "]";
371 });
372 if (dump != t.dump) {
373 LOG(ERROR) << t.regexp << " bytemap:\n" << prog->DumpByteMap();
374 LOG(ERROR) << t.regexp << " dump:\ngot " << dump << "\nwant " << t.dump;
375 nfail++;
376 }
377 delete prog;
378 re->Decref();
379 }
380 EXPECT_EQ(nfail, 0);
381}
382
383} // namespace re2
384