dfa_test.cc source code [RE2/re2/testing/dfa_test.cc]

1	// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	#include <stdint.h>
6	#include <string>
7	#include <thread>
8	#include <vector>
9
10	#include "util/test.h"
11	#include "util/flags.h"
12	#include "util/logging.h"
13	#include "util/malloc_counter.h"
14	#include "util/strutil.h"
15	#include "re2/prog.h"
16	#include "re2/re2.h"
17	#include "re2/regexp.h"
18	#include "re2/testing/regexp_generator.h"
19	#include "re2/testing/string_generator.h"
20
21	static const bool UsingMallocCounter = false;
22
23	DEFINE_FLAG(int, size, `8`, "log2(number of DFA nodes)");
24	DEFINE_FLAG(int, repeat, `2`, "Repetition count.");
25	DEFINE_FLAG(int, threads, `4`, "number of threads");
26
27	namespace re2 {
28
29	// Check that multithreaded access to DFA class works.
30
31	// Helper function: builds entire DFA for prog.
32	static void DoBuild(Prog* prog) {
33	ASSERT_TRUE(prog->BuildEntireDFA(Prog::kFirstMatch, nullptr));
34	}
35
36	TEST(Multithreaded, BuildEntireDFA) {
37	// Create regexp with 2^FLAGS_size states in DFA.
38	std::string s = "a";
39	for (int i = `0`; i < GetFlag(FLAGS_size); i++)
40	s += "[ab]";
41	s += "b";
42	Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
43	ASSERT_TRUE(re != NULL);
44
45	// Check that single-threaded code works.
46	{
47	Prog* prog = re->CompileToProg(`0`);
48	ASSERT_TRUE(prog != NULL);
49
50	std::thread t(DoBuild, prog);
51	t.join();
52
53	delete prog;
54	}
55
56	// Build the DFA simultaneously in a bunch of threads.
57	for (int i = `0`; i < GetFlag(FLAGS_repeat); i++) {
58	Prog* prog = re->CompileToProg(`0`);
59	ASSERT_TRUE(prog != NULL);
60
61	std::vector<std::thread> threads;
62	for (int j = `0`; j < GetFlag(FLAGS_threads); j++)
63	threads.emplace_back(DoBuild, prog);
64	for (int j = `0`; j < GetFlag(FLAGS_threads); j++)
65	threads [j].join();
66
67	// One more compile, to make sure everything is okay.
68	prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
69	delete prog;
70	}
71
72	re->Decref();
73	}
74
75	// Check that DFA size requirements are followed.
76	// BuildEntireDFA will, like SearchDFA, stop building out
77	// the DFA once the memory limits are reached.
78	TEST(SingleThreaded, BuildEntireDFA) {
79	// Create regexp with 2^30 states in DFA.
80	Regexp* re = Regexp::Parse("a[ab]{30}b", Regexp::LikePerl, NULL);
81	ASSERT_TRUE(re != NULL);
82
83	for (int i = `17`; i < `24`; i++) {
84	int64_t limit = int64_t{`1`}<<i;
85	int64_t usage;
86	//int64_t progusage, dfamem;
87	{
88	testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
89	Prog* prog = re->CompileToProg(limit);
90	ASSERT_TRUE(prog != NULL);
91	//progusage = m.HeapGrowth();
92	//dfamem = prog->dfa_mem();
93	prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
94	prog->BuildEntireDFA(Prog::kLongestMatch, nullptr);
95	usage = m.HeapGrowth();
96	delete prog;
97	}
98	if (UsingMallocCounter) {
99	//LOG(INFO) << "limit " << limit << ", "
100	// << "prog usage " << progusage << ", "
101	// << "DFA budget " << dfamem << ", "
102	// << "total " << usage;
103	// Tolerate +/- 10%.
104	ASSERT_GT(usage, limit*`9`/`10`);
105	ASSERT_LT(usage, limit*`11`/`10`);
106	}
107	}
108	re->Decref();
109	}
110
111	// Generates and returns a string over binary alphabet {0,1} that contains
112	// all possible binary sequences of length n as subsequences. The obvious
113	// brute force method would generate a string of length n 2^n, but this*
114	// generates a string of length n + 2^n - 1 called a De Bruijn cycle.
115	// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
116	// Such a string is useful for testing a DFA. If you have a DFA
117	// where distinct last n bytes implies distinct states, then running on a
118	// DeBruijn string causes the DFA to need to create a new state at every
119	// position in the input, never reusing any states until it gets to the
120	// end of the string. This is the worst possible case for DFA execution.
121	static std::string DeBruijnString(int n) {
122	CHECK_LT(n, static_cast<int>(`8`*sizeof(int)));
123	CHECK_GT(n, `0`);
124
125	std::vector<bool> did(size_t{`1`}<<n);
126	for (int i = `0`; i < `1`<<n; i++)
127	did [i] = false;
128
129	std::string s;
130	for (int i = `0`; i < n-`1`; i++)
131	s.append("0");
132	int bits = `0`;
133	int mask = (`1`<<n) - `1`;
134	for (int i = `0`; i < (`1`<<n); i++) {
135	bits <<= `1`;
136	bits &= mask;
137	if (!did [bits\|`1`]) {
138	bits \|= `1`;
139	s.append("1");
140	} else {
141	s.append("0");
142	}
143	CHECK(!did[bits]);
144	did [bits] = true;
145	}
146	return s;
147	}
148
149	// Test that the DFA gets the right result even if it runs
150	// out of memory during a search. The regular expression
151	// 0[01]{n}$ matches a binary string of 0s and 1s only if
152	// the (n+1)th-to-last character is a 0. Matching this in
153	// a single forward pass (as done by the DFA) requires
154	// keeping one bit for each of the last n+1 characters
155	// (whether each was a 0), or 2^(n+1) possible states.
156	// If we run this regexp to search in a string that contains
157	// every possible n-character binary string as a substring,
158	// then it will have to run through at least 2^n states.
159	// States are big data structures -- certainly more than 1 byte --
160	// so if the DFA can search correctly while staying within a
161	// 2^n byte limit, it must be handling out-of-memory conditions
162	// gracefully.
163	TEST(SingleThreaded, SearchDFA) {
164	// The De Bruijn string is the worst case input for this regexp.
165	// By default, the DFA will notice that it is flushing its cache
166	// too frequently and will bail out early, so that RE2 can use the
167	// NFA implementation instead. (The DFA loses its speed advantage
168	// if it can't get a good cache hit rate.)
169	// Tell the DFA to trudge along instead.
170	Prog::TEST_dfa_should_bail_when_slow(false);
171
172	// Choice of n is mostly arbitrary, except that:
173	// making n too big makes the test run for too long.*
174	// making n too small makes the DFA refuse to run,*
175	// because it has so little memory compared to the program size.
176	// Empirically, n = 18 is a good compromise between the two.
177	const int n = `18`;
178
179	Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
180	Regexp::LikePerl, NULL);
181	ASSERT_TRUE(re != NULL);
182
183	// The De Bruijn string for n ends with a 1 followed by n 0s in a row,
184	// which is not a match for 0[01]{n}$. Adding one more 0 is a match.
185	std::string no_match = DeBruijnString(n);
186	std::string match = no_match + "0";
187
188	int64_t usage;
189	int64_t peak_usage;
190	{
191	testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
192	Prog* prog = re->CompileToProg(`1`<<n);
193	ASSERT_TRUE(prog != NULL);
194	for (int i = `0`; i < `10`; i++) {
195	bool matched = false;
196	bool failed = false;
197	matched = prog->SearchDFA(match, StringPiece (), Prog::kUnanchored,
198	Prog::kFirstMatch, NULL, &failed, NULL);
199	ASSERT_FALSE(failed);
200	ASSERT_TRUE(matched);
201	matched = prog->SearchDFA(no_match, StringPiece (), Prog::kUnanchored,
202	Prog::kFirstMatch, NULL, &failed, NULL);
203	ASSERT_FALSE(failed);
204	ASSERT_FALSE(matched);
205	}
206	usage = m.HeapGrowth();
207	peak_usage = m.PeakHeapGrowth();
208	delete prog;
209	}
210	if (UsingMallocCounter) {
211	//LOG(INFO) << "usage " << usage << ", "
212	// << "peak usage " << peak_usage;
213	ASSERT_LT(usage, `1`<<n);
214	ASSERT_LT(peak_usage, `1`<<n);
215	}
216	re->Decref();
217
218	// Reset to original behaviour.
219	Prog::TEST_dfa_should_bail_when_slow(true);
220	}
221
222	// Helper function: searches for match, which should match,
223	// and no_match, which should not.
224	static void DoSearch(Prog* prog, const StringPiece& match,
225	const StringPiece& no_match) {
226	for (int i = `0`; i < `2`; i++) {
227	bool matched = false;
228	bool failed = false;
229	matched = prog->SearchDFA(match, StringPiece (), Prog::kUnanchored,
230	Prog::kFirstMatch, NULL, &failed, NULL);
231	ASSERT_FALSE(failed);
232	ASSERT_TRUE(matched);
233	matched = prog->SearchDFA(no_match, StringPiece (), Prog::kUnanchored,
234	Prog::kFirstMatch, NULL, &failed, NULL);
235	ASSERT_FALSE(failed);
236	ASSERT_FALSE(matched);
237	}
238	}
239
240	TEST(Multithreaded, SearchDFA) {
241	Prog::TEST_dfa_should_bail_when_slow(false);
242
243	// Same as single-threaded test above.
244	const int n = `18`;
245	Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
246	Regexp::LikePerl, NULL);
247	ASSERT_TRUE(re != NULL);
248	std::string no_match = DeBruijnString(n);
249	std::string match = no_match + "0";
250
251	// Check that single-threaded code works.
252	{
253	Prog* prog = re->CompileToProg(`1`<<n);
254	ASSERT_TRUE(prog != NULL);
255
256	std::thread t(DoSearch, prog, match, no_match);
257	t.join();
258
259	delete prog;
260	}
261
262	// Run the search simultaneously in a bunch of threads.
263	// Reuse same flags for Multithreaded.BuildDFA above.
264	for (int i = `0`; i < GetFlag(FLAGS_repeat); i++) {
265	Prog* prog = re->CompileToProg(`1`<<n);
266	ASSERT_TRUE(prog != NULL);
267
268	std::vector<std::thread> threads;
269	for (int j = `0`; j < GetFlag(FLAGS_threads); j++)
270	threads.emplace_back(DoSearch, prog, match, no_match);
271	for (int j = `0`; j < GetFlag(FLAGS_threads); j++)
272	threads [j].join();
273
274	delete prog;
275	}
276
277	re->Decref();
278
279	// Reset to original behaviour.
280	Prog::TEST_dfa_should_bail_when_slow(true);
281	}
282
283	struct ReverseTest {
284	const char* regexp;
285	const char* text;
286	bool match;
287	};
288
289	// Test that reverse DFA handles anchored/unanchored correctly.
290	// It's in the DFA interface but not used by RE2.
291	ReverseTest reverse_tests[] = {
292	{ "\\A(a\|b)", "abc", true },
293	{ "(a\|b)\\z", "cba", true },
294	{ "\\A(a\|b)", "cba", false },
295	{ "(a\|b)\\z", "abc", false },
296	};
297
298	TEST(DFA, ReverseMatch) {
299	int nfail = `0`;
300	for (size_t i = `0`; i < arraysize(reverse_tests); i++) {
301	const ReverseTest& t = reverse_tests[i];
302	Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
303	ASSERT_TRUE(re != NULL);
304	Prog* prog = re->CompileToReverseProg(`0`);
305	ASSERT_TRUE(prog != NULL);
306	bool failed = false;
307	bool matched = prog->SearchDFA(t.text, StringPiece (), Prog::kUnanchored,
308	Prog::kFirstMatch, NULL, &failed, NULL);
309	if (matched != t.match) {
310	LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
311	nfail++;
312	}
313	delete prog;
314	re->Decref();
315	}
316	EXPECT_EQ(nfail, `0`);
317	}
318
319	struct CallbackTest {
320	const char* regexp;
321	const char* dump;
322	};
323
324	// Test that DFA::BuildAllStates() builds the expected DFA states
325	// and issues the expected callbacks. These test cases reflect the
326	// very compact encoding of the callbacks, but that also makes them
327	// very difficult to understand, so let's work through "\\Aa\\z".
328	// There are three slots per DFA state because the bytemap has two
329	// equivalence classes and there is a third slot for kByteEndText:
330	// 0: all bytes that are not 'a'
331	// 1: the byte 'a'
332	// 2: kByteEndText
333	// -1 means that there is no transition from that DFA state to any
334	// other DFA state for that slot. The valid transitions are thus:
335	// state 0 --slot 1--> state 1
336	// state 1 --slot 2--> state 2
337	// The double brackets indicate that state 2 is a matching state.
338	// Putting it together, this means that the DFA must consume the
339	// byte 'a' and then hit end of text. Q.E.D.
340	CallbackTest callback_tests[] = {
341	{ "\\Aa\\z", "[-1,1,-1] [-1,-1,2] [[-1,-1,-1]]" },
342	{ "\\Aab\\z", "[-1,1,-1,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
343	{ "\\Aa*b\\z", "[-1,0,1,-1] [-1,-1,-1,2] [[-1,-1,-1,-1]]" },
344	{ "\\Aa+b\\z", "[-1,1,-1,-1] [-1,1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
345	{ "\\Aa?b\\z", "[-1,1,2,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
346	{ "\\Aa\\C*\\z", "[-1,1,-1] [1,1,2] [[-1,-1,-1]]" },
347	{ "\\Aa\\C*", "[-1,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
348	{ "a\\C*", "[0,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
349	{ "\\C*", "[1,2] [[1,1]] [[-1,-1]]" },
350	{ "a", "[0,1,-1] [2,2,2] [[-1,-1,-1]]"} ,
351	};
352
353	TEST(DFA, Callback) {
354	int nfail = `0`;
355	for (size_t i = `0`; i < arraysize(callback_tests); i++) {
356	const CallbackTest& t = callback_tests[i];
357	Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
358	ASSERT_TRUE(re != NULL);
359	Prog* prog = re->CompileToProg(`0`);
360	ASSERT_TRUE(prog != NULL);
361	std::string dump;
362	prog->BuildEntireDFA(Prog::kLongestMatch, [&](const int* next, bool match) {
363	ASSERT_TRUE(next != NULL);
364	if (!dump.empty())
365	dump += " ";
366	dump += match ? "[[" : "[";
367	for (int b = `0`; b < prog->bytemap_range() + `1`; b++)
368	dump += StringPrintf("%d,", next[b]);
369	dump.pop_back();
370	dump += match ? "]]" : "]";
371	});
372	if (dump != t.dump) {
373	LOG(ERROR) << t.regexp << " bytemap:\n" << prog->DumpByteMap();
374	LOG(ERROR) << t.regexp << " dump:\ngot " << dump << "\nwant " << t.dump;
375	nfail++;
376	}
377	delete prog;
378	re->Decref();
379	}
380	EXPECT_EQ(nfail, `0`);
381	}
382
383	} // namespace re2
384

Browse the source code of RE2/re2/testing/dfa_test.cc