1 | // Copyright 2006-2008 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #include <stdint.h> |
6 | #include <string> |
7 | #include <thread> |
8 | #include <vector> |
9 | |
10 | #include "util/test.h" |
11 | #include "util/flags.h" |
12 | #include "util/logging.h" |
13 | #include "util/malloc_counter.h" |
14 | #include "util/strutil.h" |
15 | #include "re2/prog.h" |
16 | #include "re2/re2.h" |
17 | #include "re2/regexp.h" |
18 | #include "re2/testing/regexp_generator.h" |
19 | #include "re2/testing/string_generator.h" |
20 | |
21 | static const bool UsingMallocCounter = false; |
22 | |
23 | DEFINE_FLAG(int, size, 8, "log2(number of DFA nodes)" ); |
24 | DEFINE_FLAG(int, repeat, 2, "Repetition count." ); |
25 | DEFINE_FLAG(int, threads, 4, "number of threads" ); |
26 | |
27 | namespace re2 { |
28 | |
29 | // Check that multithreaded access to DFA class works. |
30 | |
31 | // Helper function: builds entire DFA for prog. |
32 | static void DoBuild(Prog* prog) { |
33 | ASSERT_TRUE(prog->BuildEntireDFA(Prog::kFirstMatch, nullptr)); |
34 | } |
35 | |
36 | TEST(Multithreaded, BuildEntireDFA) { |
37 | // Create regexp with 2^FLAGS_size states in DFA. |
38 | std::string s = "a" ; |
39 | for (int i = 0; i < GetFlag(FLAGS_size); i++) |
40 | s += "[ab]" ; |
41 | s += "b" ; |
42 | Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL); |
43 | ASSERT_TRUE(re != NULL); |
44 | |
45 | // Check that single-threaded code works. |
46 | { |
47 | Prog* prog = re->CompileToProg(0); |
48 | ASSERT_TRUE(prog != NULL); |
49 | |
50 | std::thread t(DoBuild, prog); |
51 | t.join(); |
52 | |
53 | delete prog; |
54 | } |
55 | |
56 | // Build the DFA simultaneously in a bunch of threads. |
57 | for (int i = 0; i < GetFlag(FLAGS_repeat); i++) { |
58 | Prog* prog = re->CompileToProg(0); |
59 | ASSERT_TRUE(prog != NULL); |
60 | |
61 | std::vector<std::thread> threads; |
62 | for (int j = 0; j < GetFlag(FLAGS_threads); j++) |
63 | threads.emplace_back(DoBuild, prog); |
64 | for (int j = 0; j < GetFlag(FLAGS_threads); j++) |
65 | threads[j].join(); |
66 | |
67 | // One more compile, to make sure everything is okay. |
68 | prog->BuildEntireDFA(Prog::kFirstMatch, nullptr); |
69 | delete prog; |
70 | } |
71 | |
72 | re->Decref(); |
73 | } |
74 | |
75 | // Check that DFA size requirements are followed. |
76 | // BuildEntireDFA will, like SearchDFA, stop building out |
77 | // the DFA once the memory limits are reached. |
78 | TEST(SingleThreaded, BuildEntireDFA) { |
79 | // Create regexp with 2^30 states in DFA. |
80 | Regexp* re = Regexp::Parse("a[ab]{30}b" , Regexp::LikePerl, NULL); |
81 | ASSERT_TRUE(re != NULL); |
82 | |
83 | for (int i = 17; i < 24; i++) { |
84 | int64_t limit = int64_t{1}<<i; |
85 | int64_t usage; |
86 | //int64_t progusage, dfamem; |
87 | { |
88 | testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY); |
89 | Prog* prog = re->CompileToProg(limit); |
90 | ASSERT_TRUE(prog != NULL); |
91 | //progusage = m.HeapGrowth(); |
92 | //dfamem = prog->dfa_mem(); |
93 | prog->BuildEntireDFA(Prog::kFirstMatch, nullptr); |
94 | prog->BuildEntireDFA(Prog::kLongestMatch, nullptr); |
95 | usage = m.HeapGrowth(); |
96 | delete prog; |
97 | } |
98 | if (UsingMallocCounter) { |
99 | //LOG(INFO) << "limit " << limit << ", " |
100 | // << "prog usage " << progusage << ", " |
101 | // << "DFA budget " << dfamem << ", " |
102 | // << "total " << usage; |
103 | // Tolerate +/- 10%. |
104 | ASSERT_GT(usage, limit*9/10); |
105 | ASSERT_LT(usage, limit*11/10); |
106 | } |
107 | } |
108 | re->Decref(); |
109 | } |
110 | |
111 | // Generates and returns a string over binary alphabet {0,1} that contains |
112 | // all possible binary sequences of length n as subsequences. The obvious |
113 | // brute force method would generate a string of length n * 2^n, but this |
114 | // generates a string of length n + 2^n - 1 called a De Bruijn cycle. |
115 | // See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. |
116 | // Such a string is useful for testing a DFA. If you have a DFA |
117 | // where distinct last n bytes implies distinct states, then running on a |
118 | // DeBruijn string causes the DFA to need to create a new state at every |
119 | // position in the input, never reusing any states until it gets to the |
120 | // end of the string. This is the worst possible case for DFA execution. |
121 | static std::string DeBruijnString(int n) { |
122 | CHECK_LT(n, static_cast<int>(8*sizeof(int))); |
123 | CHECK_GT(n, 0); |
124 | |
125 | std::vector<bool> did(size_t{1}<<n); |
126 | for (int i = 0; i < 1<<n; i++) |
127 | did[i] = false; |
128 | |
129 | std::string s; |
130 | for (int i = 0; i < n-1; i++) |
131 | s.append("0" ); |
132 | int bits = 0; |
133 | int mask = (1<<n) - 1; |
134 | for (int i = 0; i < (1<<n); i++) { |
135 | bits <<= 1; |
136 | bits &= mask; |
137 | if (!did[bits|1]) { |
138 | bits |= 1; |
139 | s.append("1" ); |
140 | } else { |
141 | s.append("0" ); |
142 | } |
143 | CHECK(!did[bits]); |
144 | did[bits] = true; |
145 | } |
146 | return s; |
147 | } |
148 | |
149 | // Test that the DFA gets the right result even if it runs |
150 | // out of memory during a search. The regular expression |
151 | // 0[01]{n}$ matches a binary string of 0s and 1s only if |
152 | // the (n+1)th-to-last character is a 0. Matching this in |
153 | // a single forward pass (as done by the DFA) requires |
154 | // keeping one bit for each of the last n+1 characters |
155 | // (whether each was a 0), or 2^(n+1) possible states. |
156 | // If we run this regexp to search in a string that contains |
157 | // every possible n-character binary string as a substring, |
158 | // then it will have to run through at least 2^n states. |
159 | // States are big data structures -- certainly more than 1 byte -- |
160 | // so if the DFA can search correctly while staying within a |
161 | // 2^n byte limit, it must be handling out-of-memory conditions |
162 | // gracefully. |
163 | TEST(SingleThreaded, SearchDFA) { |
164 | // The De Bruijn string is the worst case input for this regexp. |
165 | // By default, the DFA will notice that it is flushing its cache |
166 | // too frequently and will bail out early, so that RE2 can use the |
167 | // NFA implementation instead. (The DFA loses its speed advantage |
168 | // if it can't get a good cache hit rate.) |
169 | // Tell the DFA to trudge along instead. |
170 | Prog::TEST_dfa_should_bail_when_slow(false); |
171 | |
172 | // Choice of n is mostly arbitrary, except that: |
173 | // * making n too big makes the test run for too long. |
174 | // * making n too small makes the DFA refuse to run, |
175 | // because it has so little memory compared to the program size. |
176 | // Empirically, n = 18 is a good compromise between the two. |
177 | const int n = 18; |
178 | |
179 | Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$" , n), |
180 | Regexp::LikePerl, NULL); |
181 | ASSERT_TRUE(re != NULL); |
182 | |
183 | // The De Bruijn string for n ends with a 1 followed by n 0s in a row, |
184 | // which is not a match for 0[01]{n}$. Adding one more 0 is a match. |
185 | std::string no_match = DeBruijnString(n); |
186 | std::string match = no_match + "0" ; |
187 | |
188 | int64_t usage; |
189 | int64_t peak_usage; |
190 | { |
191 | testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY); |
192 | Prog* prog = re->CompileToProg(1<<n); |
193 | ASSERT_TRUE(prog != NULL); |
194 | for (int i = 0; i < 10; i++) { |
195 | bool matched = false; |
196 | bool failed = false; |
197 | matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored, |
198 | Prog::kFirstMatch, NULL, &failed, NULL); |
199 | ASSERT_FALSE(failed); |
200 | ASSERT_TRUE(matched); |
201 | matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored, |
202 | Prog::kFirstMatch, NULL, &failed, NULL); |
203 | ASSERT_FALSE(failed); |
204 | ASSERT_FALSE(matched); |
205 | } |
206 | usage = m.HeapGrowth(); |
207 | peak_usage = m.PeakHeapGrowth(); |
208 | delete prog; |
209 | } |
210 | if (UsingMallocCounter) { |
211 | //LOG(INFO) << "usage " << usage << ", " |
212 | // << "peak usage " << peak_usage; |
213 | ASSERT_LT(usage, 1<<n); |
214 | ASSERT_LT(peak_usage, 1<<n); |
215 | } |
216 | re->Decref(); |
217 | |
218 | // Reset to original behaviour. |
219 | Prog::TEST_dfa_should_bail_when_slow(true); |
220 | } |
221 | |
222 | // Helper function: searches for match, which should match, |
223 | // and no_match, which should not. |
224 | static void DoSearch(Prog* prog, const StringPiece& match, |
225 | const StringPiece& no_match) { |
226 | for (int i = 0; i < 2; i++) { |
227 | bool matched = false; |
228 | bool failed = false; |
229 | matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored, |
230 | Prog::kFirstMatch, NULL, &failed, NULL); |
231 | ASSERT_FALSE(failed); |
232 | ASSERT_TRUE(matched); |
233 | matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored, |
234 | Prog::kFirstMatch, NULL, &failed, NULL); |
235 | ASSERT_FALSE(failed); |
236 | ASSERT_FALSE(matched); |
237 | } |
238 | } |
239 | |
240 | TEST(Multithreaded, SearchDFA) { |
241 | Prog::TEST_dfa_should_bail_when_slow(false); |
242 | |
243 | // Same as single-threaded test above. |
244 | const int n = 18; |
245 | Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$" , n), |
246 | Regexp::LikePerl, NULL); |
247 | ASSERT_TRUE(re != NULL); |
248 | std::string no_match = DeBruijnString(n); |
249 | std::string match = no_match + "0" ; |
250 | |
251 | // Check that single-threaded code works. |
252 | { |
253 | Prog* prog = re->CompileToProg(1<<n); |
254 | ASSERT_TRUE(prog != NULL); |
255 | |
256 | std::thread t(DoSearch, prog, match, no_match); |
257 | t.join(); |
258 | |
259 | delete prog; |
260 | } |
261 | |
262 | // Run the search simultaneously in a bunch of threads. |
263 | // Reuse same flags for Multithreaded.BuildDFA above. |
264 | for (int i = 0; i < GetFlag(FLAGS_repeat); i++) { |
265 | Prog* prog = re->CompileToProg(1<<n); |
266 | ASSERT_TRUE(prog != NULL); |
267 | |
268 | std::vector<std::thread> threads; |
269 | for (int j = 0; j < GetFlag(FLAGS_threads); j++) |
270 | threads.emplace_back(DoSearch, prog, match, no_match); |
271 | for (int j = 0; j < GetFlag(FLAGS_threads); j++) |
272 | threads[j].join(); |
273 | |
274 | delete prog; |
275 | } |
276 | |
277 | re->Decref(); |
278 | |
279 | // Reset to original behaviour. |
280 | Prog::TEST_dfa_should_bail_when_slow(true); |
281 | } |
282 | |
283 | struct ReverseTest { |
284 | const char* regexp; |
285 | const char* text; |
286 | bool match; |
287 | }; |
288 | |
289 | // Test that reverse DFA handles anchored/unanchored correctly. |
290 | // It's in the DFA interface but not used by RE2. |
291 | ReverseTest reverse_tests[] = { |
292 | { "\\A(a|b)" , "abc" , true }, |
293 | { "(a|b)\\z" , "cba" , true }, |
294 | { "\\A(a|b)" , "cba" , false }, |
295 | { "(a|b)\\z" , "abc" , false }, |
296 | }; |
297 | |
298 | TEST(DFA, ReverseMatch) { |
299 | int nfail = 0; |
300 | for (size_t i = 0; i < arraysize(reverse_tests); i++) { |
301 | const ReverseTest& t = reverse_tests[i]; |
302 | Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); |
303 | ASSERT_TRUE(re != NULL); |
304 | Prog* prog = re->CompileToReverseProg(0); |
305 | ASSERT_TRUE(prog != NULL); |
306 | bool failed = false; |
307 | bool matched = prog->SearchDFA(t.text, StringPiece(), Prog::kUnanchored, |
308 | Prog::kFirstMatch, NULL, &failed, NULL); |
309 | if (matched != t.match) { |
310 | LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match; |
311 | nfail++; |
312 | } |
313 | delete prog; |
314 | re->Decref(); |
315 | } |
316 | EXPECT_EQ(nfail, 0); |
317 | } |
318 | |
319 | struct CallbackTest { |
320 | const char* regexp; |
321 | const char* dump; |
322 | }; |
323 | |
324 | // Test that DFA::BuildAllStates() builds the expected DFA states |
325 | // and issues the expected callbacks. These test cases reflect the |
326 | // very compact encoding of the callbacks, but that also makes them |
327 | // very difficult to understand, so let's work through "\\Aa\\z". |
328 | // There are three slots per DFA state because the bytemap has two |
329 | // equivalence classes and there is a third slot for kByteEndText: |
330 | // 0: all bytes that are not 'a' |
331 | // 1: the byte 'a' |
332 | // 2: kByteEndText |
333 | // -1 means that there is no transition from that DFA state to any |
334 | // other DFA state for that slot. The valid transitions are thus: |
335 | // state 0 --slot 1--> state 1 |
336 | // state 1 --slot 2--> state 2 |
337 | // The double brackets indicate that state 2 is a matching state. |
338 | // Putting it together, this means that the DFA must consume the |
339 | // byte 'a' and then hit end of text. Q.E.D. |
340 | CallbackTest callback_tests[] = { |
341 | { "\\Aa\\z" , "[-1,1,-1] [-1,-1,2] [[-1,-1,-1]]" }, |
342 | { "\\Aab\\z" , "[-1,1,-1,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" }, |
343 | { "\\Aa*b\\z" , "[-1,0,1,-1] [-1,-1,-1,2] [[-1,-1,-1,-1]]" }, |
344 | { "\\Aa+b\\z" , "[-1,1,-1,-1] [-1,1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" }, |
345 | { "\\Aa?b\\z" , "[-1,1,2,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" }, |
346 | { "\\Aa\\C*\\z" , "[-1,1,-1] [1,1,2] [[-1,-1,-1]]" }, |
347 | { "\\Aa\\C*" , "[-1,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" }, |
348 | { "a\\C*" , "[0,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" }, |
349 | { "\\C*" , "[1,2] [[1,1]] [[-1,-1]]" }, |
350 | { "a" , "[0,1,-1] [2,2,2] [[-1,-1,-1]]" } , |
351 | }; |
352 | |
353 | TEST(DFA, Callback) { |
354 | int nfail = 0; |
355 | for (size_t i = 0; i < arraysize(callback_tests); i++) { |
356 | const CallbackTest& t = callback_tests[i]; |
357 | Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); |
358 | ASSERT_TRUE(re != NULL); |
359 | Prog* prog = re->CompileToProg(0); |
360 | ASSERT_TRUE(prog != NULL); |
361 | std::string dump; |
362 | prog->BuildEntireDFA(Prog::kLongestMatch, [&](const int* next, bool match) { |
363 | ASSERT_TRUE(next != NULL); |
364 | if (!dump.empty()) |
365 | dump += " " ; |
366 | dump += match ? "[[" : "[" ; |
367 | for (int b = 0; b < prog->bytemap_range() + 1; b++) |
368 | dump += StringPrintf("%d," , next[b]); |
369 | dump.pop_back(); |
370 | dump += match ? "]]" : "]" ; |
371 | }); |
372 | if (dump != t.dump) { |
373 | LOG(ERROR) << t.regexp << " bytemap:\n" << prog->DumpByteMap(); |
374 | LOG(ERROR) << t.regexp << " dump:\ngot " << dump << "\nwant " << t.dump; |
375 | nfail++; |
376 | } |
377 | delete prog; |
378 | re->Decref(); |
379 | } |
380 | EXPECT_EQ(nfail, 0); |
381 | } |
382 | |
383 | } // namespace re2 |
384 | |