1// Copyright 2008 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression engine tester -- test all the implementations against each other.
6
7#include <stddef.h>
8#include <stdint.h>
9#include <string.h>
10#include <string>
11
12#include "util/util.h"
13#include "util/flags.h"
14#include "util/logging.h"
15#include "util/strutil.h"
16#include "re2/testing/tester.h"
17#include "re2/prog.h"
18#include "re2/re2.h"
19#include "re2/regexp.h"
20
21DEFINE_FLAG(bool, dump_prog, false, "dump regexp program");
22DEFINE_FLAG(bool, log_okay, false, "log successful runs");
23DEFINE_FLAG(bool, dump_rprog, false, "dump reversed regexp program");
24
25DEFINE_FLAG(int, max_regexp_failures, 100,
26 "maximum number of regexp test failures (-1 = unlimited)");
27
28DEFINE_FLAG(std::string, regexp_engines, "",
29 "pattern to select regexp engines to test");
30
31namespace re2 {
32
33enum {
34 kMaxSubmatch = 1+16, // $0...$16
35};
36
37const char* engine_names[kEngineMax] = {
38 "Backtrack",
39 "NFA",
40 "DFA",
41 "DFA1",
42 "OnePass",
43 "BitState",
44 "RE2",
45 "RE2a",
46 "RE2b",
47 "PCRE",
48};
49
50// Returns the name of the engine.
51static const char* EngineName(Engine e) {
52 CHECK_GE(e, 0);
53 CHECK_LT(e, arraysize(engine_names));
54 CHECK(engine_names[e] != NULL);
55 return engine_names[e];
56}
57
58// Returns bit mask of engines to use.
59static uint32_t Engines() {
60 static bool did_parse = false;
61 static uint32_t cached_engines = 0;
62
63 if (did_parse)
64 return cached_engines;
65
66 if (GetFlag(FLAGS_regexp_engines).empty()) {
67 cached_engines = ~0;
68 } else {
69 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
70 if (GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos)
71 cached_engines |= 1<<i;
72 }
73
74 if (cached_engines == 0)
75 LOG(INFO) << "Warning: no engines enabled.";
76 if (!UsingPCRE)
77 cached_engines &= ~(1<<kEnginePCRE);
78 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
79 if (cached_engines & (1<<i))
80 LOG(INFO) << EngineName(i) << " enabled";
81 }
82
83 did_parse = true;
84 return cached_engines;
85}
86
87// The result of running a match.
88struct TestInstance::Result {
89 bool skipped; // test skipped: wasn't applicable
90 bool matched; // found a match
91 bool untrusted; // don't really trust the answer
92 bool have_submatch; // computed all submatch info
93 bool have_submatch0; // computed just submatch[0]
94 StringPiece submatch[kMaxSubmatch];
95};
96
97typedef TestInstance::Result Result;
98
99// Formats a single capture range s in text in the form (a,b)
100// where a and b are the starting and ending offsets of s in text.
101static std::string FormatCapture(const StringPiece& text,
102 const StringPiece& s) {
103 if (s.data() == NULL)
104 return "(?,?)";
105 return StringPrintf("(%td,%td)",
106 s.begin() - text.begin(),
107 s.end() - text.begin());
108}
109
110// Returns whether text contains non-ASCII (>= 0x80) bytes.
111static bool NonASCII(const StringPiece& text) {
112 for (size_t i = 0; i < text.size(); i++)
113 if ((uint8_t)text[i] >= 0x80)
114 return true;
115 return false;
116}
117
118// Returns string representation of match kind.
119static std::string FormatKind(Prog::MatchKind kind) {
120 switch (kind) {
121 case Prog::kFullMatch:
122 return "full match";
123 case Prog::kLongestMatch:
124 return "longest match";
125 case Prog::kFirstMatch:
126 return "first match";
127 case Prog::kManyMatch:
128 return "many match";
129 }
130 return "???";
131}
132
133// Returns string representation of anchor kind.
134static std::string FormatAnchor(Prog::Anchor anchor) {
135 switch (anchor) {
136 case Prog::kAnchored:
137 return "anchored";
138 case Prog::kUnanchored:
139 return "unanchored";
140 }
141 return "???";
142}
143
144struct ParseMode {
145 Regexp::ParseFlags parse_flags;
146 std::string desc;
147};
148
149static const Regexp::ParseFlags single_line =
150 Regexp::LikePerl;
151static const Regexp::ParseFlags multi_line =
152 static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
153
154static ParseMode parse_modes[] = {
155 { single_line, "single-line" },
156 { single_line|Regexp::Latin1, "single-line, latin1" },
157 { multi_line, "multiline" },
158 { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
159 { multi_line|Regexp::Latin1, "multiline, latin1" },
160};
161
162static std::string FormatMode(Regexp::ParseFlags flags) {
163 for (size_t i = 0; i < arraysize(parse_modes); i++)
164 if (parse_modes[i].parse_flags == flags)
165 return parse_modes[i].desc;
166 return StringPrintf("%#x", static_cast<uint32_t>(flags));
167}
168
169// Constructs and saves all the matching engines that
170// will be required for the given tests.
171TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
172 Regexp::ParseFlags flags)
173 : regexp_str_(regexp_str),
174 kind_(kind),
175 flags_(flags),
176 error_(false),
177 regexp_(NULL),
178 num_captures_(0),
179 prog_(NULL),
180 rprog_(NULL),
181 re_(NULL),
182 re2_(NULL) {
183
184 VLOG(1) << CEscape(regexp_str);
185
186 // Compile regexp to prog.
187 // Always required - needed for backtracking (reference implementation).
188 RegexpStatus status;
189 regexp_ = Regexp::Parse(regexp_str, flags, &status);
190 if (regexp_ == NULL) {
191 LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
192 << " mode: " << FormatMode(flags);
193 error_ = true;
194 return;
195 }
196 num_captures_ = regexp_->NumCaptures();
197 prog_ = regexp_->CompileToProg(0);
198 if (prog_ == NULL) {
199 LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
200 error_ = true;
201 return;
202 }
203 if (GetFlag(FLAGS_dump_prog)) {
204 LOG(INFO) << "Prog for "
205 << " regexp "
206 << CEscape(regexp_str_)
207 << " (" << FormatKind(kind_)
208 << ", " << FormatMode(flags_)
209 << ")\n"
210 << prog_->Dump();
211 }
212
213 // Compile regexp to reversed prog. Only needed for DFA engines.
214 if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
215 rprog_ = regexp_->CompileToReverseProg(0);
216 if (rprog_ == NULL) {
217 LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
218 error_ = true;
219 return;
220 }
221 if (GetFlag(FLAGS_dump_rprog))
222 LOG(INFO) << rprog_->Dump();
223 }
224
225 // Create re string that will be used for RE and RE2.
226 std::string re = std::string(regexp_str);
227 // Accomodate flags.
228 // Regexp::Latin1 will be accomodated below.
229 if (!(flags & Regexp::OneLine))
230 re = "(?m)" + re;
231 if (flags & Regexp::NonGreedy)
232 re = "(?U)" + re;
233 if (flags & Regexp::DotNL)
234 re = "(?s)" + re;
235
236 // Compile regexp to RE2.
237 if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
238 RE2::Options options;
239 if (flags & Regexp::Latin1)
240 options.set_encoding(RE2::Options::EncodingLatin1);
241 if (kind_ == Prog::kLongestMatch)
242 options.set_longest_match(true);
243 re2_ = new RE2(re, options);
244 if (!re2_->error().empty()) {
245 LOG(INFO) << "Cannot RE2: " << CEscape(re);
246 error_ = true;
247 return;
248 }
249 }
250
251 // Compile regexp to RE.
252 // PCRE as exposed by the RE interface isn't always usable.
253 // 1. It disagrees about handling of empty-string reptitions
254 // like matching (a*)* against "b". PCRE treats the (a*) as
255 // occurring once, while we treat it as occurring not at all.
256 // 2. It treats $ as this weird thing meaning end of string
257 // or before the \n at the end of the string.
258 // 3. It doesn't implement POSIX leftmost-longest matching.
259 // 4. It lets \s match vertical tab.
260 // MimicsPCRE() detects 1 and 2.
261 if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
262 kind_ != Prog::kLongestMatch) {
263 PCRE_Options o;
264 o.set_option(PCRE::UTF8);
265 if (flags & Regexp::Latin1)
266 o.set_option(PCRE::None);
267 // PCRE has interface bug keeping us from finding $0, so
268 // add one more layer of parens.
269 re_ = new PCRE("("+re+")", o);
270 if (!re_->error().empty()) {
271 LOG(INFO) << "Cannot PCRE: " << CEscape(re);
272 error_ = true;
273 return;
274 }
275 }
276}
277
278TestInstance::~TestInstance() {
279 if (regexp_)
280 regexp_->Decref();
281 delete prog_;
282 delete rprog_;
283 delete re_;
284 delete re2_;
285}
286
287// Runs a single search using the named engine type.
288// This interface hides all the irregularities of the various
289// engine interfaces from the rest of this file.
290void TestInstance::RunSearch(Engine type,
291 const StringPiece& orig_text,
292 const StringPiece& orig_context,
293 Prog::Anchor anchor,
294 Result* result) {
295 // Result is not trivial, so we cannot freely clear it with memset(3),
296 // but zeroing objects like so is safe and expedient for our purposes.
297 memset(reinterpret_cast<void*>(result), 0, sizeof *result);
298 if (regexp_ == NULL) {
299 result->skipped = true;
300 return;
301 }
302 int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
303 if (nsubmatch > kMaxSubmatch)
304 nsubmatch = kMaxSubmatch;
305
306 StringPiece text = orig_text;
307 StringPiece context = orig_context;
308
309 switch (type) {
310 default:
311 LOG(FATAL) << "Bad RunSearch type: " << (int)type;
312
313 case kEngineBacktrack:
314 if (prog_ == NULL) {
315 result->skipped = true;
316 break;
317 }
318 result->matched =
319 prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
320 result->submatch, nsubmatch);
321 result->have_submatch = true;
322 break;
323
324 case kEngineNFA:
325 if (prog_ == NULL) {
326 result->skipped = true;
327 break;
328 }
329 result->matched =
330 prog_->SearchNFA(text, context, anchor, kind_,
331 result->submatch, nsubmatch);
332 result->have_submatch = true;
333 break;
334
335 case kEngineDFA:
336 if (prog_ == NULL) {
337 result->skipped = true;
338 break;
339 }
340 result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
341 &result->skipped, NULL);
342 break;
343
344 case kEngineDFA1:
345 if (prog_ == NULL || rprog_ == NULL) {
346 result->skipped = true;
347 break;
348 }
349 result->matched =
350 prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
351 &result->skipped, NULL);
352 // If anchored, no need for second run,
353 // but do it anyway to find more bugs.
354 if (result->matched) {
355 if (!rprog_->SearchDFA(result->submatch[0], context,
356 Prog::kAnchored, Prog::kLongestMatch,
357 result->submatch,
358 &result->skipped, NULL)) {
359 LOG(ERROR) << "Reverse DFA inconsistency: "
360 << CEscape(regexp_str_)
361 << " on " << CEscape(text);
362 result->matched = false;
363 }
364 }
365 result->have_submatch0 = true;
366 break;
367
368 case kEngineOnePass:
369 if (prog_ == NULL ||
370 !prog_->IsOnePass() ||
371 anchor == Prog::kUnanchored ||
372 nsubmatch > Prog::kMaxOnePassCapture) {
373 result->skipped = true;
374 break;
375 }
376 result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
377 result->submatch, nsubmatch);
378 result->have_submatch = true;
379 break;
380
381 case kEngineBitState:
382 if (prog_ == NULL ||
383 !prog_->CanBitState()) {
384 result->skipped = true;
385 break;
386 }
387 result->matched = prog_->SearchBitState(text, context, anchor, kind_,
388 result->submatch, nsubmatch);
389 result->have_submatch = true;
390 break;
391
392 case kEngineRE2:
393 case kEngineRE2a:
394 case kEngineRE2b: {
395 if (!re2_ || text.end() != context.end()) {
396 result->skipped = true;
397 break;
398 }
399
400 RE2::Anchor re_anchor;
401 if (anchor == Prog::kAnchored)
402 re_anchor = RE2::ANCHOR_START;
403 else
404 re_anchor = RE2::UNANCHORED;
405 if (kind_ == Prog::kFullMatch)
406 re_anchor = RE2::ANCHOR_BOTH;
407
408 result->matched = re2_->Match(
409 context,
410 static_cast<size_t>(text.begin() - context.begin()),
411 static_cast<size_t>(text.end() - context.begin()),
412 re_anchor,
413 result->submatch,
414 nsubmatch);
415 result->have_submatch = nsubmatch > 0;
416 break;
417 }
418
419 case kEnginePCRE: {
420 if (!re_ || text.begin() != context.begin() ||
421 text.end() != context.end()) {
422 result->skipped = true;
423 break;
424 }
425
426 // In Perl/PCRE, \v matches any character considered vertical
427 // whitespace, not just vertical tab. Regexp::MimicsPCRE() is
428 // unable to handle all cases of this, unfortunately, so just
429 // catch them here. :(
430 if (regexp_str_.find("\\v") != StringPiece::npos &&
431 (text.find('\n') != StringPiece::npos ||
432 text.find('\f') != StringPiece::npos ||
433 text.find('\r') != StringPiece::npos)) {
434 result->skipped = true;
435 break;
436 }
437
438 // PCRE 8.34 or so started allowing vertical tab to match \s,
439 // following a change made in Perl 5.18. RE2 does not.
440 if ((regexp_str_.find("\\s") != StringPiece::npos ||
441 regexp_str_.find("\\S") != StringPiece::npos) &&
442 text.find('\v') != StringPiece::npos) {
443 result->skipped = true;
444 break;
445 }
446
447 const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
448 PCRE::Arg *a = new PCRE::Arg[nsubmatch];
449 for (int i = 0; i < nsubmatch; i++) {
450 a[i] = PCRE::Arg(&result->submatch[i]);
451 argptr[i] = &a[i];
452 }
453 size_t consumed;
454 PCRE::Anchor pcre_anchor;
455 if (anchor == Prog::kAnchored)
456 pcre_anchor = PCRE::ANCHOR_START;
457 else
458 pcre_anchor = PCRE::UNANCHORED;
459 if (kind_ == Prog::kFullMatch)
460 pcre_anchor = PCRE::ANCHOR_BOTH;
461 re_->ClearHitLimit();
462 result->matched =
463 re_->DoMatch(text,
464 pcre_anchor,
465 &consumed,
466 argptr, nsubmatch);
467 if (re_->HitLimit()) {
468 result->untrusted = true;
469 delete[] argptr;
470 delete[] a;
471 break;
472 }
473 result->have_submatch = true;
474 delete[] argptr;
475 delete[] a;
476 break;
477 }
478 }
479
480 if (!result->matched)
481 memset(result->submatch, 0, sizeof result->submatch);
482}
483
484// Checks whether r is okay given that correct is the right answer.
485// Specifically, r's answers have to match (but it doesn't have to
486// claim to have all the answers).
487static bool ResultOkay(const Result& r, const Result& correct) {
488 if (r.skipped)
489 return true;
490 if (r.matched != correct.matched)
491 return false;
492 if (r.have_submatch || r.have_submatch0) {
493 for (int i = 0; i < kMaxSubmatch; i++) {
494 if (correct.submatch[i].data() != r.submatch[i].data() ||
495 correct.submatch[i].size() != r.submatch[i].size())
496 return false;
497 if (!r.have_submatch)
498 break;
499 }
500 }
501 return true;
502}
503
504// Runs a single test.
505bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
506 Prog::Anchor anchor) {
507 // Backtracking is the gold standard.
508 Result correct;
509 RunSearch(kEngineBacktrack, text, context, anchor, &correct);
510 if (correct.skipped) {
511 if (regexp_ == NULL)
512 return true;
513 LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
514 << " " << FormatMode(flags_);
515 return false;
516 }
517 VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
518 << " text " << CEscape(text)
519 << " (" << FormatKind(kind_)
520 << ", " << FormatAnchor(anchor)
521 << ", " << FormatMode(flags_)
522 << ")";
523
524 // Compare the others.
525 bool all_okay = true;
526 for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
527 if (!(Engines() & (1<<i)))
528 continue;
529
530 Result r;
531 RunSearch(i, text, context, anchor, &r);
532 if (ResultOkay(r, correct)) {
533 if (GetFlag(FLAGS_log_okay))
534 LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
535 continue;
536 }
537
538 // We disagree with PCRE on the meaning of some Unicode matches.
539 // In particular, we treat non-ASCII UTF-8 as non-word characters.
540 // We also treat "empty" character sets like [^\w\W] as being
541 // impossible to match, while PCRE apparently excludes some code
542 // points (e.g., 0x0080) from both \w and \W.
543 if (i == kEnginePCRE && NonASCII(text))
544 continue;
545
546 if (!r.untrusted)
547 all_okay = false;
548
549 LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
550 context, anchor);
551 if (r.matched != correct.matched) {
552 if (r.matched) {
553 LOG(INFO) << " Should not match (but does).";
554 } else {
555 LOG(INFO) << " Should match (but does not).";
556 continue;
557 }
558 }
559 for (int i = 0; i < 1+num_captures_; i++) {
560 if (r.submatch[i].data() != correct.submatch[i].data() ||
561 r.submatch[i].size() != correct.submatch[i].size()) {
562 LOG(INFO) <<
563 StringPrintf(" $%d: should be %s is %s",
564 i,
565 FormatCapture(text, correct.submatch[i]).c_str(),
566 FormatCapture(text, r.submatch[i]).c_str());
567 } else {
568 LOG(INFO) <<
569 StringPrintf(" $%d: %s ok", i,
570 FormatCapture(text, r.submatch[i]).c_str());
571 }
572 }
573 }
574
575 if (!all_okay) {
576 // This will be initialised once (after flags have been initialised)
577 // and that is desirable because we want to enforce a global limit.
578 static int max_regexp_failures = GetFlag(FLAGS_max_regexp_failures);
579 if (max_regexp_failures > 0 && --max_regexp_failures == 0)
580 LOG(QFATAL) << "Too many regexp failures.";
581 }
582
583 return all_okay;
584}
585
586void TestInstance::LogMatch(const char* prefix, Engine e,
587 const StringPiece& text, const StringPiece& context,
588 Prog::Anchor anchor) {
589 LOG(INFO) << prefix
590 << EngineName(e)
591 << " regexp "
592 << CEscape(regexp_str_)
593 << " "
594 << CEscape(regexp_->ToString())
595 << " text "
596 << CEscape(text)
597 << " ("
598 << text.begin() - context.begin()
599 << ","
600 << text.end() - context.begin()
601 << ") of context "
602 << CEscape(context)
603 << " (" << FormatKind(kind_)
604 << ", " << FormatAnchor(anchor)
605 << ", " << FormatMode(flags_)
606 << ")";
607}
608
609static Prog::MatchKind kinds[] = {
610 Prog::kFirstMatch,
611 Prog::kLongestMatch,
612 Prog::kFullMatch,
613};
614
615// Test all possible match kinds and parse modes.
616Tester::Tester(const StringPiece& regexp) {
617 error_ = false;
618 for (size_t i = 0; i < arraysize(kinds); i++) {
619 for (size_t j = 0; j < arraysize(parse_modes); j++) {
620 TestInstance* t = new TestInstance(regexp, kinds[i],
621 parse_modes[j].parse_flags);
622 error_ |= t->error();
623 v_.push_back(t);
624 }
625 }
626}
627
628Tester::~Tester() {
629 for (size_t i = 0; i < v_.size(); i++)
630 delete v_[i];
631}
632
633bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
634 Prog::Anchor anchor) {
635 bool okay = true;
636 for (size_t i = 0; i < v_.size(); i++)
637 okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
638 return okay;
639}
640
641static Prog::Anchor anchors[] = {
642 Prog::kAnchored,
643 Prog::kUnanchored
644};
645
646bool Tester::TestInput(const StringPiece& text) {
647 bool okay = TestInputInContext(text, text);
648 if (!text.empty()) {
649 StringPiece sp;
650 sp = text;
651 sp.remove_prefix(1);
652 okay &= TestInputInContext(sp, text);
653 sp = text;
654 sp.remove_suffix(1);
655 okay &= TestInputInContext(sp, text);
656 }
657 return okay;
658}
659
660bool Tester::TestInputInContext(const StringPiece& text,
661 const StringPiece& context) {
662 bool okay = true;
663 for (size_t i = 0; i < arraysize(anchors); i++)
664 okay &= TestCase(text, context, anchors[i]);
665 return okay;
666}
667
668bool TestRegexpOnText(const StringPiece& regexp,
669 const StringPiece& text) {
670 Tester t(regexp);
671 return t.TestInput(text);
672}
673
674} // namespace re2
675