tester.cc source code [RE2/re2/testing/tester.cc]

1	// Copyright 2008 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Regular expression engine tester -- test all the implementations against each other.
6
7	#include <stddef.h>
8	#include <stdint.h>
9	#include <string.h>
10	#include <string>
11
12	#include "util/util.h"
13	#include "util/flags.h"
14	#include "util/logging.h"
15	#include "util/strutil.h"
16	#include "re2/testing/tester.h"
17	#include "re2/prog.h"
18	#include "re2/re2.h"
19	#include "re2/regexp.h"
20
21	DEFINE_FLAG(bool, dump_prog, false, "dump regexp program");
22	DEFINE_FLAG(bool, log_okay, false, "log successful runs");
23	DEFINE_FLAG(bool, dump_rprog, false, "dump reversed regexp program");
24
25	DEFINE_FLAG(int, max_regexp_failures, `100`,
26	"maximum number of regexp test failures (-1 = unlimited)");
27
28	DEFINE_FLAG(std::string, regexp_engines, "",
29	"pattern to select regexp engines to test");
30
31	namespace re2 {
32
33	enum {
34	kMaxSubmatch = `1`+`16`, // $0...$16
35	};
36
37	const char* engine_names[kEngineMax] = {
38	"Backtrack",
39	"NFA",
40	"DFA",
41	"DFA1",
42	"OnePass",
43	"BitState",
44	"RE2",
45	"RE2a",
46	"RE2b",
47	"PCRE",
48	};
49
50	// Returns the name of the engine.
51	static const char* EngineName(Engine e) {
52	CHECK_GE(e, `0`);
53	CHECK_LT(e, arraysize(engine_names));
54	CHECK(engine_names[e] != NULL);
55	return engine_names[e];
56	}
57
58	// Returns bit mask of engines to use.
59	static uint32_t Engines() {
60	static bool did_parse = false;
61	static uint32_t cached_engines = `0`;
62
63	if (did_parse)
64	return cached_engines;
65
66	if (GetFlag(FLAGS_regexp_engines).empty()) {
67	cached_engines = ~`0`;
68	} else {
69	for (Engine i = static_cast<Engine>(`0`); i < kEngineMax; i ++)
70	if (GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos)
71	cached_engines \|= `1`<<i;
72	}
73
74	if (cached_engines == `0`)
75	LOG(INFO) << "Warning: no engines enabled.";
76	if (!UsingPCRE)
77	cached_engines &= ~(`1`<<kEnginePCRE);
78	for (Engine i = static_cast<Engine>(`0`); i < kEngineMax; i ++) {
79	if (cached_engines & (`1`<<i))
80	LOG(INFO) << EngineName(i) << " enabled";
81	}
82
83	did_parse = true;
84	return cached_engines;
85	}
86
87	// The result of running a match.
88	struct TestInstance::Result {
89	bool skipped; // test skipped: wasn't applicable
90	bool matched; // found a match
91	bool untrusted; // don't really trust the answer
92	bool have_submatch; // computed all submatch info
93	bool have_submatch0; // computed just submatch[0]
94	StringPiece submatch[kMaxSubmatch];
95	};
96
97	typedef TestInstance::Result Result;
98
99	// Formats a single capture range s in text in the form (a,b)
100	// where a and b are the starting and ending offsets of s in text.
101	static std::string FormatCapture(const StringPiece& text,
102	const StringPiece& s) {
103	if (s.data() == NULL)
104	return "(?,?)";
105	return StringPrintf("(%td,%td)",
106	s.begin() - text.begin(),
107	s.end() - text.begin());
108	}
109
110	// Returns whether text contains non-ASCII (>= 0x80) bytes.
111	static bool NonASCII(const StringPiece& text) {
112	for (size_t i = `0`; i < text.size(); i++)
113	if ((uint8_t)text [i] >= `0x80`)
114	return true;
115	return false;
116	}
117
118	// Returns string representation of match kind.
119	static std::string FormatKind(Prog::MatchKind kind) {
120	switch (kind) {
121	case Prog::kFullMatch:
122	return "full match";
123	case Prog::kLongestMatch:
124	return "longest match";
125	case Prog::kFirstMatch:
126	return "first match";
127	case Prog::kManyMatch:
128	return "many match";
129	}
130	return "???";
131	}
132
133	// Returns string representation of anchor kind.
134	static std::string FormatAnchor(Prog::Anchor anchor) {
135	switch (anchor) {
136	case Prog::kAnchored:
137	return "anchored";
138	case Prog::kUnanchored:
139	return "unanchored";
140	}
141	return "???";
142	}
143
144	struct ParseMode {
145	Regexp::ParseFlags parse_flags;
146	std::string desc;
147	};
148
149	static const Regexp::ParseFlags single_line =
150	Regexp::LikePerl;
151	static const Regexp::ParseFlags multi_line =
152	static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
153
154	static ParseMode parse_modes[] = {
155	{ single_line, "single-line" },
156	{ single_line \|Regexp::Latin1, "single-line, latin1" },
157	{ multi_line, "multiline" },
158	{ multi_line \|Regexp::NonGreedy, "multiline, nongreedy" },
159	{ multi_line \|Regexp::Latin1, "multiline, latin1" },
160	};
161
162	static std::string FormatMode(Regexp::ParseFlags flags) {
163	for (size_t i = `0`; i < arraysize(parse_modes); i++)
164	if (parse_modes[i].parse_flags == flags)
165	return parse_modes[i].desc;
166	return StringPrintf("%#x", static_cast<uint32_t>(flags));
167	}
168
169	// Constructs and saves all the matching engines that
170	// will be required for the given tests.
171	TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
172	Regexp::ParseFlags flags)
173	: regexp_str_(regexp_str),
174	kind_(kind),
175	flags_(flags),
176	error_(false),
177	regexp_(NULL),
178	num_captures_(`0`),
179	prog_(NULL),
180	rprog_(NULL),
181	re_(NULL),
182	re2_(NULL) {
183
184	VLOG(`1`) << CEscape(regexp_str);
185
186	// Compile regexp to prog.
187	// Always required - needed for backtracking (reference implementation).
188	RegexpStatus status;
189	regexp_ = Regexp::Parse(regexp_str, flags, &status);
190	if (regexp_ == NULL) {
191	LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
192	<< " mode: " << FormatMode(flags);
193	error_ = true;
194	return;
195	}
196	num_captures_ = regexp_->NumCaptures();
197	prog_ = regexp_->CompileToProg(`0`);
198	if (prog_ == NULL) {
199	LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
200	error_ = true;
201	return;
202	}
203	if (GetFlag(FLAGS_dump_prog)) {
204	LOG(INFO) << "Prog for "
205	<< " regexp "
206	<< CEscape(regexp_str_)
207	<< " (" << FormatKind(kind_)
208	<< ", " << FormatMode(flags_)
209	<< ")\n"
210	<< prog_->Dump();
211	}
212
213	// Compile regexp to reversed prog. Only needed for DFA engines.
214	if (Engines() & ((`1`<<kEngineDFA)\|(`1`<<kEngineDFA1))) {
215	rprog_ = regexp_->CompileToReverseProg(`0`);
216	if (rprog_ == NULL) {
217	LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
218	error_ = true;
219	return;
220	}
221	if (GetFlag(FLAGS_dump_rprog))
222	LOG(INFO) << rprog_->Dump();
223	}
224
225	// Create re string that will be used for RE and RE2.
226	std::string re = std::string (regexp_str);
227	// Accomodate flags.
228	// Regexp::Latin1 will be accomodated below.
229	if (!(flags & Regexp::OneLine))
230	re = "(?m)" + re;
231	if (flags & Regexp::NonGreedy)
232	re = "(?U)" + re;
233	if (flags & Regexp::DotNL)
234	re = "(?s)" + re;
235
236	// Compile regexp to RE2.
237	if (Engines() & ((`1`<<kEngineRE2)\|(`1`<<kEngineRE2a)\|(`1`<<kEngineRE2b))) {
238	RE2::Options options;
239	if (flags & Regexp::Latin1)
240	options.set_encoding(RE2::Options::EncodingLatin1);
241	if (kind_ == Prog::kLongestMatch)
242	options.set_longest_match(true);
243	re2_ = new RE2 (re, options);
244	if (!re2_->error().empty()) {
245	LOG(INFO) << "Cannot RE2: " << CEscape(re);
246	error_ = true;
247	return;
248	}
249	}
250
251	// Compile regexp to RE.
252	// PCRE as exposed by the RE interface isn't always usable.
253	// 1. It disagrees about handling of empty-string reptitions
254	// like matching (a)* against "b". PCRE treats the (a) as
255	// occurring once, while we treat it as occurring not at all.
256	// 2. It treats $ as this weird thing meaning end of string
257	// or before the \n at the end of the string.
258	// 3. It doesn't implement POSIX leftmost-longest matching.
259	// 4. It lets \s match vertical tab.
260	// MimicsPCRE() detects 1 and 2.
261	if ((Engines() & (`1`<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
262	kind_ != Prog::kLongestMatch) {
263	PCRE_Options o;
264	o.set_option(PCRE::UTF8);
265	if (flags & Regexp::Latin1)
266	o.set_option(PCRE::None);
267	// PCRE has interface bug keeping us from finding $0, so
268	// add one more layer of parens.
269	re_ = new PCRE ("("+re +")", o);
270	if (!re_->error().empty()) {
271	LOG(INFO) << "Cannot PCRE: " << CEscape(re);
272	error_ = true;
273	return;
274	}
275	}
276	}
277
278	TestInstance::~TestInstance() {
279	if (regexp_)
280	regexp_->Decref();
281	delete prog_;
282	delete rprog_;
283	delete re_;
284	delete re2_;
285	}
286
287	// Runs a single search using the named engine type.
288	// This interface hides all the irregularities of the various
289	// engine interfaces from the rest of this file.
290	void TestInstance::RunSearch(Engine type,
291	const StringPiece& orig_text,
292	const StringPiece& orig_context,
293	Prog::Anchor anchor,
294	Result* result) {
295	// Result is not trivial, so we cannot freely clear it with memset(3),
296	// but zeroing objects like so is safe and expedient for our purposes.
297	memset(reinterpret_cast<void>(result), `0`, sizeof* *result);
298	if (regexp_ == NULL) {
299	result->skipped = true;
300	return;
301	}
302	int nsubmatch = `1` + num_captures_; // NumCaptures doesn't count $0
303	if (nsubmatch > kMaxSubmatch)
304	nsubmatch = kMaxSubmatch;
305
306	StringPiece text = orig_text;
307	StringPiece context = orig_context;
308
309	switch (type) {
310	default:
311	LOG(FATAL) << "Bad RunSearch type: " << (int)type;
312
313	case kEngineBacktrack:
314	if (prog_ == NULL) {
315	result->skipped = true;
316	break;
317	}
318	result->matched =
319	prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
320	result->submatch, nsubmatch);
321	result->have_submatch = true;
322	break;
323
324	case kEngineNFA:
325	if (prog_ == NULL) {
326	result->skipped = true;
327	break;
328	}
329	result->matched =
330	prog_->SearchNFA(text, context, anchor, kind_,
331	result->submatch, nsubmatch);
332	result->have_submatch = true;
333	break;
334
335	case kEngineDFA:
336	if (prog_ == NULL) {
337	result->skipped = true;
338	break;
339	}
340	result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
341	&result->skipped, NULL);
342	break;
343
344	case kEngineDFA1:
345	if (prog_ == NULL \|\| rprog_ == NULL) {
346	result->skipped = true;
347	break;
348	}
349	result->matched =
350	prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
351	&result->skipped, NULL);
352	// If anchored, no need for second run,
353	// but do it anyway to find more bugs.
354	if (result->matched) {
355	if (!rprog_->SearchDFA(result->submatch[`0`], context,
356	Prog::kAnchored, Prog::kLongestMatch,
357	result->submatch,
358	&result->skipped, NULL)) {
359	LOG(ERROR) << "Reverse DFA inconsistency: "
360	<< CEscape(regexp_str_)
361	<< " on " << CEscape(text);
362	result->matched = false;
363	}
364	}
365	result->have_submatch0 = true;
366	break;
367
368	case kEngineOnePass:
369	if (prog_ == NULL \|\|
370	!prog_->IsOnePass() \|\|
371	anchor == Prog::kUnanchored \|\|
372	nsubmatch > Prog::kMaxOnePassCapture) {
373	result->skipped = true;
374	break;
375	}
376	result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
377	result->submatch, nsubmatch);
378	result->have_submatch = true;
379	break;
380
381	case kEngineBitState:
382	if (prog_ == NULL \|\|
383	!prog_->CanBitState()) {
384	result->skipped = true;
385	break;
386	}
387	result->matched = prog_->SearchBitState(text, context, anchor, kind_,
388	result->submatch, nsubmatch);
389	result->have_submatch = true;
390	break;
391
392	case kEngineRE2:
393	case kEngineRE2a:
394	case kEngineRE2b: {
395	if (!re2_ \|\| text.end() != context.end()) {
396	result->skipped = true;
397	break;
398	}
399
400	RE2::Anchor re_anchor;
401	if (anchor == Prog::kAnchored)
402	re_anchor = RE2::ANCHOR_START;
403	else
404	re_anchor = RE2::UNANCHORED;
405	if (kind_ == Prog::kFullMatch)
406	re_anchor = RE2::ANCHOR_BOTH;
407
408	result->matched = re2_->Match(
409	context,
410	static_cast<size_t>(text.begin() - context.begin()),
411	static_cast<size_t>(text.end() - context.begin()),
412	re_anchor,
413	result->submatch,
414	nsubmatch);
415	result->have_submatch = nsubmatch > `0`;
416	break;
417	}
418
419	case kEnginePCRE: {
420	if (!re_ \|\| text.begin() != context.begin() \|\|
421	text.end() != context.end()) {
422	result->skipped = true;
423	break;
424	}
425
426	// In Perl/PCRE, \v matches any character considered vertical
427	// whitespace, not just vertical tab. Regexp::MimicsPCRE() is
428	// unable to handle all cases of this, unfortunately, so just
429	// catch them here. :(
430	if (regexp_str_.find("\\v") != StringPiece::npos &&
431	(text.find(`'\n'`) != StringPiece::npos \|\|
432	text.find(`'\f'`) != StringPiece::npos \|\|
433	text.find(`'\r'`) != StringPiece::npos)) {
434	result->skipped = true;
435	break;
436	}
437
438	// PCRE 8.34 or so started allowing vertical tab to match \s,
439	// following a change made in Perl 5.18. RE2 does not.
440	if ((regexp_str_.find("\\s") != StringPiece::npos \|\|
441	regexp_str_.find("\\S") != StringPiece::npos) &&
442	text.find(`'\v'`) != StringPiece::npos) {
443	result->skipped = true;
444	break;
445	}
446
447	const PCRE::Arg argptr = new** const PCRE::Arg*[nsubmatch];
448	PCRE::Arg a = new* PCRE::Arg[nsubmatch];
449	for (int i = `0`; i < nsubmatch; i++) {
450	a[i] = PCRE::Arg (&result->submatch[i]);
451	argptr[i] = &a[i];
452	}
453	size_t consumed;
454	PCRE::Anchor pcre_anchor;
455	if (anchor == Prog::kAnchored)
456	pcre_anchor = PCRE::ANCHOR_START;
457	else
458	pcre_anchor = PCRE::UNANCHORED;
459	if (kind_ == Prog::kFullMatch)
460	pcre_anchor = PCRE::ANCHOR_BOTH;
461	re_->ClearHitLimit();
462	result->matched =
463	re_->DoMatch(text,
464	pcre_anchor,
465	&consumed,
466	argptr, nsubmatch);
467	if (re_->HitLimit()) {
468	result->untrusted = true;
469	delete[] argptr;
470	delete[] a;
471	break;
472	}
473	result->have_submatch = true;
474	delete[] argptr;
475	delete[] a;
476	break;
477	}
478	}
479
480	if (!result->matched)
481	memset(result->submatch, `0`, sizeof result->submatch);
482	}
483
484	// Checks whether r is okay given that correct is the right answer.
485	// Specifically, r's answers have to match (but it doesn't have to
486	// claim to have all the answers).
487	static bool ResultOkay(const Result& r, const Result& correct) {
488	if (r.skipped)
489	return true;
490	if (r.matched != correct.matched)
491	return false;
492	if (r.have_submatch \|\| r.have_submatch0) {
493	for (int i = `0`; i < kMaxSubmatch; i++) {
494	if (correct.submatch[i].data() != r.submatch[i].data() \|\|
495	correct.submatch[i].size() != r.submatch[i].size())
496	return false;
497	if (!r.have_submatch)
498	break;
499	}
500	}
501	return true;
502	}
503
504	// Runs a single test.
505	bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
506	Prog::Anchor anchor) {
507	// Backtracking is the gold standard.
508	Result correct;
509	RunSearch(kEngineBacktrack, text, context, anchor, &correct);
510	if (correct.skipped) {
511	if (regexp_ == NULL)
512	return true;
513	LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
514	<< " " << FormatMode(flags_);
515	return false;
516	}
517	VLOG(`1`) << "Try: regexp " << CEscape(regexp_str_)
518	<< " text " << CEscape(text)
519	<< " (" << FormatKind(kind_)
520	<< ", " << FormatAnchor(anchor)
521	<< ", " << FormatMode(flags_)
522	<< ")";
523
524	// Compare the others.
525	bool all_okay = true;
526	for (Engine i = kEngineBacktrack +`1`; i < kEngineMax; i ++) {
527	if (!(Engines() & (`1`<<i)))
528	continue;
529
530	Result r;
531	RunSearch(i, text, context, anchor, &r);
532	if (ResultOkay(r, correct)) {
533	if (GetFlag(FLAGS_log_okay))
534	LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
535	continue;
536	}
537
538	// We disagree with PCRE on the meaning of some Unicode matches.
539	// In particular, we treat non-ASCII UTF-8 as non-word characters.
540	// We also treat "empty" character sets like [^\w\W] as being
541	// impossible to match, while PCRE apparently excludes some code
542	// points (e.g., 0x0080) from both \w and \W.
543	if (i == kEnginePCRE && NonASCII(text))
544	continue;
545
546	if (!r.untrusted)
547	all_okay = false;
548
549	LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
550	context, anchor);
551	if (r.matched != correct.matched) {
552	if (r.matched) {
553	LOG(INFO) << " Should not match (but does).";
554	} else {
555	LOG(INFO) << " Should match (but does not).";
556	continue;
557	}
558	}
559	for (int i = `0`; i < `1`+num_captures_; i++) {
560	if (r.submatch[i].data() != correct.submatch[i].data() \|\|
561	r.submatch[i].size() != correct.submatch[i].size()) {
562	LOG(INFO) <<
563	StringPrintf(" $%d: should be %s is %s",
564	i,
565	FormatCapture(text, correct.submatch[i]).c_str(),
566	FormatCapture(text, r.submatch[i]).c_str());
567	} else {
568	LOG(INFO) <<
569	StringPrintf(" $%d: %s ok", i,
570	FormatCapture(text, r.submatch[i]).c_str());
571	}
572	}
573	}
574
575	if (!all_okay) {
576	// This will be initialised once (after flags have been initialised)
577	// and that is desirable because we want to enforce a global limit.
578	static int max_regexp_failures = GetFlag(FLAGS_max_regexp_failures);
579	if (max_regexp_failures > `0` && --max_regexp_failures == `0`)
580	LOG(QFATAL) << "Too many regexp failures.";
581	}
582
583	return all_okay;
584	}
585
586	void TestInstance::LogMatch(const char* prefix, Engine e,
587	const StringPiece& text, const StringPiece& context,
588	Prog::Anchor anchor) {
589	LOG(INFO) << prefix
590	<< EngineName(e)
591	<< " regexp "
592	<< CEscape(regexp_str_)
593	<< " "
594	<< CEscape(regexp_->ToString())
595	<< " text "
596	<< CEscape(text)
597	<< " ("
598	<< text.begin() - context.begin()
599	<< ","
600	<< text.end() - context.begin()
601	<< ") of context "
602	<< CEscape(context)
603	<< " (" << FormatKind(kind_)
604	<< ", " << FormatAnchor(anchor)
605	<< ", " << FormatMode(flags_)
606	<< ")";
607	}
608
609	static Prog::MatchKind kinds[] = {
610	Prog::kFirstMatch,
611	Prog::kLongestMatch,
612	Prog::kFullMatch,
613	};
614
615	// Test all possible match kinds and parse modes.
616	Tester::Tester(const StringPiece& regexp) {
617	error_ = false;
618	for (size_t i = `0`; i < arraysize(kinds); i++) {
619	for (size_t j = `0`; j < arraysize(parse_modes); j++) {
620	TestInstance* t = new TestInstance (regexp, kinds[i],
621	parse_modes[j].parse_flags);
622	error_ \|= t->error();
623	v_.push_back(t);
624	}
625	}
626	}
627
628	Tester::~Tester() {
629	for (size_t i = `0`; i < v_.size(); i++)
630	delete v_[i];
631	}
632
633	bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
634	Prog::Anchor anchor) {
635	bool okay = true;
636	for (size_t i = `0`; i < v_.size(); i++)
637	okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
638	return okay;
639	}
640
641	static Prog::Anchor anchors[] = {
642	Prog::kAnchored,
643	Prog::kUnanchored
644	};
645
646	bool Tester::TestInput(const StringPiece& text) {
647	bool okay = TestInputInContext(text, text);
648	if (!text.empty()) {
649	StringPiece sp;
650	sp = text;
651	sp.remove_prefix(`1`);
652	okay &= TestInputInContext(sp, text);
653	sp = text;
654	sp.remove_suffix(`1`);
655	okay &= TestInputInContext(sp, text);
656	}
657	return okay;
658	}
659
660	bool Tester::TestInputInContext(const StringPiece& text,
661	const StringPiece& context) {
662	bool okay = true;
663	for (size_t i = `0`; i < arraysize(anchors); i++)
664	okay &= TestCase(text, context, anchors[i]);
665	return okay;
666	}
667
668	bool TestRegexpOnText(const StringPiece& regexp,
669	const StringPiece& text) {
670	Tester t(regexp);
671	return t.TestInput(text);
672	}
673
674	} // namespace re2
675

Browse the source code of RE2/re2/testing/tester.cc