re2.cc source code [RE2/re2/re2.cc]

1	// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Regular expression interface RE2.
6	//
7	// Originally the PCRE C++ wrapper, but adapted to use
8	// the new automata-based regular expression engines.
9
10	#include "re2/re2.h"
11
12	#include <assert.h>
13	#include <ctype.h>
14	#include <errno.h>
15	#include <stdint.h>
16	#include <stdlib.h>
17	#include <string.h>
18	#include <algorithm>
19	#include <iterator>
20	#include <mutex>
21	#include <string>
22	#include <utility>
23	#include <vector>
24
25	#include "util/util.h"
26	#include "util/logging.h"
27	#include "util/strutil.h"
28	#include "util/utf.h"
29	#include "re2/prog.h"
30	#include "re2/regexp.h"
31	#include "re2/sparse_array.h"
32
33	namespace re2 {
34
35	// Maximum number of args we can set
36	static const int kMaxArgs = `16`;
37	static const int kVecSize = `1`+kMaxArgs;
38
39	const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
40
41	RE2::Options::Options(RE2::CannedOptions opt)
42	: encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
43	posix_syntax_(opt == RE2::POSIX),
44	longest_match_(opt == RE2::POSIX),
45	log_errors_(opt != RE2::Quiet),
46	max_mem_(kDefaultMaxMem),
47	literal_(false),
48	never_nl_(false),
49	dot_nl_(false),
50	never_capture_(false),
51	case_sensitive_(true),
52	perl_classes_(false),
53	word_boundary_(false),
54	one_line_(false) {
55	}
56
57	// static empty objects for use as const references.
58	// To avoid global constructors, allocated in RE2::Init().
59	static const std::string* empty_string;
60	static const std::map<std::string, int>* empty_named_groups;
61	static const std::map<int, std::string>* empty_group_names;
62
63	// Converts from Regexp error code to RE2 error code.
64	// Maybe some day they will diverge. In any event, this
65	// hides the existence of Regexp from RE2 users.
66	static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
67	switch (code) {
68	case re2::kRegexpSuccess:
69	return RE2::NoError;
70	case re2::kRegexpInternalError:
71	return RE2::ErrorInternal;
72	case re2::kRegexpBadEscape:
73	return RE2::ErrorBadEscape;
74	case re2::kRegexpBadCharClass:
75	return RE2::ErrorBadCharClass;
76	case re2::kRegexpBadCharRange:
77	return RE2::ErrorBadCharRange;
78	case re2::kRegexpMissingBracket:
79	return RE2::ErrorMissingBracket;
80	case re2::kRegexpMissingParen:
81	return RE2::ErrorMissingParen;
82	case re2::kRegexpTrailingBackslash:
83	return RE2::ErrorTrailingBackslash;
84	case re2::kRegexpRepeatArgument:
85	return RE2::ErrorRepeatArgument;
86	case re2::kRegexpRepeatSize:
87	return RE2::ErrorRepeatSize;
88	case re2::kRegexpRepeatOp:
89	return RE2::ErrorRepeatOp;
90	case re2::kRegexpBadPerlOp:
91	return RE2::ErrorBadPerlOp;
92	case re2::kRegexpBadUTF8:
93	return RE2::ErrorBadUTF8;
94	case re2::kRegexpBadNamedCapture:
95	return RE2::ErrorBadNamedCapture;
96	}
97	return RE2::ErrorInternal;
98	}
99
100	static std::string trunc(const StringPiece& pattern) {
101	if (pattern.size() < `100`)
102	return std::string (pattern);
103	return std::string (pattern.substr(`0`, `100`)) + "...";
104	}
105
106
107	RE2::RE2(const char* pattern) {
108	Init(pattern, DefaultOptions);
109	}
110
111	RE2::RE2(const std::string& pattern) {
112	Init(pattern, DefaultOptions);
113	}
114
115	RE2::RE2(const StringPiece& pattern) {
116	Init(pattern, DefaultOptions);
117	}
118
119	RE2::RE2(const StringPiece& pattern, const Options& options) {
120	Init(pattern, options);
121	}
122
123	int RE2::Options::ParseFlags() const {
124	int flags = Regexp::ClassNL;
125	switch (encoding()) {
126	default:
127	if (log_errors())
128	LOG(ERROR) << "Unknown encoding " << encoding();
129	break;
130	case RE2::Options::EncodingUTF8:
131	break;
132	case RE2::Options::EncodingLatin1:
133	flags \|= Regexp::Latin1;
134	break;
135	}
136
137	if (!posix_syntax())
138	flags \|= Regexp::LikePerl;
139
140	if (literal())
141	flags \|= Regexp::Literal;
142
143	if (never_nl())
144	flags \|= Regexp::NeverNL;
145
146	if (dot_nl())
147	flags \|= Regexp::DotNL;
148
149	if (never_capture())
150	flags \|= Regexp::NeverCapture;
151
152	if (!case_sensitive())
153	flags \|= Regexp::FoldCase;
154
155	if (perl_classes())
156	flags \|= Regexp::PerlClasses;
157
158	if (word_boundary())
159	flags \|= Regexp::PerlB;
160
161	if (one_line())
162	flags \|= Regexp::OneLine;
163
164	return flags;
165	}
166
167	void RE2::Init(const StringPiece& pattern, const Options& options) {
168	static std::once_flag empty_once;
169	std::call_once(empty_once, []() {
170	empty_string = new std::string;
171	empty_named_groups = new std::map<std::string, int>;
172	empty_group_names = new std::map<int, std::string>;
173	});
174
175	pattern_ = std::string (pattern);
176	options_.Copy(options);
177	entire_regexp_ = NULL;
178	suffix_regexp_ = NULL;
179	prog_ = NULL;
180	num_captures_ = -`1`;
181	rprog_ = NULL;
182	error_ = empty_string;
183	error_code_ = NoError;
184	named_groups_ = NULL;
185	group_names_ = NULL;
186
187	RegexpStatus status;
188	entire_regexp_ = Regexp::Parse(
189	pattern_,
190	static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
191	&status);
192	if (entire_regexp_ == NULL) {
193	if (options_.log_errors()) {
194	LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
195	<< status.Text();
196	}
197	error_ = new std::string (status.Text());
198	error_code_ = RegexpErrorToRE2(status.code());
199	error_arg_ = std::string (status.error_arg());
200	return;
201	}
202
203	re2::Regexp* suffix;
204	if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
205	suffix_regexp_ = suffix;
206	else
207	suffix_regexp_ = entire_regexp_->Incref();
208
209	// Two thirds of the memory goes to the forward Prog,
210	// one third to the reverse prog, because the forward
211	// Prog has two DFAs but the reverse prog has one.
212	prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*`2`/`3`);
213	if (prog_ == NULL) {
214	if (options_.log_errors())
215	LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
216	error_ = new std::string ("pattern too large - compile failed");
217	error_code_ = RE2::ErrorPatternTooLarge;
218	return;
219	}
220
221	// We used to compute this lazily, but it's used during the
222	// typical control flow for a match call, so we now compute
223	// it eagerly, which avoids the overhead of std::once_flag.
224	num_captures_ = suffix_regexp_->NumCaptures();
225
226	// Could delay this until the first match call that
227	// cares about submatch information, but the one-pass
228	// machine's memory gets cut from the DFA memory budget,
229	// and that is harder to do if the DFA has already
230	// been built.
231	is_one_pass_ = prog_->IsOnePass();
232	}
233
234	// Returns rprog_, computing it if needed.
235	re2::Prog* RE2::ReverseProg() const {
236	std::call_once(rprog_once_, [](const RE2* re) {
237	re->rprog_ =
238	re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / `3`);
239	if (re->rprog_ == NULL) {
240	if (re->options_.log_errors())
241	LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
242	re->error_ =
243	new std::string ("pattern too large - reverse compile failed");
244	re->error_code_ = RE2::ErrorPatternTooLarge;
245	}
246	}, this);
247	return rprog_;
248	}
249
250	RE2::~RE2() {
251	if (suffix_regexp_)
252	suffix_regexp_->Decref();
253	if (entire_regexp_)
254	entire_regexp_->Decref();
255	delete prog_;
256	delete rprog_;
257	if (error_ != empty_string)
258	delete error_;
259	if (named_groups_ != NULL && named_groups_ != empty_named_groups)
260	delete named_groups_;
261	if (group_names_ != NULL && group_names_ != empty_group_names)
262	delete group_names_;
263	}
264
265	int RE2::ProgramSize() const {
266	if (prog_ == NULL)
267	return -`1`;
268	return prog_->size();
269	}
270
271	int RE2::ReverseProgramSize() const {
272	if (prog_ == NULL)
273	return -`1`;
274	Prog* prog = ReverseProg();
275	if (prog == NULL)
276	return -`1`;
277	return prog->size();
278	}
279
280	static int Fanout(Prog* prog, std::map<int, int>* histogram) {
281	SparseArray<int> fanout(prog->size());
282	prog->Fanout(&fanout);
283	histogram->clear();
284	for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
285	// TODO(junyer): Optimise this?
286	int bucket = `0`;
287	while (`1` << bucket < i->value()) {
288	bucket++;
289	}
290	(*histogram)[bucket]++;
291	}
292	return histogram->rbegin()->first;
293	}
294
295	int RE2::ProgramFanout(std::map<int, int>* histogram) const {
296	if (prog_ == NULL)
297	return -`1`;
298	return Fanout(prog_, histogram);
299	}
300
301	int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const {
302	if (prog_ == NULL)
303	return -`1`;
304	Prog* prog = ReverseProg();
305	if (prog == NULL)
306	return -`1`;
307	return Fanout(prog, histogram);
308	}
309
310	// Returns named_groups_, computing it if needed.
311	const std::map<std::string, int>& RE2::NamedCapturingGroups() const {
312	std::call_once(named_groups_once_, [](const RE2* re) {
313	if (re->suffix_regexp_ != NULL)
314	re->named_groups_ = re->suffix_regexp_->NamedCaptures();
315	if (re->named_groups_ == NULL)
316	re->named_groups_ = empty_named_groups;
317	}, this);
318	return *named_groups_;
319	}
320
321	// Returns group_names_, computing it if needed.
322	const std::map<int, std::string>& RE2::CapturingGroupNames() const {
323	std::call_once(group_names_once_, [](const RE2* re) {
324	if (re->suffix_regexp_ != NULL)
325	re->group_names_ = re->suffix_regexp_->CaptureNames();
326	if (re->group_names_ == NULL)
327	re->group_names_ = empty_group_names;
328	}, this);
329	return *group_names_;
330	}
331
332	/** Convenience interfaces **/
333
334	bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
335	const Arg* const args[], int n) {
336	return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
337	}
338
339	bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
340	const Arg* const args[], int n) {
341	return re.DoMatch(text, UNANCHORED, NULL, args, n);
342	}
343
344	bool RE2::ConsumeN(StringPiece* input, const RE2& re,
345	const Arg* const args[], int n) {
346	size_t consumed;
347	if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
348	input->remove_prefix(consumed);
349	return true;
350	} else {
351	return false;
352	}
353	}
354
355	bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
356	const Arg* const args[], int n) {
357	size_t consumed;
358	if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
359	input->remove_prefix(consumed);
360	return true;
361	} else {
362	return false;
363	}
364	}
365
366	bool RE2::Replace(std::string* str,
367	const RE2& re,
368	const StringPiece& rewrite) {
369	StringPiece vec[kVecSize];
370	int nvec = `1` + MaxSubmatch(rewrite);
371	if (nvec > static_cast<int>(arraysize(vec)))
372	return false;
373	if (!re.Match(*str, `0`, str->size(), UNANCHORED, vec, nvec))
374	return false;
375
376	std::string s;
377	if (!re.Rewrite(&s, rewrite, vec, nvec))
378	return false;
379
380	assert(vec[`0`].data() >= str->data());
381	assert(vec[`0`].data() + vec[`0`].size() <= str->data() + str->size());
382	str->replace(vec[`0`].data() - str->data(), vec[`0`].size(), s);
383	return true;
384	}
385
386	int RE2::GlobalReplace(std::string* str,
387	const RE2& re,
388	const StringPiece& rewrite) {
389	StringPiece vec[kVecSize];
390	int nvec = `1` + MaxSubmatch(rewrite);
391	if (nvec > static_cast<int>(arraysize(vec)))
392	return false;
393
394	const char* p = str->data();
395	const char* ep = p + str->size();
396	const char* lastend = NULL;
397	std::string out;
398	int count = `0`;
399	#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
400	// Iterate just once when fuzzing. Otherwise, we easily get bogged down
401	// and coverage is unlikely to improve despite significant expense.
402	while (p == str->data()) {
403	#else
404	while (p <= ep) {
405	#endif
406	if (!re.Match(str, static_cast*<size_t>(p - str->data()),
407	str->size(), UNANCHORED, vec, nvec))
408	break;
409	if (p < vec[`0`].data())
410	out.append(p, vec[`0`].data() - p);
411	if (vec[`0`].data() == lastend && vec[`0`].empty()) {
412	// Disallow empty match at end of last match: skip ahead.
413	//
414	// fullrune() takes int, not ptrdiff_t. However, it just looks
415	// at the leading byte and treats any length >= 4 the same.
416	if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
417	fullrune(p, static_cast<int>(std::min(ptrdiff_t{`4`}, ep - p)))) {
418	// re is in UTF-8 mode and there is enough left of str
419	// to allow us to advance by up to UTFmax bytes.
420	Rune r;
421	int n = chartorune(&r, p);
422	// Some copies of chartorune have a bug that accepts
423	// encodings of values in (10FFFF, 1FFFFF] as valid.
424	if (r > Runemax) {
425	n = `1`;
426	r = Runeerror;
427	}
428	if (!(n == `1` && r == Runeerror)) { // no decoding error
429	out.append(p, n);
430	p += n;
431	continue;
432	}
433	}
434	// Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
435	// we fell through from above and the GIGO principle applies.
436	if (p < ep)
437	out.append(p, `1`);
438	p++;
439	continue;
440	}
441	re.Rewrite(&out, rewrite, vec, nvec);
442	p = vec[`0`].data() + vec[`0`].size();
443	lastend = p;
444	count++;
445	}
446
447	if (count == `0`)
448	return `0`;
449
450	if (p < ep)
451	out.append(p, ep - p);
452	using std::swap;
453	swap(out, *str);
454	return count;
455	}
456
457	bool RE2::Extract(const StringPiece& text,
458	const RE2& re,
459	const StringPiece& rewrite,
460	std::string* out) {
461	StringPiece vec[kVecSize];
462	int nvec = `1` + MaxSubmatch(rewrite);
463	if (nvec > static_cast<int>(arraysize(vec)))
464	return false;
465
466	if (!re.Match(text, `0`, text.size(), UNANCHORED, vec, nvec))
467	return false;
468
469	out->clear();
470	return re.Rewrite(out, rewrite, vec, nvec);
471	}
472
473	std::string RE2::QuoteMeta(const StringPiece& unquoted) {
474	std::string result;
475	result.reserve(unquoted.size() << `1`);
476
477	// Escape any ascii character not in [A-Za-z_0-9].
478	//
479	// Note that it's legal to escape a character even if it has no
480	// special meaning in a regular expression -- so this function does
481	// that. (This also makes it identical to the perl function of the
482	// same name except for the null-character special case;
483	// see `perldoc -f quotemeta`.)
484	for (size_t ii = `0`; ii < unquoted.size(); ++ii) {
485	// Note that using 'isalnum' here raises the benchmark time from
486	// 32ns to 58ns:
487	if ((unquoted [ii] < `'a'` \|\| unquoted [ii] > `'z'`) &&
488	(unquoted [ii] < `'A'` \|\| unquoted [ii] > `'Z'`) &&
489	(unquoted [ii] < `'0'` \|\| unquoted [ii] > `'9'`) &&
490	unquoted [ii] != `'_'` &&
491	// If this is the part of a UTF8 or Latin1 character, we need
492	// to copy this byte without escaping. Experimentally this is
493	// what works correctly with the regexp library.
494	!(unquoted [ii] & `128`)) {
495	if (unquoted [ii] == `'\0'`) { // Special handling for null chars.
496	// Note that this special handling is not strictly required for RE2,
497	// but this quoting is required for other regexp libraries such as
498	// PCRE.
499	// Can't use "\\0" since the next character might be a digit.
500	result += "\\x00";
501	continue;
502	}
503	result += `'\\'`;
504	}
505	result += unquoted [ii];
506	}
507
508	return result;
509	}
510
511	bool RE2::PossibleMatchRange(std::string* min, std::string* max,
512	int maxlen) const {
513	if (prog_ == NULL)
514	return false;
515
516	int n = static_cast<int>(prefix_.size());
517	if (n > maxlen)
518	n = maxlen;
519
520	// Determine initial min max from prefix_ literal.
521	*min = prefix_.substr(`0`, n);
522	*max = prefix_.substr(`0`, n);
523	if (prefix_foldcase_) {
524	// prefix is ASCII lowercase; change min to uppercase.*
525	for (int i = `0`; i < n; i++) {
526	char& c = (*min)[i];
527	if (`'a'` <= c && c <= `'z'`)
528	c += `'A'` - `'a'`;
529	}
530	}
531
532	// Add to prefix min max using PossibleMatchRange on regexp.
533	std::string dmin, dmax;
534	maxlen -= n;
535	if (maxlen > `0` && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
536	min->append(dmin);
537	max->append(dmax);
538	} else if (!max->empty()) {
539	// prog_->PossibleMatchRange has failed us,
540	// but we still have useful information from prefix_.
541	// Round up max to allow any possible suffix.*
542	PrefixSuccessor(max);
543	} else {
544	// Nothing useful.
545	*min = "";
546	*max = "";
547	return false;
548	}
549
550	return true;
551	}
552
553	// Avoid possible locale nonsense in standard strcasecmp.
554	// The string a is known to be all lowercase.
555	static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
556	const char* ae = a + len;
557
558	for (; a < ae; a++, b++) {
559	uint8_t x = *a;
560	uint8_t y = *b;
561	if (`'A'` <= y && y <= `'Z'`)
562	y += `'a'` - `'A'`;
563	if (x != y)
564	return x - y;
565	}
566	return `0`;
567	}
568
569
570	/** Actual matching and rewriting code **/
571
572	bool RE2::Match(const StringPiece& text,
573	size_t startpos,
574	size_t endpos,
575	Anchor re_anchor,
576	StringPiece* submatch,
577	int nsubmatch) const {
578	if (!ok()) {
579	if (options_.log_errors())
580	LOG(ERROR) << "Invalid RE2: " << *error_;
581	return false;
582	}
583
584	if (startpos > endpos \|\| endpos > text.size()) {
585	if (options_.log_errors())
586	LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
587	<< "startpos: " << startpos << ", "
588	<< "endpos: " << endpos << ", "
589	<< "text size: " << text.size() << "]";
590	return false;
591	}
592
593	StringPiece subtext = text;
594	subtext.remove_prefix(startpos);
595	subtext.remove_suffix(text.size() - endpos);
596
597	// Use DFAs to find exact location of match, filter out non-matches.
598
599	// Don't ask for the location if we won't use it.
600	// SearchDFA can do extra optimizations in that case.
601	StringPiece match;
602	StringPiece* matchp = &match;
603	if (nsubmatch == `0`)
604	matchp = NULL;
605
606	int ncap = `1` + NumberOfCapturingGroups();
607	if (ncap > nsubmatch)
608	ncap = nsubmatch;
609
610	// If the regexp is anchored explicitly, must not be in middle of text.
611	if (prog_->anchor_start() && startpos != `0`)
612	return false;
613
614	// If the regexp is anchored explicitly, update re_anchor
615	// so that we can potentially fall into a faster case below.
616	if (prog_->anchor_start() && prog_->anchor_end())
617	re_anchor = ANCHOR_BOTH;
618	else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
619	re_anchor = ANCHOR_START;
620
621	// Check for the required prefix, if any.
622	size_t prefixlen = `0`;
623	if (!prefix_.empty()) {
624	if (startpos != `0`)
625	return false;
626	prefixlen = prefix_.size();
627	if (prefixlen > subtext.size())
628	return false;
629	if (prefix_foldcase_) {
630	if (ascii_strcasecmp(&prefix_[`0`], subtext.data(), prefixlen) != `0`)
631	return false;
632	} else {
633	if (memcmp(&prefix_[`0`], subtext.data(), prefixlen) != `0`)
634	return false;
635	}
636	subtext.remove_prefix(prefixlen);
637	// If there is a required prefix, the anchor must be at least ANCHOR_START.
638	if (re_anchor != ANCHOR_BOTH)
639	re_anchor = ANCHOR_START;
640	}
641
642	Prog::Anchor anchor = Prog::kUnanchored;
643	Prog::MatchKind kind = Prog::kFirstMatch;
644	if (options_.longest_match())
645	kind = Prog::kLongestMatch;
646	bool skipped_test = false;
647
648	bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
649
650	// BitState allocates a bitmap of size prog_->list_count() text.size().*
651	// It also allocates a stack of 3-word structures which could potentially
652	// grow as large as prog_->list_count() text.size(), but in practice is*
653	// much smaller.
654	const int kMaxBitStateBitmapSize = `256``1024`; // bitmap size <= max (bits)*
655	bool can_bit_state = prog_->CanBitState();
656	size_t bit_state_text_max = kMaxBitStateBitmapSize / prog_->list_count();
657
658	bool dfa_failed = false;
659	switch (re_anchor) {
660	default:
661	case UNANCHORED: {
662	if (!prog_->SearchDFA(subtext, text, anchor, kind,
663	matchp, &dfa_failed, NULL)) {
664	if (dfa_failed) {
665	if (options_.log_errors())
666	LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
667	<< "bytemap range " << prog_->bytemap_range() << ", "
668	<< "list count " << prog_->list_count();
669	// Fall back to NFA below.
670	skipped_test = true;
671	break;
672	}
673	return false;
674	}
675	if (matchp == NULL) // Matched. Don't care where
676	return true;
677	// SearchDFA set match[0].end() but didn't know where the
678	// match started. Run the regexp backward from match[0].end()
679	// to find the longest possible match -- that's where it started.
680	Prog* prog = ReverseProg();
681	if (prog == NULL)
682	return false;
683	if (!prog->SearchDFA(match, text, Prog::kAnchored,
684	Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
685	if (dfa_failed) {
686	if (options_.log_errors())
687	LOG(ERROR) << "DFA out of memory: size " << prog->size() << ", "
688	<< "bytemap range " << prog->bytemap_range() << ", "
689	<< "list count " << prog->list_count();
690	// Fall back to NFA below.
691	skipped_test = true;
692	break;
693	}
694	if (options_.log_errors())
695	LOG(ERROR) << "SearchDFA inconsistency";
696	return false;
697	}
698	break;
699	}
700
701	case ANCHOR_BOTH:
702	case ANCHOR_START:
703	if (re_anchor == ANCHOR_BOTH)
704	kind = Prog::kFullMatch;
705	anchor = Prog::kAnchored;
706
707	// If only a small amount of text and need submatch
708	// information anyway and we're going to use OnePass or BitState
709	// to get it, we might as well not even bother with the DFA:
710	// OnePass or BitState will be fast enough.
711	// On tiny texts, OnePass outruns even the DFA, and
712	// it doesn't have the shared state and occasional mutex that
713	// the DFA does.
714	if (can_one_pass && text.size() <= `4096` &&
715	(ncap > `1` \|\| text.size() <= `8`)) {
716	skipped_test = true;
717	break;
718	}
719	if (can_bit_state && text.size() <= bit_state_text_max && ncap > `1`) {
720	skipped_test = true;
721	break;
722	}
723	if (!prog_->SearchDFA(subtext, text, anchor, kind,
724	&match, &dfa_failed, NULL)) {
725	if (dfa_failed) {
726	if (options_.log_errors())
727	LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
728	<< "bytemap range " << prog_->bytemap_range() << ", "
729	<< "list count " << prog_->list_count();
730	// Fall back to NFA below.
731	skipped_test = true;
732	break;
733	}
734	return false;
735	}
736	break;
737	}
738
739	if (!skipped_test && ncap <= `1`) {
740	// We know exactly where it matches. That's enough.
741	if (ncap == `1`)
742	submatch[`0`] = match;
743	} else {
744	StringPiece subtext1;
745	if (skipped_test) {
746	// DFA ran out of memory or was skipped:
747	// need to search in entire original text.
748	subtext1 = subtext;
749	} else {
750	// DFA found the exact match location:
751	// let NFA run an anchored, full match search
752	// to find submatch locations.
753	subtext1 = match;
754	anchor = Prog::kAnchored;
755	kind = Prog::kFullMatch;
756	}
757
758	if (can_one_pass && anchor != Prog::kUnanchored) {
759	if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
760	if (!skipped_test && options_.log_errors())
761	LOG(ERROR) << "SearchOnePass inconsistency";
762	return false;
763	}
764	} else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
765	if (!prog_->SearchBitState(subtext1, text, anchor,
766	kind, submatch, ncap)) {
767	if (!skipped_test && options_.log_errors())
768	LOG(ERROR) << "SearchBitState inconsistency";
769	return false;
770	}
771	} else {
772	if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
773	if (!skipped_test && options_.log_errors())
774	LOG(ERROR) << "SearchNFA inconsistency";
775	return false;
776	}
777	}
778	}
779
780	// Adjust overall match for required prefix that we stripped off.
781	if (prefixlen > `0` && nsubmatch > `0`)
782	submatch[`0`] = StringPiece (submatch[`0`].data() - prefixlen,
783	submatch[`0`].size() + prefixlen);
784
785	// Zero submatches that don't exist in the regexp.
786	for (int i = ncap; i < nsubmatch; i++)
787	submatch[i] = StringPiece ();
788	return true;
789	}
790
791	// Internal matcher - like Match() but takes Args not StringPieces.
792	bool RE2::DoMatch(const StringPiece& text,
793	Anchor re_anchor,
794	size_t* consumed,
795	const Arg* const* args,
796	int n) const {
797	if (!ok()) {
798	if (options_.log_errors())
799	LOG(ERROR) << "Invalid RE2: " << *error_;
800	return false;
801	}
802
803	if (NumberOfCapturingGroups() < n) {
804	// RE has fewer capturing groups than number of Arg pointers passed in.
805	return false;
806	}
807
808	// Count number of capture groups needed.
809	int nvec;
810	if (n == `0` && consumed == NULL)
811	nvec = `0`;
812	else
813	nvec = n+`1`;
814
815	StringPiece* vec;
816	StringPiece stkvec[kVecSize];
817	StringPiece* heapvec = NULL;
818
819	if (nvec <= static_cast<int>(arraysize(stkvec))) {
820	vec = stkvec;
821	} else {
822	vec = new StringPiece[nvec];
823	heapvec = vec;
824	}
825
826	if (!Match(text, `0`, text.size(), re_anchor, vec, nvec)) {
827	delete[] heapvec;
828	return false;
829	}
830
831	if (consumed != NULL)
832	consumed = static_cast*<size_t>(vec[`0`].end() - text.begin());
833
834	if (n == `0` \|\| args == NULL) {
835	// We are not interested in results
836	delete[] heapvec;
837	return true;
838	}
839
840	// If we got here, we must have matched the whole pattern.
841	for (int i = `0`; i < n; i++) {
842	const StringPiece& s = vec[i+`1`];
843	if (!args[i]->Parse(s.data(), s.size())) {
844	// TODO: Should we indicate what the error was?
845	delete[] heapvec;
846	return false;
847	}
848	}
849
850	delete[] heapvec;
851	return true;
852	}
853
854	// Checks that the rewrite string is well-formed with respect to this
855	// regular expression.
856	bool RE2::CheckRewriteString(const StringPiece& rewrite,
857	std::string* error) const {
858	int max_token = -`1`;
859	for (const char s = rewrite.data(), end = s + rewrite.size();
860	s < end; s++) {
861	int c = *s;
862	if (c != `'\\'`) {
863	continue;
864	}
865	if (++s == end) {
866	*error = "Rewrite schema error: '\\' not allowed at end.";
867	return false;
868	}
869	c = *s;
870	if (c == `'\\'`) {
871	continue;
872	}
873	if (!isdigit(c)) {
874	*error = "Rewrite schema error: "
875	"'\\' must be followed by a digit or '\\'.";
876	return false;
877	}
878	int n = (c - `'0'`);
879	if (max_token < n) {
880	max_token = n;
881	}
882	}
883
884	if (max_token > NumberOfCapturingGroups()) {
885	*error = StringPrintf(
886	"Rewrite schema requests %d matches, but the regexp only has %d "
887	"parenthesized subexpressions.",
888	max_token, NumberOfCapturingGroups());
889	return false;
890	}
891	return true;
892	}
893
894	// Returns the maximum submatch needed for the rewrite to be done by Replace().
895	// E.g. if rewrite == "foo \\2,\\1", returns 2.
896	int RE2::MaxSubmatch(const StringPiece& rewrite) {
897	int max = `0`;
898	for (const char s = rewrite.data(), end = s + rewrite.size();
899	s < end; s++) {
900	if (*s == `'\\'`) {
901	s++;
902	int c = (s < end) ? *s : -`1`;
903	if (isdigit(c)) {
904	int n = (c - `'0'`);
905	if (n > max)
906	max = n;
907	}
908	}
909	}
910	return max;
911	}
912
913	// Append the "rewrite" string, with backslash subsitutions from "vec",
914	// to string "out".
915	bool RE2::Rewrite(std::string* out,
916	const StringPiece& rewrite,
917	const StringPiece* vec,
918	int veclen) const {
919	for (const char s = rewrite.data(), end = s + rewrite.size();
920	s < end; s++) {
921	if (*s != `'\\'`) {
922	out->push_back(*s);
923	continue;
924	}
925	s++;
926	int c = (s < end) ? *s : -`1`;
927	if (isdigit(c)) {
928	int n = (c - `'0'`);
929	if (n >= veclen) {
930	if (options_.log_errors()) {
931	LOG(ERROR) << "requested group " << n
932	<< " in regexp " << rewrite.data();
933	}
934	return false;
935	}
936	StringPiece snip = vec[n];
937	if (!snip.empty())
938	out->append(snip.data(), snip.size());
939	} else if (c == `'\\'`) {
940	out->push_back(`'\\'`);
941	} else {
942	if (options_.log_errors())
943	LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
944	return false;
945	}
946	}
947	return true;
948	}
949
950	/** Parsers for various types **/
951
952	bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) {
953	// We fail if somebody asked us to store into a non-NULL void pointer*
954	return (dest == NULL);
955	}
956
957	bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) {
958	if (dest == NULL) return true;
959	reinterpret_cast<std::string*>(dest)->assign(str, n);
960	return true;
961	}
962
963	bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
964	if (dest == NULL) return true;
965	(reinterpret_cast<StringPiece>(dest)) = StringPiece (str, n);
966	return true;
967	}
968
969	bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) {
970	if (n != `1`) return false;
971	if (dest == NULL) return true;
972	(reinterpret_cast<char**>(dest)) = str[`0`];
973	return true;
974	}
975
976	bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) {
977	if (n != `1`) return false;
978	if (dest == NULL) return true;
979	(reinterpret_cast<signed* char*>(dest)) = str[`0`];
980	return true;
981	}
982
983	bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) {
984	if (n != `1`) return false;
985	if (dest == NULL) return true;
986	(reinterpret_cast<unsigned* char*>(dest)) = str[`0`];
987	return true;
988	}
989
990	// Largest number spec that we are willing to parse
991	static const int kMaxNumberLength = `32`;
992
993	// REQUIRES "buf" must have length at least nbuf.
994	// Copies "str" into "buf" and null-terminates.
995	// Overwrites np with the new length.*
996	static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
997	size_t* np, bool accept_spaces) {
998	size_t n = *np;
999	if (n == `0`) return "";
1000	if (n > `0` && isspace(*str)) {
1001	// We are less forgiving than the strtoxxx() routines and do not
1002	// allow leading spaces. We do allow leading spaces for floats.
1003	if (!accept_spaces) {
1004	return "";
1005	}
1006	while (n > `0` && isspace(*str)) {
1007	n--;
1008	str++;
1009	}
1010	}
1011
1012	// Although buf has a fixed maximum size, we can still handle
1013	// arbitrarily large integers correctly by omitting leading zeros.
1014	// (Numbers that are still too long will be out of range.)
1015	// Before deciding whether str is too long,
1016	// remove leading zeros with s/000+/00/.
1017	// Leaving the leading two zeros in place means that
1018	// we don't change 0000x123 (invalid) into 0x123 (valid).
1019	// Skip over leading - before replacing.
1020	bool neg = false;
1021	if (n >= `1` && str[`0`] == `'-'`) {
1022	neg = true;
1023	n--;
1024	str++;
1025	}
1026
1027	if (n >= `3` && str[`0`] == `'0'` && str[`1`] == `'0'`) {
1028	while (n >= `3` && str[`2`] == `'0'`) {
1029	n--;
1030	str++;
1031	}
1032	}
1033
1034	if (neg) { // make room in buf for -
1035	n++;
1036	str--;
1037	}
1038
1039	if (n > nbuf-`1`) return "";
1040
1041	memmove(buf, str, n);
1042	if (neg) {
1043	buf[`0`] = `'-'`;
1044	}
1045	buf[n] = `'\0'`;
1046	*np = n;
1047	return buf;
1048	}
1049
1050	bool RE2::Arg::parse_long_radix(const char* str,
1051	size_t n,
1052	void* dest,
1053	int radix) {
1054	if (n == `0`) return false;
1055	char buf[kMaxNumberLength+`1`];
1056	str = TerminateNumber(buf, sizeof buf, str, &n, false);
1057	char* end;
1058	errno = `0`;
1059	long r = strtol(str, &end, radix);
1060	if (end != str + n) return false; // Leftover junk
1061	if (errno) return false;
1062	if (dest == NULL) return true;
1063	(reinterpret_cast<long**>(dest)) = r;
1064	return true;
1065	}
1066
1067	bool RE2::Arg::parse_ulong_radix(const char* str,
1068	size_t n,
1069	void* dest,
1070	int radix) {
1071	if (n == `0`) return false;
1072	char buf[kMaxNumberLength+`1`];
1073	str = TerminateNumber(buf, sizeof buf, str, &n, false);
1074	if (str[`0`] == `'-'`) {
1075	// strtoul() will silently accept negative numbers and parse
1076	// them. This module is more strict and treats them as errors.
1077	return false;
1078	}
1079
1080	char* end;
1081	errno = `0`;
1082	unsigned long r = strtoul(str, &end, radix);
1083	if (end != str + n) return false; // Leftover junk
1084	if (errno) return false;
1085	if (dest == NULL) return true;
1086	(reinterpret_cast<unsigned* long*>(dest)) = r;
1087	return true;
1088	}
1089
1090	bool RE2::Arg::parse_short_radix(const char* str,
1091	size_t n,
1092	void* dest,
1093	int radix) {
1094	long r;
1095	if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1096	if ((short)r != r) return false; // Out of range
1097	if (dest == NULL) return true;
1098	(reinterpret_cast<short*>(dest)) = (short*)r;
1099	return true;
1100	}
1101
1102	bool RE2::Arg::parse_ushort_radix(const char* str,
1103	size_t n,
1104	void* dest,
1105	int radix) {
1106	unsigned long r;
1107	if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1108	if ((unsigned short)r != r) return false; // Out of range
1109	if (dest == NULL) return true;
1110	(reinterpret_cast<unsigned* short>(dest)) = (unsigned* short)r;
1111	return true;
1112	}
1113
1114	bool RE2::Arg::parse_int_radix(const char* str,
1115	size_t n,
1116	void* dest,
1117	int radix) {
1118	long r;
1119	if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1120	if ((int)r != r) return false; // Out of range
1121	if (dest == NULL) return true;
1122	(reinterpret_cast<int*>(dest)) = (int*)r;
1123	return true;
1124	}
1125
1126	bool RE2::Arg::parse_uint_radix(const char* str,
1127	size_t n,
1128	void* dest,
1129	int radix) {
1130	unsigned long r;
1131	if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1132	if ((unsigned int)r != r) return false; // Out of range
1133	if (dest == NULL) return true;
1134	(reinterpret_cast<unsigned* int>(dest)) = (unsigned* int)r;
1135	return true;
1136	}
1137
1138	bool RE2::Arg::parse_longlong_radix(const char* str,
1139	size_t n,
1140	void* dest,
1141	int radix) {
1142	if (n == `0`) return false;
1143	char buf[kMaxNumberLength+`1`];
1144	str = TerminateNumber(buf, sizeof buf, str, &n, false);
1145	char* end;
1146	errno = `0`;
1147	long long r = strtoll(str, &end, radix);
1148	if (end != str + n) return false; // Leftover junk
1149	if (errno) return false;
1150	if (dest == NULL) return true;
1151	(reinterpret_cast<long* long*>(dest)) = r;
1152	return true;
1153	}
1154
1155	bool RE2::Arg::parse_ulonglong_radix(const char* str,
1156	size_t n,
1157	void* dest,
1158	int radix) {
1159	if (n == `0`) return false;
1160	char buf[kMaxNumberLength+`1`];
1161	str = TerminateNumber(buf, sizeof buf, str, &n, false);
1162	if (str[`0`] == `'-'`) {
1163	// strtoull() will silently accept negative numbers and parse
1164	// them. This module is more strict and treats them as errors.
1165	return false;
1166	}
1167	char* end;
1168	errno = `0`;
1169	unsigned long long r = strtoull(str, &end, radix);
1170	if (end != str + n) return false; // Leftover junk
1171	if (errno) return false;
1172	if (dest == NULL) return true;
1173	(reinterpret_cast<unsigned* long long*>(dest)) = r;
1174	return true;
1175	}
1176
1177	static bool parse_double_float(const char* str, size_t n, bool isfloat,
1178	void* dest) {
1179	if (n == `0`) return false;
1180	static const int kMaxLength = `200`;
1181	char buf[kMaxLength+`1`];
1182	str = TerminateNumber(buf, sizeof buf, str, &n, true);
1183	char* end;
1184	errno = `0`;
1185	double r;
1186	if (isfloat) {
1187	r = strtof(str, &end);
1188	} else {
1189	r = strtod(str, &end);
1190	}
1191	if (end != str + n) return false; // Leftover junk
1192	if (errno) return false;
1193	if (dest == NULL) return true;
1194	if (isfloat) {
1195	(reinterpret_cast<float*>(dest)) = (float*)r;
1196	} else {
1197	(reinterpret_cast<double**>(dest)) = r;
1198	}
1199	return true;
1200	}
1201
1202	bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) {
1203	return parse_double_float(str, n, false, dest);
1204	}
1205
1206	bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) {
1207	return parse_double_float(str, n, true, dest);
1208	}
1209
1210	#define DEFINE_INTEGER_PARSER(name) \
1211	bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) { \
1212	return parse_##name##_radix(str, n, dest, 10); \
1213	} \
1214	bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
1215	return parse_##name##_radix(str, n, dest, 16); \
1216	} \
1217	bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \
1218	return parse_##name##_radix(str, n, dest, 8); \
1219	} \
1220	bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n, \
1221	void* dest) { \
1222	return parse_##name##_radix(str, n, dest, 0); \
1223	}
1224
1225	DEFINE_INTEGER_PARSER(short);
1226	DEFINE_INTEGER_PARSER(ushort);
1227	DEFINE_INTEGER_PARSER(int);
1228	DEFINE_INTEGER_PARSER(uint);
1229	DEFINE_INTEGER_PARSER(long);
1230	DEFINE_INTEGER_PARSER(ulong);
1231	DEFINE_INTEGER_PARSER(longlong);
1232	DEFINE_INTEGER_PARSER(ulonglong);
1233
1234	#undef DEFINE_INTEGER_PARSER
1235
1236	} // namespace re2
1237

Browse the source code of RE2/re2/re2.cc