1 | // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Regular expression interface RE2. |
6 | // |
7 | // Originally the PCRE C++ wrapper, but adapted to use |
8 | // the new automata-based regular expression engines. |
9 | |
10 | #include "re2/re2.h" |
11 | |
12 | #include <assert.h> |
13 | #include <ctype.h> |
14 | #include <errno.h> |
15 | #include <stdint.h> |
16 | #include <stdlib.h> |
17 | #include <string.h> |
18 | #include <algorithm> |
19 | #include <iterator> |
20 | #include <mutex> |
21 | #include <string> |
22 | #include <utility> |
23 | #include <vector> |
24 | |
25 | #include "util/util.h" |
26 | #include "util/logging.h" |
27 | #include "util/strutil.h" |
28 | #include "util/utf.h" |
29 | #include "re2/prog.h" |
30 | #include "re2/regexp.h" |
31 | #include "re2/sparse_array.h" |
32 | |
33 | namespace re2 { |
34 | |
35 | // Maximum number of args we can set |
36 | static const int kMaxArgs = 16; |
37 | static const int kVecSize = 1+kMaxArgs; |
38 | |
39 | const int RE2::Options::kDefaultMaxMem; // initialized in re2.h |
40 | |
41 | RE2::Options::Options(RE2::CannedOptions opt) |
42 | : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), |
43 | posix_syntax_(opt == RE2::POSIX), |
44 | longest_match_(opt == RE2::POSIX), |
45 | log_errors_(opt != RE2::Quiet), |
46 | max_mem_(kDefaultMaxMem), |
47 | literal_(false), |
48 | never_nl_(false), |
49 | dot_nl_(false), |
50 | never_capture_(false), |
51 | case_sensitive_(true), |
52 | perl_classes_(false), |
53 | word_boundary_(false), |
54 | one_line_(false) { |
55 | } |
56 | |
57 | // static empty objects for use as const references. |
58 | // To avoid global constructors, allocated in RE2::Init(). |
59 | static const std::string* empty_string; |
60 | static const std::map<std::string, int>* empty_named_groups; |
61 | static const std::map<int, std::string>* empty_group_names; |
62 | |
63 | // Converts from Regexp error code to RE2 error code. |
64 | // Maybe some day they will diverge. In any event, this |
65 | // hides the existence of Regexp from RE2 users. |
66 | static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { |
67 | switch (code) { |
68 | case re2::kRegexpSuccess: |
69 | return RE2::NoError; |
70 | case re2::kRegexpInternalError: |
71 | return RE2::ErrorInternal; |
72 | case re2::kRegexpBadEscape: |
73 | return RE2::ErrorBadEscape; |
74 | case re2::kRegexpBadCharClass: |
75 | return RE2::ErrorBadCharClass; |
76 | case re2::kRegexpBadCharRange: |
77 | return RE2::ErrorBadCharRange; |
78 | case re2::kRegexpMissingBracket: |
79 | return RE2::ErrorMissingBracket; |
80 | case re2::kRegexpMissingParen: |
81 | return RE2::ErrorMissingParen; |
82 | case re2::kRegexpTrailingBackslash: |
83 | return RE2::ErrorTrailingBackslash; |
84 | case re2::kRegexpRepeatArgument: |
85 | return RE2::ErrorRepeatArgument; |
86 | case re2::kRegexpRepeatSize: |
87 | return RE2::ErrorRepeatSize; |
88 | case re2::kRegexpRepeatOp: |
89 | return RE2::ErrorRepeatOp; |
90 | case re2::kRegexpBadPerlOp: |
91 | return RE2::ErrorBadPerlOp; |
92 | case re2::kRegexpBadUTF8: |
93 | return RE2::ErrorBadUTF8; |
94 | case re2::kRegexpBadNamedCapture: |
95 | return RE2::ErrorBadNamedCapture; |
96 | } |
97 | return RE2::ErrorInternal; |
98 | } |
99 | |
100 | static std::string trunc(const StringPiece& pattern) { |
101 | if (pattern.size() < 100) |
102 | return std::string(pattern); |
103 | return std::string(pattern.substr(0, 100)) + "..." ; |
104 | } |
105 | |
106 | |
107 | RE2::RE2(const char* pattern) { |
108 | Init(pattern, DefaultOptions); |
109 | } |
110 | |
111 | RE2::RE2(const std::string& pattern) { |
112 | Init(pattern, DefaultOptions); |
113 | } |
114 | |
115 | RE2::RE2(const StringPiece& pattern) { |
116 | Init(pattern, DefaultOptions); |
117 | } |
118 | |
119 | RE2::RE2(const StringPiece& pattern, const Options& options) { |
120 | Init(pattern, options); |
121 | } |
122 | |
123 | int RE2::Options::ParseFlags() const { |
124 | int flags = Regexp::ClassNL; |
125 | switch (encoding()) { |
126 | default: |
127 | if (log_errors()) |
128 | LOG(ERROR) << "Unknown encoding " << encoding(); |
129 | break; |
130 | case RE2::Options::EncodingUTF8: |
131 | break; |
132 | case RE2::Options::EncodingLatin1: |
133 | flags |= Regexp::Latin1; |
134 | break; |
135 | } |
136 | |
137 | if (!posix_syntax()) |
138 | flags |= Regexp::LikePerl; |
139 | |
140 | if (literal()) |
141 | flags |= Regexp::Literal; |
142 | |
143 | if (never_nl()) |
144 | flags |= Regexp::NeverNL; |
145 | |
146 | if (dot_nl()) |
147 | flags |= Regexp::DotNL; |
148 | |
149 | if (never_capture()) |
150 | flags |= Regexp::NeverCapture; |
151 | |
152 | if (!case_sensitive()) |
153 | flags |= Regexp::FoldCase; |
154 | |
155 | if (perl_classes()) |
156 | flags |= Regexp::PerlClasses; |
157 | |
158 | if (word_boundary()) |
159 | flags |= Regexp::PerlB; |
160 | |
161 | if (one_line()) |
162 | flags |= Regexp::OneLine; |
163 | |
164 | return flags; |
165 | } |
166 | |
167 | void RE2::Init(const StringPiece& pattern, const Options& options) { |
168 | static std::once_flag empty_once; |
169 | std::call_once(empty_once, []() { |
170 | empty_string = new std::string; |
171 | empty_named_groups = new std::map<std::string, int>; |
172 | empty_group_names = new std::map<int, std::string>; |
173 | }); |
174 | |
175 | pattern_ = std::string(pattern); |
176 | options_.Copy(options); |
177 | entire_regexp_ = NULL; |
178 | suffix_regexp_ = NULL; |
179 | prog_ = NULL; |
180 | num_captures_ = -1; |
181 | rprog_ = NULL; |
182 | error_ = empty_string; |
183 | error_code_ = NoError; |
184 | named_groups_ = NULL; |
185 | group_names_ = NULL; |
186 | |
187 | RegexpStatus status; |
188 | entire_regexp_ = Regexp::Parse( |
189 | pattern_, |
190 | static_cast<Regexp::ParseFlags>(options_.ParseFlags()), |
191 | &status); |
192 | if (entire_regexp_ == NULL) { |
193 | if (options_.log_errors()) { |
194 | LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " |
195 | << status.Text(); |
196 | } |
197 | error_ = new std::string(status.Text()); |
198 | error_code_ = RegexpErrorToRE2(status.code()); |
199 | error_arg_ = std::string(status.error_arg()); |
200 | return; |
201 | } |
202 | |
203 | re2::Regexp* suffix; |
204 | if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) |
205 | suffix_regexp_ = suffix; |
206 | else |
207 | suffix_regexp_ = entire_regexp_->Incref(); |
208 | |
209 | // Two thirds of the memory goes to the forward Prog, |
210 | // one third to the reverse prog, because the forward |
211 | // Prog has two DFAs but the reverse prog has one. |
212 | prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); |
213 | if (prog_ == NULL) { |
214 | if (options_.log_errors()) |
215 | LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'" ; |
216 | error_ = new std::string("pattern too large - compile failed" ); |
217 | error_code_ = RE2::ErrorPatternTooLarge; |
218 | return; |
219 | } |
220 | |
221 | // We used to compute this lazily, but it's used during the |
222 | // typical control flow for a match call, so we now compute |
223 | // it eagerly, which avoids the overhead of std::once_flag. |
224 | num_captures_ = suffix_regexp_->NumCaptures(); |
225 | |
226 | // Could delay this until the first match call that |
227 | // cares about submatch information, but the one-pass |
228 | // machine's memory gets cut from the DFA memory budget, |
229 | // and that is harder to do if the DFA has already |
230 | // been built. |
231 | is_one_pass_ = prog_->IsOnePass(); |
232 | } |
233 | |
234 | // Returns rprog_, computing it if needed. |
235 | re2::Prog* RE2::ReverseProg() const { |
236 | std::call_once(rprog_once_, [](const RE2* re) { |
237 | re->rprog_ = |
238 | re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3); |
239 | if (re->rprog_ == NULL) { |
240 | if (re->options_.log_errors()) |
241 | LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'" ; |
242 | re->error_ = |
243 | new std::string("pattern too large - reverse compile failed" ); |
244 | re->error_code_ = RE2::ErrorPatternTooLarge; |
245 | } |
246 | }, this); |
247 | return rprog_; |
248 | } |
249 | |
250 | RE2::~RE2() { |
251 | if (suffix_regexp_) |
252 | suffix_regexp_->Decref(); |
253 | if (entire_regexp_) |
254 | entire_regexp_->Decref(); |
255 | delete prog_; |
256 | delete rprog_; |
257 | if (error_ != empty_string) |
258 | delete error_; |
259 | if (named_groups_ != NULL && named_groups_ != empty_named_groups) |
260 | delete named_groups_; |
261 | if (group_names_ != NULL && group_names_ != empty_group_names) |
262 | delete group_names_; |
263 | } |
264 | |
265 | int RE2::ProgramSize() const { |
266 | if (prog_ == NULL) |
267 | return -1; |
268 | return prog_->size(); |
269 | } |
270 | |
271 | int RE2::ReverseProgramSize() const { |
272 | if (prog_ == NULL) |
273 | return -1; |
274 | Prog* prog = ReverseProg(); |
275 | if (prog == NULL) |
276 | return -1; |
277 | return prog->size(); |
278 | } |
279 | |
280 | static int Fanout(Prog* prog, std::map<int, int>* histogram) { |
281 | SparseArray<int> fanout(prog->size()); |
282 | prog->Fanout(&fanout); |
283 | histogram->clear(); |
284 | for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) { |
285 | // TODO(junyer): Optimise this? |
286 | int bucket = 0; |
287 | while (1 << bucket < i->value()) { |
288 | bucket++; |
289 | } |
290 | (*histogram)[bucket]++; |
291 | } |
292 | return histogram->rbegin()->first; |
293 | } |
294 | |
295 | int RE2::ProgramFanout(std::map<int, int>* histogram) const { |
296 | if (prog_ == NULL) |
297 | return -1; |
298 | return Fanout(prog_, histogram); |
299 | } |
300 | |
301 | int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const { |
302 | if (prog_ == NULL) |
303 | return -1; |
304 | Prog* prog = ReverseProg(); |
305 | if (prog == NULL) |
306 | return -1; |
307 | return Fanout(prog, histogram); |
308 | } |
309 | |
310 | // Returns named_groups_, computing it if needed. |
311 | const std::map<std::string, int>& RE2::NamedCapturingGroups() const { |
312 | std::call_once(named_groups_once_, [](const RE2* re) { |
313 | if (re->suffix_regexp_ != NULL) |
314 | re->named_groups_ = re->suffix_regexp_->NamedCaptures(); |
315 | if (re->named_groups_ == NULL) |
316 | re->named_groups_ = empty_named_groups; |
317 | }, this); |
318 | return *named_groups_; |
319 | } |
320 | |
321 | // Returns group_names_, computing it if needed. |
322 | const std::map<int, std::string>& RE2::CapturingGroupNames() const { |
323 | std::call_once(group_names_once_, [](const RE2* re) { |
324 | if (re->suffix_regexp_ != NULL) |
325 | re->group_names_ = re->suffix_regexp_->CaptureNames(); |
326 | if (re->group_names_ == NULL) |
327 | re->group_names_ = empty_group_names; |
328 | }, this); |
329 | return *group_names_; |
330 | } |
331 | |
332 | /***** Convenience interfaces *****/ |
333 | |
334 | bool RE2::FullMatchN(const StringPiece& text, const RE2& re, |
335 | const Arg* const args[], int n) { |
336 | return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); |
337 | } |
338 | |
339 | bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, |
340 | const Arg* const args[], int n) { |
341 | return re.DoMatch(text, UNANCHORED, NULL, args, n); |
342 | } |
343 | |
344 | bool RE2::ConsumeN(StringPiece* input, const RE2& re, |
345 | const Arg* const args[], int n) { |
346 | size_t consumed; |
347 | if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { |
348 | input->remove_prefix(consumed); |
349 | return true; |
350 | } else { |
351 | return false; |
352 | } |
353 | } |
354 | |
355 | bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, |
356 | const Arg* const args[], int n) { |
357 | size_t consumed; |
358 | if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { |
359 | input->remove_prefix(consumed); |
360 | return true; |
361 | } else { |
362 | return false; |
363 | } |
364 | } |
365 | |
366 | bool RE2::Replace(std::string* str, |
367 | const RE2& re, |
368 | const StringPiece& rewrite) { |
369 | StringPiece vec[kVecSize]; |
370 | int nvec = 1 + MaxSubmatch(rewrite); |
371 | if (nvec > static_cast<int>(arraysize(vec))) |
372 | return false; |
373 | if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) |
374 | return false; |
375 | |
376 | std::string s; |
377 | if (!re.Rewrite(&s, rewrite, vec, nvec)) |
378 | return false; |
379 | |
380 | assert(vec[0].data() >= str->data()); |
381 | assert(vec[0].data() + vec[0].size() <= str->data() + str->size()); |
382 | str->replace(vec[0].data() - str->data(), vec[0].size(), s); |
383 | return true; |
384 | } |
385 | |
386 | int RE2::GlobalReplace(std::string* str, |
387 | const RE2& re, |
388 | const StringPiece& rewrite) { |
389 | StringPiece vec[kVecSize]; |
390 | int nvec = 1 + MaxSubmatch(rewrite); |
391 | if (nvec > static_cast<int>(arraysize(vec))) |
392 | return false; |
393 | |
394 | const char* p = str->data(); |
395 | const char* ep = p + str->size(); |
396 | const char* lastend = NULL; |
397 | std::string out; |
398 | int count = 0; |
399 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
400 | // Iterate just once when fuzzing. Otherwise, we easily get bogged down |
401 | // and coverage is unlikely to improve despite significant expense. |
402 | while (p == str->data()) { |
403 | #else |
404 | while (p <= ep) { |
405 | #endif |
406 | if (!re.Match(*str, static_cast<size_t>(p - str->data()), |
407 | str->size(), UNANCHORED, vec, nvec)) |
408 | break; |
409 | if (p < vec[0].data()) |
410 | out.append(p, vec[0].data() - p); |
411 | if (vec[0].data() == lastend && vec[0].empty()) { |
412 | // Disallow empty match at end of last match: skip ahead. |
413 | // |
414 | // fullrune() takes int, not ptrdiff_t. However, it just looks |
415 | // at the leading byte and treats any length >= 4 the same. |
416 | if (re.options().encoding() == RE2::Options::EncodingUTF8 && |
417 | fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) { |
418 | // re is in UTF-8 mode and there is enough left of str |
419 | // to allow us to advance by up to UTFmax bytes. |
420 | Rune r; |
421 | int n = chartorune(&r, p); |
422 | // Some copies of chartorune have a bug that accepts |
423 | // encodings of values in (10FFFF, 1FFFFF] as valid. |
424 | if (r > Runemax) { |
425 | n = 1; |
426 | r = Runeerror; |
427 | } |
428 | if (!(n == 1 && r == Runeerror)) { // no decoding error |
429 | out.append(p, n); |
430 | p += n; |
431 | continue; |
432 | } |
433 | } |
434 | // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode, |
435 | // we fell through from above and the GIGO principle applies. |
436 | if (p < ep) |
437 | out.append(p, 1); |
438 | p++; |
439 | continue; |
440 | } |
441 | re.Rewrite(&out, rewrite, vec, nvec); |
442 | p = vec[0].data() + vec[0].size(); |
443 | lastend = p; |
444 | count++; |
445 | } |
446 | |
447 | if (count == 0) |
448 | return 0; |
449 | |
450 | if (p < ep) |
451 | out.append(p, ep - p); |
452 | using std::swap; |
453 | swap(out, *str); |
454 | return count; |
455 | } |
456 | |
457 | bool RE2::(const StringPiece& text, |
458 | const RE2& re, |
459 | const StringPiece& rewrite, |
460 | std::string* out) { |
461 | StringPiece vec[kVecSize]; |
462 | int nvec = 1 + MaxSubmatch(rewrite); |
463 | if (nvec > static_cast<int>(arraysize(vec))) |
464 | return false; |
465 | |
466 | if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) |
467 | return false; |
468 | |
469 | out->clear(); |
470 | return re.Rewrite(out, rewrite, vec, nvec); |
471 | } |
472 | |
473 | std::string RE2::QuoteMeta(const StringPiece& unquoted) { |
474 | std::string result; |
475 | result.reserve(unquoted.size() << 1); |
476 | |
477 | // Escape any ascii character not in [A-Za-z_0-9]. |
478 | // |
479 | // Note that it's legal to escape a character even if it has no |
480 | // special meaning in a regular expression -- so this function does |
481 | // that. (This also makes it identical to the perl function of the |
482 | // same name except for the null-character special case; |
483 | // see `perldoc -f quotemeta`.) |
484 | for (size_t ii = 0; ii < unquoted.size(); ++ii) { |
485 | // Note that using 'isalnum' here raises the benchmark time from |
486 | // 32ns to 58ns: |
487 | if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && |
488 | (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && |
489 | (unquoted[ii] < '0' || unquoted[ii] > '9') && |
490 | unquoted[ii] != '_' && |
491 | // If this is the part of a UTF8 or Latin1 character, we need |
492 | // to copy this byte without escaping. Experimentally this is |
493 | // what works correctly with the regexp library. |
494 | !(unquoted[ii] & 128)) { |
495 | if (unquoted[ii] == '\0') { // Special handling for null chars. |
496 | // Note that this special handling is not strictly required for RE2, |
497 | // but this quoting is required for other regexp libraries such as |
498 | // PCRE. |
499 | // Can't use "\\0" since the next character might be a digit. |
500 | result += "\\x00" ; |
501 | continue; |
502 | } |
503 | result += '\\'; |
504 | } |
505 | result += unquoted[ii]; |
506 | } |
507 | |
508 | return result; |
509 | } |
510 | |
511 | bool RE2::PossibleMatchRange(std::string* min, std::string* max, |
512 | int maxlen) const { |
513 | if (prog_ == NULL) |
514 | return false; |
515 | |
516 | int n = static_cast<int>(prefix_.size()); |
517 | if (n > maxlen) |
518 | n = maxlen; |
519 | |
520 | // Determine initial min max from prefix_ literal. |
521 | *min = prefix_.substr(0, n); |
522 | *max = prefix_.substr(0, n); |
523 | if (prefix_foldcase_) { |
524 | // prefix is ASCII lowercase; change *min to uppercase. |
525 | for (int i = 0; i < n; i++) { |
526 | char& c = (*min)[i]; |
527 | if ('a' <= c && c <= 'z') |
528 | c += 'A' - 'a'; |
529 | } |
530 | } |
531 | |
532 | // Add to prefix min max using PossibleMatchRange on regexp. |
533 | std::string dmin, dmax; |
534 | maxlen -= n; |
535 | if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { |
536 | min->append(dmin); |
537 | max->append(dmax); |
538 | } else if (!max->empty()) { |
539 | // prog_->PossibleMatchRange has failed us, |
540 | // but we still have useful information from prefix_. |
541 | // Round up *max to allow any possible suffix. |
542 | PrefixSuccessor(max); |
543 | } else { |
544 | // Nothing useful. |
545 | *min = "" ; |
546 | *max = "" ; |
547 | return false; |
548 | } |
549 | |
550 | return true; |
551 | } |
552 | |
553 | // Avoid possible locale nonsense in standard strcasecmp. |
554 | // The string a is known to be all lowercase. |
555 | static int ascii_strcasecmp(const char* a, const char* b, size_t len) { |
556 | const char* ae = a + len; |
557 | |
558 | for (; a < ae; a++, b++) { |
559 | uint8_t x = *a; |
560 | uint8_t y = *b; |
561 | if ('A' <= y && y <= 'Z') |
562 | y += 'a' - 'A'; |
563 | if (x != y) |
564 | return x - y; |
565 | } |
566 | return 0; |
567 | } |
568 | |
569 | |
570 | /***** Actual matching and rewriting code *****/ |
571 | |
572 | bool RE2::Match(const StringPiece& text, |
573 | size_t startpos, |
574 | size_t endpos, |
575 | Anchor re_anchor, |
576 | StringPiece* submatch, |
577 | int nsubmatch) const { |
578 | if (!ok()) { |
579 | if (options_.log_errors()) |
580 | LOG(ERROR) << "Invalid RE2: " << *error_; |
581 | return false; |
582 | } |
583 | |
584 | if (startpos > endpos || endpos > text.size()) { |
585 | if (options_.log_errors()) |
586 | LOG(ERROR) << "RE2: invalid startpos, endpos pair. [" |
587 | << "startpos: " << startpos << ", " |
588 | << "endpos: " << endpos << ", " |
589 | << "text size: " << text.size() << "]" ; |
590 | return false; |
591 | } |
592 | |
593 | StringPiece subtext = text; |
594 | subtext.remove_prefix(startpos); |
595 | subtext.remove_suffix(text.size() - endpos); |
596 | |
597 | // Use DFAs to find exact location of match, filter out non-matches. |
598 | |
599 | // Don't ask for the location if we won't use it. |
600 | // SearchDFA can do extra optimizations in that case. |
601 | StringPiece match; |
602 | StringPiece* matchp = &match; |
603 | if (nsubmatch == 0) |
604 | matchp = NULL; |
605 | |
606 | int ncap = 1 + NumberOfCapturingGroups(); |
607 | if (ncap > nsubmatch) |
608 | ncap = nsubmatch; |
609 | |
610 | // If the regexp is anchored explicitly, must not be in middle of text. |
611 | if (prog_->anchor_start() && startpos != 0) |
612 | return false; |
613 | |
614 | // If the regexp is anchored explicitly, update re_anchor |
615 | // so that we can potentially fall into a faster case below. |
616 | if (prog_->anchor_start() && prog_->anchor_end()) |
617 | re_anchor = ANCHOR_BOTH; |
618 | else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) |
619 | re_anchor = ANCHOR_START; |
620 | |
621 | // Check for the required prefix, if any. |
622 | size_t prefixlen = 0; |
623 | if (!prefix_.empty()) { |
624 | if (startpos != 0) |
625 | return false; |
626 | prefixlen = prefix_.size(); |
627 | if (prefixlen > subtext.size()) |
628 | return false; |
629 | if (prefix_foldcase_) { |
630 | if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
631 | return false; |
632 | } else { |
633 | if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
634 | return false; |
635 | } |
636 | subtext.remove_prefix(prefixlen); |
637 | // If there is a required prefix, the anchor must be at least ANCHOR_START. |
638 | if (re_anchor != ANCHOR_BOTH) |
639 | re_anchor = ANCHOR_START; |
640 | } |
641 | |
642 | Prog::Anchor anchor = Prog::kUnanchored; |
643 | Prog::MatchKind kind = Prog::kFirstMatch; |
644 | if (options_.longest_match()) |
645 | kind = Prog::kLongestMatch; |
646 | bool skipped_test = false; |
647 | |
648 | bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture); |
649 | |
650 | // BitState allocates a bitmap of size prog_->list_count() * text.size(). |
651 | // It also allocates a stack of 3-word structures which could potentially |
652 | // grow as large as prog_->list_count() * text.size(), but in practice is |
653 | // much smaller. |
654 | const int kMaxBitStateBitmapSize = 256*1024; // bitmap size <= max (bits) |
655 | bool can_bit_state = prog_->CanBitState(); |
656 | size_t bit_state_text_max = kMaxBitStateBitmapSize / prog_->list_count(); |
657 | |
658 | bool dfa_failed = false; |
659 | switch (re_anchor) { |
660 | default: |
661 | case UNANCHORED: { |
662 | if (!prog_->SearchDFA(subtext, text, anchor, kind, |
663 | matchp, &dfa_failed, NULL)) { |
664 | if (dfa_failed) { |
665 | if (options_.log_errors()) |
666 | LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " |
667 | << "bytemap range " << prog_->bytemap_range() << ", " |
668 | << "list count " << prog_->list_count(); |
669 | // Fall back to NFA below. |
670 | skipped_test = true; |
671 | break; |
672 | } |
673 | return false; |
674 | } |
675 | if (matchp == NULL) // Matched. Don't care where |
676 | return true; |
677 | // SearchDFA set match[0].end() but didn't know where the |
678 | // match started. Run the regexp backward from match[0].end() |
679 | // to find the longest possible match -- that's where it started. |
680 | Prog* prog = ReverseProg(); |
681 | if (prog == NULL) |
682 | return false; |
683 | if (!prog->SearchDFA(match, text, Prog::kAnchored, |
684 | Prog::kLongestMatch, &match, &dfa_failed, NULL)) { |
685 | if (dfa_failed) { |
686 | if (options_.log_errors()) |
687 | LOG(ERROR) << "DFA out of memory: size " << prog->size() << ", " |
688 | << "bytemap range " << prog->bytemap_range() << ", " |
689 | << "list count " << prog->list_count(); |
690 | // Fall back to NFA below. |
691 | skipped_test = true; |
692 | break; |
693 | } |
694 | if (options_.log_errors()) |
695 | LOG(ERROR) << "SearchDFA inconsistency" ; |
696 | return false; |
697 | } |
698 | break; |
699 | } |
700 | |
701 | case ANCHOR_BOTH: |
702 | case ANCHOR_START: |
703 | if (re_anchor == ANCHOR_BOTH) |
704 | kind = Prog::kFullMatch; |
705 | anchor = Prog::kAnchored; |
706 | |
707 | // If only a small amount of text and need submatch |
708 | // information anyway and we're going to use OnePass or BitState |
709 | // to get it, we might as well not even bother with the DFA: |
710 | // OnePass or BitState will be fast enough. |
711 | // On tiny texts, OnePass outruns even the DFA, and |
712 | // it doesn't have the shared state and occasional mutex that |
713 | // the DFA does. |
714 | if (can_one_pass && text.size() <= 4096 && |
715 | (ncap > 1 || text.size() <= 8)) { |
716 | skipped_test = true; |
717 | break; |
718 | } |
719 | if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) { |
720 | skipped_test = true; |
721 | break; |
722 | } |
723 | if (!prog_->SearchDFA(subtext, text, anchor, kind, |
724 | &match, &dfa_failed, NULL)) { |
725 | if (dfa_failed) { |
726 | if (options_.log_errors()) |
727 | LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " |
728 | << "bytemap range " << prog_->bytemap_range() << ", " |
729 | << "list count " << prog_->list_count(); |
730 | // Fall back to NFA below. |
731 | skipped_test = true; |
732 | break; |
733 | } |
734 | return false; |
735 | } |
736 | break; |
737 | } |
738 | |
739 | if (!skipped_test && ncap <= 1) { |
740 | // We know exactly where it matches. That's enough. |
741 | if (ncap == 1) |
742 | submatch[0] = match; |
743 | } else { |
744 | StringPiece subtext1; |
745 | if (skipped_test) { |
746 | // DFA ran out of memory or was skipped: |
747 | // need to search in entire original text. |
748 | subtext1 = subtext; |
749 | } else { |
750 | // DFA found the exact match location: |
751 | // let NFA run an anchored, full match search |
752 | // to find submatch locations. |
753 | subtext1 = match; |
754 | anchor = Prog::kAnchored; |
755 | kind = Prog::kFullMatch; |
756 | } |
757 | |
758 | if (can_one_pass && anchor != Prog::kUnanchored) { |
759 | if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { |
760 | if (!skipped_test && options_.log_errors()) |
761 | LOG(ERROR) << "SearchOnePass inconsistency" ; |
762 | return false; |
763 | } |
764 | } else if (can_bit_state && subtext1.size() <= bit_state_text_max) { |
765 | if (!prog_->SearchBitState(subtext1, text, anchor, |
766 | kind, submatch, ncap)) { |
767 | if (!skipped_test && options_.log_errors()) |
768 | LOG(ERROR) << "SearchBitState inconsistency" ; |
769 | return false; |
770 | } |
771 | } else { |
772 | if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { |
773 | if (!skipped_test && options_.log_errors()) |
774 | LOG(ERROR) << "SearchNFA inconsistency" ; |
775 | return false; |
776 | } |
777 | } |
778 | } |
779 | |
780 | // Adjust overall match for required prefix that we stripped off. |
781 | if (prefixlen > 0 && nsubmatch > 0) |
782 | submatch[0] = StringPiece(submatch[0].data() - prefixlen, |
783 | submatch[0].size() + prefixlen); |
784 | |
785 | // Zero submatches that don't exist in the regexp. |
786 | for (int i = ncap; i < nsubmatch; i++) |
787 | submatch[i] = StringPiece(); |
788 | return true; |
789 | } |
790 | |
791 | // Internal matcher - like Match() but takes Args not StringPieces. |
792 | bool RE2::DoMatch(const StringPiece& text, |
793 | Anchor re_anchor, |
794 | size_t* consumed, |
795 | const Arg* const* args, |
796 | int n) const { |
797 | if (!ok()) { |
798 | if (options_.log_errors()) |
799 | LOG(ERROR) << "Invalid RE2: " << *error_; |
800 | return false; |
801 | } |
802 | |
803 | if (NumberOfCapturingGroups() < n) { |
804 | // RE has fewer capturing groups than number of Arg pointers passed in. |
805 | return false; |
806 | } |
807 | |
808 | // Count number of capture groups needed. |
809 | int nvec; |
810 | if (n == 0 && consumed == NULL) |
811 | nvec = 0; |
812 | else |
813 | nvec = n+1; |
814 | |
815 | StringPiece* vec; |
816 | StringPiece stkvec[kVecSize]; |
817 | StringPiece* heapvec = NULL; |
818 | |
819 | if (nvec <= static_cast<int>(arraysize(stkvec))) { |
820 | vec = stkvec; |
821 | } else { |
822 | vec = new StringPiece[nvec]; |
823 | heapvec = vec; |
824 | } |
825 | |
826 | if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) { |
827 | delete[] heapvec; |
828 | return false; |
829 | } |
830 | |
831 | if (consumed != NULL) |
832 | *consumed = static_cast<size_t>(vec[0].end() - text.begin()); |
833 | |
834 | if (n == 0 || args == NULL) { |
835 | // We are not interested in results |
836 | delete[] heapvec; |
837 | return true; |
838 | } |
839 | |
840 | // If we got here, we must have matched the whole pattern. |
841 | for (int i = 0; i < n; i++) { |
842 | const StringPiece& s = vec[i+1]; |
843 | if (!args[i]->Parse(s.data(), s.size())) { |
844 | // TODO: Should we indicate what the error was? |
845 | delete[] heapvec; |
846 | return false; |
847 | } |
848 | } |
849 | |
850 | delete[] heapvec; |
851 | return true; |
852 | } |
853 | |
854 | // Checks that the rewrite string is well-formed with respect to this |
855 | // regular expression. |
856 | bool RE2::CheckRewriteString(const StringPiece& rewrite, |
857 | std::string* error) const { |
858 | int max_token = -1; |
859 | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
860 | s < end; s++) { |
861 | int c = *s; |
862 | if (c != '\\') { |
863 | continue; |
864 | } |
865 | if (++s == end) { |
866 | *error = "Rewrite schema error: '\\' not allowed at end." ; |
867 | return false; |
868 | } |
869 | c = *s; |
870 | if (c == '\\') { |
871 | continue; |
872 | } |
873 | if (!isdigit(c)) { |
874 | *error = "Rewrite schema error: " |
875 | "'\\' must be followed by a digit or '\\'." ; |
876 | return false; |
877 | } |
878 | int n = (c - '0'); |
879 | if (max_token < n) { |
880 | max_token = n; |
881 | } |
882 | } |
883 | |
884 | if (max_token > NumberOfCapturingGroups()) { |
885 | *error = StringPrintf( |
886 | "Rewrite schema requests %d matches, but the regexp only has %d " |
887 | "parenthesized subexpressions." , |
888 | max_token, NumberOfCapturingGroups()); |
889 | return false; |
890 | } |
891 | return true; |
892 | } |
893 | |
894 | // Returns the maximum submatch needed for the rewrite to be done by Replace(). |
895 | // E.g. if rewrite == "foo \\2,\\1", returns 2. |
896 | int RE2::MaxSubmatch(const StringPiece& rewrite) { |
897 | int max = 0; |
898 | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
899 | s < end; s++) { |
900 | if (*s == '\\') { |
901 | s++; |
902 | int c = (s < end) ? *s : -1; |
903 | if (isdigit(c)) { |
904 | int n = (c - '0'); |
905 | if (n > max) |
906 | max = n; |
907 | } |
908 | } |
909 | } |
910 | return max; |
911 | } |
912 | |
913 | // Append the "rewrite" string, with backslash subsitutions from "vec", |
914 | // to string "out". |
915 | bool RE2::Rewrite(std::string* out, |
916 | const StringPiece& rewrite, |
917 | const StringPiece* vec, |
918 | int veclen) const { |
919 | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
920 | s < end; s++) { |
921 | if (*s != '\\') { |
922 | out->push_back(*s); |
923 | continue; |
924 | } |
925 | s++; |
926 | int c = (s < end) ? *s : -1; |
927 | if (isdigit(c)) { |
928 | int n = (c - '0'); |
929 | if (n >= veclen) { |
930 | if (options_.log_errors()) { |
931 | LOG(ERROR) << "requested group " << n |
932 | << " in regexp " << rewrite.data(); |
933 | } |
934 | return false; |
935 | } |
936 | StringPiece snip = vec[n]; |
937 | if (!snip.empty()) |
938 | out->append(snip.data(), snip.size()); |
939 | } else if (c == '\\') { |
940 | out->push_back('\\'); |
941 | } else { |
942 | if (options_.log_errors()) |
943 | LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); |
944 | return false; |
945 | } |
946 | } |
947 | return true; |
948 | } |
949 | |
950 | /***** Parsers for various types *****/ |
951 | |
952 | bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) { |
953 | // We fail if somebody asked us to store into a non-NULL void* pointer |
954 | return (dest == NULL); |
955 | } |
956 | |
957 | bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) { |
958 | if (dest == NULL) return true; |
959 | reinterpret_cast<std::string*>(dest)->assign(str, n); |
960 | return true; |
961 | } |
962 | |
963 | bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) { |
964 | if (dest == NULL) return true; |
965 | *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n); |
966 | return true; |
967 | } |
968 | |
969 | bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) { |
970 | if (n != 1) return false; |
971 | if (dest == NULL) return true; |
972 | *(reinterpret_cast<char*>(dest)) = str[0]; |
973 | return true; |
974 | } |
975 | |
976 | bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) { |
977 | if (n != 1) return false; |
978 | if (dest == NULL) return true; |
979 | *(reinterpret_cast<signed char*>(dest)) = str[0]; |
980 | return true; |
981 | } |
982 | |
983 | bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) { |
984 | if (n != 1) return false; |
985 | if (dest == NULL) return true; |
986 | *(reinterpret_cast<unsigned char*>(dest)) = str[0]; |
987 | return true; |
988 | } |
989 | |
990 | // Largest number spec that we are willing to parse |
991 | static const int kMaxNumberLength = 32; |
992 | |
993 | // REQUIRES "buf" must have length at least nbuf. |
994 | // Copies "str" into "buf" and null-terminates. |
995 | // Overwrites *np with the new length. |
996 | static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, |
997 | size_t* np, bool accept_spaces) { |
998 | size_t n = *np; |
999 | if (n == 0) return "" ; |
1000 | if (n > 0 && isspace(*str)) { |
1001 | // We are less forgiving than the strtoxxx() routines and do not |
1002 | // allow leading spaces. We do allow leading spaces for floats. |
1003 | if (!accept_spaces) { |
1004 | return "" ; |
1005 | } |
1006 | while (n > 0 && isspace(*str)) { |
1007 | n--; |
1008 | str++; |
1009 | } |
1010 | } |
1011 | |
1012 | // Although buf has a fixed maximum size, we can still handle |
1013 | // arbitrarily large integers correctly by omitting leading zeros. |
1014 | // (Numbers that are still too long will be out of range.) |
1015 | // Before deciding whether str is too long, |
1016 | // remove leading zeros with s/000+/00/. |
1017 | // Leaving the leading two zeros in place means that |
1018 | // we don't change 0000x123 (invalid) into 0x123 (valid). |
1019 | // Skip over leading - before replacing. |
1020 | bool neg = false; |
1021 | if (n >= 1 && str[0] == '-') { |
1022 | neg = true; |
1023 | n--; |
1024 | str++; |
1025 | } |
1026 | |
1027 | if (n >= 3 && str[0] == '0' && str[1] == '0') { |
1028 | while (n >= 3 && str[2] == '0') { |
1029 | n--; |
1030 | str++; |
1031 | } |
1032 | } |
1033 | |
1034 | if (neg) { // make room in buf for - |
1035 | n++; |
1036 | str--; |
1037 | } |
1038 | |
1039 | if (n > nbuf-1) return "" ; |
1040 | |
1041 | memmove(buf, str, n); |
1042 | if (neg) { |
1043 | buf[0] = '-'; |
1044 | } |
1045 | buf[n] = '\0'; |
1046 | *np = n; |
1047 | return buf; |
1048 | } |
1049 | |
1050 | bool RE2::Arg::parse_long_radix(const char* str, |
1051 | size_t n, |
1052 | void* dest, |
1053 | int radix) { |
1054 | if (n == 0) return false; |
1055 | char buf[kMaxNumberLength+1]; |
1056 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1057 | char* end; |
1058 | errno = 0; |
1059 | long r = strtol(str, &end, radix); |
1060 | if (end != str + n) return false; // Leftover junk |
1061 | if (errno) return false; |
1062 | if (dest == NULL) return true; |
1063 | *(reinterpret_cast<long*>(dest)) = r; |
1064 | return true; |
1065 | } |
1066 | |
1067 | bool RE2::Arg::parse_ulong_radix(const char* str, |
1068 | size_t n, |
1069 | void* dest, |
1070 | int radix) { |
1071 | if (n == 0) return false; |
1072 | char buf[kMaxNumberLength+1]; |
1073 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1074 | if (str[0] == '-') { |
1075 | // strtoul() will silently accept negative numbers and parse |
1076 | // them. This module is more strict and treats them as errors. |
1077 | return false; |
1078 | } |
1079 | |
1080 | char* end; |
1081 | errno = 0; |
1082 | unsigned long r = strtoul(str, &end, radix); |
1083 | if (end != str + n) return false; // Leftover junk |
1084 | if (errno) return false; |
1085 | if (dest == NULL) return true; |
1086 | *(reinterpret_cast<unsigned long*>(dest)) = r; |
1087 | return true; |
1088 | } |
1089 | |
1090 | bool RE2::Arg::parse_short_radix(const char* str, |
1091 | size_t n, |
1092 | void* dest, |
1093 | int radix) { |
1094 | long r; |
1095 | if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
1096 | if ((short)r != r) return false; // Out of range |
1097 | if (dest == NULL) return true; |
1098 | *(reinterpret_cast<short*>(dest)) = (short)r; |
1099 | return true; |
1100 | } |
1101 | |
1102 | bool RE2::Arg::parse_ushort_radix(const char* str, |
1103 | size_t n, |
1104 | void* dest, |
1105 | int radix) { |
1106 | unsigned long r; |
1107 | if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
1108 | if ((unsigned short)r != r) return false; // Out of range |
1109 | if (dest == NULL) return true; |
1110 | *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r; |
1111 | return true; |
1112 | } |
1113 | |
1114 | bool RE2::Arg::parse_int_radix(const char* str, |
1115 | size_t n, |
1116 | void* dest, |
1117 | int radix) { |
1118 | long r; |
1119 | if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
1120 | if ((int)r != r) return false; // Out of range |
1121 | if (dest == NULL) return true; |
1122 | *(reinterpret_cast<int*>(dest)) = (int)r; |
1123 | return true; |
1124 | } |
1125 | |
1126 | bool RE2::Arg::parse_uint_radix(const char* str, |
1127 | size_t n, |
1128 | void* dest, |
1129 | int radix) { |
1130 | unsigned long r; |
1131 | if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
1132 | if ((unsigned int)r != r) return false; // Out of range |
1133 | if (dest == NULL) return true; |
1134 | *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r; |
1135 | return true; |
1136 | } |
1137 | |
1138 | bool RE2::Arg::parse_longlong_radix(const char* str, |
1139 | size_t n, |
1140 | void* dest, |
1141 | int radix) { |
1142 | if (n == 0) return false; |
1143 | char buf[kMaxNumberLength+1]; |
1144 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1145 | char* end; |
1146 | errno = 0; |
1147 | long long r = strtoll(str, &end, radix); |
1148 | if (end != str + n) return false; // Leftover junk |
1149 | if (errno) return false; |
1150 | if (dest == NULL) return true; |
1151 | *(reinterpret_cast<long long*>(dest)) = r; |
1152 | return true; |
1153 | } |
1154 | |
1155 | bool RE2::Arg::parse_ulonglong_radix(const char* str, |
1156 | size_t n, |
1157 | void* dest, |
1158 | int radix) { |
1159 | if (n == 0) return false; |
1160 | char buf[kMaxNumberLength+1]; |
1161 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1162 | if (str[0] == '-') { |
1163 | // strtoull() will silently accept negative numbers and parse |
1164 | // them. This module is more strict and treats them as errors. |
1165 | return false; |
1166 | } |
1167 | char* end; |
1168 | errno = 0; |
1169 | unsigned long long r = strtoull(str, &end, radix); |
1170 | if (end != str + n) return false; // Leftover junk |
1171 | if (errno) return false; |
1172 | if (dest == NULL) return true; |
1173 | *(reinterpret_cast<unsigned long long*>(dest)) = r; |
1174 | return true; |
1175 | } |
1176 | |
1177 | static bool parse_double_float(const char* str, size_t n, bool isfloat, |
1178 | void* dest) { |
1179 | if (n == 0) return false; |
1180 | static const int kMaxLength = 200; |
1181 | char buf[kMaxLength+1]; |
1182 | str = TerminateNumber(buf, sizeof buf, str, &n, true); |
1183 | char* end; |
1184 | errno = 0; |
1185 | double r; |
1186 | if (isfloat) { |
1187 | r = strtof(str, &end); |
1188 | } else { |
1189 | r = strtod(str, &end); |
1190 | } |
1191 | if (end != str + n) return false; // Leftover junk |
1192 | if (errno) return false; |
1193 | if (dest == NULL) return true; |
1194 | if (isfloat) { |
1195 | *(reinterpret_cast<float*>(dest)) = (float)r; |
1196 | } else { |
1197 | *(reinterpret_cast<double*>(dest)) = r; |
1198 | } |
1199 | return true; |
1200 | } |
1201 | |
1202 | bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) { |
1203 | return parse_double_float(str, n, false, dest); |
1204 | } |
1205 | |
1206 | bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) { |
1207 | return parse_double_float(str, n, true, dest); |
1208 | } |
1209 | |
1210 | #define DEFINE_INTEGER_PARSER(name) \ |
1211 | bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) { \ |
1212 | return parse_##name##_radix(str, n, dest, 10); \ |
1213 | } \ |
1214 | bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \ |
1215 | return parse_##name##_radix(str, n, dest, 16); \ |
1216 | } \ |
1217 | bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \ |
1218 | return parse_##name##_radix(str, n, dest, 8); \ |
1219 | } \ |
1220 | bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n, \ |
1221 | void* dest) { \ |
1222 | return parse_##name##_radix(str, n, dest, 0); \ |
1223 | } |
1224 | |
1225 | DEFINE_INTEGER_PARSER(short); |
1226 | DEFINE_INTEGER_PARSER(ushort); |
1227 | DEFINE_INTEGER_PARSER(int); |
1228 | DEFINE_INTEGER_PARSER(uint); |
1229 | DEFINE_INTEGER_PARSER(long); |
1230 | DEFINE_INTEGER_PARSER(ulong); |
1231 | DEFINE_INTEGER_PARSER(longlong); |
1232 | DEFINE_INTEGER_PARSER(ulonglong); |
1233 | |
1234 | #undef DEFINE_INTEGER_PARSER |
1235 | |
1236 | } // namespace re2 |
1237 | |