1 | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Regular expression representation. |
6 | // Tested by parse_test.cc |
7 | |
8 | #include "re2/regexp.h" |
9 | |
10 | #include <stddef.h> |
11 | #include <stdint.h> |
12 | #include <string.h> |
13 | #include <algorithm> |
14 | #include <map> |
15 | #include <mutex> |
16 | #include <string> |
17 | #include <vector> |
18 | |
19 | #include "util/util.h" |
20 | #include "util/logging.h" |
21 | #include "util/mutex.h" |
22 | #include "util/utf.h" |
23 | #include "re2/stringpiece.h" |
24 | #include "re2/walker-inl.h" |
25 | |
26 | namespace re2 { |
27 | |
28 | // Constructor. Allocates vectors as appropriate for operator. |
29 | Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) |
30 | : op_(static_cast<uint8_t>(op)), |
31 | simple_(false), |
32 | parse_flags_(static_cast<uint16_t>(parse_flags)), |
33 | ref_(1), |
34 | nsub_(0), |
35 | down_(NULL) { |
36 | subone_ = NULL; |
37 | memset(the_union_, 0, sizeof the_union_); |
38 | } |
39 | |
40 | // Destructor. Assumes already cleaned up children. |
41 | // Private: use Decref() instead of delete to destroy Regexps. |
42 | // Can't call Decref on the sub-Regexps here because |
43 | // that could cause arbitrarily deep recursion, so |
44 | // required Decref() to have handled them for us. |
45 | Regexp::~Regexp() { |
46 | if (nsub_ > 0) |
47 | LOG(DFATAL) << "Regexp not destroyed." ; |
48 | |
49 | switch (op_) { |
50 | default: |
51 | break; |
52 | case kRegexpCapture: |
53 | delete name_; |
54 | break; |
55 | case kRegexpLiteralString: |
56 | delete[] runes_; |
57 | break; |
58 | case kRegexpCharClass: |
59 | if (cc_) |
60 | cc_->Delete(); |
61 | delete ccb_; |
62 | break; |
63 | } |
64 | } |
65 | |
66 | // If it's possible to destroy this regexp without recurring, |
67 | // do so and return true. Else return false. |
68 | bool Regexp::QuickDestroy() { |
69 | if (nsub_ == 0) { |
70 | delete this; |
71 | return true; |
72 | } |
73 | return false; |
74 | } |
75 | |
76 | // Lazily allocated. |
77 | static Mutex* ref_mutex; |
78 | static std::map<Regexp*, int>* ref_map; |
79 | |
80 | int Regexp::Ref() { |
81 | if (ref_ < kMaxRef) |
82 | return ref_; |
83 | |
84 | MutexLock l(ref_mutex); |
85 | return (*ref_map)[this]; |
86 | } |
87 | |
88 | // Increments reference count, returns object as convenience. |
89 | Regexp* Regexp::Incref() { |
90 | if (ref_ >= kMaxRef-1) { |
91 | static std::once_flag ref_once; |
92 | std::call_once(ref_once, []() { |
93 | ref_mutex = new Mutex; |
94 | ref_map = new std::map<Regexp*, int>; |
95 | }); |
96 | |
97 | // Store ref count in overflow map. |
98 | MutexLock l(ref_mutex); |
99 | if (ref_ == kMaxRef) { |
100 | // already overflowed |
101 | (*ref_map)[this]++; |
102 | } else { |
103 | // overflowing now |
104 | (*ref_map)[this] = kMaxRef; |
105 | ref_ = kMaxRef; |
106 | } |
107 | return this; |
108 | } |
109 | |
110 | ref_++; |
111 | return this; |
112 | } |
113 | |
114 | // Decrements reference count and deletes this object if count reaches 0. |
115 | void Regexp::Decref() { |
116 | if (ref_ == kMaxRef) { |
117 | // Ref count is stored in overflow map. |
118 | MutexLock l(ref_mutex); |
119 | int r = (*ref_map)[this] - 1; |
120 | if (r < kMaxRef) { |
121 | ref_ = static_cast<uint16_t>(r); |
122 | ref_map->erase(this); |
123 | } else { |
124 | (*ref_map)[this] = r; |
125 | } |
126 | return; |
127 | } |
128 | ref_--; |
129 | if (ref_ == 0) |
130 | Destroy(); |
131 | } |
132 | |
133 | // Deletes this object; ref count has count reached 0. |
134 | void Regexp::Destroy() { |
135 | if (QuickDestroy()) |
136 | return; |
137 | |
138 | // Handle recursive Destroy with explicit stack |
139 | // to avoid arbitrarily deep recursion on process stack [sigh]. |
140 | down_ = NULL; |
141 | Regexp* stack = this; |
142 | while (stack != NULL) { |
143 | Regexp* re = stack; |
144 | stack = re->down_; |
145 | if (re->ref_ != 0) |
146 | LOG(DFATAL) << "Bad reference count " << re->ref_; |
147 | if (re->nsub_ > 0) { |
148 | Regexp** subs = re->sub(); |
149 | for (int i = 0; i < re->nsub_; i++) { |
150 | Regexp* sub = subs[i]; |
151 | if (sub == NULL) |
152 | continue; |
153 | if (sub->ref_ == kMaxRef) |
154 | sub->Decref(); |
155 | else |
156 | --sub->ref_; |
157 | if (sub->ref_ == 0 && !sub->QuickDestroy()) { |
158 | sub->down_ = stack; |
159 | stack = sub; |
160 | } |
161 | } |
162 | if (re->nsub_ > 1) |
163 | delete[] subs; |
164 | re->nsub_ = 0; |
165 | } |
166 | delete re; |
167 | } |
168 | } |
169 | |
170 | void Regexp::AddRuneToString(Rune r) { |
171 | DCHECK(op_ == kRegexpLiteralString); |
172 | if (nrunes_ == 0) { |
173 | // start with 8 |
174 | runes_ = new Rune[8]; |
175 | } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) { |
176 | // double on powers of two |
177 | Rune *old = runes_; |
178 | runes_ = new Rune[nrunes_ * 2]; |
179 | for (int i = 0; i < nrunes_; i++) |
180 | runes_[i] = old[i]; |
181 | delete[] old; |
182 | } |
183 | |
184 | runes_[nrunes_++] = r; |
185 | } |
186 | |
187 | Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { |
188 | Regexp* re = new Regexp(kRegexpHaveMatch, flags); |
189 | re->match_id_ = match_id; |
190 | return re; |
191 | } |
192 | |
193 | Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) { |
194 | // Squash **, ++ and ??. |
195 | if (op == sub->op() && flags == sub->parse_flags()) |
196 | return sub; |
197 | |
198 | // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because |
199 | // op is Star/Plus/Quest, we just have to check that sub->op() is too. |
200 | if ((sub->op() == kRegexpStar || |
201 | sub->op() == kRegexpPlus || |
202 | sub->op() == kRegexpQuest) && |
203 | flags == sub->parse_flags()) { |
204 | // If sub is Star, no need to rewrite it. |
205 | if (sub->op() == kRegexpStar) |
206 | return sub; |
207 | |
208 | // Rewrite sub to Star. |
209 | Regexp* re = new Regexp(kRegexpStar, flags); |
210 | re->AllocSub(1); |
211 | re->sub()[0] = sub->sub()[0]->Incref(); |
212 | sub->Decref(); // We didn't consume the reference after all. |
213 | return re; |
214 | } |
215 | |
216 | Regexp* re = new Regexp(op, flags); |
217 | re->AllocSub(1); |
218 | re->sub()[0] = sub; |
219 | return re; |
220 | } |
221 | |
222 | Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { |
223 | return StarPlusOrQuest(kRegexpPlus, sub, flags); |
224 | } |
225 | |
226 | Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { |
227 | return StarPlusOrQuest(kRegexpStar, sub, flags); |
228 | } |
229 | |
230 | Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { |
231 | return StarPlusOrQuest(kRegexpQuest, sub, flags); |
232 | } |
233 | |
234 | Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, |
235 | ParseFlags flags, bool can_factor) { |
236 | if (nsub == 1) |
237 | return sub[0]; |
238 | |
239 | if (nsub == 0) { |
240 | if (op == kRegexpAlternate) |
241 | return new Regexp(kRegexpNoMatch, flags); |
242 | else |
243 | return new Regexp(kRegexpEmptyMatch, flags); |
244 | } |
245 | |
246 | Regexp** subcopy = NULL; |
247 | if (op == kRegexpAlternate && can_factor) { |
248 | // Going to edit sub; make a copy so we don't step on caller. |
249 | subcopy = new Regexp*[nsub]; |
250 | memmove(subcopy, sub, nsub * sizeof sub[0]); |
251 | sub = subcopy; |
252 | nsub = FactorAlternation(sub, nsub, flags); |
253 | if (nsub == 1) { |
254 | Regexp* re = sub[0]; |
255 | delete[] subcopy; |
256 | return re; |
257 | } |
258 | } |
259 | |
260 | if (nsub > kMaxNsub) { |
261 | // Too many subexpressions to fit in a single Regexp. |
262 | // Make a two-level tree. Two levels gets us to 65535^2. |
263 | int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub; |
264 | Regexp* re = new Regexp(op, flags); |
265 | re->AllocSub(nbigsub); |
266 | Regexp** subs = re->sub(); |
267 | for (int i = 0; i < nbigsub - 1; i++) |
268 | subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false); |
269 | subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, |
270 | nsub - (nbigsub-1)*kMaxNsub, flags, |
271 | false); |
272 | delete[] subcopy; |
273 | return re; |
274 | } |
275 | |
276 | Regexp* re = new Regexp(op, flags); |
277 | re->AllocSub(nsub); |
278 | Regexp** subs = re->sub(); |
279 | for (int i = 0; i < nsub; i++) |
280 | subs[i] = sub[i]; |
281 | |
282 | delete[] subcopy; |
283 | return re; |
284 | } |
285 | |
286 | Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) { |
287 | return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); |
288 | } |
289 | |
290 | Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) { |
291 | return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); |
292 | } |
293 | |
294 | Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) { |
295 | return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); |
296 | } |
297 | |
298 | Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) { |
299 | Regexp* re = new Regexp(kRegexpCapture, flags); |
300 | re->AllocSub(1); |
301 | re->sub()[0] = sub; |
302 | re->cap_ = cap; |
303 | return re; |
304 | } |
305 | |
306 | Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) { |
307 | Regexp* re = new Regexp(kRegexpRepeat, flags); |
308 | re->AllocSub(1); |
309 | re->sub()[0] = sub; |
310 | re->min_ = min; |
311 | re->max_ = max; |
312 | return re; |
313 | } |
314 | |
315 | Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) { |
316 | Regexp* re = new Regexp(kRegexpLiteral, flags); |
317 | re->rune_ = rune; |
318 | return re; |
319 | } |
320 | |
321 | Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) { |
322 | if (nrunes <= 0) |
323 | return new Regexp(kRegexpEmptyMatch, flags); |
324 | if (nrunes == 1) |
325 | return NewLiteral(runes[0], flags); |
326 | Regexp* re = new Regexp(kRegexpLiteralString, flags); |
327 | for (int i = 0; i < nrunes; i++) |
328 | re->AddRuneToString(runes[i]); |
329 | return re; |
330 | } |
331 | |
332 | Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { |
333 | Regexp* re = new Regexp(kRegexpCharClass, flags); |
334 | re->cc_ = cc; |
335 | return re; |
336 | } |
337 | |
338 | void Regexp::Swap(Regexp* that) { |
339 | // Regexp is not trivially copyable, so we cannot freely copy it with |
340 | // memmove(3), but swapping objects like so is safe for our purposes. |
341 | char tmp[sizeof *this]; |
342 | void* vthis = reinterpret_cast<void*>(this); |
343 | void* vthat = reinterpret_cast<void*>(that); |
344 | memmove(tmp, vthis, sizeof *this); |
345 | memmove(vthis, vthat, sizeof *this); |
346 | memmove(vthat, tmp, sizeof *this); |
347 | } |
348 | |
349 | // Tests equality of all top-level structure but not subregexps. |
350 | static bool TopEqual(Regexp* a, Regexp* b) { |
351 | if (a->op() != b->op()) |
352 | return false; |
353 | |
354 | switch (a->op()) { |
355 | case kRegexpNoMatch: |
356 | case kRegexpEmptyMatch: |
357 | case kRegexpAnyChar: |
358 | case kRegexpAnyByte: |
359 | case kRegexpBeginLine: |
360 | case kRegexpEndLine: |
361 | case kRegexpWordBoundary: |
362 | case kRegexpNoWordBoundary: |
363 | case kRegexpBeginText: |
364 | return true; |
365 | |
366 | case kRegexpEndText: |
367 | // The parse flags remember whether it's \z or (?-m:$), |
368 | // which matters when testing against PCRE. |
369 | return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0; |
370 | |
371 | case kRegexpLiteral: |
372 | return a->rune() == b->rune() && |
373 | ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0; |
374 | |
375 | case kRegexpLiteralString: |
376 | return a->nrunes() == b->nrunes() && |
377 | ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 && |
378 | memcmp(a->runes(), b->runes(), |
379 | a->nrunes() * sizeof a->runes()[0]) == 0; |
380 | |
381 | case kRegexpAlternate: |
382 | case kRegexpConcat: |
383 | return a->nsub() == b->nsub(); |
384 | |
385 | case kRegexpStar: |
386 | case kRegexpPlus: |
387 | case kRegexpQuest: |
388 | return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0; |
389 | |
390 | case kRegexpRepeat: |
391 | return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && |
392 | a->min() == b->min() && |
393 | a->max() == b->max(); |
394 | |
395 | case kRegexpCapture: |
396 | return a->cap() == b->cap() && a->name() == b->name(); |
397 | |
398 | case kRegexpHaveMatch: |
399 | return a->match_id() == b->match_id(); |
400 | |
401 | case kRegexpCharClass: { |
402 | CharClass* acc = a->cc(); |
403 | CharClass* bcc = b->cc(); |
404 | return acc->size() == bcc->size() && |
405 | acc->end() - acc->begin() == bcc->end() - bcc->begin() && |
406 | memcmp(acc->begin(), bcc->begin(), |
407 | (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0; |
408 | } |
409 | } |
410 | |
411 | LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op(); |
412 | return 0; |
413 | } |
414 | |
415 | bool Regexp::Equal(Regexp* a, Regexp* b) { |
416 | if (a == NULL || b == NULL) |
417 | return a == b; |
418 | |
419 | if (!TopEqual(a, b)) |
420 | return false; |
421 | |
422 | // Fast path: |
423 | // return without allocating vector if there are no subregexps. |
424 | switch (a->op()) { |
425 | case kRegexpAlternate: |
426 | case kRegexpConcat: |
427 | case kRegexpStar: |
428 | case kRegexpPlus: |
429 | case kRegexpQuest: |
430 | case kRegexpRepeat: |
431 | case kRegexpCapture: |
432 | break; |
433 | |
434 | default: |
435 | return true; |
436 | } |
437 | |
438 | // Committed to doing real work. |
439 | // The stack (vector) has pairs of regexps waiting to |
440 | // be compared. The regexps are only equal if |
441 | // all the pairs end up being equal. |
442 | std::vector<Regexp*> stk; |
443 | |
444 | for (;;) { |
445 | // Invariant: TopEqual(a, b) == true. |
446 | Regexp* a2; |
447 | Regexp* b2; |
448 | switch (a->op()) { |
449 | default: |
450 | break; |
451 | case kRegexpAlternate: |
452 | case kRegexpConcat: |
453 | for (int i = 0; i < a->nsub(); i++) { |
454 | a2 = a->sub()[i]; |
455 | b2 = b->sub()[i]; |
456 | if (!TopEqual(a2, b2)) |
457 | return false; |
458 | stk.push_back(a2); |
459 | stk.push_back(b2); |
460 | } |
461 | break; |
462 | |
463 | case kRegexpStar: |
464 | case kRegexpPlus: |
465 | case kRegexpQuest: |
466 | case kRegexpRepeat: |
467 | case kRegexpCapture: |
468 | a2 = a->sub()[0]; |
469 | b2 = b->sub()[0]; |
470 | if (!TopEqual(a2, b2)) |
471 | return false; |
472 | // Really: |
473 | // stk.push_back(a2); |
474 | // stk.push_back(b2); |
475 | // break; |
476 | // but faster to assign directly and loop. |
477 | a = a2; |
478 | b = b2; |
479 | continue; |
480 | } |
481 | |
482 | size_t n = stk.size(); |
483 | if (n == 0) |
484 | break; |
485 | |
486 | DCHECK_GE(n, 2); |
487 | a = stk[n-2]; |
488 | b = stk[n-1]; |
489 | stk.resize(n-2); |
490 | } |
491 | |
492 | return true; |
493 | } |
494 | |
495 | // Keep in sync with enum RegexpStatusCode in regexp.h |
496 | static const char *kErrorStrings[] = { |
497 | "no error" , |
498 | "unexpected error" , |
499 | "invalid escape sequence" , |
500 | "invalid character class" , |
501 | "invalid character class range" , |
502 | "missing ]" , |
503 | "missing )" , |
504 | "trailing \\" , |
505 | "no argument for repetition operator" , |
506 | "invalid repetition size" , |
507 | "bad repetition operator" , |
508 | "invalid perl operator" , |
509 | "invalid UTF-8" , |
510 | "invalid named capture group" , |
511 | }; |
512 | |
513 | std::string RegexpStatus::CodeText(enum RegexpStatusCode code) { |
514 | if (code < 0 || code >= arraysize(kErrorStrings)) |
515 | code = kRegexpInternalError; |
516 | return kErrorStrings[code]; |
517 | } |
518 | |
519 | std::string RegexpStatus::Text() const { |
520 | if (error_arg_.empty()) |
521 | return CodeText(code_); |
522 | std::string s; |
523 | s.append(CodeText(code_)); |
524 | s.append(": " ); |
525 | s.append(error_arg_.data(), error_arg_.size()); |
526 | return s; |
527 | } |
528 | |
529 | void RegexpStatus::Copy(const RegexpStatus& status) { |
530 | code_ = status.code_; |
531 | error_arg_ = status.error_arg_; |
532 | } |
533 | |
534 | typedef int Ignored; // Walker<void> doesn't exist |
535 | |
536 | // Walker subclass to count capturing parens in regexp. |
537 | class NumCapturesWalker : public Regexp::Walker<Ignored> { |
538 | public: |
539 | NumCapturesWalker() : ncapture_(0) {} |
540 | int ncapture() { return ncapture_; } |
541 | |
542 | virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
543 | if (re->op() == kRegexpCapture) |
544 | ncapture_++; |
545 | return ignored; |
546 | } |
547 | virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
548 | // Should never be called: we use Walk not WalkExponential. |
549 | LOG(DFATAL) << "NumCapturesWalker::ShortVisit called" ; |
550 | return ignored; |
551 | } |
552 | |
553 | private: |
554 | int ncapture_; |
555 | |
556 | NumCapturesWalker(const NumCapturesWalker&) = delete; |
557 | NumCapturesWalker& operator=(const NumCapturesWalker&) = delete; |
558 | }; |
559 | |
560 | int Regexp::NumCaptures() { |
561 | NumCapturesWalker w; |
562 | w.Walk(this, 0); |
563 | return w.ncapture(); |
564 | } |
565 | |
566 | // Walker class to build map of named capture groups and their indices. |
567 | class NamedCapturesWalker : public Regexp::Walker<Ignored> { |
568 | public: |
569 | NamedCapturesWalker() : map_(NULL) {} |
570 | ~NamedCapturesWalker() { delete map_; } |
571 | |
572 | std::map<std::string, int>* TakeMap() { |
573 | std::map<std::string, int>* m = map_; |
574 | map_ = NULL; |
575 | return m; |
576 | } |
577 | |
578 | Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
579 | if (re->op() == kRegexpCapture && re->name() != NULL) { |
580 | // Allocate map once we find a name. |
581 | if (map_ == NULL) |
582 | map_ = new std::map<std::string, int>; |
583 | |
584 | // Record first occurrence of each name. |
585 | // (The rule is that if you have the same name |
586 | // multiple times, only the leftmost one counts.) |
587 | if (map_->find(*re->name()) == map_->end()) |
588 | (*map_)[*re->name()] = re->cap(); |
589 | } |
590 | return ignored; |
591 | } |
592 | |
593 | virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
594 | // Should never be called: we use Walk not WalkExponential. |
595 | LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called" ; |
596 | return ignored; |
597 | } |
598 | |
599 | private: |
600 | std::map<std::string, int>* map_; |
601 | |
602 | NamedCapturesWalker(const NamedCapturesWalker&) = delete; |
603 | NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete; |
604 | }; |
605 | |
606 | std::map<std::string, int>* Regexp::NamedCaptures() { |
607 | NamedCapturesWalker w; |
608 | w.Walk(this, 0); |
609 | return w.TakeMap(); |
610 | } |
611 | |
612 | // Walker class to build map from capture group indices to their names. |
613 | class CaptureNamesWalker : public Regexp::Walker<Ignored> { |
614 | public: |
615 | CaptureNamesWalker() : map_(NULL) {} |
616 | ~CaptureNamesWalker() { delete map_; } |
617 | |
618 | std::map<int, std::string>* TakeMap() { |
619 | std::map<int, std::string>* m = map_; |
620 | map_ = NULL; |
621 | return m; |
622 | } |
623 | |
624 | Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
625 | if (re->op() == kRegexpCapture && re->name() != NULL) { |
626 | // Allocate map once we find a name. |
627 | if (map_ == NULL) |
628 | map_ = new std::map<int, std::string>; |
629 | |
630 | (*map_)[re->cap()] = *re->name(); |
631 | } |
632 | return ignored; |
633 | } |
634 | |
635 | virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
636 | // Should never be called: we use Walk not WalkExponential. |
637 | LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called" ; |
638 | return ignored; |
639 | } |
640 | |
641 | private: |
642 | std::map<int, std::string>* map_; |
643 | |
644 | CaptureNamesWalker(const CaptureNamesWalker&) = delete; |
645 | CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete; |
646 | }; |
647 | |
648 | std::map<int, std::string>* Regexp::CaptureNames() { |
649 | CaptureNamesWalker w; |
650 | w.Walk(this, 0); |
651 | return w.TakeMap(); |
652 | } |
653 | |
654 | // Determines whether regexp matches must be anchored |
655 | // with a fixed string prefix. If so, returns the prefix and |
656 | // the regexp that remains after the prefix. The prefix might |
657 | // be ASCII case-insensitive. |
658 | bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase, |
659 | Regexp** suffix) { |
660 | // No need for a walker: the regexp must be of the form |
661 | // 1. some number of ^ anchors |
662 | // 2. a literal char or string |
663 | // 3. the rest |
664 | prefix->clear(); |
665 | *foldcase = false; |
666 | *suffix = NULL; |
667 | if (op_ != kRegexpConcat) |
668 | return false; |
669 | |
670 | // Some number of anchors, then a literal or concatenation. |
671 | int i = 0; |
672 | Regexp** sub = this->sub(); |
673 | while (i < nsub_ && sub[i]->op_ == kRegexpBeginText) |
674 | i++; |
675 | if (i == 0 || i >= nsub_) |
676 | return false; |
677 | |
678 | Regexp* re = sub[i]; |
679 | switch (re->op_) { |
680 | default: |
681 | return false; |
682 | |
683 | case kRegexpLiteralString: |
684 | // Convert to string in proper encoding. |
685 | if (re->parse_flags() & Latin1) { |
686 | prefix->resize(re->nrunes_); |
687 | for (int j = 0; j < re->nrunes_; j++) |
688 | (*prefix)[j] = static_cast<char>(re->runes_[j]); |
689 | } else { |
690 | // Convert to UTF-8 in place. |
691 | // Assume worst-case space and then trim. |
692 | prefix->resize(re->nrunes_ * UTFmax); |
693 | char *p = &(*prefix)[0]; |
694 | for (int j = 0; j < re->nrunes_; j++) { |
695 | Rune r = re->runes_[j]; |
696 | if (r < Runeself) |
697 | *p++ = static_cast<char>(r); |
698 | else |
699 | p += runetochar(p, &r); |
700 | } |
701 | prefix->resize(p - &(*prefix)[0]); |
702 | } |
703 | break; |
704 | |
705 | case kRegexpLiteral: |
706 | if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { |
707 | prefix->append(1, static_cast<char>(re->rune_)); |
708 | } else { |
709 | char buf[UTFmax]; |
710 | prefix->append(buf, runetochar(buf, &re->rune_)); |
711 | } |
712 | break; |
713 | } |
714 | *foldcase = (sub[i]->parse_flags() & FoldCase) != 0; |
715 | i++; |
716 | |
717 | // The rest. |
718 | if (i < nsub_) { |
719 | for (int j = i; j < nsub_; j++) |
720 | sub[j]->Incref(); |
721 | re = Concat(sub + i, nsub_ - i, parse_flags()); |
722 | } else { |
723 | re = new Regexp(kRegexpEmptyMatch, parse_flags()); |
724 | } |
725 | *suffix = re; |
726 | return true; |
727 | } |
728 | |
729 | // Character class builder is a balanced binary tree (STL set) |
730 | // containing non-overlapping, non-abutting RuneRanges. |
731 | // The less-than operator used in the tree treats two |
732 | // ranges as equal if they overlap at all, so that |
733 | // lookups for a particular Rune are possible. |
734 | |
735 | CharClassBuilder::CharClassBuilder() { |
736 | nrunes_ = 0; |
737 | upper_ = 0; |
738 | lower_ = 0; |
739 | } |
740 | |
741 | // Add lo-hi to the class; return whether class got bigger. |
742 | bool CharClassBuilder::AddRange(Rune lo, Rune hi) { |
743 | if (hi < lo) |
744 | return false; |
745 | |
746 | if (lo <= 'z' && hi >= 'A') { |
747 | // Overlaps some alpha, maybe not all. |
748 | // Update bitmaps telling which ASCII letters are in the set. |
749 | Rune lo1 = std::max<Rune>(lo, 'A'); |
750 | Rune hi1 = std::min<Rune>(hi, 'Z'); |
751 | if (lo1 <= hi1) |
752 | upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); |
753 | |
754 | lo1 = std::max<Rune>(lo, 'a'); |
755 | hi1 = std::min<Rune>(hi, 'z'); |
756 | if (lo1 <= hi1) |
757 | lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); |
758 | } |
759 | |
760 | { // Check whether lo, hi is already in the class. |
761 | iterator it = ranges_.find(RuneRange(lo, lo)); |
762 | if (it != end() && it->lo <= lo && hi <= it->hi) |
763 | return false; |
764 | } |
765 | |
766 | // Look for a range abutting lo on the left. |
767 | // If it exists, take it out and increase our range. |
768 | if (lo > 0) { |
769 | iterator it = ranges_.find(RuneRange(lo-1, lo-1)); |
770 | if (it != end()) { |
771 | lo = it->lo; |
772 | if (it->hi > hi) |
773 | hi = it->hi; |
774 | nrunes_ -= it->hi - it->lo + 1; |
775 | ranges_.erase(it); |
776 | } |
777 | } |
778 | |
779 | // Look for a range abutting hi on the right. |
780 | // If it exists, take it out and increase our range. |
781 | if (hi < Runemax) { |
782 | iterator it = ranges_.find(RuneRange(hi+1, hi+1)); |
783 | if (it != end()) { |
784 | hi = it->hi; |
785 | nrunes_ -= it->hi - it->lo + 1; |
786 | ranges_.erase(it); |
787 | } |
788 | } |
789 | |
790 | // Look for ranges between lo and hi. Take them out. |
791 | // This is only safe because the set has no overlapping ranges. |
792 | // We've already removed any ranges abutting lo and hi, so |
793 | // any that overlap [lo, hi] must be contained within it. |
794 | for (;;) { |
795 | iterator it = ranges_.find(RuneRange(lo, hi)); |
796 | if (it == end()) |
797 | break; |
798 | nrunes_ -= it->hi - it->lo + 1; |
799 | ranges_.erase(it); |
800 | } |
801 | |
802 | // Finally, add [lo, hi]. |
803 | nrunes_ += hi - lo + 1; |
804 | ranges_.insert(RuneRange(lo, hi)); |
805 | return true; |
806 | } |
807 | |
808 | void CharClassBuilder::AddCharClass(CharClassBuilder *cc) { |
809 | for (iterator it = cc->begin(); it != cc->end(); ++it) |
810 | AddRange(it->lo, it->hi); |
811 | } |
812 | |
813 | bool CharClassBuilder::Contains(Rune r) { |
814 | return ranges_.find(RuneRange(r, r)) != end(); |
815 | } |
816 | |
817 | // Does the character class behave the same on A-Z as on a-z? |
818 | bool CharClassBuilder::FoldsASCII() { |
819 | return ((upper_ ^ lower_) & AlphaMask) == 0; |
820 | } |
821 | |
822 | CharClassBuilder* CharClassBuilder::Copy() { |
823 | CharClassBuilder* cc = new CharClassBuilder; |
824 | for (iterator it = begin(); it != end(); ++it) |
825 | cc->ranges_.insert(RuneRange(it->lo, it->hi)); |
826 | cc->upper_ = upper_; |
827 | cc->lower_ = lower_; |
828 | cc->nrunes_ = nrunes_; |
829 | return cc; |
830 | } |
831 | |
832 | |
833 | |
834 | void CharClassBuilder::RemoveAbove(Rune r) { |
835 | if (r >= Runemax) |
836 | return; |
837 | |
838 | if (r < 'z') { |
839 | if (r < 'a') |
840 | lower_ = 0; |
841 | else |
842 | lower_ &= AlphaMask >> ('z' - r); |
843 | } |
844 | |
845 | if (r < 'Z') { |
846 | if (r < 'A') |
847 | upper_ = 0; |
848 | else |
849 | upper_ &= AlphaMask >> ('Z' - r); |
850 | } |
851 | |
852 | for (;;) { |
853 | |
854 | iterator it = ranges_.find(RuneRange(r + 1, Runemax)); |
855 | if (it == end()) |
856 | break; |
857 | RuneRange rr = *it; |
858 | ranges_.erase(it); |
859 | nrunes_ -= rr.hi - rr.lo + 1; |
860 | if (rr.lo <= r) { |
861 | rr.hi = r; |
862 | ranges_.insert(rr); |
863 | nrunes_ += rr.hi - rr.lo + 1; |
864 | } |
865 | } |
866 | } |
867 | |
868 | void CharClassBuilder::Negate() { |
869 | // Build up negation and then copy in. |
870 | // Could edit ranges in place, but C++ won't let me. |
871 | std::vector<RuneRange> v; |
872 | v.reserve(ranges_.size() + 1); |
873 | |
874 | // In negation, first range begins at 0, unless |
875 | // the current class begins at 0. |
876 | iterator it = begin(); |
877 | if (it == end()) { |
878 | v.push_back(RuneRange(0, Runemax)); |
879 | } else { |
880 | int nextlo = 0; |
881 | if (it->lo == 0) { |
882 | nextlo = it->hi + 1; |
883 | ++it; |
884 | } |
885 | for (; it != end(); ++it) { |
886 | v.push_back(RuneRange(nextlo, it->lo - 1)); |
887 | nextlo = it->hi + 1; |
888 | } |
889 | if (nextlo <= Runemax) |
890 | v.push_back(RuneRange(nextlo, Runemax)); |
891 | } |
892 | |
893 | ranges_.clear(); |
894 | for (size_t i = 0; i < v.size(); i++) |
895 | ranges_.insert(v[i]); |
896 | |
897 | upper_ = AlphaMask & ~upper_; |
898 | lower_ = AlphaMask & ~lower_; |
899 | nrunes_ = Runemax+1 - nrunes_; |
900 | } |
901 | |
902 | // Character class is a sorted list of ranges. |
903 | // The ranges are allocated in the same block as the header, |
904 | // necessitating a special allocator and Delete method. |
905 | |
906 | CharClass* CharClass::New(int maxranges) { |
907 | CharClass* cc; |
908 | uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; |
909 | cc = reinterpret_cast<CharClass*>(data); |
910 | cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc); |
911 | cc->nranges_ = 0; |
912 | cc->folds_ascii_ = false; |
913 | cc->nrunes_ = 0; |
914 | return cc; |
915 | } |
916 | |
917 | void CharClass::Delete() { |
918 | uint8_t* data = reinterpret_cast<uint8_t*>(this); |
919 | delete[] data; |
920 | } |
921 | |
922 | CharClass* CharClass::Negate() { |
923 | CharClass* cc = CharClass::New(nranges_+1); |
924 | cc->folds_ascii_ = folds_ascii_; |
925 | cc->nrunes_ = Runemax + 1 - nrunes_; |
926 | int n = 0; |
927 | int nextlo = 0; |
928 | for (CharClass::iterator it = begin(); it != end(); ++it) { |
929 | if (it->lo == nextlo) { |
930 | nextlo = it->hi + 1; |
931 | } else { |
932 | cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1); |
933 | nextlo = it->hi + 1; |
934 | } |
935 | } |
936 | if (nextlo <= Runemax) |
937 | cc->ranges_[n++] = RuneRange(nextlo, Runemax); |
938 | cc->nranges_ = n; |
939 | return cc; |
940 | } |
941 | |
942 | bool CharClass::Contains(Rune r) { |
943 | RuneRange* rr = ranges_; |
944 | int n = nranges_; |
945 | while (n > 0) { |
946 | int m = n/2; |
947 | if (rr[m].hi < r) { |
948 | rr += m+1; |
949 | n -= m+1; |
950 | } else if (r < rr[m].lo) { |
951 | n = m; |
952 | } else { // rr[m].lo <= r && r <= rr[m].hi |
953 | return true; |
954 | } |
955 | } |
956 | return false; |
957 | } |
958 | |
959 | CharClass* CharClassBuilder::GetCharClass() { |
960 | CharClass* cc = CharClass::New(static_cast<int>(ranges_.size())); |
961 | int n = 0; |
962 | for (iterator it = begin(); it != end(); ++it) |
963 | cc->ranges_[n++] = *it; |
964 | cc->nranges_ = n; |
965 | DCHECK_LE(n, static_cast<int>(ranges_.size())); |
966 | cc->nrunes_ = nrunes_; |
967 | cc->folds_ascii_ = FoldsASCII(); |
968 | return cc; |
969 | } |
970 | |
971 | } // namespace re2 |
972 | |