1 | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #ifndef RE2_REGEXP_H_ |
6 | #define RE2_REGEXP_H_ |
7 | |
8 | // --- SPONSORED LINK -------------------------------------------------- |
9 | // If you want to use this library for regular expression matching, |
10 | // you should use re2/re2.h, which provides a class RE2 that |
11 | // mimics the PCRE interface provided by PCRE's C++ wrappers. |
12 | // This header describes the low-level interface used to implement RE2 |
13 | // and may change in backwards-incompatible ways from time to time. |
14 | // In contrast, RE2's interface will not. |
15 | // --------------------------------------------------------------------- |
16 | |
17 | // Regular expression library: parsing, execution, and manipulation |
18 | // of regular expressions. |
19 | // |
20 | // Any operation that traverses the Regexp structures should be written |
21 | // using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested |
22 | // regular expressions such as x++++++++++++++++++++... might cause recursive |
23 | // traversals to overflow the stack. |
24 | // |
25 | // It is the caller's responsibility to provide appropriate mutual exclusion |
26 | // around manipulation of the regexps. RE2 does this. |
27 | // |
28 | // PARSING |
29 | // |
30 | // Regexp::Parse parses regular expressions encoded in UTF-8. |
31 | // The default syntax is POSIX extended regular expressions, |
32 | // with the following changes: |
33 | // |
34 | // 1. Backreferences (optional in POSIX EREs) are not supported. |
35 | // (Supporting them precludes the use of DFA-based |
36 | // matching engines.) |
37 | // |
38 | // 2. Collating elements and collation classes are not supported. |
39 | // (No one has needed or wanted them.) |
40 | // |
41 | // The exact syntax accepted can be modified by passing flags to |
42 | // Regexp::Parse. In particular, many of the basic Perl additions |
43 | // are available. The flags are documented below (search for LikePerl). |
44 | // |
45 | // If parsed with the flag Regexp::Latin1, both the regular expression |
46 | // and the input to the matching routines are assumed to be encoded in |
47 | // Latin-1, not UTF-8. |
48 | // |
49 | // EXECUTION |
50 | // |
51 | // Once Regexp has parsed a regular expression, it provides methods |
52 | // to search text using that regular expression. These methods are |
53 | // implemented via calling out to other regular expression libraries. |
54 | // (Let's call them the sublibraries.) |
55 | // |
56 | // To call a sublibrary, Regexp does not simply prepare a |
57 | // string version of the regular expression and hand it to the |
58 | // sublibrary. Instead, Regexp prepares, from its own parsed form, the |
59 | // corresponding internal representation used by the sublibrary. |
60 | // This has the drawback of needing to know the internal representation |
61 | // used by the sublibrary, but it has two important benefits: |
62 | // |
63 | // 1. The syntax and meaning of regular expressions is guaranteed |
64 | // to be that used by Regexp's parser, not the syntax expected |
65 | // by the sublibrary. Regexp might accept a restricted or |
66 | // expanded syntax for regular expressions as compared with |
67 | // the sublibrary. As long as Regexp can translate from its |
68 | // internal form into the sublibrary's, clients need not know |
69 | // exactly which sublibrary they are using. |
70 | // |
71 | // 2. The sublibrary parsers are bypassed. For whatever reason, |
72 | // sublibrary regular expression parsers often have security |
73 | // problems. For example, plan9grep's regular expression parser |
74 | // has a buffer overflow in its handling of large character |
75 | // classes, and PCRE's parser has had buffer overflow problems |
76 | // in the past. Security-team requires sandboxing of sublibrary |
77 | // regular expression parsers. Avoiding the sublibrary parsers |
78 | // avoids the sandbox. |
79 | // |
80 | // The execution methods we use now are provided by the compiled form, |
81 | // Prog, described in prog.h |
82 | // |
83 | // MANIPULATION |
84 | // |
85 | // Unlike other regular expression libraries, Regexp makes its parsed |
86 | // form accessible to clients, so that client code can analyze the |
87 | // parsed regular expressions. |
88 | |
89 | #include <stdint.h> |
90 | #include <map> |
91 | #include <set> |
92 | #include <string> |
93 | |
94 | #include "util/util.h" |
95 | #include "util/logging.h" |
96 | #include "util/utf.h" |
97 | #include "re2/stringpiece.h" |
98 | |
99 | namespace re2 { |
100 | |
101 | // Keep in sync with string list kOpcodeNames[] in testing/dump.cc |
102 | enum RegexpOp { |
103 | // Matches no strings. |
104 | kRegexpNoMatch = 1, |
105 | |
106 | // Matches empty string. |
107 | kRegexpEmptyMatch, |
108 | |
109 | // Matches rune_. |
110 | kRegexpLiteral, |
111 | |
112 | // Matches runes_. |
113 | kRegexpLiteralString, |
114 | |
115 | // Matches concatenation of sub_[0..nsub-1]. |
116 | kRegexpConcat, |
117 | // Matches union of sub_[0..nsub-1]. |
118 | kRegexpAlternate, |
119 | |
120 | // Matches sub_[0] zero or more times. |
121 | kRegexpStar, |
122 | // Matches sub_[0] one or more times. |
123 | kRegexpPlus, |
124 | // Matches sub_[0] zero or one times. |
125 | kRegexpQuest, |
126 | |
127 | // Matches sub_[0] at least min_ times, at most max_ times. |
128 | // max_ == -1 means no upper limit. |
129 | kRegexpRepeat, |
130 | |
131 | // Parenthesized (capturing) subexpression. Index is cap_. |
132 | // Optionally, capturing name is name_. |
133 | kRegexpCapture, |
134 | |
135 | // Matches any character. |
136 | kRegexpAnyChar, |
137 | |
138 | // Matches any byte [sic]. |
139 | kRegexpAnyByte, |
140 | |
141 | // Matches empty string at beginning of line. |
142 | kRegexpBeginLine, |
143 | // Matches empty string at end of line. |
144 | kRegexpEndLine, |
145 | |
146 | // Matches word boundary "\b". |
147 | kRegexpWordBoundary, |
148 | // Matches not-a-word boundary "\B". |
149 | kRegexpNoWordBoundary, |
150 | |
151 | // Matches empty string at beginning of text. |
152 | kRegexpBeginText, |
153 | // Matches empty string at end of text. |
154 | kRegexpEndText, |
155 | |
156 | // Matches character class given by cc_. |
157 | kRegexpCharClass, |
158 | |
159 | // Forces match of entire expression right now, |
160 | // with match ID match_id_ (used by RE2::Set). |
161 | kRegexpHaveMatch, |
162 | |
163 | kMaxRegexpOp = kRegexpHaveMatch, |
164 | }; |
165 | |
166 | // Keep in sync with string list in regexp.cc |
167 | enum RegexpStatusCode { |
168 | // No error |
169 | kRegexpSuccess = 0, |
170 | |
171 | // Unexpected error |
172 | kRegexpInternalError, |
173 | |
174 | // Parse errors |
175 | kRegexpBadEscape, // bad escape sequence |
176 | kRegexpBadCharClass, // bad character class |
177 | kRegexpBadCharRange, // bad character class range |
178 | kRegexpMissingBracket, // missing closing ] |
179 | kRegexpMissingParen, // missing closing ) |
180 | kRegexpTrailingBackslash, // at end of regexp |
181 | kRegexpRepeatArgument, // repeat argument missing, e.g. "*" |
182 | kRegexpRepeatSize, // bad repetition argument |
183 | kRegexpRepeatOp, // bad repetition operator |
184 | kRegexpBadPerlOp, // bad perl operator |
185 | kRegexpBadUTF8, // invalid UTF-8 in regexp |
186 | kRegexpBadNamedCapture, // bad named capture |
187 | }; |
188 | |
189 | // Error status for certain operations. |
190 | class RegexpStatus { |
191 | public: |
192 | RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {} |
193 | ~RegexpStatus() { delete tmp_; } |
194 | |
195 | void set_code(RegexpStatusCode code) { code_ = code; } |
196 | void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; } |
197 | void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; } |
198 | RegexpStatusCode code() const { return code_; } |
199 | const StringPiece& error_arg() const { return error_arg_; } |
200 | bool ok() const { return code() == kRegexpSuccess; } |
201 | |
202 | // Copies state from status. |
203 | void Copy(const RegexpStatus& status); |
204 | |
205 | // Returns text equivalent of code, e.g.: |
206 | // "Bad character class" |
207 | static std::string CodeText(RegexpStatusCode code); |
208 | |
209 | // Returns text describing error, e.g.: |
210 | // "Bad character class: [z-a]" |
211 | std::string Text() const; |
212 | |
213 | private: |
214 | RegexpStatusCode code_; // Kind of error |
215 | StringPiece error_arg_; // Piece of regexp containing syntax error. |
216 | std::string* tmp_; // Temporary storage, possibly where error_arg_ is. |
217 | |
218 | RegexpStatus(const RegexpStatus&) = delete; |
219 | RegexpStatus& operator=(const RegexpStatus&) = delete; |
220 | }; |
221 | |
222 | // Compiled form; see prog.h |
223 | class Prog; |
224 | |
225 | struct RuneRange { |
226 | RuneRange() : lo(0), hi(0) { } |
227 | RuneRange(int l, int h) : lo(l), hi(h) { } |
228 | Rune lo; |
229 | Rune hi; |
230 | }; |
231 | |
232 | // Less-than on RuneRanges treats a == b if they overlap at all. |
233 | // This lets us look in a set to find the range covering a particular Rune. |
234 | struct RuneRangeLess { |
235 | bool operator()(const RuneRange& a, const RuneRange& b) const { |
236 | return a.hi < b.lo; |
237 | } |
238 | }; |
239 | |
240 | class CharClassBuilder; |
241 | |
242 | class CharClass { |
243 | public: |
244 | void Delete(); |
245 | |
246 | typedef RuneRange* iterator; |
247 | iterator begin() { return ranges_; } |
248 | iterator end() { return ranges_ + nranges_; } |
249 | |
250 | int size() { return nrunes_; } |
251 | bool empty() { return nrunes_ == 0; } |
252 | bool full() { return nrunes_ == Runemax+1; } |
253 | bool FoldsASCII() { return folds_ascii_; } |
254 | |
255 | bool Contains(Rune r); |
256 | CharClass* Negate(); |
257 | |
258 | private: |
259 | CharClass(); // not implemented |
260 | ~CharClass(); // not implemented |
261 | static CharClass* New(int maxranges); |
262 | |
263 | friend class CharClassBuilder; |
264 | |
265 | bool folds_ascii_; |
266 | int nrunes_; |
267 | RuneRange *ranges_; |
268 | int nranges_; |
269 | |
270 | CharClass(const CharClass&) = delete; |
271 | CharClass& operator=(const CharClass&) = delete; |
272 | }; |
273 | |
274 | class Regexp { |
275 | public: |
276 | |
277 | // Flags for parsing. Can be ORed together. |
278 | enum ParseFlags { |
279 | NoParseFlags = 0, |
280 | FoldCase = 1<<0, // Fold case during matching (case-insensitive). |
281 | Literal = 1<<1, // Treat s as literal string instead of a regexp. |
282 | ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s |
283 | // and [[:space:]] to match newline. |
284 | DotNL = 1<<3, // Allow . to match newline. |
285 | MatchNL = ClassNL | DotNL, |
286 | OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and |
287 | // end of text, not around embedded newlines. |
288 | // (Perl's default) |
289 | Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8. |
290 | NonGreedy = 1<<6, // Repetition operators are non-greedy by default. |
291 | PerlClasses = 1<<7, // Allow Perl character classes like \d. |
292 | PerlB = 1<<8, // Allow Perl's \b and \B. |
293 | PerlX = 1<<9, // Perl extensions: |
294 | // non-capturing parens - (?: ) |
295 | // non-greedy operators - *? +? ?? {}? |
296 | // flag edits - (?i) (?-i) (?i: ) |
297 | // i - FoldCase |
298 | // m - !OneLine |
299 | // s - DotNL |
300 | // U - NonGreedy |
301 | // line ends: \A \z |
302 | // \Q and \E to disable/enable metacharacters |
303 | // (?P<name>expr) for named captures |
304 | // \C to match any single byte |
305 | UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group |
306 | // and \P{Han} for its negation. |
307 | NeverNL = 1<<11, // Never match NL, even if the regexp mentions |
308 | // it explicitly. |
309 | NeverCapture = 1<<12, // Parse all parens as non-capturing. |
310 | |
311 | // As close to Perl as we can get. |
312 | LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | |
313 | UnicodeGroups, |
314 | |
315 | // Internal use only. |
316 | WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text |
317 | AllParseFlags = (1<<14)-1, |
318 | }; |
319 | |
320 | // Get. No set, Regexps are logically immutable once created. |
321 | RegexpOp op() { return static_cast<RegexpOp>(op_); } |
322 | int nsub() { return nsub_; } |
323 | bool simple() { return simple_ != 0; } |
324 | ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); } |
325 | int Ref(); // For testing. |
326 | |
327 | Regexp** sub() { |
328 | if(nsub_ <= 1) |
329 | return &subone_; |
330 | else |
331 | return submany_; |
332 | } |
333 | |
334 | int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; } |
335 | int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; } |
336 | Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; } |
337 | CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; } |
338 | int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; } |
339 | const std::string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; } |
340 | Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; } |
341 | int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; } |
342 | int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; } |
343 | |
344 | // Increments reference count, returns object as convenience. |
345 | Regexp* Incref(); |
346 | |
347 | // Decrements reference count and deletes this object if count reaches 0. |
348 | void Decref(); |
349 | |
350 | // Parses string s to produce regular expression, returned. |
351 | // Caller must release return value with re->Decref(). |
352 | // On failure, sets *status (if status != NULL) and returns NULL. |
353 | static Regexp* Parse(const StringPiece& s, ParseFlags flags, |
354 | RegexpStatus* status); |
355 | |
356 | // Returns a _new_ simplified version of the current regexp. |
357 | // Does not edit the current regexp. |
358 | // Caller must release return value with re->Decref(). |
359 | // Simplified means that counted repetition has been rewritten |
360 | // into simpler terms and all Perl/POSIX features have been |
361 | // removed. The result will capture exactly the same |
362 | // subexpressions the original did, unless formatted with ToString. |
363 | Regexp* Simplify(); |
364 | friend class CoalesceWalker; |
365 | friend class SimplifyWalker; |
366 | |
367 | // Parses the regexp src and then simplifies it and sets *dst to the |
368 | // string representation of the simplified form. Returns true on success. |
369 | // Returns false and sets *status (if status != NULL) on parse error. |
370 | static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags, |
371 | std::string* dst, RegexpStatus* status); |
372 | |
373 | // Returns the number of capturing groups in the regexp. |
374 | int NumCaptures(); |
375 | friend class NumCapturesWalker; |
376 | |
377 | // Returns a map from names to capturing group indices, |
378 | // or NULL if the regexp contains no named capture groups. |
379 | // The caller is responsible for deleting the map. |
380 | std::map<std::string, int>* NamedCaptures(); |
381 | |
382 | // Returns a map from capturing group indices to capturing group |
383 | // names or NULL if the regexp contains no named capture groups. The |
384 | // caller is responsible for deleting the map. |
385 | std::map<int, std::string>* CaptureNames(); |
386 | |
387 | // Returns a string representation of the current regexp, |
388 | // using as few parentheses as possible. |
389 | std::string ToString(); |
390 | |
391 | // Convenience functions. They consume the passed reference, |
392 | // so in many cases you should use, e.g., Plus(re->Incref(), flags). |
393 | // They do not consume allocated arrays like subs or runes. |
394 | static Regexp* Plus(Regexp* sub, ParseFlags flags); |
395 | static Regexp* Star(Regexp* sub, ParseFlags flags); |
396 | static Regexp* Quest(Regexp* sub, ParseFlags flags); |
397 | static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags); |
398 | static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags); |
399 | static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap); |
400 | static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max); |
401 | static Regexp* NewLiteral(Rune rune, ParseFlags flags); |
402 | static Regexp* NewCharClass(CharClass* cc, ParseFlags flags); |
403 | static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags); |
404 | static Regexp* HaveMatch(int match_id, ParseFlags flags); |
405 | |
406 | // Like Alternate but does not factor out common prefixes. |
407 | static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags); |
408 | |
409 | // Debugging function. Returns string format for regexp |
410 | // that makes structure clear. Does NOT use regexp syntax. |
411 | std::string Dump(); |
412 | |
413 | // Helper traversal class, defined fully in walker-inl.h. |
414 | template<typename T> class Walker; |
415 | |
416 | // Compile to Prog. See prog.h |
417 | // Reverse prog expects to be run over text backward. |
418 | // Construction and execution of prog will |
419 | // stay within approximately max_mem bytes of memory. |
420 | // If max_mem <= 0, a reasonable default is used. |
421 | Prog* CompileToProg(int64_t max_mem); |
422 | Prog* CompileToReverseProg(int64_t max_mem); |
423 | |
424 | // Whether to expect this library to find exactly the same answer as PCRE |
425 | // when running this regexp. Most regexps do mimic PCRE exactly, but a few |
426 | // obscure cases behave differently. Technically this is more a property |
427 | // of the Prog than the Regexp, but the computation is much easier to do |
428 | // on the Regexp. See mimics_pcre.cc for the exact conditions. |
429 | bool MimicsPCRE(); |
430 | |
431 | // Benchmarking function. |
432 | void NullWalk(); |
433 | |
434 | // Whether every match of this regexp must be anchored and |
435 | // begin with a non-empty fixed string (perhaps after ASCII |
436 | // case-folding). If so, returns the prefix and the sub-regexp that |
437 | // follows it. |
438 | // Callers should expect *prefix, *foldcase and *suffix to be "zeroed" |
439 | // regardless of the return value. |
440 | bool RequiredPrefix(std::string* prefix, bool* foldcase, |
441 | Regexp** suffix); |
442 | |
443 | private: |
444 | // Constructor allocates vectors as appropriate for operator. |
445 | explicit Regexp(RegexpOp op, ParseFlags parse_flags); |
446 | |
447 | // Use Decref() instead of delete to release Regexps. |
448 | // This is private to catch deletes at compile time. |
449 | ~Regexp(); |
450 | void Destroy(); |
451 | bool QuickDestroy(); |
452 | |
453 | // Helpers for Parse. Listed here so they can edit Regexps. |
454 | class ParseState; |
455 | |
456 | friend class ParseState; |
457 | friend bool ParseCharClass(StringPiece* s, Regexp** out_re, |
458 | RegexpStatus* status); |
459 | |
460 | // Helper for testing [sic]. |
461 | friend bool RegexpEqualTestingOnly(Regexp*, Regexp*); |
462 | |
463 | // Computes whether Regexp is already simple. |
464 | bool ComputeSimple(); |
465 | |
466 | // Constructor that generates a Star, Plus or Quest, |
467 | // squashing the pair if sub is also a Star, Plus or Quest. |
468 | static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags); |
469 | |
470 | // Constructor that generates a concatenation or alternation, |
471 | // enforcing the limit on the number of subexpressions for |
472 | // a particular Regexp. |
473 | static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs, |
474 | ParseFlags flags, bool can_factor); |
475 | |
476 | // Returns the leading string that re starts with. |
477 | // The returned Rune* points into a piece of re, |
478 | // so it must not be used after the caller calls re->Decref(). |
479 | static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags); |
480 | |
481 | // Removes the first n leading runes from the beginning of re. |
482 | // Edits re in place. |
483 | static void RemoveLeadingString(Regexp* re, int n); |
484 | |
485 | // Returns the leading regexp in re's top-level concatenation. |
486 | // The returned Regexp* points at re or a sub-expression of re, |
487 | // so it must not be used after the caller calls re->Decref(). |
488 | static Regexp* LeadingRegexp(Regexp* re); |
489 | |
490 | // Removes LeadingRegexp(re) from re and returns the remainder. |
491 | // Might edit re in place. |
492 | static Regexp* RemoveLeadingRegexp(Regexp* re); |
493 | |
494 | // Simplifies an alternation of literal strings by factoring out |
495 | // common prefixes. |
496 | static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags); |
497 | friend class FactorAlternationImpl; |
498 | |
499 | // Is a == b? Only efficient on regexps that have not been through |
500 | // Simplify yet - the expansion of a kRegexpRepeat will make this |
501 | // take a long time. Do not call on such regexps, hence private. |
502 | static bool Equal(Regexp* a, Regexp* b); |
503 | |
504 | // Allocate space for n sub-regexps. |
505 | void AllocSub(int n) { |
506 | DCHECK(n >= 0 && static_cast<uint16_t>(n) == n); |
507 | if (n > 1) |
508 | submany_ = new Regexp*[n]; |
509 | nsub_ = static_cast<uint16_t>(n); |
510 | } |
511 | |
512 | // Add Rune to LiteralString |
513 | void AddRuneToString(Rune r); |
514 | |
515 | // Swaps this with that, in place. |
516 | void Swap(Regexp *that); |
517 | |
518 | // Operator. See description of operators above. |
519 | // uint8_t instead of RegexpOp to control space usage. |
520 | uint8_t op_; |
521 | |
522 | // Is this regexp structure already simple |
523 | // (has it been returned by Simplify)? |
524 | // uint8_t instead of bool to control space usage. |
525 | uint8_t simple_; |
526 | |
527 | // Flags saved from parsing and used during execution. |
528 | // (Only FoldCase is used.) |
529 | // uint16_t instead of ParseFlags to control space usage. |
530 | uint16_t parse_flags_; |
531 | |
532 | // Reference count. Exists so that SimplifyRegexp can build |
533 | // regexp structures that are dags rather than trees to avoid |
534 | // exponential blowup in space requirements. |
535 | // uint16_t to control space usage. |
536 | // The standard regexp routines will never generate a |
537 | // ref greater than the maximum repeat count (kMaxRepeat), |
538 | // but even so, Incref and Decref consult an overflow map |
539 | // when ref_ reaches kMaxRef. |
540 | uint16_t ref_; |
541 | static const uint16_t kMaxRef = 0xffff; |
542 | |
543 | // Subexpressions. |
544 | // uint16_t to control space usage. |
545 | // Concat and Alternate handle larger numbers of subexpressions |
546 | // by building concatenation or alternation trees. |
547 | // Other routines should call Concat or Alternate instead of |
548 | // filling in sub() by hand. |
549 | uint16_t nsub_; |
550 | static const uint16_t kMaxNsub = 0xffff; |
551 | union { |
552 | Regexp** submany_; // if nsub_ > 1 |
553 | Regexp* subone_; // if nsub_ == 1 |
554 | }; |
555 | |
556 | // Extra space for parse and teardown stacks. |
557 | Regexp* down_; |
558 | |
559 | // Arguments to operator. See description of operators above. |
560 | union { |
561 | struct { // Repeat |
562 | int max_; |
563 | int min_; |
564 | }; |
565 | struct { // Capture |
566 | int cap_; |
567 | std::string* name_; |
568 | }; |
569 | struct { // LiteralString |
570 | int nrunes_; |
571 | Rune* runes_; |
572 | }; |
573 | struct { // CharClass |
574 | // These two could be in separate union members, |
575 | // but it wouldn't save any space (there are other two-word structs) |
576 | // and keeping them separate avoids confusion during parsing. |
577 | CharClass* cc_; |
578 | CharClassBuilder* ccb_; |
579 | }; |
580 | Rune rune_; // Literal |
581 | int match_id_; // HaveMatch |
582 | void *the_union_[2]; // as big as any other element, for memset |
583 | }; |
584 | |
585 | Regexp(const Regexp&) = delete; |
586 | Regexp& operator=(const Regexp&) = delete; |
587 | }; |
588 | |
589 | // Character class set: contains non-overlapping, non-abutting RuneRanges. |
590 | typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet; |
591 | |
592 | class CharClassBuilder { |
593 | public: |
594 | CharClassBuilder(); |
595 | |
596 | typedef RuneRangeSet::iterator iterator; |
597 | iterator begin() { return ranges_.begin(); } |
598 | iterator end() { return ranges_.end(); } |
599 | |
600 | int size() { return nrunes_; } |
601 | bool empty() { return nrunes_ == 0; } |
602 | bool full() { return nrunes_ == Runemax+1; } |
603 | |
604 | bool Contains(Rune r); |
605 | bool FoldsASCII(); |
606 | bool AddRange(Rune lo, Rune hi); // returns whether class changed |
607 | CharClassBuilder* Copy(); |
608 | void AddCharClass(CharClassBuilder* cc); |
609 | void Negate(); |
610 | void RemoveAbove(Rune r); |
611 | CharClass* GetCharClass(); |
612 | void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags); |
613 | |
614 | private: |
615 | static const uint32_t AlphaMask = (1<<26) - 1; |
616 | uint32_t upper_; // bitmap of A-Z |
617 | uint32_t lower_; // bitmap of a-z |
618 | int nrunes_; |
619 | RuneRangeSet ranges_; |
620 | |
621 | CharClassBuilder(const CharClassBuilder&) = delete; |
622 | CharClassBuilder& operator=(const CharClassBuilder&) = delete; |
623 | }; |
624 | |
625 | // Bitwise ops on ParseFlags produce ParseFlags. |
626 | inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, |
627 | Regexp::ParseFlags b) { |
628 | return static_cast<Regexp::ParseFlags>( |
629 | static_cast<int>(a) | static_cast<int>(b)); |
630 | } |
631 | |
632 | inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, |
633 | Regexp::ParseFlags b) { |
634 | return static_cast<Regexp::ParseFlags>( |
635 | static_cast<int>(a) ^ static_cast<int>(b)); |
636 | } |
637 | |
638 | inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, |
639 | Regexp::ParseFlags b) { |
640 | return static_cast<Regexp::ParseFlags>( |
641 | static_cast<int>(a) & static_cast<int>(b)); |
642 | } |
643 | |
644 | inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) { |
645 | // Attempting to produce a value out of enum's range has undefined behaviour. |
646 | return static_cast<Regexp::ParseFlags>( |
647 | ~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags)); |
648 | } |
649 | |
650 | } // namespace re2 |
651 | |
652 | #endif // RE2_REGEXP_H_ |
653 | |