1 | // Licensed to the .NET Foundation under one or more agreements. |
2 | // The .NET Foundation licenses this file to you under the MIT license. |
3 | // See the LICENSE file in the project root for more information. |
4 | // |
5 | // Provides basic interpreted regular expression matching. This is meant as a debugging tool, |
6 | // and if regular expressions become necessary in a non-debug scenario great care should be |
7 | // used to ensure that performance is not impaired, and a more thorough review of this could |
8 | // would also be a good thing. This file does not include any concrete instantiations but |
9 | // instead provides the basic building blocks. Some concrete instantiations can be found in |
10 | // regex_util.h. |
11 | // |
12 | // NOTE: See code:clr::regex::RegExBase (below) for description of supported regex language. |
13 | // |
14 | // NOTE: we had to forego standard options such as tr1::regex |
15 | // (http://en.wikipedia.org/wiki/Technical_Report_1#Regular_Expressions) and Microsoft's |
16 | // internal GRETA regular expressions (http://toolbox/sites/987/default.aspx) because they |
17 | // both rely heavily on the STL, which can not currently be used within the CLR. |
18 | // |
19 | // NOTE: If this becomes non-debug-only, then read the comment on WCHARItemTraits for what |
20 | // what needs fixing. |
21 | // |
22 | |
23 | // |
24 | |
25 | #ifndef _REGEX_BASE_H_ |
26 | #define _REGEX_BASE_H_ |
27 | |
28 | // Forward declare namespace so that it is not debug-only (even if currently there is nothing |
29 | // but debug-only code in the namespace). This enables a "using namespace clr;" line in a |
30 | // header file without having to worry about whether or not it's in a debug-only block. |
31 | namespace clr {} |
32 | |
33 | #ifdef _DEBUG |
34 | |
35 | #include "utilcode.h" // for string hash functions |
36 | #include "sstring.h" |
37 | |
38 | namespace clr { |
39 | namespace regex { |
40 | |
41 | // Implementation details. Code contained in any "imp" namespace should never be directly used |
42 | // by clients of RegEx. |
43 | namespace imp { |
44 | |
45 | //=================================================================================================== |
46 | // Helper for clr::regex::RegExBase. See class definition for clr::regex::RegExBase below for more |
47 | // information. |
48 | |
49 | template <typename ITEM_TRAITS, typename GROUP_CONTAINER> |
50 | class RegExBaseHelper : protected ITEM_TRAITS |
51 | { |
52 | public: |
53 | typedef typename ITEM_TRAITS::RegexIterator RegexIterator; |
54 | typedef typename ITEM_TRAITS::InputIterator InputIterator; |
55 | |
56 | typedef typename ITEM_TRAITS::MatchFlags MatchFlags; |
57 | static const MatchFlags DefaultMatchFlags = ITEM_TRAITS::DefaultMatchFlags; |
58 | |
59 | // Arguments: |
60 | // regex : marks the start of the regular expression string. |
61 | // regexEnd : marks the end of the regular expression string. |
62 | // input : marks the start of the input string against which regex will be matched. |
63 | // inputEnd : marks the end of the input string. |
64 | // groups : recipient of regular expression groups. |
65 | // |
66 | // Returns true if the regular expression was successfully matched against the input string; |
67 | // otherwise false. |
68 | |
69 | RegExBaseHelper(const RegexIterator& regex, |
70 | const RegexIterator& regexEnd, |
71 | const InputIterator& input, |
72 | const InputIterator& inputEnd, |
73 | GROUP_CONTAINER& groups, |
74 | const MatchFlags flags = DefaultMatchFlags); |
75 | |
76 | // The main entrypoint to RegExBaseHelper, Match will attempt to match the regular expression |
77 | // defined by [regex,regexEnd) against the input defined by [input,inputEnd). |
78 | bool Match() |
79 | { WRAPPER_NO_CONTRACT; return DoMatch(m_regex, m_input); } |
80 | |
81 | protected: |
82 | typedef typename ITEM_TRAITS::Item Item; |
83 | typedef typename ITEM_TRAITS::ItemType ItemType; |
84 | |
85 | // Try to match regex at any point within input, starting with the first character and moving |
86 | // along one at a time until a match is found or the end of the input is encountered, whichever |
87 | // comes first. |
88 | bool DoMatch( |
89 | const RegexIterator& regex, |
90 | InputIterator input); |
91 | |
92 | // Try to match regex starting exactly at input. |
93 | bool DoMatchHere( |
94 | const RegexIterator& regex, |
95 | const InputIterator& input); |
96 | |
97 | // The function returns true if a match is found consisting of zero or more items c followed by a |
98 | // successful match of regex on the remaining input; otherwise false is returned. This is a |
99 | // conservative match, so it starts with trying to match zero items followed by regex, |
100 | // and will then try to match one item followed by regex. |
101 | bool DoMatchStar( |
102 | const Item& c, |
103 | const RegexIterator& regex, |
104 | InputIterator input); |
105 | |
106 | // The function returns true if a match is found consisting of zero or more items c followed by a |
107 | // successful match of regex on the remaining input; otherwise false is returned. This is a |
108 | // greedy match, so it starts with trying to match as many items as it can followed by regex, |
109 | // and on failure will try again with one less items matched. |
110 | bool DoMatchStarEagerly( |
111 | const Item& c, |
112 | const RegexIterator& regex, |
113 | InputIterator input); |
114 | |
115 | // Convenience method. |
116 | Item GetItem( |
117 | const RegexIterator ®ex) |
118 | { WRAPPER_NO_CONTRACT; return ITEM_TRAITS::GetItem(regex, m_regexEnd, m_flags); } |
119 | |
120 | // Convenience method. |
121 | bool MatchItem( |
122 | const Item& c, |
123 | const InputIterator& input) |
124 | { WRAPPER_NO_CONTRACT; return ITEM_TRAITS::MatchItem(c, input, m_inputEnd, m_flags); } |
125 | |
126 | // Declared as protected to prevent direct instantiation. |
127 | RegExBaseHelper() |
128 | {} |
129 | |
130 | RegexIterator m_regex; |
131 | RegexIterator m_regexEnd; |
132 | InputIterator m_input; |
133 | InputIterator m_inputEnd; |
134 | GROUP_CONTAINER& m_groups; |
135 | MatchFlags m_flags; |
136 | }; |
137 | |
138 | //--------------------------------------------------------------------------------------------------- |
139 | // This method simply stores the end iterators for the regular expression and the input strings, as |
140 | // well as the group collection object and flags, and forwards the call to DoMatch. |
141 | |
142 | template <typename ITEM_TRAITS, typename GROUP_CONTAINER> |
143 | RegExBaseHelper<ITEM_TRAITS, GROUP_CONTAINER>::RegExBaseHelper( |
144 | const RegexIterator& regex, |
145 | const RegexIterator& regexEnd, |
146 | const InputIterator& input, |
147 | const InputIterator& inputEnd, |
148 | GROUP_CONTAINER& groups, |
149 | const MatchFlags flags) |
150 | : m_regex(regex), |
151 | m_regexEnd(regexEnd), |
152 | m_input(input), |
153 | m_inputEnd(inputEnd), |
154 | m_groups(groups), |
155 | m_flags(flags) |
156 | { WRAPPER_NO_CONTRACT; } |
157 | |
158 | //--------------------------------------------------------------------------------------------------- |
159 | // This method checks if the regular expression starts with a caret, indicating that any match must |
160 | // be anchored at the start of the input string. If such a caret exists, one match attempt is made |
161 | // on the input starting with the first character and the result is returned. If the regex does not |
162 | // start with a caret, the method attempts to match against the input string, starting at the first |
163 | // character and moving one character over for each successive attempt, until a match is found or |
164 | // the end of the input is encountered, whichever comes first. |
165 | |
166 | template <typename ITEM_TRAITS, typename GROUP_CONTAINER> |
167 | inline bool |
168 | RegExBaseHelper<ITEM_TRAITS, GROUP_CONTAINER>::DoMatch( |
169 | const RegexIterator& regex, |
170 | InputIterator input) |
171 | { |
172 | WRAPPER_NO_CONTRACT; |
173 | |
174 | if (GetItem(regex).GetType() == ITEM_TRAITS::CARET) |
175 | { // Match must occur from the beginning of the line |
176 | m_groups.OpenGroup(input, m_inputEnd); |
177 | bool res = DoMatchHere(regex+1, input); |
178 | if (!res) |
179 | m_groups.CancelGroup(); |
180 | return res; |
181 | } |
182 | else |
183 | { // Match can happen anywhere in the string |
184 | do |
185 | { // Attempt to match against each substring [x,inputEnd) for x = 0...inputEnd |
186 | |
187 | // Begin the group that contains the entire match. |
188 | m_groups.OpenGroup(input, m_inputEnd); |
189 | |
190 | if (DoMatchHere(regex, input)) |
191 | { // Success. Note that the entire match group is closed elsewhere on a |
192 | // successful match. |
193 | return true; |
194 | } |
195 | |
196 | // On failure, cancel the group so that it can be reopened on the new substring |
197 | m_groups.CancelGroup(); |
198 | } while (input++ != m_inputEnd); |
199 | } |
200 | |
201 | // No successful match found. |
202 | return false; |
203 | } |
204 | |
205 | //------------------------------------------------------------------------------------------------------- |
206 | // This is the main loop, which handles grouping constructs, repetition directives (*, *?, +, +?), and |
207 | // EOL matches ($), delegating all character matching to ITEM_TRAITS::MatchItem |
208 | // The general algorithm is: |
209 | // 1. Get the next item. |
210 | // 2. If the item is a DOLLAR type, check to see if we're at the end of the retular expression and |
211 | // the input string, and if so return success. |
212 | // 3. If the item is a grouping construct, open or close the appropriate group and continue matching. |
213 | // On failure, roll back the grouping change so that subsequent attemts will have correct state. |
214 | // 4. Check to see if the item following the current is a repetition directive, and if so take the |
215 | // appropriate action. |
216 | // 5. Otherwise defer to ITEM_TRAITS::MatchItem and if successful continue to match the remaining |
217 | // regular expression and input string; otherwise return failure. |
218 | |
219 | template <typename ITEM_TRAITS, typename GROUP_CONTAINER> |
220 | inline bool |
221 | RegExBaseHelper<ITEM_TRAITS, GROUP_CONTAINER>::DoMatchHere( |
222 | const RegexIterator& regex, |
223 | const InputIterator& input) |
224 | { |
225 | WRAPPER_NO_CONTRACT; |
226 | |
227 | if (regex == m_regexEnd) |
228 | { // Reached the end of the regular expression without ever returning false, |
229 | // implying a successful match. Close the overall match group and return. |
230 | m_groups.CloseGroup(input); |
231 | return true; |
232 | } |
233 | |
234 | Item c0 = GetItem(regex); |
235 | if (c0.GetType() == ITEM_TRAITS::DOLLAR && (c0.GetNext() == m_regexEnd)) |
236 | { // Matches EOL if a '$' is encountered at the end of the input. |
237 | m_groups.CloseGroup(input); |
238 | // Success only if we're actually at the end of the input string. |
239 | return input == m_inputEnd; |
240 | } |
241 | if (c0.GetType() == ITEM_TRAITS::PAREN_OPEN) |
242 | { // Encountered an open parenthesis ('('); open a new grouping. |
243 | m_groups.OpenGroup(input, m_inputEnd); |
244 | bool res = DoMatchHere(c0.GetNext(), input); |
245 | if (!res) |
246 | { // If a match fails, there could be further attempts (such as if |
247 | // there is an active repetition matching frame beneath us), so |
248 | // need to cancel the group we just opened so that the grouping |
249 | // structure remains consistent. |
250 | m_groups.CancelGroup(); |
251 | } |
252 | return res; |
253 | } |
254 | if (c0.GetType() == ITEM_TRAITS::PAREN_CLOSE) |
255 | { // Close the most recent open grouping. |
256 | COUNT_T i = m_groups.CloseGroup(input); |
257 | bool res = DoMatchHere(c0.GetNext(), input); |
258 | if (!res) |
259 | { // For the same reasons as the need to cancel an opened group |
260 | // explained above, we need to reopen the closed group if a |
261 | // match fails. |
262 | m_groups.ReopenGroup(i, m_inputEnd); |
263 | } |
264 | return res; |
265 | } |
266 | |
267 | if (c0.GetNext() != m_regexEnd) |
268 | { // If there is another item in the regex string following the current one, get |
269 | // it to see if it is a repetition matching directive. |
270 | Item c1 = GetItem(c0.GetNext()); |
271 | if (c1.GetType() == ITEM_TRAITS::STAR) |
272 | { // '*' matching directive encountered |
273 | if (c1.GetNext() != m_regexEnd) |
274 | { |
275 | Item c2 = GetItem(c1.GetNext()); |
276 | if (c2.GetType() == ITEM_TRAITS::QUESTION_MARK) |
277 | { // conservative matching semantics requested |
278 | return DoMatchStar(c0, c2.GetNext(), input); |
279 | } |
280 | } |
281 | // Eager matching |
282 | return DoMatchStarEagerly(c0, c1.GetNext(), input); |
283 | } |
284 | if (c1.GetType() == ITEM_TRAITS::PLUS) |
285 | { // '+' matching directive encountered. |
286 | if (c1.GetNext() != m_regexEnd) |
287 | { |
288 | Item c2 = GetItem(c1.GetNext()); |
289 | if (c2.GetType() == ITEM_TRAITS::QUESTION_MARK) |
290 | { // conservative matching semantics requested |
291 | return MatchItem(c0, input) && DoMatchStar(c0, c2.GetNext(), input+1); |
292 | } |
293 | } |
294 | // Eager matching |
295 | return MatchItem(c0, input) && DoMatchStarEagerly(c0, c1.GetNext(), input+1); |
296 | } |
297 | if (c1.GetType() == ITEM_TRAITS::QUESTION_MARK) |
298 | { // '?' matching directive encountered |
299 | return (MatchItem(c0, input) && DoMatchHere(c1.GetNext(), input+1)) || DoMatchHere(c1.GetNext(), input); |
300 | } |
301 | } |
302 | |
303 | // No special matching semantics encountered, delegate the matching to ITEM_TRAITS::MatchItem |
304 | return MatchItem(c0, input) && DoMatchHere(c0.GetNext(), input+1); |
305 | } |
306 | |
307 | //------------------------------------------------------------------------------------------------------- |
308 | // Conservative '*' repetition matching. This attempts to match zero items c followed by a match of |
309 | // regex. If this fails, attempt to match one item c followed by a match of regex. Repeat until item c |
310 | // does not match or a successful match is found. |
311 | |
312 | template <typename ITEM_TRAITS, typename GROUP_CONTAINER> |
313 | inline bool |
314 | RegExBaseHelper<ITEM_TRAITS, GROUP_CONTAINER>::DoMatchStar( |
315 | const Item& c, |
316 | const RegexIterator& regex, |
317 | InputIterator input) |
318 | { |
319 | WRAPPER_NO_CONTRACT; |
320 | CONSISTENCY_CHECK(input != m_inputEnd); |
321 | |
322 | do { |
323 | if (DoMatchHere(regex, input)) |
324 | { // A successful match is found! |
325 | return true; |
326 | } |
327 | // No successful match, so try to match one more item and then attempt to match regex on the |
328 | // remaining input. |
329 | } while (input != m_inputEnd && MatchItem(c, input++)); |
330 | return false; |
331 | } |
332 | |
333 | //------------------------------------------------------------------------------------------------------- |
334 | // Similar to DoMatchStar above, except this algorithm matches as many items c as possible first followed |
335 | // by regex on the remaining input, and on failure tries again with a match against one less item c |
336 | // followed by regex on the remaining input, and repeats until there are no items c remaining to match |
337 | // and the zero item match followed by a match of regex on the entire remaining input fails. If any of |
338 | // the match attempts succeed, return success. |
339 | |
340 | template <typename ITEM_TRAITS, typename GROUP_CONTAINER> |
341 | inline bool |
342 | RegExBaseHelper<ITEM_TRAITS, GROUP_CONTAINER>::DoMatchStarEagerly( |
343 | const Item& c, |
344 | const RegexIterator& regex, |
345 | InputIterator input) |
346 | { |
347 | WRAPPER_NO_CONTRACT; |
348 | |
349 | // Make sure we keep a hold of how far back we can unwind. |
350 | InputIterator inputOrig = input; |
351 | |
352 | // First, determine the maximum number of matches against item c. |
353 | while (input != m_inputEnd && MatchItem(c, input)) |
354 | { |
355 | ++input; |
356 | } |
357 | |
358 | do |
359 | { // Work backwards from the maximum number of matches of item c until a match is found |
360 | // or until we have backed right up to the starting value of input (saved in inputOrig), |
361 | // at which time we admit failure. |
362 | if (DoMatchHere(regex, input)) |
363 | return true; |
364 | } while (inputOrig != input--); |
365 | return false; |
366 | } |
367 | |
368 | } // namespace imp |
369 | |
370 | //======================================================================================================= |
371 | // Represents a matched group using iterators to denote the string contained by [Begin(),End()). |
372 | |
373 | template<typename INPUT_ITERATOR> |
374 | class Group |
375 | { |
376 | public: |
377 | typedef INPUT_ITERATOR InputIterator; |
378 | |
379 | // |
380 | // Functions for accessing group properties |
381 | // |
382 | |
383 | // Returns the iterator indicating the start of the group |
384 | const InputIterator& Begin() const |
385 | { LIMITED_METHOD_CONTRACT; return m_begin; } |
386 | |
387 | // Returns the iterator indicating the first non-member of the group |
388 | const InputIterator& End() const |
389 | { LIMITED_METHOD_CONTRACT; return m_end; } |
390 | |
391 | // It is possible that m_end - m_begin could be greater than the maximum of COUNT_T. m_end and |
392 | // m_begin are the end and start of a string, so is entirely unlikely to overflow a COUNT_T. |
393 | // Conbined with the fact that this is debug-only code, opting not to replace all COUNT_T |
394 | // uses with SIZE_T. |
395 | COUNT_T Length() const |
396 | { WRAPPER_NO_CONTRACT; return static_cast<COUNT_T>(m_end - m_begin); } |
397 | |
398 | // |
399 | // Functions used by RegExBaseHelper to create grouping constructs. |
400 | // |
401 | |
402 | Group() |
403 | : m_isClosed(false), m_begin(), m_end() |
404 | { WRAPPER_NO_CONTRACT; } |
405 | |
406 | Group(const InputIterator& start, const InputIterator& end, bool isClosed = false) |
407 | : m_isClosed(isClosed), m_begin(start), m_end(end) |
408 | { WRAPPER_NO_CONTRACT; } |
409 | |
410 | void SetBegin(const InputIterator& start) |
411 | { WRAPPER_NO_CONTRACT; m_begin = start; } |
412 | |
413 | void SetEnd(const InputIterator& end) |
414 | { WRAPPER_NO_CONTRACT; m_end = end; } |
415 | |
416 | bool IsClosed() const |
417 | { LIMITED_METHOD_CONTRACT; return m_isClosed; } |
418 | |
419 | void SetIsClosed(bool isClosed) |
420 | { WRAPPER_NO_CONTRACT; m_isClosed = isClosed; } |
421 | |
422 | protected: |
423 | bool m_isClosed; |
424 | InputIterator m_begin; |
425 | InputIterator m_end; |
426 | }; |
427 | |
428 | //======================================================================================================= |
429 | // Represents a generic container of groups, defaulting to using Group<INPUT_ITERATOR> as its element |
430 | // type. This container satisfies the method requrements of RegExBase. When a match is successful, the |
431 | // match groups may be accessed using the index operator or the iterators definin the matched groups |
432 | // [Begin(), End()). |
433 | |
434 | template <typename INPUT_ITERATOR, typename GROUP_TYPE = Group<INPUT_ITERATOR> > |
435 | class GroupContainer |
436 | { |
437 | public: |
438 | typedef typename SArray<GROUP_TYPE>::Iterator Iterator; |
439 | |
440 | // |
441 | // Functions for enumerating groups |
442 | // |
443 | |
444 | GROUP_TYPE & operator[](COUNT_T idx) |
445 | { |
446 | WRAPPER_NO_CONTRACT; |
447 | CONSISTENCY_CHECK(((COUNT_T)(COUNT_T)idx) == idx); |
448 | return m_array[idx]; |
449 | } |
450 | |
451 | // Returns an iterator to the first matched group (which is always the string for the |
452 | // entire successfully matched string. Specific groups start at Begin()+1 and continue |
453 | // to End()-1. |
454 | Iterator Begin() |
455 | { WRAPPER_NO_CONTRACT; return m_array.Begin(); } |
456 | |
457 | // Returns the first invalid iterator value. |
458 | Iterator End() |
459 | { WRAPPER_NO_CONTRACT; return m_array.End(); } |
460 | |
461 | // |
462 | COUNT_T Count() const |
463 | { WRAPPER_NO_CONTRACT; return m_array.GetCount(); } |
464 | |
465 | // |
466 | // Functions used by RegExBaseHelper to create grouping constructs. |
467 | // |
468 | |
469 | // Note: OpenGroup takes an end iterator so that the group will have a valid (if possibly |
470 | // incorrect) endpoint in the case that the regular expression has unbalanced grouping |
471 | // parentheses. |
472 | void OpenGroup(const INPUT_ITERATOR& start, const INPUT_ITERATOR& end) |
473 | { WRAPPER_NO_CONTRACT; m_array.Append(GROUP_TYPE(start, end, false)); } |
474 | |
475 | COUNT_T CloseGroup(const INPUT_ITERATOR& end); |
476 | |
477 | void ReopenGroup(COUNT_T i, const INPUT_ITERATOR& end); |
478 | |
479 | void CancelGroup() |
480 | { WRAPPER_NO_CONTRACT; m_array.Delete(m_array.End() - 1); } |
481 | |
482 | private: |
483 | SArray<GROUP_TYPE> m_array; |
484 | }; |
485 | |
486 | //------------------------------------------------------------------------------------------------------- |
487 | // Works backwards from the most recently created group looking for an open group to close. Returns |
488 | // the index of the group that was closed, which is used in the event that a group needs to be |
489 | // reopened. |
490 | |
491 | template <typename INPUT_ITERATOR, typename GROUP_TYPE> |
492 | COUNT_T |
493 | GroupContainer<INPUT_ITERATOR, GROUP_TYPE>::CloseGroup( |
494 | const INPUT_ITERATOR& end) |
495 | { |
496 | WRAPPER_NO_CONTRACT; |
497 | |
498 | for (COUNT_T i = (COUNT_T)Count(); i > 0; --i) |
499 | { |
500 | if (!m_array[i-1].IsClosed()) |
501 | { |
502 | m_array[i-1].SetEnd(end); |
503 | m_array[i-1].SetIsClosed(true); |
504 | return i-1; |
505 | } |
506 | } |
507 | |
508 | _ASSERTE(!"Unmatched grouping constructs!" ); |
509 | return 0; |
510 | } |
511 | |
512 | //------------------------------------------------------------------------------------------------------- |
513 | // Reopen a group at the given index, using 'end' to overwrite the current end. |
514 | |
515 | template <typename INPUT_ITERATOR, typename GROUP_TYPE> |
516 | void |
517 | GroupContainer<INPUT_ITERATOR, GROUP_TYPE>::ReopenGroup( |
518 | COUNT_T i, |
519 | const INPUT_ITERATOR& end) |
520 | { |
521 | WRAPPER_NO_CONTRACT; |
522 | CONSISTENCY_CHECK(i > 0 && i < Count()); |
523 | |
524 | if (i > 0 && i < Count()) |
525 | { |
526 | m_array[i].SetEnd(end); |
527 | m_array[i].SetIsClosed(false); |
528 | } |
529 | } |
530 | |
531 | //======================================================================================================= |
532 | // Empty group container that satisfies the method requirements of RegExBase but has empty bodies. This |
533 | // allows for non-allocating matches when grouping is not required. |
534 | |
535 | template <typename INPUT_ITERATOR> |
536 | class NullGroupContainer |
537 | { |
538 | public: |
539 | void OpenGroup(INPUT_ITERATOR, INPUT_ITERATOR) {} |
540 | COUNT_T CloseGroup(INPUT_ITERATOR) { return 0; } |
541 | void ReopenGroup(COUNT_T, INPUT_ITERATOR) {} |
542 | void CancelGroup() {} |
543 | }; |
544 | |
545 | //======================================================================================================= |
546 | // This mini-implementation of regular expression matching supports the |
547 | // following constructs: |
548 | // ^ matches the beginning of the input string |
549 | // $ matches the end of the input string |
550 | // * matches zero or more occurrences of the previous item eagerly |
551 | // *? matches zero or more occurrences of the previous item conservatively |
552 | // + matches 1 or more occurrences of the previous item eagerly |
553 | // +? matches 1 or more occurrences of the previous item conservatively |
554 | // ? matches 0 or 1 occurences of the previous item |
555 | // ( starts a grouping |
556 | // ) ends a grouping |
557 | // |
558 | // IMPORTANT: These are just anchoring and grouping constructs. See the definition for ItemTraitsBase |
559 | // below for information on the default character classes that are supported. (The intent of |
560 | // this separation is to allow customization of the character classes where required.) |
561 | |
562 | // ITEM_TRAITS provides traits for individual tokens in a regular expression, as well as a mechanism for |
563 | // matching said individual components with the target string. RegexBase derives from ITEM_TRAITS in a |
564 | // protected fashion, and is responsible for providing the following: |
565 | // 1. "RegexIterator" typedef |
566 | // Used as an iterator into the regular expression, and used as arguments to indicate the start |
567 | // and the end of the regular expression string. |
568 | // 2. "InputIterator" typedef |
569 | // Used as an iterator into the input string, and used as arguments to indicate the start |
570 | // and the end of the input string. |
571 | // (NOTE: RegexIterator and InputIterator are often typedef'ed to be the same thing.) |
572 | // 3. "Item" typedef. |
573 | // This will be used with methods GetItem and MatchItem (see below). Item must |
574 | // define the the following methods: |
575 | // ItemType GetType() : returns the type of the item. See below for explanation of ItemType |
576 | // const RegexIterator& GetNext() : iterator pointing to the start of the next item. |
577 | // 4. "MatchFlags" typedef, and "static const DefaultMatchFlags" value. |
578 | // Provided for calls to "Match" and "Matches", and passed on to calls "GetItem" and "MatchItem". |
579 | // 5. enum ItemType |
580 | // Defines the following minimum values: |
581 | // DOT |
582 | // CARET |
583 | // DOLLAR |
584 | // STAR |
585 | // QUESTION_MARK |
586 | // PLUS |
587 | // PAREN_OPEN |
588 | // PAREN_CLOSE |
589 | // ItemType may include more values, and may even choose to ignore the above enum types, all of |
590 | // which must be recognized by GetItem and MatchItem (see below). |
591 | // 6. static Item GetItem(const RegexIterator& regex, |
592 | // const RegexIterator& regexEnd, |
593 | // const MatchFlags& flags) |
594 | // This method takes a regular expression iterator and returns the next regular expression |
595 | // element (Item) pointed to by the iterator. |
596 | // 7. static bool MatchItem(const Item& c, |
597 | // const InputIterator& input, |
598 | // const InputIterator& inputEnd, |
599 | // const MatchFlags &flags) |
600 | |
601 | // GROUP_CONTAINER provides functionality for keeping track of regular expression groups. This is a generic |
602 | // argument to Match, and the type of the object must support the following methods: |
603 | // 1. void OpenGroup(const InputIterator& start, const InputIterator& end); |
604 | // Called when a PAREN_OPEN item is encountered. |
605 | // 2. COUNT_T CloseGroup(const InputIterator& end); |
606 | // Called when a PAREN_CLOSE item is encountered. Returns the index of the group that was closed. |
607 | // 3. void ReopenGroup(COUNT_T i, const InputIterator& end); |
608 | // Called when a match following a call to CloseGroup fails, essentially requesting a rollback |
609 | // of the call to CloseGroup. |
610 | // 4. void CancelGroup(); |
611 | // Called when a match following a call to OpenGroup fails, essentially requesting a rollback |
612 | // of the call to OpenGroup. |
613 | |
614 | template <typename ITEM_TRAITS> |
615 | class RegExBase : public ITEM_TRAITS |
616 | { |
617 | public: |
618 | typedef typename ITEM_TRAITS::RegexIterator RegexIterator; |
619 | typedef typename ITEM_TRAITS::InputIterator InputIterator; |
620 | |
621 | // This is a convenience typedef that allows a caller to easily declare a grouping container |
622 | // to be passed to a call to Match. An example would be (see regex_util.h for a definition of |
623 | // SStringRegEx): |
624 | // |
625 | // SString input(SL"Simmons"); |
626 | // SStringRegEx::GroupingContainer container; |
627 | // if (SStringRegEx::Match(SL"(Sim+on)", input, container)) { |
628 | // printf("%S", container[1].GetSString(input).GetUnicode()); |
629 | // } |
630 | // |
631 | typedef GroupContainer<InputIterator, Group<InputIterator> > GroupingContainer; |
632 | |
633 | // Pulls down the typedef for MatchFlags and initialized a static representing the default flags. |
634 | typedef typename ITEM_TRAITS::MatchFlags MatchFlags; |
635 | static const MatchFlags DefaultMatchFlags = ITEM_TRAITS::DefaultMatchFlags; |
636 | |
637 | template <typename GROUP_CONTAINER> |
638 | static bool Match(RegexIterator regex, |
639 | RegexIterator regexEnd, |
640 | InputIterator input, |
641 | InputIterator inputEnd, |
642 | GROUP_CONTAINER& groups, |
643 | MatchFlags flags = DefaultMatchFlags) |
644 | { |
645 | imp::RegExBaseHelper<ITEM_TRAITS, GROUP_CONTAINER> |
646 | re(regex, regexEnd, input, inputEnd, groups, flags); |
647 | return re.Match(); |
648 | } |
649 | |
650 | static bool Matches(RegexIterator regex, |
651 | RegexIterator regexEnd, |
652 | InputIterator input, |
653 | InputIterator inputEnd, |
654 | MatchFlags flags = DefaultMatchFlags) |
655 | { |
656 | NullGroupContainer<InputIterator> ngc; |
657 | return Match(regex, regexEnd, input, inputEnd, ngc, flags); |
658 | } |
659 | }; |
660 | |
661 | //======================================================================================================= |
662 | // In addition to requirements specified on RegExBase, StandardItemTraits provides the following |
663 | // additinal regular expression item types. |
664 | // c matches any literal character c |
665 | // . matches any single character |
666 | // \d any literal digit character |
667 | // \w any alpha character |
668 | // \s any whitespace character |
669 | // |
670 | // Child types of ItemTraitsBase must implement GetItem and MatchItem (see below for full |
671 | // signature requirements). Current child type implementations permit a backslash ('\') to escape |
672 | // special characters ('.', '$', '*', etc.) and allow them to be interpreted as literal characters. |
673 | // |
674 | // This type describes a particular behaviour, but must be subtyped for the particular target character |
675 | // needed, and GetItem and MatchItem must be implemented. |
676 | // |
677 | |
678 | template <typename ITERATOR_TYPE, typename CHAR_TYPE> |
679 | class ItemTraitsBase |
680 | { |
681 | public: |
682 | typedef ITERATOR_TYPE RegexIterator; |
683 | typedef ITERATOR_TYPE InputIterator; |
684 | |
685 | enum MatchFlags |
686 | { |
687 | MF_NONE = 0x00, |
688 | MF_CASE_INSENSITIVE = 0x01 // Match character literals as case insensitive. |
689 | }; |
690 | |
691 | static const MatchFlags DefaultMatchFlags = MF_NONE; |
692 | |
693 | protected: |
694 | ItemTraitsBase() |
695 | {} |
696 | |
697 | enum ItemType |
698 | { |
699 | // REQUIRED, as described in RegExBase |
700 | CARET, |
701 | DOLLAR, |
702 | STAR, |
703 | QUESTION_MARK, |
704 | PLUS, |
705 | PAREN_OPEN, |
706 | PAREN_CLOSE, |
707 | // ADDITIONAL |
708 | DOT, // any literal character |
709 | DIGIT, // any digit |
710 | ALPHA, // any alpha character, upper or lower case |
711 | WHITESPACE, // any whitespace character |
712 | NON_WHITESPACE, // any non-whitespace character |
713 | CHARACTER, // a specific literal character |
714 | }; |
715 | |
716 | class Item |
717 | { |
718 | public: |
719 | Item(ItemType type, CHAR_TYPE val, const RegexIterator& next) |
720 | : m_type(type), m_val(val), m_next(next) |
721 | { WRAPPER_NO_CONTRACT; } |
722 | |
723 | Item(ItemType type, const RegexIterator& next) |
724 | : m_type(type), m_val(0), m_next(next) |
725 | { WRAPPER_NO_CONTRACT; } |
726 | |
727 | ItemType GetType() const |
728 | { LIMITED_METHOD_CONTRACT; return m_type; } |
729 | |
730 | const RegexIterator& GetNext() const |
731 | { LIMITED_METHOD_CONTRACT; return m_next; } |
732 | |
733 | CHAR_TYPE GetValue() const |
734 | { LIMITED_METHOD_CONTRACT; return m_val; } |
735 | |
736 | protected: |
737 | ItemType m_type; |
738 | CHAR_TYPE m_val; |
739 | RegexIterator m_next; |
740 | }; |
741 | |
742 | // All deriving types must add the following methods: |
743 | // static Item GetItem(const RegexIterator& regex, const RegexIterator& regexEnd); |
744 | // static bool MatchItem(const Item& c, const InputIterator& input, const InputIterator& inputEnd); |
745 | }; |
746 | |
747 | //======================================================================================================= |
748 | // Implements ItemTraitsBase, provides matching for UNICODE characters. |
749 | // |
750 | // !!!IMPORTANT!!! |
751 | // This is not a complete unicode implementation - only the equivalent of ASCII alpha characters are |
752 | // consider to be part of the alpha set, and this is also the only set on which case insensitive |
753 | // operations will correctly work. If RegEx is moved out of DEBUG ONLY, then this will have to be fixed |
754 | // to properly address these issues. |
755 | // !!!IMPORTANT!!! |
756 | |
757 | template <typename ITERATOR_TYPE> |
758 | class WCHARItemTraits : public ItemTraitsBase<ITERATOR_TYPE, WCHAR> |
759 | { |
760 | public: |
761 | typedef ItemTraitsBase<ITERATOR_TYPE, WCHAR> PARENT_TYPE; |
762 | typedef typename PARENT_TYPE::RegexIterator RegexIterator; |
763 | typedef typename PARENT_TYPE::InputIterator InputIterator; |
764 | typedef typename PARENT_TYPE::Item Item; |
765 | typedef typename PARENT_TYPE::MatchFlags MatchFlags; |
766 | |
767 | static Item GetItem(const RegexIterator& regex, const RegexIterator& regexEnd, MatchFlags flags); |
768 | static bool MatchItem(const Item& c, const InputIterator& input, const InputIterator& inputEnd, MatchFlags flags); |
769 | |
770 | protected: |
771 | WCHARItemTraits() |
772 | {} |
773 | |
774 | private: |
775 | static inline bool IS_UPPER_A_TO_Z(WCHAR x) |
776 | { WRAPPER_NO_CONTRACT; return (((x) >= W('A')) && ((x) <= W('Z'))); } |
777 | |
778 | static inline bool IS_LOWER_A_TO_Z(WCHAR x) |
779 | { WRAPPER_NO_CONTRACT; return (((x) >= W('a')) && ((x) <= W('z'))); } |
780 | |
781 | static inline WCHAR UPCASE(WCHAR x) |
782 | { WRAPPER_NO_CONTRACT; return (IS_LOWER_A_TO_Z(x) ? ((x) - W('a') + W('A')) : (x)); } |
783 | |
784 | static inline WCHAR DOWNCASE(WCHAR x) |
785 | { WRAPPER_NO_CONTRACT; return (IS_UPPER_A_TO_Z(x) ? ((x) - W('A') + W('a')) : (x)); } |
786 | |
787 | static bool MatchCharacter(WCHAR c1, WCHAR c2, MatchFlags flags) |
788 | { WRAPPER_NO_CONTRACT; return (flags & PARENT_TYPE::MF_CASE_INSENSITIVE) ? (DOWNCASE(c1) == DOWNCASE(c2)) : (c1 == c2); } |
789 | }; |
790 | |
791 | //------------------------------------------------------------------------------------------------------- |
792 | // Reads the next item from regex, recognizing special characters outlined in ItemTraitsBase. |
793 | |
794 | template <typename ITERATOR_TYPE> |
795 | typename WCHARItemTraits<ITERATOR_TYPE>::Item |
796 | WCHARItemTraits<ITERATOR_TYPE>::GetItem( |
797 | const RegexIterator& regex, |
798 | const RegexIterator& regexEnd, |
799 | MatchFlags flags) |
800 | { |
801 | WRAPPER_NO_CONTRACT; |
802 | |
803 | if (*regex == W('\\')) |
804 | { |
805 | const RegexIterator regexNext = regex+1; |
806 | if (regexNext == regexEnd) |
807 | return Item(PARENT_TYPE::CHARACTER, W('\\'), regexNext); |
808 | if (*regexNext == W('d')) |
809 | return Item(PARENT_TYPE::DIGIT, regexNext+1); |
810 | if (*regexNext == W('w')) |
811 | return Item(PARENT_TYPE::ALPHA, regexNext+1); |
812 | if (*regexNext == W('s')) |
813 | return Item(PARENT_TYPE::WHITESPACE, regexNext+1); |
814 | if (*regexNext == W('S')) |
815 | return Item(PARENT_TYPE::NON_WHITESPACE, regexNext+1); |
816 | return Item(PARENT_TYPE::CHARACTER, *regexNext, regexNext+1); |
817 | } |
818 | else if (*regex == W('.')) |
819 | return Item(PARENT_TYPE::DOT, W('.'), regex+1); |
820 | else if (*regex == W('^')) |
821 | return Item(PARENT_TYPE::CARET, W('^'), regex+1); |
822 | else if (*regex == W('$')) |
823 | return Item(PARENT_TYPE::DOLLAR, W('$'), regex+1); |
824 | else if (*regex == W('*')) |
825 | return Item(PARENT_TYPE::STAR, W('*'), regex+1); |
826 | else if (*regex == W('?')) |
827 | return Item(PARENT_TYPE::QUESTION_MARK, W('?'), regex+1); |
828 | else if (*regex == W('+')) |
829 | return Item(PARENT_TYPE::PLUS, W('+'), regex+1); |
830 | else if (*regex == W('(')) |
831 | return Item(PARENT_TYPE::PAREN_OPEN, W('('), regex+1); |
832 | else if (*regex == W(')')) |
833 | return Item(PARENT_TYPE::PAREN_CLOSE, W(')'), regex+1); |
834 | else |
835 | return Item(PARENT_TYPE::CHARACTER, *regex, regex + 1); |
836 | } |
837 | |
838 | //------------------------------------------------------------------------------------------------------- |
839 | // Returns true if the next character point to by input matches the character class described by c. |
840 | |
841 | template <typename ITERATOR_TYPE> |
842 | bool |
843 | WCHARItemTraits<ITERATOR_TYPE>::MatchItem( |
844 | const Item& c, |
845 | const InputIterator& input, |
846 | const InputIterator& inputEnd, |
847 | MatchFlags flags) |
848 | { |
849 | WRAPPER_NO_CONTRACT; |
850 | |
851 | if (c.GetType() == PARENT_TYPE::DIGIT) |
852 | return *input >= W('0') && *input <= W('9'); |
853 | else if (c.GetType() == PARENT_TYPE::ALPHA) |
854 | return (*input >= W('a') && *input <= W('z')) || (*input >= W('A') && *input <= W('Z')); |
855 | else if (c.GetType() == PARENT_TYPE::WHITESPACE) |
856 | return *input == W(' ') || *input == W('\t'); |
857 | else if (c.GetType() == PARENT_TYPE::NON_WHITESPACE) |
858 | return !(*input == W(' ') || *input == W('\t')); |
859 | else |
860 | return c.GetType() == PARENT_TYPE::DOT || MatchCharacter(c.GetValue(), *input, flags); |
861 | } |
862 | |
863 | //======================================================================================================= |
864 | // Implements ItemTraitsBase, provides matching for ASCII (*not* UTF8) characters. |
865 | |
866 | template <typename ITERATOR_TYPE> |
867 | class CHARItemTraits : public ItemTraitsBase<ITERATOR_TYPE, CHAR> |
868 | { |
869 | public: |
870 | typedef ItemTraitsBase<ITERATOR_TYPE, CHAR> PARENT_TYPE; |
871 | typedef typename PARENT_TYPE::RegexIterator RegexIterator; |
872 | typedef typename PARENT_TYPE::InputIterator InputIterator; |
873 | typedef typename PARENT_TYPE::Item Item; |
874 | typedef typename PARENT_TYPE::MatchFlags MatchFlags; |
875 | |
876 | static Item GetItem(const RegexIterator& regex, const RegexIterator& regexEnd, MatchFlags flags); |
877 | static bool MatchItem(const Item& c, const InputIterator& input, const InputIterator& inputEnd, MatchFlags flags); |
878 | |
879 | protected: |
880 | CHARItemTraits() |
881 | {} |
882 | |
883 | private: |
884 | static inline bool IS_UPPER_A_TO_Z(CHAR x) |
885 | { WRAPPER_NO_CONTRACT; return (((x) >= 'A') && ((x) <= 'Z')); } |
886 | |
887 | static inline bool IS_LOWER_A_TO_Z(CHAR x) |
888 | { WRAPPER_NO_CONTRACT; return (((x) >= 'a') && ((x) <= 'z')); } |
889 | |
890 | static inline CHAR UPCASE(CHAR x) |
891 | { WRAPPER_NO_CONTRACT; return (IS_LOWER_A_TO_Z(x) ? ((x) - 'a' + 'A') : (x)); } |
892 | |
893 | static inline CHAR DOWNCASE(CHAR x) |
894 | { WRAPPER_NO_CONTRACT; return (IS_UPPER_A_TO_Z(x) ? ((x) - 'A' + 'a') : (x)); } |
895 | |
896 | static bool MatchCharacter(CHAR c1, CHAR c2, MatchFlags flags) |
897 | { WRAPPER_NO_CONTRACT; return (flags & PARENT_TYPE::MF_CASE_INSENSITIVE) ? (DOWNCASE(c1) == DOWNCASE(c2)) : (c1 == c2); } |
898 | }; |
899 | |
900 | //------------------------------------------------------------------------------------------------------- |
901 | // Reads the next item from regex, recognizing special characters outlined in ItemTraitsBase. |
902 | |
903 | template <typename ITERATOR_TYPE> |
904 | typename CHARItemTraits<ITERATOR_TYPE>::Item |
905 | CHARItemTraits<ITERATOR_TYPE>::GetItem( |
906 | const RegexIterator& regex, |
907 | const RegexIterator& regexEnd, |
908 | MatchFlags flags) |
909 | { |
910 | WRAPPER_NO_CONTRACT; |
911 | |
912 | if (*regex == '\\') |
913 | { |
914 | const RegexIterator regexNext = regex+1; |
915 | if (regexNext == regexEnd) |
916 | return Item(PARENT_TYPE::CHARACTER, W('\\'), regexNext); |
917 | if (*regexNext == 'd') |
918 | return Item(PARENT_TYPE::DIGIT, regexNext+1); |
919 | if (*regexNext == 'w') |
920 | return Item(PARENT_TYPE::ALPHA, regexNext+1); |
921 | if (*regexNext == 's') |
922 | return Item(PARENT_TYPE::WHITESPACE, regexNext+1); |
923 | return Item(PARENT_TYPE::CHARACTER, *regexNext, regexNext+1); |
924 | } |
925 | else if (*regex == '.') |
926 | return Item(PARENT_TYPE::DOT, '.', regex+1); |
927 | else if (*regex == '^') |
928 | return Item(PARENT_TYPE::CARET, '^', regex+1); |
929 | else if (*regex == '$') |
930 | return Item(PARENT_TYPE::DOLLAR, '$', regex+1); |
931 | else if (*regex == '*') |
932 | return Item(PARENT_TYPE::STAR, '*', regex+1); |
933 | else if (*regex == '?') |
934 | return Item(PARENT_TYPE::QUESTION_MARK, '?', regex+1); |
935 | else if (*regex == '+') |
936 | return Item(PARENT_TYPE::PLUS, '+', regex+1); |
937 | else if (*regex == '(') |
938 | return Item(PARENT_TYPE::PAREN_OPEN, '(', regex+1); |
939 | else if (*regex == ')') |
940 | return Item(PARENT_TYPE::PAREN_CLOSE, ')', regex+1); |
941 | else |
942 | return Item(PARENT_TYPE::CHARACTER, *regex, regex + 1); |
943 | } |
944 | |
945 | //------------------------------------------------------------------------------------------------------- |
946 | // Returns true if the next character point to by input matches the character class described by c. |
947 | |
948 | template <typename ITERATOR_TYPE> |
949 | bool |
950 | CHARItemTraits<ITERATOR_TYPE>::MatchItem( |
951 | const Item& c, |
952 | const InputIterator& input, |
953 | const InputIterator& inputEnd, |
954 | MatchFlags flags) |
955 | { |
956 | WRAPPER_NO_CONTRACT; |
957 | |
958 | if (c.GetType() == PARENT_TYPE::DIGIT) |
959 | return *input >= W('0') && *input <= W('9'); |
960 | else if (c.GetType() == PARENT_TYPE::ALPHA) |
961 | return (*input >= W('a') && *input <= W('z')) || (*input >= W('A') && *input <= W('Z')); |
962 | else if (c.GetType() == PARENT_TYPE::WHITESPACE) |
963 | return *input == W(' ') || *input == W('\t'); |
964 | else |
965 | return c.GetType() == PARENT_TYPE::DOT || MatchCharacter(c.GetValue(), *input, flags); |
966 | } |
967 | |
968 | } /* namespace regex */ |
969 | } /* namespace clr */ |
970 | |
971 | #endif // _DEBUG |
972 | |
973 | #endif // _REGEX_BASE_H_ |
974 | |