1// Licensed to the .NET Foundation under one or more agreements.
2// The .NET Foundation licenses this file to you under the MIT license.
3// See the LICENSE file in the project root for more information.
4//
5// Provides basic interpreted regular expression matching. This is meant as a debugging tool,
6// and if regular expressions become necessary in a non-debug scenario great care should be
7// used to ensure that performance is not impaired, and a more thorough review of this could
8// would also be a good thing. This file does not include any concrete instantiations but
9// instead provides the basic building blocks. Some concrete instantiations can be found in
10// regex_util.h.
11//
12// NOTE: See code:clr::regex::RegExBase (below) for description of supported regex language.
13//
14// NOTE: we had to forego standard options such as tr1::regex
15// (http://en.wikipedia.org/wiki/Technical_Report_1#Regular_Expressions) and Microsoft's
16// internal GRETA regular expressions (http://toolbox/sites/987/default.aspx) because they
17// both rely heavily on the STL, which can not currently be used within the CLR.
18//
19// NOTE: If this becomes non-debug-only, then read the comment on WCHARItemTraits for what
20// what needs fixing.
21//
22
23//
24
25#ifndef _REGEX_BASE_H_
26#define _REGEX_BASE_H_
27
28// Forward declare namespace so that it is not debug-only (even if currently there is nothing
29// but debug-only code in the namespace). This enables a "using namespace clr;" line in a
30// header file without having to worry about whether or not it's in a debug-only block.
31namespace clr {}
32
33#ifdef _DEBUG
34
35#include "utilcode.h" // for string hash functions
36#include "sstring.h"
37
38namespace clr {
39namespace regex {
40
41// Implementation details. Code contained in any "imp" namespace should never be directly used
42// by clients of RegEx.
43namespace imp {
44
45 //===================================================================================================
46 // Helper for clr::regex::RegExBase. See class definition for clr::regex::RegExBase below for more
47 // information.
48
49 template <typename ITEM_TRAITS, typename GROUP_CONTAINER>
50 class RegExBaseHelper : protected ITEM_TRAITS
51 {
52 public:
53 typedef typename ITEM_TRAITS::RegexIterator RegexIterator;
54 typedef typename ITEM_TRAITS::InputIterator InputIterator;
55
56 typedef typename ITEM_TRAITS::MatchFlags MatchFlags;
57 static const MatchFlags DefaultMatchFlags = ITEM_TRAITS::DefaultMatchFlags;
58
59 // Arguments:
60 // regex : marks the start of the regular expression string.
61 // regexEnd : marks the end of the regular expression string.
62 // input : marks the start of the input string against which regex will be matched.
63 // inputEnd : marks the end of the input string.
64 // groups : recipient of regular expression groups.
65 //
66 // Returns true if the regular expression was successfully matched against the input string;
67 // otherwise false.
68
69 RegExBaseHelper(const RegexIterator& regex,
70 const RegexIterator& regexEnd,
71 const InputIterator& input,
72 const InputIterator& inputEnd,
73 GROUP_CONTAINER& groups,
74 const MatchFlags flags = DefaultMatchFlags);
75
76 // The main entrypoint to RegExBaseHelper, Match will attempt to match the regular expression
77 // defined by [regex,regexEnd) against the input defined by [input,inputEnd).
78 bool Match()
79 { WRAPPER_NO_CONTRACT; return DoMatch(m_regex, m_input); }
80
81 protected:
82 typedef typename ITEM_TRAITS::Item Item;
83 typedef typename ITEM_TRAITS::ItemType ItemType;
84
85 // Try to match regex at any point within input, starting with the first character and moving
86 // along one at a time until a match is found or the end of the input is encountered, whichever
87 // comes first.
88 bool DoMatch(
89 const RegexIterator& regex,
90 InputIterator input);
91
92 // Try to match regex starting exactly at input.
93 bool DoMatchHere(
94 const RegexIterator& regex,
95 const InputIterator& input);
96
97 // The function returns true if a match is found consisting of zero or more items c followed by a
98 // successful match of regex on the remaining input; otherwise false is returned. This is a
99 // conservative match, so it starts with trying to match zero items followed by regex,
100 // and will then try to match one item followed by regex.
101 bool DoMatchStar(
102 const Item& c,
103 const RegexIterator& regex,
104 InputIterator input);
105
106 // The function returns true if a match is found consisting of zero or more items c followed by a
107 // successful match of regex on the remaining input; otherwise false is returned. This is a
108 // greedy match, so it starts with trying to match as many items as it can followed by regex,
109 // and on failure will try again with one less items matched.
110 bool DoMatchStarEagerly(
111 const Item& c,
112 const RegexIterator& regex,
113 InputIterator input);
114
115 // Convenience method.
116 Item GetItem(
117 const RegexIterator &regex)
118 { WRAPPER_NO_CONTRACT; return ITEM_TRAITS::GetItem(regex, m_regexEnd, m_flags); }
119
120 // Convenience method.
121 bool MatchItem(
122 const Item& c,
123 const InputIterator& input)
124 { WRAPPER_NO_CONTRACT; return ITEM_TRAITS::MatchItem(c, input, m_inputEnd, m_flags); }
125
126 // Declared as protected to prevent direct instantiation.
127 RegExBaseHelper()
128 {}
129
130 RegexIterator m_regex;
131 RegexIterator m_regexEnd;
132 InputIterator m_input;
133 InputIterator m_inputEnd;
134 GROUP_CONTAINER& m_groups;
135 MatchFlags m_flags;
136 };
137
138 //---------------------------------------------------------------------------------------------------
139 // This method simply stores the end iterators for the regular expression and the input strings, as
140 // well as the group collection object and flags, and forwards the call to DoMatch.
141
142 template <typename ITEM_TRAITS, typename GROUP_CONTAINER>
143 RegExBaseHelper<ITEM_TRAITS, GROUP_CONTAINER>::RegExBaseHelper(
144 const RegexIterator& regex,
145 const RegexIterator& regexEnd,
146 const InputIterator& input,
147 const InputIterator& inputEnd,
148 GROUP_CONTAINER& groups,
149 const MatchFlags flags)
150 : m_regex(regex),
151 m_regexEnd(regexEnd),
152 m_input(input),
153 m_inputEnd(inputEnd),
154 m_groups(groups),
155 m_flags(flags)
156 { WRAPPER_NO_CONTRACT; }
157
158 //---------------------------------------------------------------------------------------------------
159 // This method checks if the regular expression starts with a caret, indicating that any match must
160 // be anchored at the start of the input string. If such a caret exists, one match attempt is made
161 // on the input starting with the first character and the result is returned. If the regex does not
162 // start with a caret, the method attempts to match against the input string, starting at the first
163 // character and moving one character over for each successive attempt, until a match is found or
164 // the end of the input is encountered, whichever comes first.
165
166 template <typename ITEM_TRAITS, typename GROUP_CONTAINER>
167 inline bool
168 RegExBaseHelper<ITEM_TRAITS, GROUP_CONTAINER>::DoMatch(
169 const RegexIterator& regex,
170 InputIterator input)
171 {
172 WRAPPER_NO_CONTRACT;
173
174 if (GetItem(regex).GetType() == ITEM_TRAITS::CARET)
175 { // Match must occur from the beginning of the line
176 m_groups.OpenGroup(input, m_inputEnd);
177 bool res = DoMatchHere(regex+1, input);
178 if (!res)
179 m_groups.CancelGroup();
180 return res;
181 }
182 else
183 { // Match can happen anywhere in the string
184 do
185 { // Attempt to match against each substring [x,inputEnd) for x = 0...inputEnd
186
187 // Begin the group that contains the entire match.
188 m_groups.OpenGroup(input, m_inputEnd);
189
190 if (DoMatchHere(regex, input))
191 { // Success. Note that the entire match group is closed elsewhere on a
192 // successful match.
193 return true;
194 }
195
196 // On failure, cancel the group so that it can be reopened on the new substring
197 m_groups.CancelGroup();
198 } while (input++ != m_inputEnd);
199 }
200
201 // No successful match found.
202 return false;
203 }
204
205 //-------------------------------------------------------------------------------------------------------
206 // This is the main loop, which handles grouping constructs, repetition directives (*, *?, +, +?), and
207 // EOL matches ($), delegating all character matching to ITEM_TRAITS::MatchItem
208 // The general algorithm is:
209 // 1. Get the next item.
210 // 2. If the item is a DOLLAR type, check to see if we're at the end of the retular expression and
211 // the input string, and if so return success.
212 // 3. If the item is a grouping construct, open or close the appropriate group and continue matching.
213 // On failure, roll back the grouping change so that subsequent attemts will have correct state.
214 // 4. Check to see if the item following the current is a repetition directive, and if so take the
215 // appropriate action.
216 // 5. Otherwise defer to ITEM_TRAITS::MatchItem and if successful continue to match the remaining
217 // regular expression and input string; otherwise return failure.
218
219 template <typename ITEM_TRAITS, typename GROUP_CONTAINER>
220 inline bool
221 RegExBaseHelper<ITEM_TRAITS, GROUP_CONTAINER>::DoMatchHere(
222 const RegexIterator& regex,
223 const InputIterator& input)
224 {
225 WRAPPER_NO_CONTRACT;
226
227 if (regex == m_regexEnd)
228 { // Reached the end of the regular expression without ever returning false,
229 // implying a successful match. Close the overall match group and return.
230 m_groups.CloseGroup(input);
231 return true;
232 }
233
234 Item c0 = GetItem(regex);
235 if (c0.GetType() == ITEM_TRAITS::DOLLAR && (c0.GetNext() == m_regexEnd))
236 { // Matches EOL if a '$' is encountered at the end of the input.
237 m_groups.CloseGroup(input);
238 // Success only if we're actually at the end of the input string.
239 return input == m_inputEnd;
240 }
241 if (c0.GetType() == ITEM_TRAITS::PAREN_OPEN)
242 { // Encountered an open parenthesis ('('); open a new grouping.
243 m_groups.OpenGroup(input, m_inputEnd);
244 bool res = DoMatchHere(c0.GetNext(), input);
245 if (!res)
246 { // If a match fails, there could be further attempts (such as if
247 // there is an active repetition matching frame beneath us), so
248 // need to cancel the group we just opened so that the grouping
249 // structure remains consistent.
250 m_groups.CancelGroup();
251 }
252 return res;
253 }
254 if (c0.GetType() == ITEM_TRAITS::PAREN_CLOSE)
255 { // Close the most recent open grouping.
256 COUNT_T i = m_groups.CloseGroup(input);
257 bool res = DoMatchHere(c0.GetNext(), input);
258 if (!res)
259 { // For the same reasons as the need to cancel an opened group
260 // explained above, we need to reopen the closed group if a
261 // match fails.
262 m_groups.ReopenGroup(i, m_inputEnd);
263 }
264 return res;
265 }
266
267 if (c0.GetNext() != m_regexEnd)
268 { // If there is another item in the regex string following the current one, get
269 // it to see if it is a repetition matching directive.
270 Item c1 = GetItem(c0.GetNext());
271 if (c1.GetType() == ITEM_TRAITS::STAR)
272 { // '*' matching directive encountered
273 if (c1.GetNext() != m_regexEnd)
274 {
275 Item c2 = GetItem(c1.GetNext());
276 if (c2.GetType() == ITEM_TRAITS::QUESTION_MARK)
277 { // conservative matching semantics requested
278 return DoMatchStar(c0, c2.GetNext(), input);
279 }
280 }
281 // Eager matching
282 return DoMatchStarEagerly(c0, c1.GetNext(), input);
283 }
284 if (c1.GetType() == ITEM_TRAITS::PLUS)
285 { // '+' matching directive encountered.
286 if (c1.GetNext() != m_regexEnd)
287 {
288 Item c2 = GetItem(c1.GetNext());
289 if (c2.GetType() == ITEM_TRAITS::QUESTION_MARK)
290 { // conservative matching semantics requested
291 return MatchItem(c0, input) && DoMatchStar(c0, c2.GetNext(), input+1);
292 }
293 }
294 // Eager matching
295 return MatchItem(c0, input) && DoMatchStarEagerly(c0, c1.GetNext(), input+1);
296 }
297 if (c1.GetType() == ITEM_TRAITS::QUESTION_MARK)
298 { // '?' matching directive encountered
299 return (MatchItem(c0, input) && DoMatchHere(c1.GetNext(), input+1)) || DoMatchHere(c1.GetNext(), input);
300 }
301 }
302
303 // No special matching semantics encountered, delegate the matching to ITEM_TRAITS::MatchItem
304 return MatchItem(c0, input) && DoMatchHere(c0.GetNext(), input+1);
305 }
306
307 //-------------------------------------------------------------------------------------------------------
308 // Conservative '*' repetition matching. This attempts to match zero items c followed by a match of
309 // regex. If this fails, attempt to match one item c followed by a match of regex. Repeat until item c
310 // does not match or a successful match is found.
311
312 template <typename ITEM_TRAITS, typename GROUP_CONTAINER>
313 inline bool
314 RegExBaseHelper<ITEM_TRAITS, GROUP_CONTAINER>::DoMatchStar(
315 const Item& c,
316 const RegexIterator& regex,
317 InputIterator input)
318 {
319 WRAPPER_NO_CONTRACT;
320 CONSISTENCY_CHECK(input != m_inputEnd);
321
322 do {
323 if (DoMatchHere(regex, input))
324 { // A successful match is found!
325 return true;
326 }
327 // No successful match, so try to match one more item and then attempt to match regex on the
328 // remaining input.
329 } while (input != m_inputEnd && MatchItem(c, input++));
330 return false;
331 }
332
333 //-------------------------------------------------------------------------------------------------------
334 // Similar to DoMatchStar above, except this algorithm matches as many items c as possible first followed
335 // by regex on the remaining input, and on failure tries again with a match against one less item c
336 // followed by regex on the remaining input, and repeats until there are no items c remaining to match
337 // and the zero item match followed by a match of regex on the entire remaining input fails. If any of
338 // the match attempts succeed, return success.
339
340 template <typename ITEM_TRAITS, typename GROUP_CONTAINER>
341 inline bool
342 RegExBaseHelper<ITEM_TRAITS, GROUP_CONTAINER>::DoMatchStarEagerly(
343 const Item& c,
344 const RegexIterator& regex,
345 InputIterator input)
346 {
347 WRAPPER_NO_CONTRACT;
348
349 // Make sure we keep a hold of how far back we can unwind.
350 InputIterator inputOrig = input;
351
352 // First, determine the maximum number of matches against item c.
353 while (input != m_inputEnd && MatchItem(c, input))
354 {
355 ++input;
356 }
357
358 do
359 { // Work backwards from the maximum number of matches of item c until a match is found
360 // or until we have backed right up to the starting value of input (saved in inputOrig),
361 // at which time we admit failure.
362 if (DoMatchHere(regex, input))
363 return true;
364 } while (inputOrig != input--);
365 return false;
366 }
367
368} // namespace imp
369
370//=======================================================================================================
371// Represents a matched group using iterators to denote the string contained by [Begin(),End()).
372
373template<typename INPUT_ITERATOR>
374class Group
375{
376public:
377 typedef INPUT_ITERATOR InputIterator;
378
379 //
380 // Functions for accessing group properties
381 //
382
383 // Returns the iterator indicating the start of the group
384 const InputIterator& Begin() const
385 { LIMITED_METHOD_CONTRACT; return m_begin; }
386
387 // Returns the iterator indicating the first non-member of the group
388 const InputIterator& End() const
389 { LIMITED_METHOD_CONTRACT; return m_end; }
390
391 // It is possible that m_end - m_begin could be greater than the maximum of COUNT_T. m_end and
392 // m_begin are the end and start of a string, so is entirely unlikely to overflow a COUNT_T.
393 // Conbined with the fact that this is debug-only code, opting not to replace all COUNT_T
394 // uses with SIZE_T.
395 COUNT_T Length() const
396 { WRAPPER_NO_CONTRACT; return static_cast<COUNT_T>(m_end - m_begin); }
397
398 //
399 // Functions used by RegExBaseHelper to create grouping constructs.
400 //
401
402 Group()
403 : m_isClosed(false), m_begin(), m_end()
404 { WRAPPER_NO_CONTRACT; }
405
406 Group(const InputIterator& start, const InputIterator& end, bool isClosed = false)
407 : m_isClosed(isClosed), m_begin(start), m_end(end)
408 { WRAPPER_NO_CONTRACT; }
409
410 void SetBegin(const InputIterator& start)
411 { WRAPPER_NO_CONTRACT; m_begin = start; }
412
413 void SetEnd(const InputIterator& end)
414 { WRAPPER_NO_CONTRACT; m_end = end; }
415
416 bool IsClosed() const
417 { LIMITED_METHOD_CONTRACT; return m_isClosed; }
418
419 void SetIsClosed(bool isClosed)
420 { WRAPPER_NO_CONTRACT; m_isClosed = isClosed; }
421
422protected:
423 bool m_isClosed;
424 InputIterator m_begin;
425 InputIterator m_end;
426};
427
428//=======================================================================================================
429// Represents a generic container of groups, defaulting to using Group<INPUT_ITERATOR> as its element
430// type. This container satisfies the method requrements of RegExBase. When a match is successful, the
431// match groups may be accessed using the index operator or the iterators definin the matched groups
432// [Begin(), End()).
433
434template <typename INPUT_ITERATOR, typename GROUP_TYPE = Group<INPUT_ITERATOR> >
435class GroupContainer
436{
437public:
438 typedef typename SArray<GROUP_TYPE>::Iterator Iterator;
439
440 //
441 // Functions for enumerating groups
442 //
443
444 GROUP_TYPE & operator[](COUNT_T idx)
445 {
446 WRAPPER_NO_CONTRACT;
447 CONSISTENCY_CHECK(((COUNT_T)(COUNT_T)idx) == idx);
448 return m_array[idx];
449 }
450
451 // Returns an iterator to the first matched group (which is always the string for the
452 // entire successfully matched string. Specific groups start at Begin()+1 and continue
453 // to End()-1.
454 Iterator Begin()
455 { WRAPPER_NO_CONTRACT; return m_array.Begin(); }
456
457 // Returns the first invalid iterator value.
458 Iterator End()
459 { WRAPPER_NO_CONTRACT; return m_array.End(); }
460
461 //
462 COUNT_T Count() const
463 { WRAPPER_NO_CONTRACT; return m_array.GetCount(); }
464
465 //
466 // Functions used by RegExBaseHelper to create grouping constructs.
467 //
468
469 // Note: OpenGroup takes an end iterator so that the group will have a valid (if possibly
470 // incorrect) endpoint in the case that the regular expression has unbalanced grouping
471 // parentheses.
472 void OpenGroup(const INPUT_ITERATOR& start, const INPUT_ITERATOR& end)
473 { WRAPPER_NO_CONTRACT; m_array.Append(GROUP_TYPE(start, end, false)); }
474
475 COUNT_T CloseGroup(const INPUT_ITERATOR& end);
476
477 void ReopenGroup(COUNT_T i, const INPUT_ITERATOR& end);
478
479 void CancelGroup()
480 { WRAPPER_NO_CONTRACT; m_array.Delete(m_array.End() - 1); }
481
482private:
483 SArray<GROUP_TYPE> m_array;
484};
485
486//-------------------------------------------------------------------------------------------------------
487// Works backwards from the most recently created group looking for an open group to close. Returns
488// the index of the group that was closed, which is used in the event that a group needs to be
489// reopened.
490
491template <typename INPUT_ITERATOR, typename GROUP_TYPE>
492COUNT_T
493GroupContainer<INPUT_ITERATOR, GROUP_TYPE>::CloseGroup(
494 const INPUT_ITERATOR& end)
495{
496 WRAPPER_NO_CONTRACT;
497
498 for (COUNT_T i = (COUNT_T)Count(); i > 0; --i)
499 {
500 if (!m_array[i-1].IsClosed())
501 {
502 m_array[i-1].SetEnd(end);
503 m_array[i-1].SetIsClosed(true);
504 return i-1;
505 }
506 }
507
508 _ASSERTE(!"Unmatched grouping constructs!");
509 return 0;
510}
511
512//-------------------------------------------------------------------------------------------------------
513// Reopen a group at the given index, using 'end' to overwrite the current end.
514
515template <typename INPUT_ITERATOR, typename GROUP_TYPE>
516void
517GroupContainer<INPUT_ITERATOR, GROUP_TYPE>::ReopenGroup(
518 COUNT_T i,
519 const INPUT_ITERATOR& end)
520{
521 WRAPPER_NO_CONTRACT;
522 CONSISTENCY_CHECK(i > 0 && i < Count());
523
524 if (i > 0 && i < Count())
525 {
526 m_array[i].SetEnd(end);
527 m_array[i].SetIsClosed(false);
528 }
529}
530
531//=======================================================================================================
532// Empty group container that satisfies the method requirements of RegExBase but has empty bodies. This
533// allows for non-allocating matches when grouping is not required.
534
535template <typename INPUT_ITERATOR>
536class NullGroupContainer
537{
538public:
539 void OpenGroup(INPUT_ITERATOR, INPUT_ITERATOR) {}
540 COUNT_T CloseGroup(INPUT_ITERATOR) { return 0; }
541 void ReopenGroup(COUNT_T, INPUT_ITERATOR) {}
542 void CancelGroup() {}
543};
544
545//=======================================================================================================
546// This mini-implementation of regular expression matching supports the
547// following constructs:
548// ^ matches the beginning of the input string
549// $ matches the end of the input string
550// * matches zero or more occurrences of the previous item eagerly
551// *? matches zero or more occurrences of the previous item conservatively
552// + matches 1 or more occurrences of the previous item eagerly
553// +? matches 1 or more occurrences of the previous item conservatively
554// ? matches 0 or 1 occurences of the previous item
555// ( starts a grouping
556// ) ends a grouping
557//
558// IMPORTANT: These are just anchoring and grouping constructs. See the definition for ItemTraitsBase
559// below for information on the default character classes that are supported. (The intent of
560// this separation is to allow customization of the character classes where required.)
561
562// ITEM_TRAITS provides traits for individual tokens in a regular expression, as well as a mechanism for
563// matching said individual components with the target string. RegexBase derives from ITEM_TRAITS in a
564// protected fashion, and is responsible for providing the following:
565// 1. "RegexIterator" typedef
566// Used as an iterator into the regular expression, and used as arguments to indicate the start
567// and the end of the regular expression string.
568// 2. "InputIterator" typedef
569// Used as an iterator into the input string, and used as arguments to indicate the start
570// and the end of the input string.
571// (NOTE: RegexIterator and InputIterator are often typedef'ed to be the same thing.)
572// 3. "Item" typedef.
573// This will be used with methods GetItem and MatchItem (see below). Item must
574// define the the following methods:
575// ItemType GetType() : returns the type of the item. See below for explanation of ItemType
576// const RegexIterator& GetNext() : iterator pointing to the start of the next item.
577// 4. "MatchFlags" typedef, and "static const DefaultMatchFlags" value.
578// Provided for calls to "Match" and "Matches", and passed on to calls "GetItem" and "MatchItem".
579// 5. enum ItemType
580// Defines the following minimum values:
581// DOT
582// CARET
583// DOLLAR
584// STAR
585// QUESTION_MARK
586// PLUS
587// PAREN_OPEN
588// PAREN_CLOSE
589// ItemType may include more values, and may even choose to ignore the above enum types, all of
590// which must be recognized by GetItem and MatchItem (see below).
591// 6. static Item GetItem(const RegexIterator& regex,
592// const RegexIterator& regexEnd,
593// const MatchFlags& flags)
594// This method takes a regular expression iterator and returns the next regular expression
595// element (Item) pointed to by the iterator.
596// 7. static bool MatchItem(const Item& c,
597// const InputIterator& input,
598// const InputIterator& inputEnd,
599// const MatchFlags &flags)
600
601// GROUP_CONTAINER provides functionality for keeping track of regular expression groups. This is a generic
602// argument to Match, and the type of the object must support the following methods:
603// 1. void OpenGroup(const InputIterator& start, const InputIterator& end);
604// Called when a PAREN_OPEN item is encountered.
605// 2. COUNT_T CloseGroup(const InputIterator& end);
606// Called when a PAREN_CLOSE item is encountered. Returns the index of the group that was closed.
607// 3. void ReopenGroup(COUNT_T i, const InputIterator& end);
608// Called when a match following a call to CloseGroup fails, essentially requesting a rollback
609// of the call to CloseGroup.
610// 4. void CancelGroup();
611// Called when a match following a call to OpenGroup fails, essentially requesting a rollback
612// of the call to OpenGroup.
613
614template <typename ITEM_TRAITS>
615class RegExBase : public ITEM_TRAITS
616{
617public:
618 typedef typename ITEM_TRAITS::RegexIterator RegexIterator;
619 typedef typename ITEM_TRAITS::InputIterator InputIterator;
620
621 // This is a convenience typedef that allows a caller to easily declare a grouping container
622 // to be passed to a call to Match. An example would be (see regex_util.h for a definition of
623 // SStringRegEx):
624 //
625 // SString input(SL"Simmons");
626 // SStringRegEx::GroupingContainer container;
627 // if (SStringRegEx::Match(SL"(Sim+on)", input, container)) {
628 // printf("%S", container[1].GetSString(input).GetUnicode());
629 // }
630 //
631 typedef GroupContainer<InputIterator, Group<InputIterator> > GroupingContainer;
632
633 // Pulls down the typedef for MatchFlags and initialized a static representing the default flags.
634 typedef typename ITEM_TRAITS::MatchFlags MatchFlags;
635 static const MatchFlags DefaultMatchFlags = ITEM_TRAITS::DefaultMatchFlags;
636
637 template <typename GROUP_CONTAINER>
638 static bool Match(RegexIterator regex,
639 RegexIterator regexEnd,
640 InputIterator input,
641 InputIterator inputEnd,
642 GROUP_CONTAINER& groups,
643 MatchFlags flags = DefaultMatchFlags)
644 {
645 imp::RegExBaseHelper<ITEM_TRAITS, GROUP_CONTAINER>
646 re(regex, regexEnd, input, inputEnd, groups, flags);
647 return re.Match();
648 }
649
650 static bool Matches(RegexIterator regex,
651 RegexIterator regexEnd,
652 InputIterator input,
653 InputIterator inputEnd,
654 MatchFlags flags = DefaultMatchFlags)
655 {
656 NullGroupContainer<InputIterator> ngc;
657 return Match(regex, regexEnd, input, inputEnd, ngc, flags);
658 }
659};
660
661//=======================================================================================================
662// In addition to requirements specified on RegExBase, StandardItemTraits provides the following
663// additinal regular expression item types.
664// c matches any literal character c
665// . matches any single character
666// \d any literal digit character
667// \w any alpha character
668// \s any whitespace character
669//
670// Child types of ItemTraitsBase must implement GetItem and MatchItem (see below for full
671// signature requirements). Current child type implementations permit a backslash ('\') to escape
672// special characters ('.', '$', '*', etc.) and allow them to be interpreted as literal characters.
673//
674// This type describes a particular behaviour, but must be subtyped for the particular target character
675// needed, and GetItem and MatchItem must be implemented.
676//
677
678template <typename ITERATOR_TYPE, typename CHAR_TYPE>
679class ItemTraitsBase
680{
681public:
682 typedef ITERATOR_TYPE RegexIterator;
683 typedef ITERATOR_TYPE InputIterator;
684
685 enum MatchFlags
686 {
687 MF_NONE = 0x00,
688 MF_CASE_INSENSITIVE = 0x01 // Match character literals as case insensitive.
689 };
690
691 static const MatchFlags DefaultMatchFlags = MF_NONE;
692
693protected:
694 ItemTraitsBase()
695 {}
696
697 enum ItemType
698 {
699 // REQUIRED, as described in RegExBase
700 CARET,
701 DOLLAR,
702 STAR,
703 QUESTION_MARK,
704 PLUS,
705 PAREN_OPEN,
706 PAREN_CLOSE,
707 // ADDITIONAL
708 DOT, // any literal character
709 DIGIT, // any digit
710 ALPHA, // any alpha character, upper or lower case
711 WHITESPACE, // any whitespace character
712 NON_WHITESPACE, // any non-whitespace character
713 CHARACTER, // a specific literal character
714 };
715
716 class Item
717 {
718 public:
719 Item(ItemType type, CHAR_TYPE val, const RegexIterator& next)
720 : m_type(type), m_val(val), m_next(next)
721 { WRAPPER_NO_CONTRACT; }
722
723 Item(ItemType type, const RegexIterator& next)
724 : m_type(type), m_val(0), m_next(next)
725 { WRAPPER_NO_CONTRACT; }
726
727 ItemType GetType() const
728 { LIMITED_METHOD_CONTRACT; return m_type; }
729
730 const RegexIterator& GetNext() const
731 { LIMITED_METHOD_CONTRACT; return m_next; }
732
733 CHAR_TYPE GetValue() const
734 { LIMITED_METHOD_CONTRACT; return m_val; }
735
736 protected:
737 ItemType m_type;
738 CHAR_TYPE m_val;
739 RegexIterator m_next;
740 };
741
742 // All deriving types must add the following methods:
743 // static Item GetItem(const RegexIterator& regex, const RegexIterator& regexEnd);
744 // static bool MatchItem(const Item& c, const InputIterator& input, const InputIterator& inputEnd);
745};
746
747//=======================================================================================================
748// Implements ItemTraitsBase, provides matching for UNICODE characters.
749//
750// !!!IMPORTANT!!!
751// This is not a complete unicode implementation - only the equivalent of ASCII alpha characters are
752// consider to be part of the alpha set, and this is also the only set on which case insensitive
753// operations will correctly work. If RegEx is moved out of DEBUG ONLY, then this will have to be fixed
754// to properly address these issues.
755// !!!IMPORTANT!!!
756
757template <typename ITERATOR_TYPE>
758class WCHARItemTraits : public ItemTraitsBase<ITERATOR_TYPE, WCHAR>
759{
760public:
761 typedef ItemTraitsBase<ITERATOR_TYPE, WCHAR> PARENT_TYPE;
762 typedef typename PARENT_TYPE::RegexIterator RegexIterator;
763 typedef typename PARENT_TYPE::InputIterator InputIterator;
764 typedef typename PARENT_TYPE::Item Item;
765 typedef typename PARENT_TYPE::MatchFlags MatchFlags;
766
767 static Item GetItem(const RegexIterator& regex, const RegexIterator& regexEnd, MatchFlags flags);
768 static bool MatchItem(const Item& c, const InputIterator& input, const InputIterator& inputEnd, MatchFlags flags);
769
770protected:
771 WCHARItemTraits()
772 {}
773
774private:
775 static inline bool IS_UPPER_A_TO_Z(WCHAR x)
776 { WRAPPER_NO_CONTRACT; return (((x) >= W('A')) && ((x) <= W('Z'))); }
777
778 static inline bool IS_LOWER_A_TO_Z(WCHAR x)
779 { WRAPPER_NO_CONTRACT; return (((x) >= W('a')) && ((x) <= W('z'))); }
780
781 static inline WCHAR UPCASE(WCHAR x)
782 { WRAPPER_NO_CONTRACT; return (IS_LOWER_A_TO_Z(x) ? ((x) - W('a') + W('A')) : (x)); }
783
784 static inline WCHAR DOWNCASE(WCHAR x)
785 { WRAPPER_NO_CONTRACT; return (IS_UPPER_A_TO_Z(x) ? ((x) - W('A') + W('a')) : (x)); }
786
787 static bool MatchCharacter(WCHAR c1, WCHAR c2, MatchFlags flags)
788 { WRAPPER_NO_CONTRACT; return (flags & PARENT_TYPE::MF_CASE_INSENSITIVE) ? (DOWNCASE(c1) == DOWNCASE(c2)) : (c1 == c2); }
789};
790
791//-------------------------------------------------------------------------------------------------------
792// Reads the next item from regex, recognizing special characters outlined in ItemTraitsBase.
793
794template <typename ITERATOR_TYPE>
795typename WCHARItemTraits<ITERATOR_TYPE>::Item
796WCHARItemTraits<ITERATOR_TYPE>::GetItem(
797 const RegexIterator& regex,
798 const RegexIterator& regexEnd,
799 MatchFlags flags)
800{
801 WRAPPER_NO_CONTRACT;
802
803 if (*regex == W('\\'))
804 {
805 const RegexIterator regexNext = regex+1;
806 if (regexNext == regexEnd)
807 return Item(PARENT_TYPE::CHARACTER, W('\\'), regexNext);
808 if (*regexNext == W('d'))
809 return Item(PARENT_TYPE::DIGIT, regexNext+1);
810 if (*regexNext == W('w'))
811 return Item(PARENT_TYPE::ALPHA, regexNext+1);
812 if (*regexNext == W('s'))
813 return Item(PARENT_TYPE::WHITESPACE, regexNext+1);
814 if (*regexNext == W('S'))
815 return Item(PARENT_TYPE::NON_WHITESPACE, regexNext+1);
816 return Item(PARENT_TYPE::CHARACTER, *regexNext, regexNext+1);
817 }
818 else if (*regex == W('.'))
819 return Item(PARENT_TYPE::DOT, W('.'), regex+1);
820 else if (*regex == W('^'))
821 return Item(PARENT_TYPE::CARET, W('^'), regex+1);
822 else if (*regex == W('$'))
823 return Item(PARENT_TYPE::DOLLAR, W('$'), regex+1);
824 else if (*regex == W('*'))
825 return Item(PARENT_TYPE::STAR, W('*'), regex+1);
826 else if (*regex == W('?'))
827 return Item(PARENT_TYPE::QUESTION_MARK, W('?'), regex+1);
828 else if (*regex == W('+'))
829 return Item(PARENT_TYPE::PLUS, W('+'), regex+1);
830 else if (*regex == W('('))
831 return Item(PARENT_TYPE::PAREN_OPEN, W('('), regex+1);
832 else if (*regex == W(')'))
833 return Item(PARENT_TYPE::PAREN_CLOSE, W(')'), regex+1);
834 else
835 return Item(PARENT_TYPE::CHARACTER, *regex, regex + 1);
836}
837
838//-------------------------------------------------------------------------------------------------------
839// Returns true if the next character point to by input matches the character class described by c.
840
841template <typename ITERATOR_TYPE>
842bool
843WCHARItemTraits<ITERATOR_TYPE>::MatchItem(
844 const Item& c,
845 const InputIterator& input,
846 const InputIterator& inputEnd,
847 MatchFlags flags)
848{
849 WRAPPER_NO_CONTRACT;
850
851 if (c.GetType() == PARENT_TYPE::DIGIT)
852 return *input >= W('0') && *input <= W('9');
853 else if (c.GetType() == PARENT_TYPE::ALPHA)
854 return (*input >= W('a') && *input <= W('z')) || (*input >= W('A') && *input <= W('Z'));
855 else if (c.GetType() == PARENT_TYPE::WHITESPACE)
856 return *input == W(' ') || *input == W('\t');
857 else if (c.GetType() == PARENT_TYPE::NON_WHITESPACE)
858 return !(*input == W(' ') || *input == W('\t'));
859 else
860 return c.GetType() == PARENT_TYPE::DOT || MatchCharacter(c.GetValue(), *input, flags);
861}
862
863//=======================================================================================================
864// Implements ItemTraitsBase, provides matching for ASCII (*not* UTF8) characters.
865
866template <typename ITERATOR_TYPE>
867class CHARItemTraits : public ItemTraitsBase<ITERATOR_TYPE, CHAR>
868{
869public:
870 typedef ItemTraitsBase<ITERATOR_TYPE, CHAR> PARENT_TYPE;
871 typedef typename PARENT_TYPE::RegexIterator RegexIterator;
872 typedef typename PARENT_TYPE::InputIterator InputIterator;
873 typedef typename PARENT_TYPE::Item Item;
874 typedef typename PARENT_TYPE::MatchFlags MatchFlags;
875
876 static Item GetItem(const RegexIterator& regex, const RegexIterator& regexEnd, MatchFlags flags);
877 static bool MatchItem(const Item& c, const InputIterator& input, const InputIterator& inputEnd, MatchFlags flags);
878
879protected:
880 CHARItemTraits()
881 {}
882
883private:
884 static inline bool IS_UPPER_A_TO_Z(CHAR x)
885 { WRAPPER_NO_CONTRACT; return (((x) >= 'A') && ((x) <= 'Z')); }
886
887 static inline bool IS_LOWER_A_TO_Z(CHAR x)
888 { WRAPPER_NO_CONTRACT; return (((x) >= 'a') && ((x) <= 'z')); }
889
890 static inline CHAR UPCASE(CHAR x)
891 { WRAPPER_NO_CONTRACT; return (IS_LOWER_A_TO_Z(x) ? ((x) - 'a' + 'A') : (x)); }
892
893 static inline CHAR DOWNCASE(CHAR x)
894 { WRAPPER_NO_CONTRACT; return (IS_UPPER_A_TO_Z(x) ? ((x) - 'A' + 'a') : (x)); }
895
896 static bool MatchCharacter(CHAR c1, CHAR c2, MatchFlags flags)
897 { WRAPPER_NO_CONTRACT; return (flags & PARENT_TYPE::MF_CASE_INSENSITIVE) ? (DOWNCASE(c1) == DOWNCASE(c2)) : (c1 == c2); }
898};
899
900//-------------------------------------------------------------------------------------------------------
901// Reads the next item from regex, recognizing special characters outlined in ItemTraitsBase.
902
903template <typename ITERATOR_TYPE>
904typename CHARItemTraits<ITERATOR_TYPE>::Item
905CHARItemTraits<ITERATOR_TYPE>::GetItem(
906 const RegexIterator& regex,
907 const RegexIterator& regexEnd,
908 MatchFlags flags)
909{
910 WRAPPER_NO_CONTRACT;
911
912 if (*regex == '\\')
913 {
914 const RegexIterator regexNext = regex+1;
915 if (regexNext == regexEnd)
916 return Item(PARENT_TYPE::CHARACTER, W('\\'), regexNext);
917 if (*regexNext == 'd')
918 return Item(PARENT_TYPE::DIGIT, regexNext+1);
919 if (*regexNext == 'w')
920 return Item(PARENT_TYPE::ALPHA, regexNext+1);
921 if (*regexNext == 's')
922 return Item(PARENT_TYPE::WHITESPACE, regexNext+1);
923 return Item(PARENT_TYPE::CHARACTER, *regexNext, regexNext+1);
924 }
925 else if (*regex == '.')
926 return Item(PARENT_TYPE::DOT, '.', regex+1);
927 else if (*regex == '^')
928 return Item(PARENT_TYPE::CARET, '^', regex+1);
929 else if (*regex == '$')
930 return Item(PARENT_TYPE::DOLLAR, '$', regex+1);
931 else if (*regex == '*')
932 return Item(PARENT_TYPE::STAR, '*', regex+1);
933 else if (*regex == '?')
934 return Item(PARENT_TYPE::QUESTION_MARK, '?', regex+1);
935 else if (*regex == '+')
936 return Item(PARENT_TYPE::PLUS, '+', regex+1);
937 else if (*regex == '(')
938 return Item(PARENT_TYPE::PAREN_OPEN, '(', regex+1);
939 else if (*regex == ')')
940 return Item(PARENT_TYPE::PAREN_CLOSE, ')', regex+1);
941 else
942 return Item(PARENT_TYPE::CHARACTER, *regex, regex + 1);
943}
944
945//-------------------------------------------------------------------------------------------------------
946// Returns true if the next character point to by input matches the character class described by c.
947
948template <typename ITERATOR_TYPE>
949bool
950CHARItemTraits<ITERATOR_TYPE>::MatchItem(
951 const Item& c,
952 const InputIterator& input,
953 const InputIterator& inputEnd,
954 MatchFlags flags)
955{
956 WRAPPER_NO_CONTRACT;
957
958 if (c.GetType() == PARENT_TYPE::DIGIT)
959 return *input >= W('0') && *input <= W('9');
960 else if (c.GetType() == PARENT_TYPE::ALPHA)
961 return (*input >= W('a') && *input <= W('z')) || (*input >= W('A') && *input <= W('Z'));
962 else if (c.GetType() == PARENT_TYPE::WHITESPACE)
963 return *input == W(' ') || *input == W('\t');
964 else
965 return c.GetType() == PARENT_TYPE::DOT || MatchCharacter(c.GetValue(), *input, flags);
966}
967
968} /* namespace regex */
969} /* namespace clr */
970
971#endif // _DEBUG
972
973#endif // _REGEX_BASE_H_
974