1/*
2 *
3 * Copyright (c) 2004
4 * John Maddock
5 *
6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 *
10 */
11
12 /*
13 * LOCATION: see http://www.boost.org for most recent version.
14 * FILE basic_regex_parser.cpp
15 * VERSION see <boost/version.hpp>
16 * DESCRIPTION: Declares template class basic_regex_parser.
17 */
18
19#ifndef BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
20#define BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
21
22#ifdef BOOST_MSVC
23#pragma warning(push)
24#pragma warning(disable: 4103)
25#endif
26#ifdef BOOST_HAS_ABI_HEADERS
27# include BOOST_ABI_PREFIX
28#endif
29#ifdef BOOST_MSVC
30#pragma warning(pop)
31#endif
32
33namespace boost{
34namespace BOOST_REGEX_DETAIL_NS{
35
36#ifdef BOOST_MSVC
37#pragma warning(push)
38#pragma warning(disable:4244 4800)
39#endif
40
41inline boost::intmax_t umax(mpl::false_ const&)
42{
43 // Get out clause here, just in case numeric_limits is unspecialized:
44 return std::numeric_limits<boost::intmax_t>::is_specialized ? (std::numeric_limits<boost::intmax_t>::max)() : INT_MAX;
45}
46inline boost::intmax_t umax(mpl::true_ const&)
47{
48 return (std::numeric_limits<std::size_t>::max)();
49}
50
51inline boost::intmax_t umax()
52{
53 return umax(mpl::bool_<std::numeric_limits<boost::intmax_t>::digits >= std::numeric_limits<std::size_t>::digits>());
54}
55
56template <class charT, class traits>
57class basic_regex_parser : public basic_regex_creator<charT, traits>
58{
59public:
60 basic_regex_parser(regex_data<charT, traits>* data);
61 void parse(const charT* p1, const charT* p2, unsigned flags);
62 void fail(regex_constants::error_type error_code, std::ptrdiff_t position);
63 void fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos);
64 void fail(regex_constants::error_type error_code, std::ptrdiff_t position, const std::string& message)
65 {
66 fail(error_code, position, message, position);
67 }
68
69 bool parse_all();
70 bool parse_basic();
71 bool parse_extended();
72 bool parse_literal();
73 bool parse_open_paren();
74 bool parse_basic_escape();
75 bool parse_extended_escape();
76 bool parse_match_any();
77 bool parse_repeat(std::size_t low = 0, std::size_t high = (std::numeric_limits<std::size_t>::max)());
78 bool parse_repeat_range(bool isbasic);
79 bool parse_alt();
80 bool parse_set();
81 bool parse_backref();
82 void parse_set_literal(basic_char_set<charT, traits>& char_set);
83 bool parse_inner_set(basic_char_set<charT, traits>& char_set);
84 bool parse_QE();
85 bool parse_perl_extension();
86 bool parse_perl_verb();
87 bool match_verb(const char*);
88 bool add_emacs_code(bool negate);
89 bool unwind_alts(std::ptrdiff_t last_paren_start);
90 digraph<charT> get_next_set_literal(basic_char_set<charT, traits>& char_set);
91 charT unescape_character();
92 regex_constants::syntax_option_type parse_options();
93
94private:
95 typedef bool (basic_regex_parser::*parser_proc_type)();
96 typedef typename traits::string_type string_type;
97 typedef typename traits::char_class_type char_class_type;
98 parser_proc_type m_parser_proc; // the main parser to use
99 const charT* m_base; // the start of the string being parsed
100 const charT* m_end; // the end of the string being parsed
101 const charT* m_position; // our current parser position
102 unsigned m_mark_count; // how many sub-expressions we have
103 int m_mark_reset; // used to indicate that we're inside a (?|...) block.
104 unsigned m_max_mark; // largest mark count seen inside a (?|...) block.
105 std::ptrdiff_t m_paren_start; // where the last seen ')' began (where repeats are inserted).
106 std::ptrdiff_t m_alt_insert_point; // where to insert the next alternative
107 bool m_has_case_change; // true if somewhere in the current block the case has changed
108#if defined(BOOST_MSVC) && defined(_M_IX86)
109 // This is an ugly warning suppression workaround (for warnings *inside* std::vector
110 // that can not otherwise be suppressed)...
111 BOOST_STATIC_ASSERT(sizeof(long) >= sizeof(void*));
112 std::vector<long> m_alt_jumps; // list of alternative in the current scope.
113#else
114 std::vector<std::ptrdiff_t> m_alt_jumps; // list of alternative in the current scope.
115#endif
116
117 basic_regex_parser& operator=(const basic_regex_parser&);
118 basic_regex_parser(const basic_regex_parser&);
119};
120
121template <class charT, class traits>
122basic_regex_parser<charT, traits>::basic_regex_parser(regex_data<charT, traits>* data)
123 : basic_regex_creator<charT, traits>(data), m_mark_count(0), m_mark_reset(-1), m_max_mark(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false)
124{
125}
126
127template <class charT, class traits>
128void basic_regex_parser<charT, traits>::parse(const charT* p1, const charT* p2, unsigned l_flags)
129{
130 // pass l_flags on to base class:
131 this->init(l_flags);
132 // set up pointers:
133 m_position = m_base = p1;
134 m_end = p2;
135 // empty strings are errors:
136 if((p1 == p2) &&
137 (
138 ((l_flags & regbase::main_option_type) != regbase::perl_syntax_group)
139 || (l_flags & regbase::no_empty_expressions)
140 )
141 )
142 {
143 fail(regex_constants::error_empty, 0);
144 return;
145 }
146 // select which parser to use:
147 switch(l_flags & regbase::main_option_type)
148 {
149 case regbase::perl_syntax_group:
150 {
151 m_parser_proc = &basic_regex_parser<charT, traits>::parse_extended;
152 //
153 // Add a leading paren with index zero to give recursions a target:
154 //
155 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
156 br->index = 0;
157 br->icase = this->flags() & regbase::icase;
158 break;
159 }
160 case regbase::basic_syntax_group:
161 m_parser_proc = &basic_regex_parser<charT, traits>::parse_basic;
162 break;
163 case regbase::literal:
164 m_parser_proc = &basic_regex_parser<charT, traits>::parse_literal;
165 break;
166 default:
167 // Ooops, someone has managed to set more than one of the main option flags,
168 // so this must be an error:
169 fail(regex_constants::error_unknown, 0, "An invalid combination of regular expression syntax flags was used.");
170 return;
171 }
172
173 // parse all our characters:
174 bool result = parse_all();
175 //
176 // Unwind our alternatives:
177 //
178 unwind_alts(-1);
179 // reset l_flags as a global scope (?imsx) may have altered them:
180 this->flags(l_flags);
181 // if we haven't gobbled up all the characters then we must
182 // have had an unexpected ')' :
183 if(!result)
184 {
185 fail(regex_constants::error_paren, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_position), "Found a closing ) with no corresponding openening parenthesis.");
186 return;
187 }
188 // if an error has been set then give up now:
189 if(this->m_pdata->m_status)
190 return;
191 // fill in our sub-expression count:
192 this->m_pdata->m_mark_count = 1 + m_mark_count;
193 this->finalize(p1, p2);
194}
195
196template <class charT, class traits>
197void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position)
198{
199 // get the error message:
200 std::string message = this->m_pdata->m_ptraits->error_string(error_code);
201 fail(error_code, position, message);
202}
203
204template <class charT, class traits>
205void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos)
206{
207 if(0 == this->m_pdata->m_status) // update the error code if not already set
208 this->m_pdata->m_status = error_code;
209 m_position = m_end; // don't bother parsing anything else
210
211#ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
212 //
213 // Augment error message with the regular expression text:
214 //
215 if(start_pos == position)
216 start_pos = (std::max)(static_cast<std::ptrdiff_t>(0), position - static_cast<std::ptrdiff_t>(10));
217 std::ptrdiff_t end_pos = (std::min)(position + static_cast<std::ptrdiff_t>(10), static_cast<std::ptrdiff_t>(m_end - m_base));
218 if(error_code != regex_constants::error_empty)
219 {
220 if((start_pos != 0) || (end_pos != (m_end - m_base)))
221 message += " The error occurred while parsing the regular expression fragment: '";
222 else
223 message += " The error occurred while parsing the regular expression: '";
224 if(start_pos != end_pos)
225 {
226 message += std::string(m_base + start_pos, m_base + position);
227 message += ">>>HERE>>>";
228 message += std::string(m_base + position, m_base + end_pos);
229 }
230 message += "'.";
231 }
232#endif
233
234#ifndef BOOST_NO_EXCEPTIONS
235 if(0 == (this->flags() & regex_constants::no_except))
236 {
237 boost::regex_error e(message, error_code, position);
238 e.raise();
239 }
240#else
241 (void)position; // suppress warnings.
242#endif
243}
244
245template <class charT, class traits>
246bool basic_regex_parser<charT, traits>::parse_all()
247{
248 bool result = true;
249 while(result && (m_position != m_end))
250 {
251 result = (this->*m_parser_proc)();
252 }
253 return result;
254}
255
256#ifdef BOOST_MSVC
257#pragma warning(push)
258#pragma warning(disable:4702)
259#endif
260template <class charT, class traits>
261bool basic_regex_parser<charT, traits>::parse_basic()
262{
263 switch(this->m_traits.syntax_type(*m_position))
264 {
265 case regex_constants::syntax_escape:
266 return parse_basic_escape();
267 case regex_constants::syntax_dot:
268 return parse_match_any();
269 case regex_constants::syntax_caret:
270 ++m_position;
271 this->append_state(syntax_element_start_line);
272 break;
273 case regex_constants::syntax_dollar:
274 ++m_position;
275 this->append_state(syntax_element_end_line);
276 break;
277 case regex_constants::syntax_star:
278 if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line))
279 return parse_literal();
280 else
281 {
282 ++m_position;
283 return parse_repeat();
284 }
285 case regex_constants::syntax_plus:
286 if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
287 return parse_literal();
288 else
289 {
290 ++m_position;
291 return parse_repeat(1);
292 }
293 case regex_constants::syntax_question:
294 if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
295 return parse_literal();
296 else
297 {
298 ++m_position;
299 return parse_repeat(0, 1);
300 }
301 case regex_constants::syntax_open_set:
302 return parse_set();
303 case regex_constants::syntax_newline:
304 if(this->flags() & regbase::newline_alt)
305 return parse_alt();
306 else
307 return parse_literal();
308 default:
309 return parse_literal();
310 }
311 return true;
312}
313
314template <class charT, class traits>
315bool basic_regex_parser<charT, traits>::parse_extended()
316{
317 bool result = true;
318 switch(this->m_traits.syntax_type(*m_position))
319 {
320 case regex_constants::syntax_open_mark:
321 return parse_open_paren();
322 case regex_constants::syntax_close_mark:
323 return false;
324 case regex_constants::syntax_escape:
325 return parse_extended_escape();
326 case regex_constants::syntax_dot:
327 return parse_match_any();
328 case regex_constants::syntax_caret:
329 ++m_position;
330 this->append_state(
331 (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_start : syntax_element_start_line));
332 break;
333 case regex_constants::syntax_dollar:
334 ++m_position;
335 this->append_state(
336 (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_end : syntax_element_end_line));
337 break;
338 case regex_constants::syntax_star:
339 if(m_position == this->m_base)
340 {
341 fail(regex_constants::error_badrepeat, 0, "The repeat operator \"*\" cannot start a regular expression.");
342 return false;
343 }
344 ++m_position;
345 return parse_repeat();
346 case regex_constants::syntax_question:
347 if(m_position == this->m_base)
348 {
349 fail(regex_constants::error_badrepeat, 0, "The repeat operator \"?\" cannot start a regular expression.");
350 return false;
351 }
352 ++m_position;
353 return parse_repeat(0,1);
354 case regex_constants::syntax_plus:
355 if(m_position == this->m_base)
356 {
357 fail(regex_constants::error_badrepeat, 0, "The repeat operator \"+\" cannot start a regular expression.");
358 return false;
359 }
360 ++m_position;
361 return parse_repeat(1);
362 case regex_constants::syntax_open_brace:
363 ++m_position;
364 return parse_repeat_range(false);
365 case regex_constants::syntax_close_brace:
366 if((this->flags() & regbase::no_perl_ex) == regbase::no_perl_ex)
367 {
368 fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
369 return false;
370 }
371 result = parse_literal();
372 break;
373 case regex_constants::syntax_or:
374 return parse_alt();
375 case regex_constants::syntax_open_set:
376 return parse_set();
377 case regex_constants::syntax_newline:
378 if(this->flags() & regbase::newline_alt)
379 return parse_alt();
380 else
381 return parse_literal();
382 case regex_constants::syntax_hash:
383 //
384 // If we have a mod_x flag set, then skip until
385 // we get to a newline character:
386 //
387 if((this->flags()
388 & (regbase::no_perl_ex|regbase::mod_x))
389 == regbase::mod_x)
390 {
391 while((m_position != m_end) && !is_separator(*m_position++)){}
392 return true;
393 }
394 BOOST_FALLTHROUGH;
395 default:
396 result = parse_literal();
397 break;
398 }
399 return result;
400}
401#ifdef BOOST_MSVC
402#pragma warning(pop)
403#endif
404
405template <class charT, class traits>
406bool basic_regex_parser<charT, traits>::parse_literal()
407{
408 // append this as a literal provided it's not a space character
409 // or the perl option regbase::mod_x is not set:
410 if(
411 ((this->flags()
412 & (regbase::main_option_type|regbase::mod_x|regbase::no_perl_ex))
413 != regbase::mod_x)
414 || !this->m_traits.isctype(*m_position, this->m_mask_space))
415 this->append_literal(*m_position);
416 ++m_position;
417 return true;
418}
419
420template <class charT, class traits>
421bool basic_regex_parser<charT, traits>::parse_open_paren()
422{
423 //
424 // skip the '(' and error check:
425 //
426 if(++m_position == m_end)
427 {
428 fail(regex_constants::error_paren, m_position - m_base);
429 return false;
430 }
431 //
432 // begin by checking for a perl-style (?...) extension:
433 //
434 if(
435 ((this->flags() & (regbase::main_option_type | regbase::no_perl_ex)) == 0)
436 || ((this->flags() & (regbase::main_option_type | regbase::emacs_ex)) == (regbase::basic_syntax_group|regbase::emacs_ex))
437 )
438 {
439 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
440 return parse_perl_extension();
441 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_star)
442 return parse_perl_verb();
443 }
444 //
445 // update our mark count, and append the required state:
446 //
447 unsigned markid = 0;
448 if(0 == (this->flags() & regbase::nosubs))
449 {
450 markid = ++m_mark_count;
451#ifndef BOOST_NO_STD_DISTANCE
452 if(this->flags() & regbase::save_subexpression_location)
453 this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 1, 0));
454#else
455 if(this->flags() & regbase::save_subexpression_location)
456 this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>((m_position - m_base) - 1, 0));
457#endif
458 }
459 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
460 pb->index = markid;
461 pb->icase = this->flags() & regbase::icase;
462 std::ptrdiff_t last_paren_start = this->getoffset(pb);
463 // back up insertion point for alternations, and set new point:
464 std::ptrdiff_t last_alt_point = m_alt_insert_point;
465 this->m_pdata->m_data.align();
466 m_alt_insert_point = this->m_pdata->m_data.size();
467 //
468 // back up the current flags in case we have a nested (?imsx) group:
469 //
470 regex_constants::syntax_option_type opts = this->flags();
471 bool old_case_change = m_has_case_change;
472 m_has_case_change = false; // no changes to this scope as yet...
473 //
474 // Back up branch reset data in case we have a nested (?|...)
475 //
476 int mark_reset = m_mark_reset;
477 m_mark_reset = -1;
478 //
479 // now recursively add more states, this will terminate when we get to a
480 // matching ')' :
481 //
482 parse_all();
483 //
484 // Unwind pushed alternatives:
485 //
486 if(0 == unwind_alts(last_paren_start))
487 return false;
488 //
489 // restore flags:
490 //
491 if(m_has_case_change)
492 {
493 // the case has changed in one or more of the alternatives
494 // within the scoped (...) block: we have to add a state
495 // to reset the case sensitivity:
496 static_cast<re_case*>(
497 this->append_state(syntax_element_toggle_case, sizeof(re_case))
498 )->icase = opts & regbase::icase;
499 }
500 this->flags(opts);
501 m_has_case_change = old_case_change;
502 //
503 // restore branch reset:
504 //
505 m_mark_reset = mark_reset;
506 //
507 // we either have a ')' or we have run out of characters prematurely:
508 //
509 if(m_position == m_end)
510 {
511 this->fail(regex_constants::error_paren, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_end));
512 return false;
513 }
514 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
515 return false;
516#ifndef BOOST_NO_STD_DISTANCE
517 if(markid && (this->flags() & regbase::save_subexpression_location))
518 this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position);
519#else
520 if(markid && (this->flags() & regbase::save_subexpression_location))
521 this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base);
522#endif
523 ++m_position;
524 //
525 // append closing parenthesis state:
526 //
527 pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
528 pb->index = markid;
529 pb->icase = this->flags() & regbase::icase;
530 this->m_paren_start = last_paren_start;
531 //
532 // restore the alternate insertion point:
533 //
534 this->m_alt_insert_point = last_alt_point;
535 //
536 // allow backrefs to this mark:
537 //
538 if((markid > 0) && (markid < sizeof(unsigned) * CHAR_BIT))
539 this->m_backrefs |= 1u << (markid - 1);
540
541 return true;
542}
543
544template <class charT, class traits>
545bool basic_regex_parser<charT, traits>::parse_basic_escape()
546{
547 if(++m_position == m_end)
548 {
549 fail(regex_constants::error_paren, m_position - m_base);
550 return false;
551 }
552 bool result = true;
553 switch(this->m_traits.escape_syntax_type(*m_position))
554 {
555 case regex_constants::syntax_open_mark:
556 return parse_open_paren();
557 case regex_constants::syntax_close_mark:
558 return false;
559 case regex_constants::syntax_plus:
560 if(this->flags() & regex_constants::bk_plus_qm)
561 {
562 ++m_position;
563 return parse_repeat(1);
564 }
565 else
566 return parse_literal();
567 case regex_constants::syntax_question:
568 if(this->flags() & regex_constants::bk_plus_qm)
569 {
570 ++m_position;
571 return parse_repeat(0, 1);
572 }
573 else
574 return parse_literal();
575 case regex_constants::syntax_open_brace:
576 if(this->flags() & regbase::no_intervals)
577 return parse_literal();
578 ++m_position;
579 return parse_repeat_range(true);
580 case regex_constants::syntax_close_brace:
581 if(this->flags() & regbase::no_intervals)
582 return parse_literal();
583 fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
584 return false;
585 case regex_constants::syntax_or:
586 if(this->flags() & regbase::bk_vbar)
587 return parse_alt();
588 else
589 result = parse_literal();
590 break;
591 case regex_constants::syntax_digit:
592 return parse_backref();
593 case regex_constants::escape_type_start_buffer:
594 if(this->flags() & regbase::emacs_ex)
595 {
596 ++m_position;
597 this->append_state(syntax_element_buffer_start);
598 }
599 else
600 result = parse_literal();
601 break;
602 case regex_constants::escape_type_end_buffer:
603 if(this->flags() & regbase::emacs_ex)
604 {
605 ++m_position;
606 this->append_state(syntax_element_buffer_end);
607 }
608 else
609 result = parse_literal();
610 break;
611 case regex_constants::escape_type_word_assert:
612 if(this->flags() & regbase::emacs_ex)
613 {
614 ++m_position;
615 this->append_state(syntax_element_word_boundary);
616 }
617 else
618 result = parse_literal();
619 break;
620 case regex_constants::escape_type_not_word_assert:
621 if(this->flags() & regbase::emacs_ex)
622 {
623 ++m_position;
624 this->append_state(syntax_element_within_word);
625 }
626 else
627 result = parse_literal();
628 break;
629 case regex_constants::escape_type_left_word:
630 if(this->flags() & regbase::emacs_ex)
631 {
632 ++m_position;
633 this->append_state(syntax_element_word_start);
634 }
635 else
636 result = parse_literal();
637 break;
638 case regex_constants::escape_type_right_word:
639 if(this->flags() & regbase::emacs_ex)
640 {
641 ++m_position;
642 this->append_state(syntax_element_word_end);
643 }
644 else
645 result = parse_literal();
646 break;
647 default:
648 if(this->flags() & regbase::emacs_ex)
649 {
650 bool negate = true;
651 switch(*m_position)
652 {
653 case 'w':
654 negate = false;
655 BOOST_FALLTHROUGH;
656 case 'W':
657 {
658 basic_char_set<charT, traits> char_set;
659 if(negate)
660 char_set.negate();
661 char_set.add_class(this->m_word_mask);
662 if(0 == this->append_set(char_set))
663 {
664 fail(regex_constants::error_ctype, m_position - m_base);
665 return false;
666 }
667 ++m_position;
668 return true;
669 }
670 case 's':
671 negate = false;
672 BOOST_FALLTHROUGH;
673 case 'S':
674 return add_emacs_code(negate);
675 case 'c':
676 case 'C':
677 // not supported yet:
678 fail(regex_constants::error_escape, m_position - m_base, "The \\c and \\C escape sequences are not supported by POSIX basic regular expressions: try the Perl syntax instead.");
679 return false;
680 default:
681 break;
682 }
683 }
684 result = parse_literal();
685 break;
686 }
687 return result;
688}
689
690template <class charT, class traits>
691bool basic_regex_parser<charT, traits>::parse_extended_escape()
692{
693 ++m_position;
694 if(m_position == m_end)
695 {
696 fail(regex_constants::error_escape, m_position - m_base, "Incomplete escape sequence found.");
697 return false;
698 }
699 bool negate = false; // in case this is a character class escape: \w \d etc
700 switch(this->m_traits.escape_syntax_type(*m_position))
701 {
702 case regex_constants::escape_type_not_class:
703 negate = true;
704 BOOST_FALLTHROUGH;
705 case regex_constants::escape_type_class:
706 {
707escape_type_class_jump:
708 typedef typename traits::char_class_type m_type;
709 m_type m = this->m_traits.lookup_classname(m_position, m_position+1);
710 if(m != 0)
711 {
712 basic_char_set<charT, traits> char_set;
713 if(negate)
714 char_set.negate();
715 char_set.add_class(m);
716 if(0 == this->append_set(char_set))
717 {
718 fail(regex_constants::error_ctype, m_position - m_base);
719 return false;
720 }
721 ++m_position;
722 return true;
723 }
724 //
725 // not a class, just a regular unknown escape:
726 //
727 this->append_literal(unescape_character());
728 break;
729 }
730 case regex_constants::syntax_digit:
731 return parse_backref();
732 case regex_constants::escape_type_left_word:
733 ++m_position;
734 this->append_state(syntax_element_word_start);
735 break;
736 case regex_constants::escape_type_right_word:
737 ++m_position;
738 this->append_state(syntax_element_word_end);
739 break;
740 case regex_constants::escape_type_start_buffer:
741 ++m_position;
742 this->append_state(syntax_element_buffer_start);
743 break;
744 case regex_constants::escape_type_end_buffer:
745 ++m_position;
746 this->append_state(syntax_element_buffer_end);
747 break;
748 case regex_constants::escape_type_word_assert:
749 ++m_position;
750 this->append_state(syntax_element_word_boundary);
751 break;
752 case regex_constants::escape_type_not_word_assert:
753 ++m_position;
754 this->append_state(syntax_element_within_word);
755 break;
756 case regex_constants::escape_type_Z:
757 ++m_position;
758 this->append_state(syntax_element_soft_buffer_end);
759 break;
760 case regex_constants::escape_type_Q:
761 return parse_QE();
762 case regex_constants::escape_type_C:
763 return parse_match_any();
764 case regex_constants::escape_type_X:
765 ++m_position;
766 this->append_state(syntax_element_combining);
767 break;
768 case regex_constants::escape_type_G:
769 ++m_position;
770 this->append_state(syntax_element_restart_continue);
771 break;
772 case regex_constants::escape_type_not_property:
773 negate = true;
774 BOOST_FALLTHROUGH;
775 case regex_constants::escape_type_property:
776 {
777 ++m_position;
778 char_class_type m;
779 if(m_position == m_end)
780 {
781 fail(regex_constants::error_escape, m_position - m_base, "Incomplete property escape found.");
782 return false;
783 }
784 // maybe have \p{ddd}
785 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
786 {
787 const charT* base = m_position;
788 // skip forward until we find enclosing brace:
789 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
790 ++m_position;
791 if(m_position == m_end)
792 {
793 fail(regex_constants::error_escape, m_position - m_base, "Closing } missing from property escape sequence.");
794 return false;
795 }
796 m = this->m_traits.lookup_classname(++base, m_position++);
797 }
798 else
799 {
800 m = this->m_traits.lookup_classname(m_position, m_position+1);
801 ++m_position;
802 }
803 if(m != 0)
804 {
805 basic_char_set<charT, traits> char_set;
806 if(negate)
807 char_set.negate();
808 char_set.add_class(m);
809 if(0 == this->append_set(char_set))
810 {
811 fail(regex_constants::error_ctype, m_position - m_base);
812 return false;
813 }
814 return true;
815 }
816 fail(regex_constants::error_ctype, m_position - m_base, "Escape sequence was neither a valid property nor a valid character class name.");
817 return false;
818 }
819 case regex_constants::escape_type_reset_start_mark:
820 if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
821 {
822 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
823 pb->index = -5;
824 pb->icase = this->flags() & regbase::icase;
825 this->m_pdata->m_data.align();
826 ++m_position;
827 return true;
828 }
829 goto escape_type_class_jump;
830 case regex_constants::escape_type_line_ending:
831 if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
832 {
833 const charT* e = get_escape_R_string<charT>();
834 const charT* old_position = m_position;
835 const charT* old_end = m_end;
836 const charT* old_base = m_base;
837 m_position = e;
838 m_base = e;
839 m_end = e + traits::length(e);
840 bool r = parse_all();
841 m_position = ++old_position;
842 m_end = old_end;
843 m_base = old_base;
844 return r;
845 }
846 goto escape_type_class_jump;
847 case regex_constants::escape_type_extended_backref:
848 if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
849 {
850 bool have_brace = false;
851 bool negative = false;
852 static const char* incomplete_message = "Incomplete \\g escape found.";
853 if(++m_position == m_end)
854 {
855 fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
856 return false;
857 }
858 // maybe have \g{ddd}
859 regex_constants::syntax_type syn = this->m_traits.syntax_type(*m_position);
860 regex_constants::syntax_type syn_end = 0;
861 if((syn == regex_constants::syntax_open_brace)
862 || (syn == regex_constants::escape_type_left_word)
863 || (syn == regex_constants::escape_type_end_buffer))
864 {
865 if(++m_position == m_end)
866 {
867 fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
868 return false;
869 }
870 have_brace = true;
871 switch(syn)
872 {
873 case regex_constants::syntax_open_brace:
874 syn_end = regex_constants::syntax_close_brace;
875 break;
876 case regex_constants::escape_type_left_word:
877 syn_end = regex_constants::escape_type_right_word;
878 break;
879 default:
880 syn_end = regex_constants::escape_type_end_buffer;
881 break;
882 }
883 }
884 negative = (*m_position == static_cast<charT>('-'));
885 if((negative) && (++m_position == m_end))
886 {
887 fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
888 return false;
889 }
890 const charT* pc = m_position;
891 boost::intmax_t i = this->m_traits.toi(pc, m_end, 10);
892 if((i < 0) && syn_end)
893 {
894 // Check for a named capture, get the leftmost one if there is more than one:
895 const charT* base = m_position;
896 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != syn_end))
897 {
898 ++m_position;
899 }
900 i = hash_value_from_capture_name(base, m_position);
901 pc = m_position;
902 }
903 if(negative)
904 i = 1 + m_mark_count - i;
905 if(((i > 0) && (i < std::numeric_limits<unsigned>::digits) && (i - 1 < static_cast<boost::intmax_t>(sizeof(unsigned) * CHAR_BIT)) && (this->m_backrefs & (1u << (i-1)))) || ((i > 10000) && (this->m_pdata->get_id(i) > 0) && (this->m_pdata->get_id(i)-1 < static_cast<boost::intmax_t>(sizeof(unsigned) * CHAR_BIT)) && (this->m_backrefs & (1u << (this->m_pdata->get_id(i)-1)))))
906 {
907 m_position = pc;
908 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
909 pb->index = i;
910 pb->icase = this->flags() & regbase::icase;
911 }
912 else
913 {
914 fail(regex_constants::error_backref, m_position - m_base);
915 return false;
916 }
917 m_position = pc;
918 if(have_brace)
919 {
920 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != syn_end))
921 {
922 fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
923 return false;
924 }
925 ++m_position;
926 }
927 return true;
928 }
929 goto escape_type_class_jump;
930 case regex_constants::escape_type_control_v:
931 if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
932 goto escape_type_class_jump;
933 BOOST_FALLTHROUGH;
934 default:
935 this->append_literal(unescape_character());
936 break;
937 }
938 return true;
939}
940
941template <class charT, class traits>
942bool basic_regex_parser<charT, traits>::parse_match_any()
943{
944 //
945 // we have a '.' that can match any character:
946 //
947 ++m_position;
948 static_cast<re_dot*>(
949 this->append_state(syntax_element_wild, sizeof(re_dot))
950 )->mask = static_cast<unsigned char>(this->flags() & regbase::no_mod_s
951 ? BOOST_REGEX_DETAIL_NS::force_not_newline
952 : this->flags() & regbase::mod_s ?
953 BOOST_REGEX_DETAIL_NS::force_newline : BOOST_REGEX_DETAIL_NS::dont_care);
954 return true;
955}
956
957template <class charT, class traits>
958bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_t high)
959{
960 bool greedy = true;
961 bool pocessive = false;
962 std::size_t insert_point;
963 //
964 // when we get to here we may have a non-greedy ? mark still to come:
965 //
966 if((m_position != m_end)
967 && (
968 (0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
969 || ((regbase::basic_syntax_group|regbase::emacs_ex) == (this->flags() & (regbase::main_option_type | regbase::emacs_ex)))
970 )
971 )
972 {
973 // OK we have a perl or emacs regex, check for a '?':
974 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
975 {
976 greedy = false;
977 ++m_position;
978 }
979 // for perl regexes only check for pocessive ++ repeats.
980 if((m_position != m_end)
981 && (0 == (this->flags() & regbase::main_option_type))
982 && (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_plus))
983 {
984 pocessive = true;
985 ++m_position;
986 }
987 }
988 if(0 == this->m_last_state)
989 {
990 fail(regex_constants::error_badrepeat, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_position), "Nothing to repeat.");
991 return false;
992 }
993 if(this->m_last_state->type == syntax_element_endmark)
994 {
995 // insert a repeat before the '(' matching the last ')':
996 insert_point = this->m_paren_start;
997 }
998 else if((this->m_last_state->type == syntax_element_literal) && (static_cast<re_literal*>(this->m_last_state)->length > 1))
999 {
1000 // the last state was a literal with more than one character, split it in two:
1001 re_literal* lit = static_cast<re_literal*>(this->m_last_state);
1002 charT c = (static_cast<charT*>(static_cast<void*>(lit+1)))[lit->length - 1];
1003 lit->length -= 1;
1004 // now append new state:
1005 lit = static_cast<re_literal*>(this->append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
1006 lit->length = 1;
1007 (static_cast<charT*>(static_cast<void*>(lit+1)))[0] = c;
1008 insert_point = this->getoffset(this->m_last_state);
1009 }
1010 else
1011 {
1012 // repeat the last state whatever it was, need to add some error checking here:
1013 switch(this->m_last_state->type)
1014 {
1015 case syntax_element_start_line:
1016 case syntax_element_end_line:
1017 case syntax_element_word_boundary:
1018 case syntax_element_within_word:
1019 case syntax_element_word_start:
1020 case syntax_element_word_end:
1021 case syntax_element_buffer_start:
1022 case syntax_element_buffer_end:
1023 case syntax_element_alt:
1024 case syntax_element_soft_buffer_end:
1025 case syntax_element_restart_continue:
1026 case syntax_element_jump:
1027 case syntax_element_startmark:
1028 case syntax_element_backstep:
1029 // can't legally repeat any of the above:
1030 fail(regex_constants::error_badrepeat, m_position - m_base);
1031 return false;
1032 default:
1033 // do nothing...
1034 break;
1035 }
1036 insert_point = this->getoffset(this->m_last_state);
1037 }
1038 //
1039 // OK we now know what to repeat, so insert the repeat around it:
1040 //
1041 re_repeat* rep = static_cast<re_repeat*>(this->insert_state(insert_point, syntax_element_rep, re_repeater_size));
1042 rep->min = low;
1043 rep->max = high;
1044 rep->greedy = greedy;
1045 rep->leading = false;
1046 // store our repeater position for later:
1047 std::ptrdiff_t rep_off = this->getoffset(rep);
1048 // and append a back jump to the repeat:
1049 re_jump* jmp = static_cast<re_jump*>(this->append_state(syntax_element_jump, sizeof(re_jump)));
1050 jmp->alt.i = rep_off - this->getoffset(jmp);
1051 this->m_pdata->m_data.align();
1052 // now fill in the alt jump for the repeat:
1053 rep = static_cast<re_repeat*>(this->getaddress(rep_off));
1054 rep->alt.i = this->m_pdata->m_data.size() - rep_off;
1055 //
1056 // If the repeat is pocessive then bracket the repeat with a (?>...)
1057 // independent sub-expression construct:
1058 //
1059 if(pocessive)
1060 {
1061 if(m_position != m_end)
1062 {
1063 //
1064 // Check for illegal following quantifier, we have to do this here, because
1065 // the extra states we insert below circumvents our usual error checking :-(
1066 //
1067 switch(this->m_traits.syntax_type(*m_position))
1068 {
1069 case regex_constants::syntax_star:
1070 case regex_constants::syntax_plus:
1071 case regex_constants::syntax_question:
1072 case regex_constants::syntax_open_brace:
1073 fail(regex_constants::error_badrepeat, m_position - m_base);
1074 return false;
1075 }
1076 }
1077 re_brace* pb = static_cast<re_brace*>(this->insert_state(insert_point, syntax_element_startmark, sizeof(re_brace)));
1078 pb->index = -3;
1079 pb->icase = this->flags() & regbase::icase;
1080 jmp = static_cast<re_jump*>(this->insert_state(insert_point + sizeof(re_brace), syntax_element_jump, sizeof(re_jump)));
1081 this->m_pdata->m_data.align();
1082 jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
1083 pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
1084 pb->index = -3;
1085 pb->icase = this->flags() & regbase::icase;
1086 }
1087 return true;
1088}
1089
1090template <class charT, class traits>
1091bool basic_regex_parser<charT, traits>::parse_repeat_range(bool isbasic)
1092{
1093 static const char* incomplete_message = "Missing } in quantified repetition.";
1094 //
1095 // parse a repeat-range:
1096 //
1097 std::size_t min, max;
1098 boost::intmax_t v;
1099 // skip whitespace:
1100 while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1101 ++m_position;
1102 if(this->m_position == this->m_end)
1103 {
1104 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1105 {
1106 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1107 return false;
1108 }
1109 // Treat the opening '{' as a literal character, rewind to start of error:
1110 --m_position;
1111 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1112 return parse_literal();
1113 }
1114 // get min:
1115 v = this->m_traits.toi(m_position, m_end, 10);
1116 // skip whitespace:
1117 if((v < 0) || (v > umax()))
1118 {
1119 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1120 {
1121 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1122 return false;
1123 }
1124 // Treat the opening '{' as a literal character, rewind to start of error:
1125 --m_position;
1126 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1127 return parse_literal();
1128 }
1129 while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1130 ++m_position;
1131 if(this->m_position == this->m_end)
1132 {
1133 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1134 {
1135 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1136 return false;
1137 }
1138 // Treat the opening '{' as a literal character, rewind to start of error:
1139 --m_position;
1140 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1141 return parse_literal();
1142 }
1143 min = static_cast<std::size_t>(v);
1144 // see if we have a comma:
1145 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_comma)
1146 {
1147 // move on and error check:
1148 ++m_position;
1149 // skip whitespace:
1150 while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1151 ++m_position;
1152 if(this->m_position == this->m_end)
1153 {
1154 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1155 {
1156 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1157 return false;
1158 }
1159 // Treat the opening '{' as a literal character, rewind to start of error:
1160 --m_position;
1161 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1162 return parse_literal();
1163 }
1164 // get the value if any:
1165 v = this->m_traits.toi(m_position, m_end, 10);
1166 max = ((v >= 0) && (v < umax())) ? (std::size_t)v : (std::numeric_limits<std::size_t>::max)();
1167 }
1168 else
1169 {
1170 // no comma, max = min:
1171 max = min;
1172 }
1173 // skip whitespace:
1174 while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1175 ++m_position;
1176 // OK now check trailing }:
1177 if(this->m_position == this->m_end)
1178 {
1179 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1180 {
1181 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1182 return false;
1183 }
1184 // Treat the opening '{' as a literal character, rewind to start of error:
1185 --m_position;
1186 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1187 return parse_literal();
1188 }
1189 if(isbasic)
1190 {
1191 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_escape)
1192 {
1193 ++m_position;
1194 if(this->m_position == this->m_end)
1195 {
1196 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1197 return false;
1198 }
1199 }
1200 else
1201 {
1202 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1203 return false;
1204 }
1205 }
1206 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_brace)
1207 ++m_position;
1208 else
1209 {
1210 // Treat the opening '{' as a literal character, rewind to start of error:
1211 --m_position;
1212 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1213 return parse_literal();
1214 }
1215 //
1216 // finally go and add the repeat, unless error:
1217 //
1218 if(min > max)
1219 {
1220 // Backtrack to error location:
1221 m_position -= 2;
1222 while(this->m_traits.isctype(*m_position, this->m_word_mask)) --m_position;
1223 ++m_position;
1224 fail(regex_constants::error_badbrace, m_position - m_base);
1225 return false;
1226 }
1227 return parse_repeat(min, max);
1228}
1229
1230template <class charT, class traits>
1231bool basic_regex_parser<charT, traits>::parse_alt()
1232{
1233 //
1234 // error check: if there have been no previous states,
1235 // or if the last state was a '(' then error:
1236 //
1237 if(
1238 ((this->m_last_state == 0) || (this->m_last_state->type == syntax_element_startmark))
1239 &&
1240 !(
1241 ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
1242 &&
1243 ((this->flags() & regbase::no_empty_expressions) == 0)
1244 )
1245 )
1246 {
1247 fail(regex_constants::error_empty, this->m_position - this->m_base, "A regular expression cannot start with the alternation operator |.");
1248 return false;
1249 }
1250 //
1251 // Reset mark count if required:
1252 //
1253 if(m_max_mark < m_mark_count)
1254 m_max_mark = m_mark_count;
1255 if(m_mark_reset >= 0)
1256 m_mark_count = m_mark_reset;
1257
1258 ++m_position;
1259 //
1260 // we need to append a trailing jump:
1261 //
1262 re_syntax_base* pj = this->append_state(BOOST_REGEX_DETAIL_NS::syntax_element_jump, sizeof(re_jump));
1263 std::ptrdiff_t jump_offset = this->getoffset(pj);
1264 //
1265 // now insert the alternative:
1266 //
1267 re_alt* palt = static_cast<re_alt*>(this->insert_state(this->m_alt_insert_point, syntax_element_alt, re_alt_size));
1268 jump_offset += re_alt_size;
1269 this->m_pdata->m_data.align();
1270 palt->alt.i = this->m_pdata->m_data.size() - this->getoffset(palt);
1271 //
1272 // update m_alt_insert_point so that the next alternate gets
1273 // inserted at the start of the second of the two we've just created:
1274 //
1275 this->m_alt_insert_point = this->m_pdata->m_data.size();
1276 //
1277 // the start of this alternative must have a case changes state
1278 // if the current block has messed around with case changes:
1279 //
1280 if(m_has_case_change)
1281 {
1282 static_cast<re_case*>(
1283 this->append_state(syntax_element_toggle_case, sizeof(re_case))
1284 )->icase = this->m_icase;
1285 }
1286 //
1287 // push the alternative onto our stack, a recursive
1288 // implementation here is easier to understand (and faster
1289 // as it happens), but causes all kinds of stack overflow problems
1290 // on programs with small stacks (COM+).
1291 //
1292 m_alt_jumps.push_back(jump_offset);
1293 return true;
1294}
1295
1296template <class charT, class traits>
1297bool basic_regex_parser<charT, traits>::parse_set()
1298{
1299 static const char* incomplete_message = "Character set declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
1300 ++m_position;
1301 if(m_position == m_end)
1302 {
1303 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1304 return false;
1305 }
1306 basic_char_set<charT, traits> char_set;
1307
1308 const charT* base = m_position; // where the '[' was
1309 const charT* item_base = m_position; // where the '[' or '^' was
1310
1311 while(m_position != m_end)
1312 {
1313 switch(this->m_traits.syntax_type(*m_position))
1314 {
1315 case regex_constants::syntax_caret:
1316 if(m_position == base)
1317 {
1318 char_set.negate();
1319 ++m_position;
1320 item_base = m_position;
1321 }
1322 else
1323 parse_set_literal(char_set);
1324 break;
1325 case regex_constants::syntax_close_set:
1326 if(m_position == item_base)
1327 {
1328 parse_set_literal(char_set);
1329 break;
1330 }
1331 else
1332 {
1333 ++m_position;
1334 if(0 == this->append_set(char_set))
1335 {
1336 fail(regex_constants::error_ctype, m_position - m_base);
1337 return false;
1338 }
1339 }
1340 return true;
1341 case regex_constants::syntax_open_set:
1342 if(parse_inner_set(char_set))
1343 break;
1344 return true;
1345 case regex_constants::syntax_escape:
1346 {
1347 //
1348 // look ahead and see if this is a character class shortcut
1349 // \d \w \s etc...
1350 //
1351 ++m_position;
1352 if(this->m_traits.escape_syntax_type(*m_position)
1353 == regex_constants::escape_type_class)
1354 {
1355 char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1356 if(m != 0)
1357 {
1358 char_set.add_class(m);
1359 ++m_position;
1360 break;
1361 }
1362 }
1363 else if(this->m_traits.escape_syntax_type(*m_position)
1364 == regex_constants::escape_type_not_class)
1365 {
1366 // negated character class:
1367 char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1368 if(m != 0)
1369 {
1370 char_set.add_negated_class(m);
1371 ++m_position;
1372 break;
1373 }
1374 }
1375 // not a character class, just a regular escape:
1376 --m_position;
1377 parse_set_literal(char_set);
1378 break;
1379 }
1380 default:
1381 parse_set_literal(char_set);
1382 break;
1383 }
1384 }
1385 return m_position != m_end;
1386}
1387
1388template <class charT, class traits>
1389bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, traits>& char_set)
1390{
1391 static const char* incomplete_message = "Character class declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
1392 //
1393 // we have either a character class [:name:]
1394 // a collating element [.name.]
1395 // or an equivalence class [=name=]
1396 //
1397 if(m_end == ++m_position)
1398 {
1399 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1400 return false;
1401 }
1402 switch(this->m_traits.syntax_type(*m_position))
1403 {
1404 case regex_constants::syntax_dot:
1405 //
1406 // a collating element is treated as a literal:
1407 //
1408 --m_position;
1409 parse_set_literal(char_set);
1410 return true;
1411 case regex_constants::syntax_colon:
1412 {
1413 // check that character classes are actually enabled:
1414 if((this->flags() & (regbase::main_option_type | regbase::no_char_classes))
1415 == (regbase::basic_syntax_group | regbase::no_char_classes))
1416 {
1417 --m_position;
1418 parse_set_literal(char_set);
1419 return true;
1420 }
1421 // skip the ':'
1422 if(m_end == ++m_position)
1423 {
1424 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1425 return false;
1426 }
1427 const charT* name_first = m_position;
1428 // skip at least one character, then find the matching ':]'
1429 if(m_end == ++m_position)
1430 {
1431 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1432 return false;
1433 }
1434 while((m_position != m_end)
1435 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_colon))
1436 ++m_position;
1437 const charT* name_last = m_position;
1438 if(m_end == m_position)
1439 {
1440 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1441 return false;
1442 }
1443 if((m_end == ++m_position)
1444 || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1445 {
1446 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1447 return false;
1448 }
1449 //
1450 // check for negated class:
1451 //
1452 bool negated = false;
1453 if(this->m_traits.syntax_type(*name_first) == regex_constants::syntax_caret)
1454 {
1455 ++name_first;
1456 negated = true;
1457 }
1458 typedef typename traits::char_class_type m_type;
1459 m_type m = this->m_traits.lookup_classname(name_first, name_last);
1460 if(m == 0)
1461 {
1462 if(char_set.empty() && (name_last - name_first == 1))
1463 {
1464 // maybe a special case:
1465 ++m_position;
1466 if( (m_position != m_end)
1467 && (this->m_traits.syntax_type(*m_position)
1468 == regex_constants::syntax_close_set))
1469 {
1470 if(this->m_traits.escape_syntax_type(*name_first)
1471 == regex_constants::escape_type_left_word)
1472 {
1473 ++m_position;
1474 this->append_state(syntax_element_word_start);
1475 return false;
1476 }
1477 if(this->m_traits.escape_syntax_type(*name_first)
1478 == regex_constants::escape_type_right_word)
1479 {
1480 ++m_position;
1481 this->append_state(syntax_element_word_end);
1482 return false;
1483 }
1484 }
1485 }
1486 fail(regex_constants::error_ctype, name_first - m_base);
1487 return false;
1488 }
1489 if(negated == false)
1490 char_set.add_class(m);
1491 else
1492 char_set.add_negated_class(m);
1493 ++m_position;
1494 break;
1495 }
1496 case regex_constants::syntax_equal:
1497 {
1498 // skip the '='
1499 if(m_end == ++m_position)
1500 {
1501 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1502 return false;
1503 }
1504 const charT* name_first = m_position;
1505 // skip at least one character, then find the matching '=]'
1506 if(m_end == ++m_position)
1507 {
1508 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1509 return false;
1510 }
1511 while((m_position != m_end)
1512 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal))
1513 ++m_position;
1514 const charT* name_last = m_position;
1515 if(m_end == m_position)
1516 {
1517 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1518 return false;
1519 }
1520 if((m_end == ++m_position)
1521 || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1522 {
1523 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1524 return false;
1525 }
1526 string_type m = this->m_traits.lookup_collatename(name_first, name_last);
1527 if((0 == m.size()) || (m.size() > 2))
1528 {
1529 fail(regex_constants::error_collate, name_first - m_base);
1530 return false;
1531 }
1532 digraph<charT> d;
1533 d.first = m[0];
1534 if(m.size() > 1)
1535 d.second = m[1];
1536 else
1537 d.second = 0;
1538 char_set.add_equivalent(d);
1539 ++m_position;
1540 break;
1541 }
1542 default:
1543 --m_position;
1544 parse_set_literal(char_set);
1545 break;
1546 }
1547 return true;
1548}
1549
1550template <class charT, class traits>
1551void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT, traits>& char_set)
1552{
1553 digraph<charT> start_range(get_next_set_literal(char_set));
1554 if(m_end == m_position)
1555 {
1556 fail(regex_constants::error_brack, m_position - m_base);
1557 return;
1558 }
1559 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1560 {
1561 // we have a range:
1562 if(m_end == ++m_position)
1563 {
1564 fail(regex_constants::error_brack, m_position - m_base);
1565 return;
1566 }
1567 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)
1568 {
1569 digraph<charT> end_range = get_next_set_literal(char_set);
1570 char_set.add_range(start_range, end_range);
1571 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1572 {
1573 if(m_end == ++m_position)
1574 {
1575 fail(regex_constants::error_brack, m_position - m_base);
1576 return;
1577 }
1578 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_set)
1579 {
1580 // trailing - :
1581 --m_position;
1582 return;
1583 }
1584 fail(regex_constants::error_range, m_position - m_base);
1585 return;
1586 }
1587 return;
1588 }
1589 --m_position;
1590 }
1591 char_set.add_single(start_range);
1592}
1593
1594template <class charT, class traits>
1595digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal(basic_char_set<charT, traits>& char_set)
1596{
1597 digraph<charT> result;
1598 switch(this->m_traits.syntax_type(*m_position))
1599 {
1600 case regex_constants::syntax_dash:
1601 if(!char_set.empty())
1602 {
1603 // see if we are at the end of the set:
1604 if((++m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1605 {
1606 fail(regex_constants::error_range, m_position - m_base);
1607 return result;
1608 }
1609 --m_position;
1610 }
1611 result.first = *m_position++;
1612 return result;
1613 case regex_constants::syntax_escape:
1614 // check to see if escapes are supported first:
1615 if(this->flags() & regex_constants::no_escape_in_lists)
1616 {
1617 result = *m_position++;
1618 break;
1619 }
1620 ++m_position;
1621 result = unescape_character();
1622 break;
1623 case regex_constants::syntax_open_set:
1624 {
1625 if(m_end == ++m_position)
1626 {
1627 fail(regex_constants::error_collate, m_position - m_base);
1628 return result;
1629 }
1630 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot)
1631 {
1632 --m_position;
1633 result.first = *m_position;
1634 ++m_position;
1635 return result;
1636 }
1637 if(m_end == ++m_position)
1638 {
1639 fail(regex_constants::error_collate, m_position - m_base);
1640 return result;
1641 }
1642 const charT* name_first = m_position;
1643 // skip at least one character, then find the matching ':]'
1644 if(m_end == ++m_position)
1645 {
1646 fail(regex_constants::error_collate, name_first - m_base);
1647 return result;
1648 }
1649 while((m_position != m_end)
1650 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot))
1651 ++m_position;
1652 const charT* name_last = m_position;
1653 if(m_end == m_position)
1654 {
1655 fail(regex_constants::error_collate, name_first - m_base);
1656 return result;
1657 }
1658 if((m_end == ++m_position)
1659 || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1660 {
1661 fail(regex_constants::error_collate, name_first - m_base);
1662 return result;
1663 }
1664 ++m_position;
1665 string_type s = this->m_traits.lookup_collatename(name_first, name_last);
1666 if(s.empty() || (s.size() > 2))
1667 {
1668 fail(regex_constants::error_collate, name_first - m_base);
1669 return result;
1670 }
1671 result.first = s[0];
1672 if(s.size() > 1)
1673 result.second = s[1];
1674 else
1675 result.second = 0;
1676 return result;
1677 }
1678 default:
1679 result = *m_position++;
1680 }
1681 return result;
1682}
1683
1684//
1685// does a value fit in the specified charT type?
1686//
1687template <class charT>
1688bool valid_value(charT, boost::intmax_t v, const mpl::true_&)
1689{
1690 return (v >> (sizeof(charT) * CHAR_BIT)) == 0;
1691}
1692template <class charT>
1693bool valid_value(charT, boost::intmax_t, const mpl::false_&)
1694{
1695 return true; // v will alsways fit in a charT
1696}
1697template <class charT>
1698bool valid_value(charT c, boost::intmax_t v)
1699{
1700 return valid_value(c, v, mpl::bool_<(sizeof(charT) < sizeof(boost::intmax_t))>());
1701}
1702
1703template <class charT, class traits>
1704charT basic_regex_parser<charT, traits>::unescape_character()
1705{
1706#ifdef BOOST_MSVC
1707#pragma warning(push)
1708#pragma warning(disable:4127)
1709#endif
1710 charT result(0);
1711 if(m_position == m_end)
1712 {
1713 fail(regex_constants::error_escape, m_position - m_base, "Escape sequence terminated prematurely.");
1714 return false;
1715 }
1716 switch(this->m_traits.escape_syntax_type(*m_position))
1717 {
1718 case regex_constants::escape_type_control_a:
1719 result = charT('\a');
1720 break;
1721 case regex_constants::escape_type_e:
1722 result = charT(27);
1723 break;
1724 case regex_constants::escape_type_control_f:
1725 result = charT('\f');
1726 break;
1727 case regex_constants::escape_type_control_n:
1728 result = charT('\n');
1729 break;
1730 case regex_constants::escape_type_control_r:
1731 result = charT('\r');
1732 break;
1733 case regex_constants::escape_type_control_t:
1734 result = charT('\t');
1735 break;
1736 case regex_constants::escape_type_control_v:
1737 result = charT('\v');
1738 break;
1739 case regex_constants::escape_type_word_assert:
1740 result = charT('\b');
1741 break;
1742 case regex_constants::escape_type_ascii_control:
1743 ++m_position;
1744 if(m_position == m_end)
1745 {
1746 // Rewind to start of escape:
1747 --m_position;
1748 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1749 fail(regex_constants::error_escape, m_position - m_base, "ASCII escape sequence terminated prematurely.");
1750 return result;
1751 }
1752 result = static_cast<charT>(*m_position % 32);
1753 break;
1754 case regex_constants::escape_type_hex:
1755 ++m_position;
1756 if(m_position == m_end)
1757 {
1758 // Rewind to start of escape:
1759 --m_position;
1760 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1761 fail(regex_constants::error_escape, m_position - m_base, "Hexadecimal escape sequence terminated prematurely.");
1762 return result;
1763 }
1764 // maybe have \x{ddd}
1765 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1766 {
1767 ++m_position;
1768 if(m_position == m_end)
1769 {
1770 // Rewind to start of escape:
1771 --m_position;
1772 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1773 fail(regex_constants::error_escape, m_position - m_base, "Missing } in hexadecimal escape sequence.");
1774 return result;
1775 }
1776 boost::intmax_t i = this->m_traits.toi(m_position, m_end, 16);
1777 if((m_position == m_end)
1778 || (i < 0)
1779 || ((std::numeric_limits<charT>::is_specialized) && (i > (boost::intmax_t)(std::numeric_limits<charT>::max)()))
1780 || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1781 {
1782 // Rewind to start of escape:
1783 --m_position;
1784 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1785 fail(regex_constants::error_badbrace, m_position - m_base, "Hexadecimal escape sequence was invalid.");
1786 return result;
1787 }
1788 ++m_position;
1789 result = charT(i);
1790 }
1791 else
1792 {
1793 std::ptrdiff_t len = (std::min)(static_cast<std::ptrdiff_t>(2), static_cast<std::ptrdiff_t>(m_end - m_position));
1794 boost::intmax_t i = this->m_traits.toi(m_position, m_position + len, 16);
1795 if((i < 0)
1796 || !valid_value(charT(0), i))
1797 {
1798 // Rewind to start of escape:
1799 --m_position;
1800 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1801 fail(regex_constants::error_escape, m_position - m_base, "Escape sequence did not encode a valid character.");
1802 return result;
1803 }
1804 result = charT(i);
1805 }
1806 return result;
1807 case regex_constants::syntax_digit:
1808 {
1809 // an octal escape sequence, the first character must be a zero
1810 // followed by up to 3 octal digits:
1811 std::ptrdiff_t len = (std::min)(::boost::BOOST_REGEX_DETAIL_NS::distance(m_position, m_end), static_cast<std::ptrdiff_t>(4));
1812 const charT* bp = m_position;
1813 boost::intmax_t val = this->m_traits.toi(bp, bp + 1, 8);
1814 if(val != 0)
1815 {
1816 // Rewind to start of escape:
1817 --m_position;
1818 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1819 // Oops not an octal escape after all:
1820 fail(regex_constants::error_escape, m_position - m_base, "Invalid octal escape sequence.");
1821 return result;
1822 }
1823 val = this->m_traits.toi(m_position, m_position + len, 8);
1824 if((val < 0) || (val > (boost::intmax_t)(std::numeric_limits<charT>::max)()))
1825 {
1826 // Rewind to start of escape:
1827 --m_position;
1828 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1829 fail(regex_constants::error_escape, m_position - m_base, "Octal escape sequence is invalid.");
1830 return result;
1831 }
1832 return static_cast<charT>(val);
1833 }
1834 case regex_constants::escape_type_named_char:
1835 {
1836 ++m_position;
1837 if(m_position == m_end)
1838 {
1839 // Rewind to start of escape:
1840 --m_position;
1841 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1842 fail(regex_constants::error_escape, m_position - m_base);
1843 return false;
1844 }
1845 // maybe have \N{name}
1846 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1847 {
1848 const charT* base = m_position;
1849 // skip forward until we find enclosing brace:
1850 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1851 ++m_position;
1852 if(m_position == m_end)
1853 {
1854 // Rewind to start of escape:
1855 --m_position;
1856 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1857 fail(regex_constants::error_escape, m_position - m_base);
1858 return false;
1859 }
1860 string_type s = this->m_traits.lookup_collatename(++base, m_position++);
1861 if(s.empty())
1862 {
1863 // Rewind to start of escape:
1864 --m_position;
1865 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1866 fail(regex_constants::error_collate, m_position - m_base);
1867 return false;
1868 }
1869 if(s.size() == 1)
1870 {
1871 return s[0];
1872 }
1873 }
1874 // fall through is a failure:
1875 // Rewind to start of escape:
1876 --m_position;
1877 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1878 fail(regex_constants::error_escape, m_position - m_base);
1879 return false;
1880 }
1881 default:
1882 result = *m_position;
1883 break;
1884 }
1885 ++m_position;
1886 return result;
1887#ifdef BOOST_MSVC
1888#pragma warning(pop)
1889#endif
1890}
1891
1892template <class charT, class traits>
1893bool basic_regex_parser<charT, traits>::parse_backref()
1894{
1895 BOOST_ASSERT(m_position != m_end);
1896 const charT* pc = m_position;
1897 boost::intmax_t i = this->m_traits.toi(pc, pc + 1, 10);
1898 if((i == 0) || (((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) && (this->flags() & regbase::no_bk_refs)))
1899 {
1900 // not a backref at all but an octal escape sequence:
1901 charT c = unescape_character();
1902 this->append_literal(c);
1903 }
1904 else if((i > 0) && (this->m_backrefs & (1u << (i-1))))
1905 {
1906 m_position = pc;
1907 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
1908 pb->index = i;
1909 pb->icase = this->flags() & regbase::icase;
1910 }
1911 else
1912 {
1913 // Rewind to start of escape:
1914 --m_position;
1915 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1916 fail(regex_constants::error_backref, m_position - m_base);
1917 return false;
1918 }
1919 return true;
1920}
1921
1922template <class charT, class traits>
1923bool basic_regex_parser<charT, traits>::parse_QE()
1924{
1925#ifdef BOOST_MSVC
1926#pragma warning(push)
1927#pragma warning(disable:4127)
1928#endif
1929 //
1930 // parse a \Q...\E sequence:
1931 //
1932 ++m_position; // skip the Q
1933 const charT* start = m_position;
1934 const charT* end;
1935 do
1936 {
1937 while((m_position != m_end)
1938 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape))
1939 ++m_position;
1940 if(m_position == m_end)
1941 {
1942 // a \Q...\E sequence may terminate with the end of the expression:
1943 end = m_position;
1944 break;
1945 }
1946 if(++m_position == m_end) // skip the escape
1947 {
1948 fail(regex_constants::error_escape, m_position - m_base, "Unterminated \\Q...\\E sequence.");
1949 return false;
1950 }
1951 // check to see if it's a \E:
1952 if(this->m_traits.escape_syntax_type(*m_position) == regex_constants::escape_type_E)
1953 {
1954 ++m_position;
1955 end = m_position - 2;
1956 break;
1957 }
1958 // otherwise go round again:
1959 }while(true);
1960 //
1961 // now add all the character between the two escapes as literals:
1962 //
1963 while(start != end)
1964 {
1965 this->append_literal(*start);
1966 ++start;
1967 }
1968 return true;
1969#ifdef BOOST_MSVC
1970#pragma warning(pop)
1971#endif
1972}
1973
1974template <class charT, class traits>
1975bool basic_regex_parser<charT, traits>::parse_perl_extension()
1976{
1977 if(++m_position == m_end)
1978 {
1979 // Rewind to start of (? sequence:
1980 --m_position;
1981 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
1982 fail(regex_constants::error_perl_extension, m_position - m_base);
1983 return false;
1984 }
1985 //
1986 // treat comments as a special case, as these
1987 // are the only ones that don't start with a leading
1988 // startmark state:
1989 //
1990 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_hash)
1991 {
1992 while((m_position != m_end)
1993 && (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark))
1994 {}
1995 return true;
1996 }
1997 //
1998 // backup some state, and prepare the way:
1999 //
2000 int markid = 0;
2001 std::ptrdiff_t jump_offset = 0;
2002 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
2003 pb->icase = this->flags() & regbase::icase;
2004 std::ptrdiff_t last_paren_start = this->getoffset(pb);
2005 // back up insertion point for alternations, and set new point:
2006 std::ptrdiff_t last_alt_point = m_alt_insert_point;
2007 this->m_pdata->m_data.align();
2008 m_alt_insert_point = this->m_pdata->m_data.size();
2009 std::ptrdiff_t expected_alt_point = m_alt_insert_point;
2010 bool restore_flags = true;
2011 regex_constants::syntax_option_type old_flags = this->flags();
2012 bool old_case_change = m_has_case_change;
2013 m_has_case_change = false;
2014 charT name_delim;
2015 int mark_reset = m_mark_reset;
2016 int max_mark = m_max_mark;
2017 m_mark_reset = -1;
2018 m_max_mark = m_mark_count;
2019 boost::intmax_t v;
2020 //
2021 // select the actual extension used:
2022 //
2023 switch(this->m_traits.syntax_type(*m_position))
2024 {
2025 case regex_constants::syntax_or:
2026 m_mark_reset = m_mark_count;
2027 BOOST_FALLTHROUGH;
2028 case regex_constants::syntax_colon:
2029 //
2030 // a non-capturing mark:
2031 //
2032 pb->index = markid = 0;
2033 ++m_position;
2034 break;
2035 case regex_constants::syntax_digit:
2036 {
2037 //
2038 // a recursive subexpression:
2039 //
2040 v = this->m_traits.toi(m_position, m_end, 10);
2041 if((v < 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2042 {
2043 // Rewind to start of (? sequence:
2044 --m_position;
2045 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2046 fail(regex_constants::error_perl_extension, m_position - m_base, "The recursive sub-expression refers to an invalid marking group, or is unterminated.");
2047 return false;
2048 }
2049insert_recursion:
2050 pb->index = markid = 0;
2051 re_recurse* pr = static_cast<re_recurse*>(this->append_state(syntax_element_recurse, sizeof(re_recurse)));
2052 pr->alt.i = v;
2053 pr->state_id = 0;
2054 static_cast<re_case*>(
2055 this->append_state(syntax_element_toggle_case, sizeof(re_case))
2056 )->icase = this->flags() & regbase::icase;
2057 break;
2058 }
2059 case regex_constants::syntax_plus:
2060 //
2061 // A forward-relative recursive subexpression:
2062 //
2063 ++m_position;
2064 v = this->m_traits.toi(m_position, m_end, 10);
2065 if((v <= 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2066 {
2067 // Rewind to start of (? sequence:
2068 --m_position;
2069 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2070 fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2071 return false;
2072 }
2073 v += m_mark_count;
2074 goto insert_recursion;
2075 case regex_constants::syntax_dash:
2076 //
2077 // Possibly a backward-relative recursive subexpression:
2078 //
2079 ++m_position;
2080 v = this->m_traits.toi(m_position, m_end, 10);
2081 if(v <= 0)
2082 {
2083 --m_position;
2084 // Oops not a relative recursion at all, but a (?-imsx) group:
2085 goto option_group_jump;
2086 }
2087 v = m_mark_count + 1 - v;
2088 if(v <= 0)
2089 {
2090 // Rewind to start of (? sequence:
2091 --m_position;
2092 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2093 fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2094 return false;
2095 }
2096 goto insert_recursion;
2097 case regex_constants::syntax_equal:
2098 pb->index = markid = -1;
2099 ++m_position;
2100 jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2101 this->m_pdata->m_data.align();
2102 m_alt_insert_point = this->m_pdata->m_data.size();
2103 break;
2104 case regex_constants::syntax_not:
2105 pb->index = markid = -2;
2106 ++m_position;
2107 jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2108 this->m_pdata->m_data.align();
2109 m_alt_insert_point = this->m_pdata->m_data.size();
2110 break;
2111 case regex_constants::escape_type_left_word:
2112 {
2113 // a lookbehind assertion:
2114 if(++m_position == m_end)
2115 {
2116 // Rewind to start of (? sequence:
2117 --m_position;
2118 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2119 fail(regex_constants::error_perl_extension, m_position - m_base);
2120 return false;
2121 }
2122 regex_constants::syntax_type t = this->m_traits.syntax_type(*m_position);
2123 if(t == regex_constants::syntax_not)
2124 pb->index = markid = -2;
2125 else if(t == regex_constants::syntax_equal)
2126 pb->index = markid = -1;
2127 else
2128 {
2129 // Probably a named capture which also starts (?< :
2130 name_delim = '>';
2131 --m_position;
2132 goto named_capture_jump;
2133 }
2134 ++m_position;
2135 jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2136 this->append_state(syntax_element_backstep, sizeof(re_brace));
2137 this->m_pdata->m_data.align();
2138 m_alt_insert_point = this->m_pdata->m_data.size();
2139 break;
2140 }
2141 case regex_constants::escape_type_right_word:
2142 //
2143 // an independent sub-expression:
2144 //
2145 pb->index = markid = -3;
2146 ++m_position;
2147 jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2148 this->m_pdata->m_data.align();
2149 m_alt_insert_point = this->m_pdata->m_data.size();
2150 break;
2151 case regex_constants::syntax_open_mark:
2152 {
2153 // a conditional expression:
2154 pb->index = markid = -4;
2155 if(++m_position == m_end)
2156 {
2157 // Rewind to start of (? sequence:
2158 --m_position;
2159 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2160 fail(regex_constants::error_perl_extension, m_position - m_base);
2161 return false;
2162 }
2163 v = this->m_traits.toi(m_position, m_end, 10);
2164 if(m_position == m_end)
2165 {
2166 // Rewind to start of (? sequence:
2167 --m_position;
2168 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2169 fail(regex_constants::error_perl_extension, m_position - m_base);
2170 return false;
2171 }
2172 if(*m_position == charT('R'))
2173 {
2174 if(++m_position == m_end)
2175 {
2176 // Rewind to start of (? sequence:
2177 --m_position;
2178 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2179 fail(regex_constants::error_perl_extension, m_position - m_base);
2180 return false;
2181 }
2182 if(*m_position == charT('&'))
2183 {
2184 const charT* base = ++m_position;
2185 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2186 ++m_position;
2187 if(m_position == m_end)
2188 {
2189 // Rewind to start of (? sequence:
2190 --m_position;
2191 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2192 fail(regex_constants::error_perl_extension, m_position - m_base);
2193 return false;
2194 }
2195 v = -static_cast<int>(hash_value_from_capture_name(base, m_position));
2196 }
2197 else
2198 {
2199 v = -this->m_traits.toi(m_position, m_end, 10);
2200 }
2201 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2202 br->index = v < 0 ? (v - 1) : 0;
2203 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2204 {
2205 // Rewind to start of (? sequence:
2206 --m_position;
2207 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2208 fail(regex_constants::error_perl_extension, m_position - m_base);
2209 return false;
2210 }
2211 if(++m_position == m_end)
2212 {
2213 // Rewind to start of (? sequence:
2214 --m_position;
2215 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2216 fail(regex_constants::error_perl_extension, m_position - m_base);
2217 return false;
2218 }
2219 }
2220 else if((*m_position == charT('\'')) || (*m_position == charT('<')))
2221 {
2222 const charT* base = ++m_position;
2223 while((m_position != m_end) && (*m_position != charT('>')) && (*m_position != charT('\'')))
2224 ++m_position;
2225 if(m_position == m_end)
2226 {
2227 // Rewind to start of (? sequence:
2228 --m_position;
2229 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2230 fail(regex_constants::error_perl_extension, m_position - m_base);
2231 return false;
2232 }
2233 v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2234 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2235 br->index = v;
2236 if(((*m_position != charT('>')) && (*m_position != charT('\''))) || (++m_position == m_end))
2237 {
2238 // Rewind to start of (? sequence:
2239 --m_position;
2240 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2241 fail(regex_constants::error_perl_extension, m_position - m_base, "Unterminated named capture.");
2242 return false;
2243 }
2244 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2245 {
2246 // Rewind to start of (? sequence:
2247 --m_position;
2248 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2249 fail(regex_constants::error_perl_extension, m_position - m_base);
2250 return false;
2251 }
2252 if(++m_position == m_end)
2253 {
2254 // Rewind to start of (? sequence:
2255 --m_position;
2256 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2257 fail(regex_constants::error_perl_extension, m_position - m_base);
2258 return false;
2259 }
2260 }
2261 else if(*m_position == charT('D'))
2262 {
2263 const char* def = "DEFINE";
2264 while(*def && (m_position != m_end) && (*m_position == charT(*def)))
2265 ++m_position, ++def;
2266 if((m_position == m_end) || *def)
2267 {
2268 // Rewind to start of (? sequence:
2269 --m_position;
2270 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2271 fail(regex_constants::error_perl_extension, m_position - m_base);
2272 return false;
2273 }
2274 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2275 br->index = 9999; // special magic value!
2276 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2277 {
2278 // Rewind to start of (? sequence:
2279 --m_position;
2280 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2281 fail(regex_constants::error_perl_extension, m_position - m_base);
2282 return false;
2283 }
2284 if(++m_position == m_end)
2285 {
2286 // Rewind to start of (? sequence:
2287 --m_position;
2288 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2289 fail(regex_constants::error_perl_extension, m_position - m_base);
2290 return false;
2291 }
2292 }
2293 else if(v > 0)
2294 {
2295 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2296 br->index = v;
2297 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2298 {
2299 // Rewind to start of (? sequence:
2300 --m_position;
2301 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2302 fail(regex_constants::error_perl_extension, m_position - m_base);
2303 return false;
2304 }
2305 if(++m_position == m_end)
2306 {
2307 // Rewind to start of (? sequence:
2308 --m_position;
2309 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2310 fail(regex_constants::error_perl_extension, m_position - m_base);
2311 return false;
2312 }
2313 }
2314 else
2315 {
2316 // verify that we have a lookahead or lookbehind assert:
2317 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_question)
2318 {
2319 // Rewind to start of (? sequence:
2320 --m_position;
2321 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2322 fail(regex_constants::error_perl_extension, m_position - m_base);
2323 return false;
2324 }
2325 if(++m_position == m_end)
2326 {
2327 // Rewind to start of (? sequence:
2328 --m_position;
2329 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2330 fail(regex_constants::error_perl_extension, m_position - m_base);
2331 return false;
2332 }
2333 if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_left_word)
2334 {
2335 if(++m_position == m_end)
2336 {
2337 // Rewind to start of (? sequence:
2338 --m_position;
2339 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2340 fail(regex_constants::error_perl_extension, m_position - m_base);
2341 return false;
2342 }
2343 if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
2344 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
2345 {
2346 // Rewind to start of (? sequence:
2347 --m_position;
2348 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2349 fail(regex_constants::error_perl_extension, m_position - m_base);
2350 return false;
2351 }
2352 m_position -= 3;
2353 }
2354 else
2355 {
2356 if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
2357 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
2358 {
2359 // Rewind to start of (? sequence:
2360 --m_position;
2361 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2362 fail(regex_constants::error_perl_extension, m_position - m_base);
2363 return false;
2364 }
2365 m_position -= 2;
2366 }
2367 }
2368 break;
2369 }
2370 case regex_constants::syntax_close_mark:
2371 // Rewind to start of (? sequence:
2372 --m_position;
2373 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2374 fail(regex_constants::error_perl_extension, m_position - m_base);
2375 return false;
2376 case regex_constants::escape_type_end_buffer:
2377 {
2378 name_delim = *m_position;
2379named_capture_jump:
2380 markid = 0;
2381 if(0 == (this->flags() & regbase::nosubs))
2382 {
2383 markid = ++m_mark_count;
2384 #ifndef BOOST_NO_STD_DISTANCE
2385 if(this->flags() & regbase::save_subexpression_location)
2386 this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 2, 0));
2387 #else
2388 if(this->flags() & regbase::save_subexpression_location)
2389 this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>((m_position - m_base) - 2, 0));
2390 #endif
2391 }
2392 pb->index = markid;
2393 const charT* base = ++m_position;
2394 if(m_position == m_end)
2395 {
2396 // Rewind to start of (? sequence:
2397 --m_position;
2398 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2399 fail(regex_constants::error_perl_extension, m_position - m_base);
2400 return false;
2401 }
2402 while((m_position != m_end) && (*m_position != name_delim))
2403 ++m_position;
2404 if(m_position == m_end)
2405 {
2406 // Rewind to start of (? sequence:
2407 --m_position;
2408 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2409 fail(regex_constants::error_perl_extension, m_position - m_base);
2410 return false;
2411 }
2412 this->m_pdata->set_name(base, m_position, markid);
2413 ++m_position;
2414 break;
2415 }
2416 default:
2417 if(*m_position == charT('R'))
2418 {
2419 ++m_position;
2420 v = 0;
2421 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2422 {
2423 // Rewind to start of (? sequence:
2424 --m_position;
2425 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2426 fail(regex_constants::error_perl_extension, m_position - m_base);
2427 return false;
2428 }
2429 goto insert_recursion;
2430 }
2431 if(*m_position == charT('&'))
2432 {
2433 ++m_position;
2434 const charT* base = m_position;
2435 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2436 ++m_position;
2437 if(m_position == m_end)
2438 {
2439 // Rewind to start of (? sequence:
2440 --m_position;
2441 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2442 fail(regex_constants::error_perl_extension, m_position - m_base);
2443 return false;
2444 }
2445 v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2446 goto insert_recursion;
2447 }
2448 if(*m_position == charT('P'))
2449 {
2450 ++m_position;
2451 if(m_position == m_end)
2452 {
2453 // Rewind to start of (? sequence:
2454 --m_position;
2455 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2456 fail(regex_constants::error_perl_extension, m_position - m_base);
2457 return false;
2458 }
2459 if(*m_position == charT('>'))
2460 {
2461 ++m_position;
2462 const charT* base = m_position;
2463 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2464 ++m_position;
2465 if(m_position == m_end)
2466 {
2467 // Rewind to start of (? sequence:
2468 --m_position;
2469 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2470 fail(regex_constants::error_perl_extension, m_position - m_base);
2471 return false;
2472 }
2473 v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2474 goto insert_recursion;
2475 }
2476 }
2477 //
2478 // lets assume that we have a (?imsx) group and try and parse it:
2479 //
2480option_group_jump:
2481 regex_constants::syntax_option_type opts = parse_options();
2482 if(m_position == m_end)
2483 {
2484 // Rewind to start of (? sequence:
2485 --m_position;
2486 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2487 fail(regex_constants::error_perl_extension, m_position - m_base);
2488 return false;
2489 }
2490 // make a note of whether we have a case change:
2491 m_has_case_change = ((opts & regbase::icase) != (this->flags() & regbase::icase));
2492 pb->index = markid = 0;
2493 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark)
2494 {
2495 // update flags and carry on as normal:
2496 this->flags(opts);
2497 restore_flags = false;
2498 old_case_change |= m_has_case_change; // defer end of scope by one ')'
2499 }
2500 else if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_colon)
2501 {
2502 // update flags and carry on until the matching ')' is found:
2503 this->flags(opts);
2504 ++m_position;
2505 }
2506 else
2507 {
2508 // Rewind to start of (? sequence:
2509 --m_position;
2510 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2511 fail(regex_constants::error_perl_extension, m_position - m_base);
2512 return false;
2513 }
2514
2515 // finally append a case change state if we need it:
2516 if(m_has_case_change)
2517 {
2518 static_cast<re_case*>(
2519 this->append_state(syntax_element_toggle_case, sizeof(re_case))
2520 )->icase = opts & regbase::icase;
2521 }
2522
2523 }
2524 //
2525 // now recursively add more states, this will terminate when we get to a
2526 // matching ')' :
2527 //
2528 parse_all();
2529 //
2530 // Unwind alternatives:
2531 //
2532 if(0 == unwind_alts(last_paren_start))
2533 {
2534 // Rewind to start of (? sequence:
2535 --m_position;
2536 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2537 fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid alternation operators within (?...) block.");
2538 return false;
2539 }
2540 //
2541 // we either have a ')' or we have run out of characters prematurely:
2542 //
2543 if(m_position == m_end)
2544 {
2545 // Rewind to start of (? sequence:
2546 --m_position;
2547 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2548 this->fail(regex_constants::error_paren, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_end));
2549 return false;
2550 }
2551 BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
2552 ++m_position;
2553 //
2554 // restore the flags:
2555 //
2556 if(restore_flags)
2557 {
2558 // append a case change state if we need it:
2559 if(m_has_case_change)
2560 {
2561 static_cast<re_case*>(
2562 this->append_state(syntax_element_toggle_case, sizeof(re_case))
2563 )->icase = old_flags & regbase::icase;
2564 }
2565 this->flags(old_flags);
2566 }
2567 //
2568 // set up the jump pointer if we have one:
2569 //
2570 if(jump_offset)
2571 {
2572 this->m_pdata->m_data.align();
2573 re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
2574 jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
2575 if((this->m_last_state == jmp) && (markid != -2))
2576 {
2577 // Oops... we didn't have anything inside the assertion.
2578 // Note we don't get here for negated forward lookahead as (?!)
2579 // does have some uses.
2580 // Rewind to start of (? sequence:
2581 --m_position;
2582 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2583 fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid or empty zero width assertion.");
2584 return false;
2585 }
2586 }
2587 //
2588 // verify that if this is conditional expression, that we do have
2589 // an alternative, if not add one:
2590 //
2591 if(markid == -4)
2592 {
2593 re_syntax_base* b = this->getaddress(expected_alt_point);
2594 // Make sure we have exactly one alternative following this state:
2595 if(b->type != syntax_element_alt)
2596 {
2597 re_alt* alt = static_cast<re_alt*>(this->insert_state(expected_alt_point, syntax_element_alt, sizeof(re_alt)));
2598 alt->alt.i = this->m_pdata->m_data.size() - this->getoffset(alt);
2599 }
2600 else if(((std::ptrdiff_t)this->m_pdata->m_data.size() > (static_cast<re_alt*>(b)->alt.i + this->getoffset(b))) && (static_cast<re_alt*>(b)->alt.i > 0) && this->getaddress(static_cast<re_alt*>(b)->alt.i, b)->type == syntax_element_alt)
2601 {
2602 // Can't have seen more than one alternative:
2603 // Rewind to start of (? sequence:
2604 --m_position;
2605 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2606 fail(regex_constants::error_bad_pattern, m_position - m_base, "More than one alternation operator | was encountered inside a conditional expression.");
2607 return false;
2608 }
2609 else
2610 {
2611 // We must *not* have seen an alternative inside a (DEFINE) block:
2612 b = this->getaddress(b->next.i, b);
2613 if((b->type == syntax_element_assert_backref) && (static_cast<re_brace*>(b)->index == 9999))
2614 {
2615 // Rewind to start of (? sequence:
2616 --m_position;
2617 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2618 fail(regex_constants::error_bad_pattern, m_position - m_base, "Alternation operators are not allowed inside a DEFINE block.");
2619 return false;
2620 }
2621 }
2622 // check for invalid repetition of next state:
2623 b = this->getaddress(expected_alt_point);
2624 b = this->getaddress(static_cast<re_alt*>(b)->next.i, b);
2625 if((b->type != syntax_element_assert_backref)
2626 && (b->type != syntax_element_startmark))
2627 {
2628 // Rewind to start of (? sequence:
2629 --m_position;
2630 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2631 fail(regex_constants::error_badrepeat, m_position - m_base, "A repetition operator cannot be applied to a zero-width assertion.");
2632 return false;
2633 }
2634 }
2635 //
2636 // append closing parenthesis state:
2637 //
2638 pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
2639 pb->index = markid;
2640 pb->icase = this->flags() & regbase::icase;
2641 this->m_paren_start = last_paren_start;
2642 //
2643 // restore the alternate insertion point:
2644 //
2645 this->m_alt_insert_point = last_alt_point;
2646 //
2647 // and the case change data:
2648 //
2649 m_has_case_change = old_case_change;
2650 //
2651 // And the mark_reset data:
2652 //
2653 if(m_max_mark > m_mark_count)
2654 {
2655 m_mark_count = m_max_mark;
2656 }
2657 m_mark_reset = mark_reset;
2658 m_max_mark = max_mark;
2659
2660
2661 if(markid > 0)
2662 {
2663#ifndef BOOST_NO_STD_DISTANCE
2664 if(this->flags() & regbase::save_subexpression_location)
2665 this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position) - 1;
2666#else
2667 if(this->flags() & regbase::save_subexpression_location)
2668 this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base) - 1;
2669#endif
2670 //
2671 // allow backrefs to this mark:
2672 //
2673 if(markid < (int)(sizeof(unsigned) * CHAR_BIT))
2674 this->m_backrefs |= 1u << (markid - 1);
2675 }
2676 return true;
2677}
2678
2679template <class charT, class traits>
2680bool basic_regex_parser<charT, traits>::match_verb(const char* verb)
2681{
2682 while(*verb)
2683 {
2684 if(static_cast<charT>(*verb) != *m_position)
2685 {
2686 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2687 fail(regex_constants::error_perl_extension, m_position - m_base);
2688 return false;
2689 }
2690 if(++m_position == m_end)
2691 {
2692 --m_position;
2693 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2694 fail(regex_constants::error_perl_extension, m_position - m_base);
2695 return false;
2696 }
2697 ++verb;
2698 }
2699 return true;
2700}
2701
2702template <class charT, class traits>
2703bool basic_regex_parser<charT, traits>::parse_perl_verb()
2704{
2705 if(++m_position == m_end)
2706 {
2707 // Rewind to start of (* sequence:
2708 --m_position;
2709 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2710 fail(regex_constants::error_perl_extension, m_position - m_base);
2711 return false;
2712 }
2713 switch(*m_position)
2714 {
2715 case 'F':
2716 if(++m_position == m_end)
2717 {
2718 // Rewind to start of (* sequence:
2719 --m_position;
2720 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2721 fail(regex_constants::error_perl_extension, m_position - m_base);
2722 return false;
2723 }
2724 if((this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark) || match_verb("AIL"))
2725 {
2726 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2727 {
2728 // Rewind to start of (* sequence:
2729 --m_position;
2730 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2731 fail(regex_constants::error_perl_extension, m_position - m_base);
2732 return false;
2733 }
2734 ++m_position;
2735 this->append_state(syntax_element_fail);
2736 return true;
2737 }
2738 break;
2739 case 'A':
2740 if(++m_position == m_end)
2741 {
2742 // Rewind to start of (* sequence:
2743 --m_position;
2744 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2745 fail(regex_constants::error_perl_extension, m_position - m_base);
2746 return false;
2747 }
2748 if(match_verb("CCEPT"))
2749 {
2750 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2751 {
2752 // Rewind to start of (* sequence:
2753 --m_position;
2754 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2755 fail(regex_constants::error_perl_extension, m_position - m_base);
2756 return false;
2757 }
2758 ++m_position;
2759 this->append_state(syntax_element_accept);
2760 return true;
2761 }
2762 break;
2763 case 'C':
2764 if(++m_position == m_end)
2765 {
2766 // Rewind to start of (* sequence:
2767 --m_position;
2768 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2769 fail(regex_constants::error_perl_extension, m_position - m_base);
2770 return false;
2771 }
2772 if(match_verb("OMMIT"))
2773 {
2774 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2775 {
2776 // Rewind to start of (* sequence:
2777 --m_position;
2778 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2779 fail(regex_constants::error_perl_extension, m_position - m_base);
2780 return false;
2781 }
2782 ++m_position;
2783 static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_commit;
2784 this->m_pdata->m_disable_match_any = true;
2785 return true;
2786 }
2787 break;
2788 case 'P':
2789 if(++m_position == m_end)
2790 {
2791 // Rewind to start of (* sequence:
2792 --m_position;
2793 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2794 fail(regex_constants::error_perl_extension, m_position - m_base);
2795 return false;
2796 }
2797 if(match_verb("RUNE"))
2798 {
2799 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2800 {
2801 // Rewind to start of (* sequence:
2802 --m_position;
2803 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2804 fail(regex_constants::error_perl_extension, m_position - m_base);
2805 return false;
2806 }
2807 ++m_position;
2808 static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_prune;
2809 this->m_pdata->m_disable_match_any = true;
2810 return true;
2811 }
2812 break;
2813 case 'S':
2814 if(++m_position == m_end)
2815 {
2816 // Rewind to start of (* sequence:
2817 --m_position;
2818 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2819 fail(regex_constants::error_perl_extension, m_position - m_base);
2820 return false;
2821 }
2822 if(match_verb("KIP"))
2823 {
2824 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2825 {
2826 // Rewind to start of (* sequence:
2827 --m_position;
2828 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2829 fail(regex_constants::error_perl_extension, m_position - m_base);
2830 return false;
2831 }
2832 ++m_position;
2833 static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_skip;
2834 this->m_pdata->m_disable_match_any = true;
2835 return true;
2836 }
2837 break;
2838 case 'T':
2839 if(++m_position == m_end)
2840 {
2841 // Rewind to start of (* sequence:
2842 --m_position;
2843 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2844 fail(regex_constants::error_perl_extension, m_position - m_base);
2845 return false;
2846 }
2847 if(match_verb("HEN"))
2848 {
2849 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2850 {
2851 // Rewind to start of (* sequence:
2852 --m_position;
2853 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2854 fail(regex_constants::error_perl_extension, m_position - m_base);
2855 return false;
2856 }
2857 ++m_position;
2858 this->append_state(syntax_element_then);
2859 this->m_pdata->m_disable_match_any = true;
2860 return true;
2861 }
2862 break;
2863 }
2864 // Rewind to start of (* sequence:
2865 --m_position;
2866 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2867 fail(regex_constants::error_perl_extension, m_position - m_base);
2868 return false;
2869}
2870
2871template <class charT, class traits>
2872bool basic_regex_parser<charT, traits>::add_emacs_code(bool negate)
2873{
2874 //
2875 // parses an emacs style \sx or \Sx construct.
2876 //
2877 if(++m_position == m_end)
2878 {
2879 // Rewind to start of sequence:
2880 --m_position;
2881 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
2882 fail(regex_constants::error_escape, m_position - m_base);
2883 return false;
2884 }
2885 basic_char_set<charT, traits> char_set;
2886 if(negate)
2887 char_set.negate();
2888
2889 static const charT s_punct[5] = { 'p', 'u', 'n', 'c', 't', };
2890
2891 switch(*m_position)
2892 {
2893 case 's':
2894 case ' ':
2895 char_set.add_class(this->m_mask_space);
2896 break;
2897 case 'w':
2898 char_set.add_class(this->m_word_mask);
2899 break;
2900 case '_':
2901 char_set.add_single(digraph<charT>(charT('$')));
2902 char_set.add_single(digraph<charT>(charT('&')));
2903 char_set.add_single(digraph<charT>(charT('*')));
2904 char_set.add_single(digraph<charT>(charT('+')));
2905 char_set.add_single(digraph<charT>(charT('-')));
2906 char_set.add_single(digraph<charT>(charT('_')));
2907 char_set.add_single(digraph<charT>(charT('<')));
2908 char_set.add_single(digraph<charT>(charT('>')));
2909 break;
2910 case '.':
2911 char_set.add_class(this->m_traits.lookup_classname(s_punct, s_punct+5));
2912 break;
2913 case '(':
2914 char_set.add_single(digraph<charT>(charT('(')));
2915 char_set.add_single(digraph<charT>(charT('[')));
2916 char_set.add_single(digraph<charT>(charT('{')));
2917 break;
2918 case ')':
2919 char_set.add_single(digraph<charT>(charT(')')));
2920 char_set.add_single(digraph<charT>(charT(']')));
2921 char_set.add_single(digraph<charT>(charT('}')));
2922 break;
2923 case '"':
2924 char_set.add_single(digraph<charT>(charT('"')));
2925 char_set.add_single(digraph<charT>(charT('\'')));
2926 char_set.add_single(digraph<charT>(charT('`')));
2927 break;
2928 case '\'':
2929 char_set.add_single(digraph<charT>(charT('\'')));
2930 char_set.add_single(digraph<charT>(charT(',')));
2931 char_set.add_single(digraph<charT>(charT('#')));
2932 break;
2933 case '<':
2934 char_set.add_single(digraph<charT>(charT(';')));
2935 break;
2936 case '>':
2937 char_set.add_single(digraph<charT>(charT('\n')));
2938 char_set.add_single(digraph<charT>(charT('\f')));
2939 break;
2940 default:
2941 fail(regex_constants::error_ctype, m_position - m_base);
2942 return false;
2943 }
2944 if(0 == this->append_set(char_set))
2945 {
2946 fail(regex_constants::error_ctype, m_position - m_base);
2947 return false;
2948 }
2949 ++m_position;
2950 return true;
2951}
2952
2953template <class charT, class traits>
2954regex_constants::syntax_option_type basic_regex_parser<charT, traits>::parse_options()
2955{
2956 // we have a (?imsx-imsx) group, convert it into a set of flags:
2957 regex_constants::syntax_option_type f = this->flags();
2958 bool breakout = false;
2959 do
2960 {
2961 switch(*m_position)
2962 {
2963 case 's':
2964 f |= regex_constants::mod_s;
2965 f &= ~regex_constants::no_mod_s;
2966 break;
2967 case 'm':
2968 f &= ~regex_constants::no_mod_m;
2969 break;
2970 case 'i':
2971 f |= regex_constants::icase;
2972 break;
2973 case 'x':
2974 f |= regex_constants::mod_x;
2975 break;
2976 default:
2977 breakout = true;
2978 continue;
2979 }
2980 if(++m_position == m_end)
2981 {
2982 // Rewind to start of (? sequence:
2983 --m_position;
2984 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2985 fail(regex_constants::error_paren, m_position - m_base);
2986 return false;
2987 }
2988 }
2989 while(!breakout);
2990
2991 breakout = false;
2992
2993 if(*m_position == static_cast<charT>('-'))
2994 {
2995 if(++m_position == m_end)
2996 {
2997 // Rewind to start of (? sequence:
2998 --m_position;
2999 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3000 fail(regex_constants::error_paren, m_position - m_base);
3001 return false;
3002 }
3003 do
3004 {
3005 switch(*m_position)
3006 {
3007 case 's':
3008 f &= ~regex_constants::mod_s;
3009 f |= regex_constants::no_mod_s;
3010 break;
3011 case 'm':
3012 f |= regex_constants::no_mod_m;
3013 break;
3014 case 'i':
3015 f &= ~regex_constants::icase;
3016 break;
3017 case 'x':
3018 f &= ~regex_constants::mod_x;
3019 break;
3020 default:
3021 breakout = true;
3022 continue;
3023 }
3024 if(++m_position == m_end)
3025 {
3026 // Rewind to start of (? sequence:
3027 --m_position;
3028 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3029 fail(regex_constants::error_paren, m_position - m_base);
3030 return false;
3031 }
3032 }
3033 while(!breakout);
3034 }
3035 return f;
3036}
3037
3038template <class charT, class traits>
3039bool basic_regex_parser<charT, traits>::unwind_alts(std::ptrdiff_t last_paren_start)
3040{
3041 //
3042 // If we didn't actually add any states after the last
3043 // alternative then that's an error:
3044 //
3045 if((this->m_alt_insert_point == static_cast<std::ptrdiff_t>(this->m_pdata->m_data.size()))
3046 && m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start)
3047 &&
3048 !(
3049 ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
3050 &&
3051 ((this->flags() & regbase::no_empty_expressions) == 0)
3052 )
3053 )
3054 {
3055 fail(regex_constants::error_empty, this->m_position - this->m_base, "Can't terminate a sub-expression with an alternation operator |.");
3056 return false;
3057 }
3058 //
3059 // Fix up our alternatives:
3060 //
3061 while(m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start))
3062 {
3063 //
3064 // fix up the jump to point to the end of the states
3065 // that we've just added:
3066 //
3067 std::ptrdiff_t jump_offset = m_alt_jumps.back();
3068 m_alt_jumps.pop_back();
3069 this->m_pdata->m_data.align();
3070 re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
3071 BOOST_ASSERT(jmp->type == syntax_element_jump);
3072 jmp->alt.i = this->m_pdata->m_data.size() - jump_offset;
3073 }
3074 return true;
3075}
3076
3077#ifdef BOOST_MSVC
3078#pragma warning(pop)
3079#endif
3080
3081} // namespace BOOST_REGEX_DETAIL_NS
3082} // namespace boost
3083
3084#ifdef BOOST_MSVC
3085#pragma warning(push)
3086#pragma warning(disable: 4103)
3087#endif
3088#ifdef BOOST_HAS_ABI_HEADERS
3089# include BOOST_ABI_SUFFIX
3090#endif
3091#ifdef BOOST_MSVC
3092#pragma warning(pop)
3093#endif
3094
3095#endif
3096