1 | /* |
2 | * |
3 | * Copyright (c) 2002 |
4 | * John Maddock |
5 | * |
6 | * Use, modification and distribution are subject to the |
7 | * Boost Software License, Version 1.0. (See accompanying file |
8 | * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
9 | * |
10 | */ |
11 | |
12 | #ifndef BOOST_REGEX_MATCHER_HPP |
13 | #define BOOST_REGEX_MATCHER_HPP |
14 | |
15 | #include <boost/regex/v4/iterator_category.hpp> |
16 | |
17 | #ifdef BOOST_MSVC |
18 | #pragma warning(push) |
19 | #pragma warning(disable: 4103) |
20 | #endif |
21 | #ifdef BOOST_HAS_ABI_HEADERS |
22 | # include BOOST_ABI_PREFIX |
23 | #endif |
24 | #ifdef BOOST_MSVC |
25 | #pragma warning(pop) |
26 | #endif |
27 | |
28 | #ifdef BOOST_MSVC |
29 | # pragma warning(push) |
30 | #if BOOST_MSVC < 1910 |
31 | #pragma warning(disable:4800) |
32 | #endif |
33 | #endif |
34 | |
35 | namespace boost{ |
36 | namespace BOOST_REGEX_DETAIL_NS{ |
37 | |
38 | // |
39 | // error checking API: |
40 | // |
41 | BOOST_REGEX_DECL void BOOST_REGEX_CALL verify_options(boost::regex_constants::syntax_option_type ef, match_flag_type mf); |
42 | // |
43 | // function can_start: |
44 | // |
45 | template <class charT> |
46 | inline bool can_start(charT c, const unsigned char* map, unsigned char mask) |
47 | { |
48 | return ((c < static_cast<charT>(0)) ? true : ((c >= static_cast<charT>(1 << CHAR_BIT)) ? true : map[c] & mask)); |
49 | } |
50 | inline bool can_start(char c, const unsigned char* map, unsigned char mask) |
51 | { |
52 | return map[(unsigned char)c] & mask; |
53 | } |
54 | inline bool can_start(signed char c, const unsigned char* map, unsigned char mask) |
55 | { |
56 | return map[(unsigned char)c] & mask; |
57 | } |
58 | inline bool can_start(unsigned char c, const unsigned char* map, unsigned char mask) |
59 | { |
60 | return map[c] & mask; |
61 | } |
62 | inline bool can_start(unsigned short c, const unsigned char* map, unsigned char mask) |
63 | { |
64 | return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask); |
65 | } |
66 | #if !defined(__hpux) && !defined(__WINSCW__)// WCHAR_MIN not usable in pp-directives. |
67 | #if defined(WCHAR_MIN) && (WCHAR_MIN == 0) && !defined(BOOST_NO_INTRINSIC_WCHAR_T) |
68 | inline bool can_start(wchar_t c, const unsigned char* map, unsigned char mask) |
69 | { |
70 | return ((c >= static_cast<wchar_t>(1u << CHAR_BIT)) ? true : map[c] & mask); |
71 | } |
72 | #endif |
73 | #endif |
74 | #if !defined(BOOST_NO_INTRINSIC_WCHAR_T) |
75 | inline bool can_start(unsigned int c, const unsigned char* map, unsigned char mask) |
76 | { |
77 | return (((c >= static_cast<unsigned int>(1u << CHAR_BIT)) ? true : map[c] & mask)); |
78 | } |
79 | #endif |
80 | |
81 | |
82 | // |
83 | // Unfortunately Rogue Waves standard library appears to have a bug |
84 | // in std::basic_string::compare that results in eroneous answers |
85 | // in some cases (tested with Borland C++ 5.1, Rogue Wave lib version |
86 | // 0x020101) the test case was: |
87 | // {39135,0} < {0xff,0} |
88 | // which succeeds when it should not. |
89 | // |
90 | #ifndef _RWSTD_VER |
91 | template <class C, class T, class A> |
92 | inline int string_compare(const std::basic_string<C,T,A>& s, const C* p) |
93 | { |
94 | if(0 == *p) |
95 | { |
96 | if(s.empty() || ((s.size() == 1) && (s[0] == 0))) |
97 | return 0; |
98 | } |
99 | return s.compare(p); |
100 | } |
101 | #else |
102 | template <class C, class T, class A> |
103 | inline int string_compare(const std::basic_string<C,T,A>& s, const C* p) |
104 | { |
105 | if(0 == *p) |
106 | { |
107 | if(s.empty() || ((s.size() == 1) && (s[0] == 0))) |
108 | return 0; |
109 | } |
110 | return s.compare(p); |
111 | } |
112 | inline int string_compare(const std::string& s, const char* p) |
113 | { return std::strcmp(s.c_str(), p); } |
114 | # ifndef BOOST_NO_WREGEX |
115 | inline int string_compare(const std::wstring& s, const wchar_t* p) |
116 | { return std::wcscmp(s.c_str(), p); } |
117 | #endif |
118 | #endif |
119 | template <class Seq, class C> |
120 | inline int string_compare(const Seq& s, const C* p) |
121 | { |
122 | std::size_t i = 0; |
123 | while((i < s.size()) && (p[i] == s[i])) |
124 | { |
125 | ++i; |
126 | } |
127 | return (i == s.size()) ? -(int)p[i] : (int)s[i] - (int)p[i]; |
128 | } |
129 | # define STR_COMP(s,p) string_compare(s,p) |
130 | |
131 | template<class charT> |
132 | inline const charT* re_skip_past_null(const charT* p) |
133 | { |
134 | while (*p != static_cast<charT>(0)) ++p; |
135 | return ++p; |
136 | } |
137 | |
138 | template <class iterator, class charT, class traits_type, class char_classT> |
139 | iterator BOOST_REGEX_CALL re_is_set_member(iterator next, |
140 | iterator last, |
141 | const re_set_long<char_classT>* set_, |
142 | const regex_data<charT, traits_type>& e, bool icase) |
143 | { |
144 | const charT* p = reinterpret_cast<const charT*>(set_+1); |
145 | iterator ptr; |
146 | unsigned int i; |
147 | //bool icase = e.m_flags & regex_constants::icase; |
148 | |
149 | if(next == last) return next; |
150 | |
151 | typedef typename traits_type::string_type traits_string_type; |
152 | const ::boost::regex_traits_wrapper<traits_type>& traits_inst = *(e.m_ptraits); |
153 | |
154 | // dwa 9/13/00 suppress incorrect MSVC warning - it claims this is never |
155 | // referenced |
156 | (void)traits_inst; |
157 | |
158 | // try and match a single character, could be a multi-character |
159 | // collating element... |
160 | for(i = 0; i < set_->csingles; ++i) |
161 | { |
162 | ptr = next; |
163 | if(*p == static_cast<charT>(0)) |
164 | { |
165 | // treat null string as special case: |
166 | if(traits_inst.translate(*ptr, icase)) |
167 | { |
168 | ++p; |
169 | continue; |
170 | } |
171 | return set_->isnot ? next : (ptr == next) ? ++next : ptr; |
172 | } |
173 | else |
174 | { |
175 | while(*p && (ptr != last)) |
176 | { |
177 | if(traits_inst.translate(*ptr, icase) != *p) |
178 | break; |
179 | ++p; |
180 | ++ptr; |
181 | } |
182 | |
183 | if(*p == static_cast<charT>(0)) // if null we've matched |
184 | return set_->isnot ? next : (ptr == next) ? ++next : ptr; |
185 | |
186 | p = re_skip_past_null(p); // skip null |
187 | } |
188 | } |
189 | |
190 | charT col = traits_inst.translate(*next, icase); |
191 | |
192 | |
193 | if(set_->cranges || set_->cequivalents) |
194 | { |
195 | traits_string_type s1; |
196 | // |
197 | // try and match a range, NB only a single character can match |
198 | if(set_->cranges) |
199 | { |
200 | if((e.m_flags & regex_constants::collate) == 0) |
201 | s1.assign(1, col); |
202 | else |
203 | { |
204 | charT a[2] = { col, charT(0), }; |
205 | s1 = traits_inst.transform(a, a + 1); |
206 | } |
207 | for(i = 0; i < set_->cranges; ++i) |
208 | { |
209 | if(STR_COMP(s1, p) >= 0) |
210 | { |
211 | do{ ++p; }while(*p); |
212 | ++p; |
213 | if(STR_COMP(s1, p) <= 0) |
214 | return set_->isnot ? next : ++next; |
215 | } |
216 | else |
217 | { |
218 | // skip first string |
219 | do{ ++p; }while(*p); |
220 | ++p; |
221 | } |
222 | // skip second string |
223 | do{ ++p; }while(*p); |
224 | ++p; |
225 | } |
226 | } |
227 | // |
228 | // try and match an equivalence class, NB only a single character can match |
229 | if(set_->cequivalents) |
230 | { |
231 | charT a[2] = { col, charT(0), }; |
232 | s1 = traits_inst.transform_primary(a, a +1); |
233 | for(i = 0; i < set_->cequivalents; ++i) |
234 | { |
235 | if(STR_COMP(s1, p) == 0) |
236 | return set_->isnot ? next : ++next; |
237 | // skip string |
238 | do{ ++p; }while(*p); |
239 | ++p; |
240 | } |
241 | } |
242 | } |
243 | if(traits_inst.isctype(col, set_->cclasses) == true) |
244 | return set_->isnot ? next : ++next; |
245 | if((set_->cnclasses != 0) && (traits_inst.isctype(col, set_->cnclasses) == false)) |
246 | return set_->isnot ? next : ++next; |
247 | return set_->isnot ? ++next : next; |
248 | } |
249 | |
250 | template <class BidiIterator> |
251 | class repeater_count |
252 | { |
253 | repeater_count** stack; |
254 | repeater_count* next; |
255 | int state_id; |
256 | std::size_t count; // the number of iterations so far |
257 | BidiIterator start_pos; // where the last repeat started |
258 | |
259 | repeater_count* unwind_until(int n, repeater_count* p, int current_recursion_id) |
260 | { |
261 | while(p && (p->state_id != n)) |
262 | { |
263 | if(-2 - current_recursion_id == p->state_id) |
264 | return 0; |
265 | p = p->next; |
266 | if(p && (p->state_id < 0)) |
267 | { |
268 | p = unwind_until(p->state_id, p, current_recursion_id); |
269 | if(!p) |
270 | return p; |
271 | p = p->next; |
272 | } |
273 | } |
274 | return p; |
275 | } |
276 | public: |
277 | repeater_count(repeater_count** s) : stack(s), next(0), state_id(-1), count(0), start_pos() {} |
278 | |
279 | repeater_count(int i, repeater_count** s, BidiIterator start, int current_recursion_id) |
280 | : start_pos(start) |
281 | { |
282 | state_id = i; |
283 | stack = s; |
284 | next = *stack; |
285 | *stack = this; |
286 | if((state_id > next->state_id) && (next->state_id >= 0)) |
287 | count = 0; |
288 | else |
289 | { |
290 | repeater_count* p = next; |
291 | p = unwind_until(state_id, p, current_recursion_id); |
292 | if(p) |
293 | { |
294 | count = p->count; |
295 | start_pos = p->start_pos; |
296 | } |
297 | else |
298 | count = 0; |
299 | } |
300 | } |
301 | ~repeater_count() |
302 | { |
303 | if(next) |
304 | *stack = next; |
305 | } |
306 | std::size_t get_count() { return count; } |
307 | int get_id() { return state_id; } |
308 | std::size_t operator++() { return ++count; } |
309 | bool check_null_repeat(const BidiIterator& pos, std::size_t max) |
310 | { |
311 | // this is called when we are about to start a new repeat, |
312 | // if the last one was NULL move our count to max, |
313 | // otherwise save the current position. |
314 | bool result = (count == 0) ? false : (pos == start_pos); |
315 | if(result) |
316 | count = max; |
317 | else |
318 | start_pos = pos; |
319 | return result; |
320 | } |
321 | }; |
322 | |
323 | struct saved_state; |
324 | |
325 | enum saved_state_type |
326 | { |
327 | saved_type_end = 0, |
328 | saved_type_paren = 1, |
329 | saved_type_recurse = 2, |
330 | saved_type_assertion = 3, |
331 | saved_state_alt = 4, |
332 | saved_state_repeater_count = 5, |
333 | = 6, |
334 | saved_state_greedy_single_repeat = 7, |
335 | saved_state_rep_slow_dot = 8, |
336 | saved_state_rep_fast_dot = 9, |
337 | saved_state_rep_char = 10, |
338 | saved_state_rep_short_set = 11, |
339 | saved_state_rep_long_set = 12, |
340 | saved_state_non_greedy_long_repeat = 13, |
341 | saved_state_count = 14 |
342 | }; |
343 | |
344 | template <class Results> |
345 | struct recursion_info |
346 | { |
347 | typedef typename Results::value_type value_type; |
348 | typedef typename value_type::iterator iterator; |
349 | int idx; |
350 | const re_syntax_base* preturn_address; |
351 | Results results; |
352 | repeater_count<iterator>* repeater_stack; |
353 | iterator location_of_start; |
354 | }; |
355 | |
356 | #ifdef BOOST_MSVC |
357 | #pragma warning(push) |
358 | #pragma warning(disable : 4251) |
359 | #if BOOST_MSVC < 1700 |
360 | # pragma warning(disable : 4231) |
361 | #endif |
362 | # if BOOST_MSVC < 1600 |
363 | # pragma warning(disable : 4660) |
364 | # endif |
365 | #endif |
366 | |
367 | template <class BidiIterator, class Allocator, class traits> |
368 | class perl_matcher |
369 | { |
370 | public: |
371 | typedef typename traits::char_type char_type; |
372 | typedef perl_matcher<BidiIterator, Allocator, traits> self_type; |
373 | typedef bool (self_type::*matcher_proc_type)(void); |
374 | typedef std::size_t traits_size_type; |
375 | typedef typename is_byte<char_type>::width_type width_type; |
376 | typedef typename regex_iterator_traits<BidiIterator>::difference_type difference_type; |
377 | typedef match_results<BidiIterator, Allocator> results_type; |
378 | |
379 | perl_matcher(BidiIterator first, BidiIterator end, |
380 | match_results<BidiIterator, Allocator>& what, |
381 | const basic_regex<char_type, traits>& e, |
382 | match_flag_type f, |
383 | BidiIterator l_base) |
384 | : m_result(what), base(first), last(end), |
385 | position(first), backstop(l_base), re(e), traits_inst(e.get_traits()), |
386 | m_independent(false), next_count(&rep_obj), rep_obj(&next_count) |
387 | #ifdef BOOST_REGEX_NON_RECURSIVE |
388 | , m_recursions(0) |
389 | #endif |
390 | { |
391 | construct_init(e, f); |
392 | } |
393 | |
394 | bool match(); |
395 | bool find(); |
396 | |
397 | void setf(match_flag_type f) |
398 | { m_match_flags |= f; } |
399 | void unsetf(match_flag_type f) |
400 | { m_match_flags &= ~f; } |
401 | |
402 | private: |
403 | void construct_init(const basic_regex<char_type, traits>& e, match_flag_type f); |
404 | |
405 | bool find_imp(); |
406 | bool match_imp(); |
407 | #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD |
408 | typedef bool (perl_matcher::*protected_proc_type)(); |
409 | bool protected_call(protected_proc_type); |
410 | #endif |
411 | void estimate_max_state_count(std::random_access_iterator_tag*); |
412 | void estimate_max_state_count(void*); |
413 | bool match_prefix(); |
414 | bool match_all_states(); |
415 | |
416 | // match procs, stored in s_match_vtable: |
417 | bool match_startmark(); |
418 | bool match_endmark(); |
419 | bool match_literal(); |
420 | bool match_start_line(); |
421 | bool match_end_line(); |
422 | bool match_wild(); |
423 | bool match_match(); |
424 | bool match_word_boundary(); |
425 | bool match_within_word(); |
426 | bool match_word_start(); |
427 | bool match_word_end(); |
428 | bool match_buffer_start(); |
429 | bool match_buffer_end(); |
430 | bool match_backref(); |
431 | bool match_long_set(); |
432 | bool match_set(); |
433 | bool match_jump(); |
434 | bool match_alt(); |
435 | bool match_rep(); |
436 | bool match_combining(); |
437 | bool match_soft_buffer_end(); |
438 | bool match_restart_continue(); |
439 | bool match_long_set_repeat(); |
440 | bool match_set_repeat(); |
441 | bool match_char_repeat(); |
442 | bool match_dot_repeat_fast(); |
443 | bool match_dot_repeat_slow(); |
444 | bool match_dot_repeat_dispatch() |
445 | { |
446 | return ::boost::is_random_access_iterator<BidiIterator>::value ? match_dot_repeat_fast() : match_dot_repeat_slow(); |
447 | } |
448 | bool match_backstep(); |
449 | bool match_assert_backref(); |
450 | bool match_toggle_case(); |
451 | #ifdef BOOST_REGEX_RECURSIVE |
452 | bool backtrack_till_match(std::size_t count); |
453 | #endif |
454 | bool match_recursion(); |
455 | bool match_fail(); |
456 | bool match_accept(); |
457 | bool match_commit(); |
458 | bool match_then(); |
459 | bool skip_until_paren(int index, bool match = true); |
460 | |
461 | // find procs stored in s_find_vtable: |
462 | bool find_restart_any(); |
463 | bool find_restart_word(); |
464 | bool find_restart_line(); |
465 | bool find_restart_buf(); |
466 | bool find_restart_lit(); |
467 | |
468 | private: |
469 | // final result structure to be filled in: |
470 | match_results<BidiIterator, Allocator>& m_result; |
471 | // temporary result for POSIX matches: |
472 | scoped_ptr<match_results<BidiIterator, Allocator> > m_temp_match; |
473 | // pointer to actual result structure to fill in: |
474 | match_results<BidiIterator, Allocator>* m_presult; |
475 | // start of sequence being searched: |
476 | BidiIterator base; |
477 | // end of sequence being searched: |
478 | BidiIterator last; |
479 | // current character being examined: |
480 | BidiIterator position; |
481 | // where to restart next search after failed match attempt: |
482 | BidiIterator restart; |
483 | // where the current search started from, acts as base for $` during grep: |
484 | BidiIterator search_base; |
485 | // how far we can go back when matching lookbehind: |
486 | BidiIterator backstop; |
487 | // the expression being examined: |
488 | const basic_regex<char_type, traits>& re; |
489 | // the expression's traits class: |
490 | const ::boost::regex_traits_wrapper<traits>& traits_inst; |
491 | // the next state in the machine being matched: |
492 | const re_syntax_base* pstate; |
493 | // matching flags in use: |
494 | match_flag_type m_match_flags; |
495 | // how many states we have examined so far: |
496 | std::ptrdiff_t state_count; |
497 | // max number of states to examine before giving up: |
498 | std::ptrdiff_t max_state_count; |
499 | // whether we should ignore case or not: |
500 | bool icase; |
501 | // set to true when (position == last), indicates that we may have a partial match: |
502 | bool m_has_partial_match; |
503 | // set to true whenever we get a match: |
504 | bool m_has_found_match; |
505 | // set to true whenever we're inside an independent sub-expression: |
506 | bool m_independent; |
507 | // the current repeat being examined: |
508 | repeater_count<BidiIterator>* next_count; |
509 | // the first repeat being examined (top of linked list): |
510 | repeater_count<BidiIterator> rep_obj; |
511 | // the mask to pass when matching word boundaries: |
512 | typename traits::char_class_type m_word_mask; |
513 | // the bitmask to use when determining whether a match_any matches a newline or not: |
514 | unsigned char match_any_mask; |
515 | // recursion information: |
516 | std::vector<recursion_info<results_type> > recursion_stack; |
517 | #ifdef BOOST_REGEX_RECURSIVE |
518 | // Set to false by a (*COMMIT): |
519 | bool m_can_backtrack; |
520 | bool m_have_accept; |
521 | bool m_have_then; |
522 | #endif |
523 | #ifdef BOOST_REGEX_NON_RECURSIVE |
524 | // |
525 | // additional members for non-recursive version: |
526 | // |
527 | typedef bool (self_type::*unwind_proc_type)(bool); |
528 | |
529 | void extend_stack(); |
530 | bool unwind(bool); |
531 | bool unwind_end(bool); |
532 | bool unwind_paren(bool); |
533 | bool unwind_recursion_stopper(bool); |
534 | bool unwind_assertion(bool); |
535 | bool unwind_alt(bool); |
536 | bool unwind_repeater_counter(bool); |
537 | bool (bool); |
538 | bool unwind_greedy_single_repeat(bool); |
539 | bool unwind_slow_dot_repeat(bool); |
540 | bool unwind_fast_dot_repeat(bool); |
541 | bool unwind_char_repeat(bool); |
542 | bool unwind_short_set_repeat(bool); |
543 | bool unwind_long_set_repeat(bool); |
544 | bool unwind_non_greedy_repeat(bool); |
545 | bool unwind_recursion(bool); |
546 | bool unwind_recursion_pop(bool); |
547 | bool unwind_commit(bool); |
548 | bool unwind_then(bool); |
549 | bool unwind_case(bool); |
550 | void destroy_single_repeat(); |
551 | void push_matched_paren(int index, const sub_match<BidiIterator>& sub); |
552 | void push_recursion_stopper(); |
553 | void push_assertion(const re_syntax_base* ps, bool positive); |
554 | void push_alt(const re_syntax_base* ps); |
555 | void push_repeater_count(int i, repeater_count<BidiIterator>** s); |
556 | void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int state_id); |
557 | void push_non_greedy_repeat(const re_syntax_base* ps); |
558 | void push_recursion(int idx, const re_syntax_base* p, results_type* presults, results_type* presults2); |
559 | void push_recursion_pop(); |
560 | void push_case_change(bool); |
561 | |
562 | // pointer to base of stack: |
563 | saved_state* m_stack_base; |
564 | // pointer to current stack position: |
565 | saved_state* m_backup_state; |
566 | // how many memory blocks have we used up?: |
567 | unsigned used_block_count; |
568 | // determines what value to return when unwinding from recursion, |
569 | // allows for mixed recursive/non-recursive algorithm: |
570 | bool m_recursive_result; |
571 | // We have unwound to a lookahead/lookbehind, used by COMMIT/PRUNE/SKIP: |
572 | bool m_unwound_lookahead; |
573 | // We have unwound to an alternative, used by THEN: |
574 | bool m_unwound_alt; |
575 | // We are unwinding a commit - used by independent subs to determine whether to stop there or carry on unwinding: |
576 | //bool m_unwind_commit; |
577 | // Recursion limit: |
578 | unsigned m_recursions; |
579 | #endif |
580 | |
581 | // these operations aren't allowed, so are declared private, |
582 | // bodies are provided to keep explicit-instantiation requests happy: |
583 | perl_matcher& operator=(const perl_matcher&) |
584 | { |
585 | return *this; |
586 | } |
587 | perl_matcher(const perl_matcher& that) |
588 | : m_result(that.m_result), re(that.re), traits_inst(that.traits_inst), rep_obj(0) {} |
589 | }; |
590 | |
591 | #ifdef BOOST_MSVC |
592 | #pragma warning(pop) |
593 | #endif |
594 | |
595 | } // namespace BOOST_REGEX_DETAIL_NS |
596 | |
597 | #ifdef BOOST_MSVC |
598 | #pragma warning(push) |
599 | #pragma warning(disable: 4103) |
600 | #endif |
601 | #ifdef BOOST_HAS_ABI_HEADERS |
602 | # include BOOST_ABI_SUFFIX |
603 | #endif |
604 | #ifdef BOOST_MSVC |
605 | #pragma warning(pop) |
606 | #endif |
607 | |
608 | } // namespace boost |
609 | |
610 | #ifdef BOOST_MSVC |
611 | # pragma warning(pop) |
612 | #endif |
613 | |
614 | // |
615 | // include the implementation of perl_matcher: |
616 | // |
617 | #ifdef BOOST_REGEX_RECURSIVE |
618 | #include <boost/regex/v4/perl_matcher_recursive.hpp> |
619 | #else |
620 | #include <boost/regex/v4/perl_matcher_non_recursive.hpp> |
621 | #endif |
622 | // this one has to be last: |
623 | #include <boost/regex/v4/perl_matcher_common.hpp> |
624 | |
625 | #endif |
626 | |
627 | |