regex_compiler.tcc source code [include/c++/8/bits/regex_compiler.tcc]

1	// class template regex -- C++ --
2
3	// Copyright (C) 2013-2018 Free Software Foundation, Inc.
4	//
5	// This file is part of the GNU ISO C++ Library. This library is free
6	// software; you can redistribute it and/or modify it under the
7	// terms of the GNU General Public License as published by the
8	// Free Software Foundation; either version 3, or (at your option)
9	// any later version.
10
11	// This library is distributed in the hope that it will be useful,
12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	// GNU General Public License for more details.
15
16	// Under Section 7 of GPL version 3, you are granted additional
17	// permissions described in the GCC Runtime Library Exception, version
18	// 3.1, as published by the Free Software Foundation.
19
20	// You should have received a copy of the GNU General Public License and
21	// a copy of the GCC Runtime Library Exception along with this program;
22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23	// <http://www.gnu.org/licenses/>.
24
25	/**
26	* @file bits/regex_compiler.tcc
27	* This is an internal header file, included by other library headers.
28	* Do not attempt to use it directly. @headername{regex}
29	*/
30
31	// FIXME make comments doxygen format.
32
33	/*
34	// This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
35	// (http://swtch.com/~rsc/regexp/regexp1.html),
36	// but doesn't strictly follow it.
37	//
38	// When compiling, states are chained* instead of tree- or graph-constructed.*
39	// It's more like structured programs: there's if statement and loop statement.
40	//
41	// For alternative structure (say "a\|b"), aka "if statement", two branches
42	// should be constructed. However, these two shall merge to an "end_tag" at
43	// the end of this operator:
44	//
45	// branch1
46	// / \
47	// => begin_tag end_tag =>
48	// \ /
49	// branch2
50	//
51	// This is the difference between this implementation and that in Russ's
52	// article.
53	//
54	// That's why we introduced dummy node here ------ "end_tag" is a dummy node.
55	// All dummy nodes will be eliminated at the end of compilation.
56	*/
57
58	namespace std _GLIBCXX_VISIBILITY(default)
59	{
60	_GLIBCXX_BEGIN_NAMESPACE_VERSION
61
62	namespace __detail
63	{
64	template<typename _TraitsT>
65	_Compiler<_TraitsT>::
66	_Compiler(_IterT __b, _IterT __e,
67	const typename _TraitsT::locale_type& __loc, _FlagT __flags)
68	: _M_flags((__flags
69	& (regex_constants::ECMAScript
70	\| regex_constants::basic
71	\| regex_constants::extended
72	\| regex_constants::grep
73	\| regex_constants::egrep
74	\| regex_constants::awk))
75	? __flags
76	: __flags \| regex_constants::ECMAScript),
77	_M_scanner(__b, __e, _M_flags, __loc),
78	_M_nfa(make_shared<_RegexT>(__loc, _M_flags)),
79	_M_traits(_M_nfa->_M_traits),
80	_M_ctype(std::use_facet<_CtypeT>(__loc))
81	{
82	_StateSeqT __r(*_M_nfa, _M_nfa->_M_start());
83	__r._M_append(_M_nfa->_M_insert_subexpr_begin());
84	this->_M_disjunction();
85	if (!_M_match_token(_ScannerT::_S_token_eof))
86	__throw_regex_error(regex_constants::error_paren);
87	__r._M_append(_M_pop());
88	__glibcxx_assert(_M_stack.empty());
89	__r._M_append(_M_nfa->_M_insert_subexpr_end());
90	__r._M_append(_M_nfa->_M_insert_accept());
91	_M_nfa->_M_eliminate_dummy();
92	}
93
94	template<typename _TraitsT>
95	void
96	_Compiler<_TraitsT>::
97	_M_disjunction()
98	{
99	this->_M_alternative();
100	while (_M_match_token(_ScannerT::_S_token_or))
101	{
102	_StateSeqT __alt1 = _M_pop();
103	this->_M_alternative();
104	_StateSeqT __alt2 = _M_pop();
105	auto __end = _M_nfa->_M_insert_dummy();
106	__alt1._M_append(__end);
107	__alt2._M_append(__end);
108	// __alt2 is state._M_next, __alt1 is state._M_alt. The executor
109	// executes _M_alt before _M_next, as well as executing left
110	// alternative before right one.
111	_M_stack.push(_StateSeqT(*_M_nfa,
112	_M_nfa->_M_insert_alt(
113	__alt2._M_start, __alt1._M_start, false),
114	__end));
115	}
116	}
117
118	template<typename _TraitsT>
119	void
120	_Compiler<_TraitsT>::
121	_M_alternative()
122	{
123	if (this->_M_term())
124	{
125	_StateSeqT __re = _M_pop();
126	this->_M_alternative();
127	__re._M_append(_M_pop());
128	_M_stack.push(__re);
129	}
130	else
131	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy()));
132	}
133
134	template<typename _TraitsT>
135	bool
136	_Compiler<_TraitsT>::
137	_M_term()
138	{
139	if (this->_M_assertion())
140	return true;
141	if (this->_M_atom())
142	{
143	while (this->_M_quantifier());
144	return true;
145	}
146	return false;
147	}
148
149	template<typename _TraitsT>
150	bool
151	_Compiler<_TraitsT>::
152	_M_assertion()
153	{
154	if (_M_match_token(_ScannerT::_S_token_line_begin))
155	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin()));
156	else if (_M_match_token(_ScannerT::_S_token_line_end))
157	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end()));
158	else if (_M_match_token(_ScannerT::_S_token_word_bound))
159	// _M_value[0] == 'n' means it's negative, say "not word boundary".
160	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
161	_M_insert_word_bound(_M_value[`0`] == `'n'`)));
162	else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
163	{
164	auto __neg = _M_value[`0`] == `'n'`;
165	this->_M_disjunction();
166	if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
167	__throw_regex_error(regex_constants::error_paren,
168	"Parenthesis is not closed.");
169	auto __tmp = _M_pop();
170	__tmp._M_append(_M_nfa->_M_insert_accept());
171	_M_stack.push(
172	_StateSeqT(
173	*_M_nfa,
174	_M_nfa->_M_insert_lookahead(__tmp._M_start, __neg)));
175	}
176	else
177	return false;
178	return true;
179	}
180
181	template<typename _TraitsT>
182	bool
183	_Compiler<_TraitsT>::
184	_M_quantifier()
185	{
186	bool __neg = (_M_flags & regex_constants::ECMAScript);
187	auto __init = [this, &__neg]()
188	{
189	if (_M_stack.empty())
190	__throw_regex_error(regex_constants::error_badrepeat,
191	"Nothing to repeat before a quantifier.");
192	__neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
193	};
194	if (_M_match_token(_ScannerT::_S_token_closure0))
195	{
196	__init();
197	auto __e = _M_pop();
198	_StateSeqT __r(*_M_nfa,
199	_M_nfa->_M_insert_repeat(_S_invalid_state_id,
200	__e._M_start, __neg));
201	__e._M_append(__r);
202	_M_stack.push(__r);
203	}
204	else if (_M_match_token(_ScannerT::_S_token_closure1))
205	{
206	__init();
207	auto __e = _M_pop();
208	__e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id,
209	__e._M_start, __neg));
210	_M_stack.push(__e);
211	}
212	else if (_M_match_token(_ScannerT::_S_token_opt))
213	{
214	__init();
215	auto __e = _M_pop();
216	auto __end = _M_nfa->_M_insert_dummy();
217	_StateSeqT __r(*_M_nfa,
218	_M_nfa->_M_insert_repeat(_S_invalid_state_id,
219	__e._M_start, __neg));
220	__e._M_append(__end);
221	__r._M_append(__end);
222	_M_stack.push(__r);
223	}
224	else if (_M_match_token(_ScannerT::_S_token_interval_begin))
225	{
226	if (_M_stack.empty())
227	__throw_regex_error(regex_constants::error_badrepeat,
228	"Nothing to repeat before a quantifier.");
229	if (!_M_match_token(_ScannerT::_S_token_dup_count))
230	__throw_regex_error(regex_constants::error_badbrace,
231	"Unexpected token in brace expression.");
232	_StateSeqT __r(_M_pop());
233	_StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy());
234	long __min_rep = _M_cur_int_value(`10`);
235	bool __infi = false;
236	long __n;
237
238	// {3
239	if (_M_match_token(_ScannerT::_S_token_comma))
240	if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
241	__n = _M_cur_int_value(`10`) - __min_rep;
242	else
243	__infi = true;
244	else
245	__n = `0`;
246	if (!_M_match_token(_ScannerT::_S_token_interval_end))
247	__throw_regex_error(regex_constants::error_brace,
248	"Unexpected end of brace expression.");
249
250	__neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
251
252	for (long __i = `0`; __i < __min_rep; ++__i)
253	__e._M_append(__r._M_clone());
254
255	if (__infi)
256	{
257	auto __tmp = __r._M_clone();
258	_StateSeqT __s(*_M_nfa,
259	_M_nfa->_M_insert_repeat(_S_invalid_state_id,
260	__tmp._M_start, __neg));
261	__tmp._M_append(__s);
262	__e._M_append(__s);
263	}
264	else
265	{
266	if (__n < `0`)
267	__throw_regex_error(regex_constants::error_badbrace,
268	"Invalid range in brace expression.");
269	auto __end = _M_nfa->_M_insert_dummy();
270	// _M_alt is the "match more" branch, and _M_next is the
271	// "match less" one. Switch _M_alt and _M_next of all created
272	// nodes. This is a hack but IMO works well.
273	std::stack<_StateIdT> __stack;
274	for (long __i = `0`; __i < __n; ++__i)
275	{
276	auto __tmp = __r._M_clone();
277	auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start,
278	__end, __neg);
279	__stack.push(__alt);
280	__e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end));
281	}
282	__e._M_append(__end);
283	while (!__stack.empty())
284	{
285	auto& __tmp = (*_M_nfa)[__stack.top()];
286	__stack.pop();
287	std::swap(__tmp._M_next, __tmp._M_alt);
288	}
289	}
290	_M_stack.push(__e);
291	}
292	else
293	return false;
294	return true;
295	}
296
297	#define __INSERT_REGEX_MATCHER(__func, ...)\
298	do {\
299	if (!(_M_flags & regex_constants::icase))\
300	if (!(_M_flags & regex_constants::collate))\
301	__func<false, false>(__VA_ARGS__);\
302	else\
303	__func<false, true>(__VA_ARGS__);\
304	else\
305	if (!(_M_flags & regex_constants::collate))\
306	__func<true, false>(__VA_ARGS__);\
307	else\
308	__func<true, true>(__VA_ARGS__);\
309	} while (false)
310
311	template<typename _TraitsT>
312	bool
313	_Compiler<_TraitsT>::
314	_M_atom()
315	{
316	if (_M_match_token(_ScannerT::_S_token_anychar))
317	{
318	if (!(_M_flags & regex_constants::ECMAScript))
319	__INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix);
320	else
321	__INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma);
322	}
323	else if (_M_try_char())
324	__INSERT_REGEX_MATCHER(_M_insert_char_matcher);
325	else if (_M_match_token(_ScannerT::_S_token_backref))
326	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
327	_M_insert_backref(_M_cur_int_value(`10`))));
328	else if (_M_match_token(_ScannerT::_S_token_quoted_class))
329	__INSERT_REGEX_MATCHER(_M_insert_character_class_matcher);
330	else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
331	{
332	_StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy());
333	this->_M_disjunction();
334	if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
335	__throw_regex_error(regex_constants::error_paren,
336	"Parenthesis is not closed.");
337	__r._M_append(_M_pop());
338	_M_stack.push(__r);
339	}
340	else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
341	{
342	_StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin());
343	this->_M_disjunction();
344	if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
345	__throw_regex_error(regex_constants::error_paren,
346	"Parenthesis is not closed.");
347	__r._M_append(_M_pop());
348	__r._M_append(_M_nfa->_M_insert_subexpr_end());
349	_M_stack.push(__r);
350	}
351	else if (!_M_bracket_expression())
352	return false;
353	return true;
354	}
355
356	template<typename _TraitsT>
357	bool
358	_Compiler<_TraitsT>::
359	_M_bracket_expression()
360	{
361	bool __neg =
362	_M_match_token(_ScannerT::_S_token_bracket_neg_begin);
363	if (!(__neg \|\| _M_match_token(_ScannerT::_S_token_bracket_begin)))
364	return false;
365	__INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg);
366	return true;
367	}
368	#undef __INSERT_REGEX_MATCHER
369
370	template<typename _TraitsT>
371	template<bool __icase, bool __collate>
372	void
373	_Compiler<_TraitsT>::
374	_M_insert_any_matcher_ecma()
375	{
376	_M_stack.push(_StateSeqT(*_M_nfa,
377	_M_nfa->_M_insert_matcher
378	(_AnyMatcher<_TraitsT, true, __icase, __collate>
379	(_M_traits))));
380	}
381
382	template<typename _TraitsT>
383	template<bool __icase, bool __collate>
384	void
385	_Compiler<_TraitsT>::
386	_M_insert_any_matcher_posix()
387	{
388	_M_stack.push(_StateSeqT(*_M_nfa,
389	_M_nfa->_M_insert_matcher
390	(_AnyMatcher<_TraitsT, false, __icase, __collate>
391	(_M_traits))));
392	}
393
394	template<typename _TraitsT>
395	template<bool __icase, bool __collate>
396	void
397	_Compiler<_TraitsT>::
398	_M_insert_char_matcher()
399	{
400	_M_stack.push(_StateSeqT(*_M_nfa,
401	_M_nfa->_M_insert_matcher
402	(_CharMatcher<_TraitsT, __icase, __collate>
403	(_M_value[`0`], _M_traits))));
404	}
405
406	template<typename _TraitsT>
407	template<bool __icase, bool __collate>
408	void
409	_Compiler<_TraitsT>::
410	_M_insert_character_class_matcher()
411	{
412	__glibcxx_assert(_M_value.size() == `1`);
413	_BracketMatcher<_TraitsT, __icase, __collate> __matcher
414	(_M_ctype.is(_CtypeT::upper, _M_value[`0`]), _M_traits);
415	__matcher._M_add_character_class(_M_value, false);
416	__matcher._M_ready();
417	_M_stack.push(_StateSeqT(*_M_nfa,
418	_M_nfa->_M_insert_matcher(std::move(__matcher))));
419	}
420
421	template<typename _TraitsT>
422	template<bool __icase, bool __collate>
423	void
424	_Compiler<_TraitsT>::
425	_M_insert_bracket_matcher(bool __neg)
426	{
427	_BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits);
428	pair<bool, _CharT> __last_char; // Optional<_CharT>
429	__last_char.first = false;
430	if (!(_M_flags & regex_constants::ECMAScript))
431	{
432	if (_M_try_char())
433	{
434	__last_char.first = true;
435	__last_char.second = _M_value[`0`];
436	}
437	else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
438	{
439	__last_char.first = true;
440	__last_char.second = `'-'`;
441	}
442	}
443	while (_M_expression_term(__last_char, __matcher));
444	if (__last_char.first)
445	__matcher._M_add_char(__last_char.second);
446	__matcher._M_ready();
447	_M_stack.push(_StateSeqT(
448	*_M_nfa,
449	_M_nfa->_M_insert_matcher(std::move(__matcher))));
450	}
451
452	template<typename _TraitsT>
453	template<bool __icase, bool __collate>
454	bool
455	_Compiler<_TraitsT>::
456	_M_expression_term(pair<bool, _CharT>& __last_char,
457	_BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
458	{
459	if (_M_match_token(_ScannerT::_S_token_bracket_end))
460	return false;
461
462	const auto __push_char = [&](_CharT __ch)
463	{
464	if (__last_char.first)
465	__matcher._M_add_char(__last_char.second);
466	else
467	__last_char.first = true;
468	__last_char.second = __ch;
469	};
470	const auto __flush = [&]
471	{
472	if (__last_char.first)
473	{
474	__matcher._M_add_char(__last_char.second);
475	__last_char.first = false;
476	}
477	};
478
479	if (_M_match_token(_ScannerT::_S_token_collsymbol))
480	{
481	auto __symbol = __matcher._M_add_collate_element(_M_value);
482	if (__symbol.size() == `1`)
483	__push_char(__symbol[`0`]);
484	else
485	__flush();
486	}
487	else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
488	{
489	__flush();
490	__matcher._M_add_equivalence_class(_M_value);
491	}
492	else if (_M_match_token(_ScannerT::_S_token_char_class_name))
493	{
494	__flush();
495	__matcher._M_add_character_class(_M_value, false);
496	}
497	else if (_M_try_char())
498	__push_char(_M_value[`0`]);
499	// POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
500	// except when the '-' is the first or last character in the bracket
501	// expression ([--0]). ECMAScript treats all '-' after a range as a
502	// normal character. Also see above, where _M_expression_term gets called.
503	//
504	// As a result, POSIX rejects [-----], but ECMAScript doesn't.
505	// Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax.
506	// Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
507	//
508	// It turns out that no one reads BNFs ;)
509	else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
510	{
511	if (!__last_char.first)
512	{
513	if (!(_M_flags & regex_constants::ECMAScript))
514	{
515	if (_M_match_token(_ScannerT::_S_token_bracket_end))
516	{
517	__push_char(`'-'`);
518	return false;
519	}
520	__throw_regex_error(
521	regex_constants::error_range,
522	"Unexpected dash in bracket expression. For POSIX syntax, "
523	"a dash is not treated literally only when it is at "
524	"beginning or end.");
525	}
526	__push_char(`'-'`);
527	}
528	else
529	{
530	if (_M_try_char())
531	{
532	__matcher._M_make_range(__last_char.second, _M_value[`0`]);
533	__last_char.first = false;
534	}
535	else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
536	{
537	__matcher._M_make_range(__last_char.second, `'-'`);
538	__last_char.first = false;
539	}
540	else
541	{
542	if (_M_scanner._M_get_token()
543	!= _ScannerT::_S_token_bracket_end)
544	__throw_regex_error(
545	regex_constants::error_range,
546	"Character is expected after a dash.");
547	__push_char(`'-'`);
548	}
549	}
550	}
551	else if (_M_match_token(_ScannerT::_S_token_quoted_class))
552	{
553	__flush();
554	__matcher._M_add_character_class(_M_value,
555	_M_ctype.is(_CtypeT::upper,
556	_M_value[`0`]));
557	}
558	else
559	__throw_regex_error(regex_constants::error_brack,
560	"Unexpected character in bracket expression.");
561
562	return true;
563	}
564
565	template<typename _TraitsT>
566	bool
567	_Compiler<_TraitsT>::
568	_M_try_char()
569	{
570	bool __is_char = false;
571	if (_M_match_token(_ScannerT::_S_token_oct_num))
572	{
573	__is_char = true;
574	_M_value.assign(`1`, _M_cur_int_value(`8`));
575	}
576	else if (_M_match_token(_ScannerT::_S_token_hex_num))
577	{
578	__is_char = true;
579	_M_value.assign(`1`, _M_cur_int_value(`16`));
580	}
581	else if (_M_match_token(_ScannerT::_S_token_ord_char))
582	__is_char = true;
583	return __is_char;
584	}
585
586	template<typename _TraitsT>
587	bool
588	_Compiler<_TraitsT>::
589	_M_match_token(_TokenT token)
590	{
591	if (token == _M_scanner._M_get_token())
592	{
593	_M_value = _M_scanner._M_get_value();
594	_M_scanner._M_advance();
595	return true;
596	}
597	return false;
598	}
599
600	template<typename _TraitsT>
601	int
602	_Compiler<_TraitsT>::
603	_M_cur_int_value(int __radix)
604	{
605	long __v = `0`;
606	for (typename _StringT::size_type __i = `0`;
607	__i < _M_value.length(); ++__i)
608	__v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
609	return __v;
610	}
611
612	template<typename _TraitsT, bool __icase, bool __collate>
613	bool
614	_BracketMatcher<_TraitsT, __icase, __collate>::
615	_M_apply(_CharT __ch, false_type) const
616	{
617	return [this, __ch]
618	{
619	if (std::binary_search(_M_char_set.begin(), _M_char_set.end(),
620	_M_translator._M_translate(__ch)))
621	return true;
622	auto __s = _M_translator._M_transform(__ch);
623	for (auto& __it : _M_range_set)
624	if (_M_translator._M_match_range(__it.first, __it.second, __s))
625	return true;
626	if (_M_traits.isctype(__ch, _M_class_set))
627	return true;
628	if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(),
629	_M_traits.transform_primary(&__ch, &__ch+`1`))
630	!= _M_equiv_set.end())
631	return true;
632	for (auto& __it : _M_neg_class_set)
633	if (!_M_traits.isctype(__ch, __it))
634	return true;
635	return false;
636	}() ^ _M_is_non_matching;
637	}
638	} // namespace __detail
639
640	_GLIBCXX_END_NAMESPACE_VERSION
641	} // namespace
642

Browse the source code of include/c++/8/bits/regex_compiler.tcc