states.hpp source code [include/boost/regex/v4/states.hpp]

1	/*
2	*
3	* Copyright (c) 1998-2002
4	* John Maddock
5	*
6	* Use, modification and distribution are subject to the
7	* Boost Software License, Version 1.0. (See accompanying file
8	* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9	*
10	*/
11
12	/*
13	* LOCATION: see http://www.boost.org for most recent version.
14	* FILE states.cpp
15	* VERSION see <boost/version.hpp>
16	* DESCRIPTION: Declares internal state machine structures.
17	*/
18
19	#ifndef BOOST_REGEX_V4_STATES_HPP
20	#define BOOST_REGEX_V4_STATES_HPP
21
22	#ifdef BOOST_MSVC
23	#pragma warning(push)
24	#pragma warning(disable: 4103)
25	#endif
26	#ifdef BOOST_HAS_ABI_HEADERS
27	# include BOOST_ABI_PREFIX
28	#endif
29	#ifdef BOOST_MSVC
30	#pragma warning(pop)
31	#endif
32
33	namespace boost{
34	namespace BOOST_REGEX_DETAIL_NS{
35
36	/* mask_type *****************************************************
37	Whenever we have a choice of two alternatives, we use an array of bytes
38	to indicate which of the two alternatives it is possible to take for any
39	given input character. If mask_take is set, then we can take the next
40	state, and if mask_skip is set then we can take the alternative.
41	***********************************************************************/
42	enum mask_type
43	{
44	mask_take = `1`,
45	mask_skip = `2`,
46	mask_init = `4`,
47	mask_any = mask_skip \| mask_take,
48	mask_all = mask_any
49	};
50
51	/* helpers ********************************************************
52	These helpers let us use function overload resolution to detect whether
53	we have narrow or wide character strings:
54	***********************************************************************/
55	struct _narrow_type{};
56	struct _wide_type{};
57	template <class charT> struct is_byte;
58	template<> struct is_byte<char> { typedef _narrow_type width_type; };
59	template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
60	template<> struct is_byte<signed char> { typedef _narrow_type width_type; };
61	template <class charT> struct is_byte { typedef _wide_type width_type; };
62
63	/* enum syntax_element_type ****************************************
64	Every record in the state machine falls into one of the following types:
65	***********************************************************************/
66	enum syntax_element_type
67	{
68	// start of a marked sub-expression, or perl-style (?...) extension
69	syntax_element_startmark = `0`,
70	// end of a marked sub-expression, or perl-style (?...) extension
71	syntax_element_endmark = syntax_element_startmark + `1`,
72	// any sequence of literal characters
73	syntax_element_literal = syntax_element_endmark + `1`,
74	// start of line assertion: ^
75	syntax_element_start_line = syntax_element_literal + `1`,
76	// end of line assertion $
77	syntax_element_end_line = syntax_element_start_line + `1`,
78	// match any character: .
79	syntax_element_wild = syntax_element_end_line + `1`,
80	// end of expression: we have a match when we get here
81	syntax_element_match = syntax_element_wild + `1`,
82	// perl style word boundary: \b
83	syntax_element_word_boundary = syntax_element_match + `1`,
84	// perl style within word boundary: \B
85	syntax_element_within_word = syntax_element_word_boundary + `1`,
86	// start of word assertion: \<
87	syntax_element_word_start = syntax_element_within_word + `1`,
88	// end of word assertion: \>
89	syntax_element_word_end = syntax_element_word_start + `1`,
90	// start of buffer assertion: \`
91	syntax_element_buffer_start = syntax_element_word_end + `1`,
92	// end of buffer assertion: \'
93	syntax_element_buffer_end = syntax_element_buffer_start + `1`,
94	// backreference to previously matched sub-expression
95	syntax_element_backref = syntax_element_buffer_end + `1`,
96	// either a wide character set [..] or one with multicharacter collating elements:
97	syntax_element_long_set = syntax_element_backref + `1`,
98	// narrow character set: [...]
99	syntax_element_set = syntax_element_long_set + `1`,
100	// jump to a new state in the machine:
101	syntax_element_jump = syntax_element_set + `1`,
102	// choose between two production states:
103	syntax_element_alt = syntax_element_jump + `1`,
104	// a repeat
105	syntax_element_rep = syntax_element_alt + `1`,
106	// match a combining character sequence
107	syntax_element_combining = syntax_element_rep + `1`,
108	// perl style soft buffer end: \z
109	syntax_element_soft_buffer_end = syntax_element_combining + `1`,
110	// perl style continuation: \G
111	syntax_element_restart_continue = syntax_element_soft_buffer_end + `1`,
112	// single character repeats:
113	syntax_element_dot_rep = syntax_element_restart_continue + `1`,
114	syntax_element_char_rep = syntax_element_dot_rep + `1`,
115	syntax_element_short_set_rep = syntax_element_char_rep + `1`,
116	syntax_element_long_set_rep = syntax_element_short_set_rep + `1`,
117	// a backstep for lookbehind repeats:
118	syntax_element_backstep = syntax_element_long_set_rep + `1`,
119	// an assertion that a mark was matched:
120	syntax_element_assert_backref = syntax_element_backstep + `1`,
121	syntax_element_toggle_case = syntax_element_assert_backref + `1`,
122	// a recursive expression:
123	syntax_element_recurse = syntax_element_toggle_case + `1`,
124	// Verbs:
125	syntax_element_fail = syntax_element_recurse + `1`,
126	syntax_element_accept = syntax_element_fail + `1`,
127	syntax_element_commit = syntax_element_accept + `1`,
128	syntax_element_then = syntax_element_commit + `1`
129	};
130
131	#ifdef BOOST_REGEX_DEBUG
132	// dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
133	std::ostream& operator<<(std::ostream&, syntax_element_type);
134	#endif
135
136	struct re_syntax_base;
137
138	/* union offset_type **********************************************
139	Points to another state in the machine. During machine construction
140	we use integral offsets, but these are converted to pointers before
141	execution of the machine.
142	***********************************************************************/
143	union offset_type
144	{
145	re_syntax_base* p;
146	std::ptrdiff_t i;
147	};
148
149	/* struct re_syntax_base ******************************************
150	Base class for all states in the machine.
151	***********************************************************************/
152	struct re_syntax_base
153	{
154	syntax_element_type type; // what kind of state this is
155	offset_type next; // next state in the machine
156	};
157
158	/* struct re_brace ************************************************
159	A marked parenthesis.
160	***********************************************************************/
161	struct re_brace : public re_syntax_base
162	{
163	// The index to match, can be zero (don't mark the sub-expression)
164	// or negative (for perl style (?...) extentions):
165	int index;
166	bool icase;
167	};
168
169	/* struct re_dot ************************************************
170	Match anything.
171	***********************************************************************/
172	enum
173	{
174	dont_care = `1`,
175	force_not_newline = `0`,
176	force_newline = `2`,
177
178	test_not_newline = `2`,
179	test_newline = `3`
180	};
181	struct re_dot : public re_syntax_base
182	{
183	unsigned char mask;
184	};
185
186	/* struct re_literal **********************************************
187	A string of literals, following this structure will be an
188	array of characters: charT[length]
189	***********************************************************************/
190	struct re_literal : public re_syntax_base
191	{
192	unsigned int length;
193	};
194
195	/* struct re_case **********************************************
196	Indicates whether we are moving to a case insensive block or not
197	***********************************************************************/
198	struct re_case : public re_syntax_base
199	{
200	bool icase;
201	};
202
203	/* struct re_set_long *********************************************
204	A wide character set of characters, following this structure will be
205	an array of type charT:
206	First csingles null-terminated strings
207	Then 2 cranges NULL terminated strings*
208	Then cequivalents NULL terminated strings
209	***********************************************************************/
210	template <class mask_type>
211	struct re_set_long : public re_syntax_base
212	{
213	unsigned int csingles, cranges, cequivalents;
214	mask_type cclasses;
215	mask_type cnclasses;
216	bool isnot;
217	bool singleton;
218	};
219
220	/* struct re_set **************************************************
221	A set of narrow-characters, matches any of _map which is none-zero
222	***********************************************************************/
223	struct re_set : public re_syntax_base
224	{
225	unsigned char _map[`1` << CHAR_BIT];
226	};
227
228	/* struct re_jump *************************************************
229	Jump to a new location in the machine (not next).
230	***********************************************************************/
231	struct re_jump : public re_syntax_base
232	{
233	offset_type alt; // location to jump to
234	};
235
236	/* struct re_alt *************************************************
237	Jump to a new location in the machine (possibly next).
238	***********************************************************************/
239	struct re_alt : public re_jump
240	{
241	unsigned char _map[`1` << CHAR_BIT]; // which characters can take the jump
242	unsigned int can_be_null; // true if we match a NULL string
243	};
244
245	/* struct re_repeat ***********************************************
246	Repeat a section of the machine
247	***********************************************************************/
248	struct re_repeat : public re_alt
249	{
250	std::size_t min, max; // min and max allowable repeats
251	int state_id; // Unique identifier for this repeat
252	bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches)
253	bool greedy; // True if this is a greedy repeat
254	};
255
256	/* struct re_recurse **********************************************
257	Recurse to a particular subexpression.
258	**********************************************************************/
259	struct re_recurse : public re_jump
260	{
261	int state_id; // identifier of first nested repeat within the recursion.
262	};
263
264	/* struct re_commit ***********************************************
265	Used for the PRUNE, SKIP and COMMIT verbs which basically differ only in what happens
266	if no match is found and we start searching forward.
267	**********************************************************************/
268	enum commit_type
269	{
270	commit_prune,
271	commit_skip,
272	commit_commit
273	};
274	struct re_commit : public re_syntax_base
275	{
276	commit_type action;
277	};
278
279	/* enum re_jump_size_type *****************************************
280	Provides compiled size of re_jump structure (allowing for trailing alignment).
281	We provide this so we know how manybytes to insert when constructing the machine
282	(The value of padding_mask is defined in regex_raw_buffer.hpp).
283	***********************************************************************/
284	enum re_jump_size_type
285	{
286	re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
287	re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
288	re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
289	};
290
291	/* proc re_is_set_member *******************************************
292	Forward declaration: we'll need this one later...
293	***********************************************************************/
294
295	template<class charT, class traits>
296	struct regex_data;
297
298	template <class iterator, class charT, class traits_type, class char_classT>
299	iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
300	iterator last,
301	const re_set_long<char_classT>* set_,
302	const regex_data<charT, traits_type>& e, bool icase);
303
304	} // namespace BOOST_REGEX_DETAIL_NS
305
306	} // namespace boost
307
308	#ifdef BOOST_MSVC
309	#pragma warning(push)
310	#pragma warning(disable: 4103)
311	#endif
312	#ifdef BOOST_HAS_ABI_HEADERS
313	# include BOOST_ABI_SUFFIX
314	#endif
315	#ifdef BOOST_MSVC
316	#pragma warning(pop)
317	#endif
318
319	#endif
320
321
322

Browse the source code of include/boost/regex/v4/states.hpp