regcomp.c source code [Glibc/posix/regcomp.c]

1	/ Extended regular expression matching and search library.*
2	Copyright (C) 2002-2020 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#ifdef _LIBC
21	# include <locale/weight.h>
22	#endif
23
24	static reg_errcode_t re_compile_internal (regex_t preg, const* char * pattern,
25	size_t length, reg_syntax_t syntax);
26	static void re_compile_fastmap_iter (regex_t *bufp,
27	const re_dfastate_t *init_state,
28	char *fastmap);
29	static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30	#ifdef RE_ENABLE_I18N
31	static void free_charset (re_charset_t *cset);
32	#endif /* RE_ENABLE_I18N */
33	static void free_workarea_compile (regex_t *preg);
34	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35	#ifdef RE_ENABLE_I18N
36	static void optimize_utf8 (re_dfa_t *dfa);
37	#endif
38	static reg_errcode_t analyze (regex_t *preg);
39	static reg_errcode_t preorder (bin_tree_t *root,
40	reg_errcode_t (fn (void , bin_tree_t )),
41	void *extra);
42	static reg_errcode_t postorder (bin_tree_t *root,
43	reg_errcode_t (fn (void , bin_tree_t )),
44	void *extra);
45	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
46	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
47	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
48	bin_tree_t *node);
49	static reg_errcode_t calc_first (void extra, bin_tree_t node);
50	static reg_errcode_t calc_next (void extra, bin_tree_t node);
51	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
52	static Idx duplicate_node (re_dfa_t dfa, Idx org_idx, unsigned* int constraint);
53	static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54	unsigned int constraint);
55	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
57	Idx node, bool root);
58	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59	static Idx fetch_number (re_string_t input, re_token_t token,
60	reg_syntax_t syntax);
61	static int peek_token (re_token_t token, re_string_t input,
62	reg_syntax_t syntax);
63	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
64	reg_syntax_t syntax, reg_errcode_t *err);
65	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
66	re_token_t *token, reg_syntax_t syntax,
67	Idx nest, reg_errcode_t *err);
68	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
69	re_token_t *token, reg_syntax_t syntax,
70	Idx nest, reg_errcode_t *err);
71	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
72	re_token_t *token, reg_syntax_t syntax,
73	Idx nest, reg_errcode_t *err);
74	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
75	re_token_t *token, reg_syntax_t syntax,
76	Idx nest, reg_errcode_t *err);
77	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
78	re_dfa_t dfa, re_token_t token,
79	reg_syntax_t syntax, reg_errcode_t *err);
80	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
81	re_token_t *token, reg_syntax_t syntax,
82	reg_errcode_t *err);
83	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84	re_string_t *regexp,
85	re_token_t token, int* token_len,
86	re_dfa_t *dfa,
87	reg_syntax_t syntax,
88	bool accept_hyphen);
89	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90	re_string_t *regexp,
91	re_token_t *token);
92	#ifdef RE_ENABLE_I18N
93	static reg_errcode_t build_equiv_class (bitset_t sbcset,
94	re_charset_t *mbcset,
95	Idx *equiv_class_alloc,
96	const unsigned char *name);
97	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98	bitset_t sbcset,
99	re_charset_t *mbcset,
100	Idx *char_class_alloc,
101	const char *class_name,
102	reg_syntax_t syntax);
103	#else /* not RE_ENABLE_I18N */
104	static reg_errcode_t build_equiv_class (bitset_t sbcset,
105	const unsigned char *name);
106	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107	bitset_t sbcset,
108	const char *class_name,
109	reg_syntax_t syntax);
110	#endif /* not RE_ENABLE_I18N */
111	static bin_tree_t build_charclass_op (re_dfa_t dfa,
112	RE_TRANSLATE_TYPE trans,
113	const char *class_name,
114	const char *extra,
115	bool non_match, reg_errcode_t *err);
116	static bin_tree_t create_tree (re_dfa_t dfa,
117	bin_tree_t left, bin_tree_t right,
118	re_token_type_t type);
119	static bin_tree_t create_token_tree (re_dfa_t dfa,
120	bin_tree_t left, bin_tree_t right,
121	const re_token_t *token);
122	static bin_tree_t duplicate_tree (const* bin_tree_t src, re_dfa_t dfa);
123	static void free_token (re_token_t *node);
124	static reg_errcode_t free_tree (void extra, bin_tree_t node);
125	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
126
127	/ This table gives an error message for each of the error codes listed*
128	in regex.h. Obviously the order here has to be same as there.
129	POSIX doesn't require that we do anything for REG_NOERROR,
130	but why not be nice? /*
131
132	static const char __re_error_msgid[] =
133	{
134	#define REG_NOERROR_IDX 0
135	gettext_noop ("Success") / REG_NOERROR /
136	"\0"
137	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138	gettext_noop ("No match") / REG_NOMATCH /
139	"\0"
140	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141	gettext_noop ("Invalid regular expression") / REG_BADPAT /
142	"\0"
143	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144	gettext_noop ("Invalid collation character") / REG_ECOLLATE /
145	"\0"
146	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147	gettext_noop ("Invalid character class name") / REG_ECTYPE /
148	"\0"
149	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150	gettext_noop ("Trailing backslash") / REG_EESCAPE /
151	"\0"
152	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153	gettext_noop ("Invalid back reference") / REG_ESUBREG /
154	"\0"
155	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156	gettext_noop ("Unmatched [, [^, [:, [., or [=") / REG_EBRACK /
157	"\0"
158	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159	gettext_noop ("Unmatched ( or \\(") / REG_EPAREN /
160	"\0"
161	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162	gettext_noop ("Unmatched \\{") / REG_EBRACE /
163	"\0"
164	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165	gettext_noop ("Invalid content of \\{\\}") / REG_BADBR /
166	"\0"
167	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168	gettext_noop ("Invalid range end") / REG_ERANGE /
169	"\0"
170	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171	gettext_noop ("Memory exhausted") / REG_ESPACE /
172	"\0"
173	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174	gettext_noop ("Invalid preceding regular expression") / REG_BADRPT /
175	"\0"
176	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177	gettext_noop ("Premature end of regular expression") / REG_EEND /
178	"\0"
179	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180	gettext_noop ("Regular expression too big") / REG_ESIZE /
181	"\0"
182	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183	gettext_noop ("Unmatched ) or \\)") / REG_ERPAREN /
184	};
185
186	static const size_t __re_error_msgid_idx[] =
187	{
188	REG_NOERROR_IDX,
189	REG_NOMATCH_IDX,
190	REG_BADPAT_IDX,
191	REG_ECOLLATE_IDX,
192	REG_ECTYPE_IDX,
193	REG_EESCAPE_IDX,
194	REG_ESUBREG_IDX,
195	REG_EBRACK_IDX,
196	REG_EPAREN_IDX,
197	REG_EBRACE_IDX,
198	REG_BADBR_IDX,
199	REG_ERANGE_IDX,
200	REG_ESPACE_IDX,
201	REG_BADRPT_IDX,
202	REG_EEND_IDX,
203	REG_ESIZE_IDX,
204	REG_ERPAREN_IDX
205	};
206
207	/ Entry points for GNU code. /
208
209	/ re_compile_pattern is the GNU regular expression compiler: it*
210	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211	Returns 0 if the pattern was valid, otherwise an error string.
212
213	Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214	are set in BUFP on entry. /*
215
216	const char *
217	re_compile_pattern (const char *pattern, size_t length,
218	struct re_pattern_buffer *bufp)
219	{
220	reg_errcode_t ret;
221
222	/ And GNU code determines whether or not to get register information*
223	by passing null for the REGS argument to re_match, etc., not by
224	setting no_sub, unless RE_NO_SUB is set. /*
225	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226
227	/ Match anchors at newline. /
228	bufp->newline_anchor = `1`;
229
230	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231
232	if (!ret)
233	return NULL;
234	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235	}
236	weak_alias (__re_compile_pattern, re_compile_pattern)
237
238	/ Set by 're_set_syntax' to the current regexp syntax to recognize. Can*
239	also be assigned to arbitrarily: each pattern buffer stores its own
240	syntax, so it can be changed between regex compilations. /*
241	/ This has no initializer because initialized variables in Emacs*
242	become read-only after dumping. /*
243	reg_syntax_t re_syntax_options;
244
245
246	/ Specify the precise syntax of regexps for compilation. This provides*
247	for compatibility for various utilities which historically have
248	different, incompatible syntaxes.
249
250	The argument SYNTAX is a bit mask comprised of the various bits
251	defined in regex.h. We return the old syntax. /*
252
253	reg_syntax_t
254	re_set_syntax (reg_syntax_t syntax)
255	{
256	reg_syntax_t ret = re_syntax_options;
257
258	re_syntax_options = syntax;
259	return ret;
260	}
261	weak_alias (__re_set_syntax, re_set_syntax)
262
263	int
264	re_compile_fastmap (struct re_pattern_buffer *bufp)
265	{
266	re_dfa_t *dfa = bufp->buffer;
267	char *fastmap = bufp->fastmap;
268
269	memset (fastmap, `'\0'`, sizeof (char) * SBC_MAX);
270	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271	if (dfa->init_state != dfa->init_state_word)
272	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273	if (dfa->init_state != dfa->init_state_nl)
274	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275	if (dfa->init_state != dfa->init_state_begbuf)
276	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277	bufp->fastmap_accurate = `1`;
278	return `0`;
279	}
280	weak_alias (__re_compile_fastmap, re_compile_fastmap)
281
282	static inline void
283	__attribute__ ((always_inline))
284	re_set_fastmap (char fastmap, bool icase, int* ch)
285	{
286	fastmap[ch] = `1`;
287	if (icase)
288	fastmap[tolower (ch)] = `1`;
289	}
290
291	/ Helper function for re_compile_fastmap.*
292	Compile fastmap for the initial_state INIT_STATE. /*
293
294	static void
295	re_compile_fastmap_iter (regex_t bufp, const* re_dfastate_t *init_state,
296	char *fastmap)
297	{
298	re_dfa_t *dfa = bufp->buffer;
299	Idx node_cnt;
300	bool icase = (dfa->mb_cur_max == `1` && (bufp->syntax & RE_ICASE));
301	for (node_cnt = `0`; node_cnt < init_state->nodes.nelem; ++node_cnt)
302	{
303	Idx node = init_state->nodes.elems[node_cnt];
304	re_token_type_t type = dfa->nodes[node].type;
305
306	if (type == CHARACTER)
307	{
308	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
309	#ifdef RE_ENABLE_I18N
310	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > `1`)
311	{
312	unsigned char buf[MB_LEN_MAX];
313	unsigned char *p;
314	wchar_t wc;
315	mbstate_t state;
316
317	p = buf;
318	*p++ = dfa->nodes[node].opr.c;
319	while (++node < dfa->nodes_len
320	&& dfa->nodes[node].type == CHARACTER
321	&& dfa->nodes[node].mb_partial)
322	*p++ = dfa->nodes[node].opr.c;
323	memset (&state, `'\0'`, sizeof (state));
324	if (__mbrtowc (&wc, (const char *) buf, p - buf,
325	&state) == p - buf
326	&& (__wcrtomb ((char *) buf, __towlower (wc), &state)
327	!= (size_t) -`1`))
328	re_set_fastmap (fastmap, false, buf[`0`]);
329	}
330	#endif
331	}
332	else if (type == SIMPLE_BRACKET)
333	{
334	int i, ch;
335	for (i = `0`, ch = `0`; i < BITSET_WORDS; ++i)
336	{
337	int j;
338	bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
339	for (j = `0`; j < BITSET_WORD_BITS; ++j, ++ch)
340	if (w & ((bitset_word_t) `1` << j))
341	re_set_fastmap (fastmap, icase, ch);
342	}
343	}
344	#ifdef RE_ENABLE_I18N
345	else if (type == COMPLEX_BRACKET)
346	{
347	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
348	Idx i;
349
350	# ifdef _LIBC
351	/ See if we have to try all bytes which start multiple collation*
352	elements.
353	e.g. In da_DK, we want to catch 'a' since "aa" is a valid
354	collation element, and don't catch 'b' since 'b' is
355	the only collation element which starts from 'b' (and
356	it is caught by SIMPLE_BRACKET). /*
357	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != `0`
358	&& (cset->ncoll_syms \|\| cset->nranges))
359	{
360	const int32_t table = (const* int32_t *)
361	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362	for (i = `0`; i < SBC_MAX; ++i)
363	if (table[i] < `0`)
364	re_set_fastmap (fastmap, icase, i);
365	}
366	# endif /* _LIBC */
367
368	/ See if we have to start the match at all multibyte characters,*
369	i.e. where we would not find an invalid sequence. This only
370	applies to multibyte character sets; for single byte character
371	sets, the SIMPLE_BRACKET again suffices. /*
372	if (dfa->mb_cur_max > `1`
373	&& (cset->nchar_classes \|\| cset->non_match \|\| cset->nranges
374	# ifdef _LIBC
375	\|\| cset->nequiv_classes
376	# endif /* _LIBC */
377	))
378	{
379	unsigned char c = `0`;
380	do
381	{
382	mbstate_t mbs;
383	memset (&mbs, `0`, sizeof (mbs));
384	if (__mbrtowc (NULL, (char *) &c, `1`, &mbs) == (size_t) -`2`)
385	re_set_fastmap (fastmap, false, (int) c);
386	}
387	while (++c != `0`);
388	}
389
390	else
391	{
392	/ ... Else catch all bytes which can start the mbchars. /
393	for (i = `0`; i < cset->nmbchars; ++i)
394	{
395	char buf[`256`];
396	mbstate_t state;
397	memset (&state, `'\0'`, sizeof (state));
398	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -`1`)
399	re_set_fastmap (fastmap, icase, (unsigned* char *) buf);
400	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > `1`)
401	{
402	if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
403	!= (size_t) -`1`)
404	re_set_fastmap (fastmap, false, (unsigned* char *) buf);
405	}
406	}
407	}
408	}
409	#endif /* RE_ENABLE_I18N */
410	else if (type == OP_PERIOD
411	#ifdef RE_ENABLE_I18N
412	\|\| type == OP_UTF8_PERIOD
413	#endif /* RE_ENABLE_I18N */
414	\|\| type == END_OF_RE)
415	{
416	memset (fastmap, `'\1'`, sizeof (char) * SBC_MAX);
417	if (type == END_OF_RE)
418	bufp->can_be_null = `1`;
419	return;
420	}
421	}
422	}
423
424	/ Entry point for POSIX code. /
425	/ regcomp takes a regular expression as a string and compiles it.*
426
427	PREG is a regex_t . We do not expect any fields to be initialized,*
428	since POSIX says we shouldn't. Thus, we set
429
430	'buffer' to the compiled pattern;
431	'used' to the length of the compiled pattern;
432	'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433	REG_EXTENDED bit in CFLAGS is set; otherwise, to
434	RE_SYNTAX_POSIX_BASIC;
435	'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436	'fastmap' to an allocated space for the fastmap;
437	'fastmap_accurate' to zero;
438	're_nsub' to the number of subexpressions in PATTERN.
439
440	PATTERN is the address of the pattern string.
441
442	CFLAGS is a series of bits which affect compilation.
443
444	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445	use POSIX basic syntax.
446
447	If REG_NEWLINE is set, then . and [^...] don't match newline.
448	Also, regexec will try a match beginning after every newline.
449
450	If REG_ICASE is set, then we considers upper- and lowercase
451	versions of letters to be equivalent when matching.
452
453	If REG_NOSUB is set, then when PREG is passed to regexec, that
454	routine will report only success or failure, and nothing about the
455	registers.
456
457	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458	the return codes and their meanings.) /*
459
460	int
461	regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
462	{
463	reg_errcode_t ret;
464	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
465	: RE_SYNTAX_POSIX_BASIC);
466
467	preg->buffer = NULL;
468	preg->allocated = `0`;
469	preg->used = `0`;
470
471	/ Try to allocate space for the fastmap. /
472	preg->fastmap = re_malloc (char, SBC_MAX);
473	if (__glibc_unlikely (preg->fastmap == NULL))
474	return REG_ESPACE;
475
476	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : `0`;
477
478	/ If REG_NEWLINE is set, newlines are treated differently. /
479	if (cflags & REG_NEWLINE)
480	{ / REG_NEWLINE implies neither . nor [^...] match newline. /
481	syntax &= ~RE_DOT_NEWLINE;
482	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
483	/ It also changes the matching behavior. /
484	preg->newline_anchor = `1`;
485	}
486	else
487	preg->newline_anchor = `0`;
488	preg->no_sub = !!(cflags & REG_NOSUB);
489	preg->translate = NULL;
490
491	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
492
493	/ POSIX doesn't distinguish between an unmatched open-group and an*
494	unmatched close-group: both are REG_EPAREN. /*
495	if (ret == REG_ERPAREN)
496	ret = REG_EPAREN;
497
498	/ We have already checked preg->fastmap != NULL. /
499	if (__glibc_likely (ret == REG_NOERROR))
500	/ Compute the fastmap now, since regexec cannot modify the pattern*
501	buffer. This function never fails in this implementation. /*
502	(void) re_compile_fastmap (preg);
503	else
504	{
505	/ Some error occurred while compiling the expression. /
506	re_free (preg->fastmap);
507	preg->fastmap = NULL;
508	}
509
510	return (int) ret;
511	}
512	libc_hidden_def (__regcomp)
513	weak_alias (__regcomp, regcomp)
514
515	/ Returns a message corresponding to an error code, ERRCODE, returned*
516	from either regcomp or regexec. We don't use PREG here. /*
517
518	size_t
519	regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
520	size_t errbuf_size)
521	{
522	const char *msg;
523	size_t msg_size;
524	int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[`0`];
525
526	if (__glibc_unlikely (errcode < `0` \|\| errcode >= nerrcodes))
527	/ Only error codes returned by the rest of the code should be passed*
528	to this routine. If we are given anything else, or if other regex
529	code generates an invalid error code, then the program has a bug.
530	Dump core so we can fix it. /*
531	abort ();
532
533	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
534
535	msg_size = strlen (msg) + `1`; / Includes the null. /
536
537	if (__glibc_likely (errbuf_size != `0`))
538	{
539	size_t cpy_size = msg_size;
540	if (__glibc_unlikely (msg_size > errbuf_size))
541	{
542	cpy_size = errbuf_size - `1`;
543	errbuf[cpy_size] = `'\0'`;
544	}
545	memcpy (errbuf, msg, cpy_size);
546	}
547
548	return msg_size;
549	}
550	weak_alias (__regerror, regerror)
551
552
553	#ifdef RE_ENABLE_I18N
554	/ This static array is used for the map to single-byte characters when*
555	UTF-8 is used. Otherwise we would allocate memory just to initialize
556	it the same all the time. UTF-8 is the preferred encoding so this is
557	a worthwhile optimization. /*
558	static const bitset_t utf8_sb_map =
559	{
560	/ Set the first 128 bits. /
561	# if defined __GNUC__ && !defined __STRICT_ANSI__
562	[`0` ... `0x80` / BITSET_WORD_BITS - `1`] = BITSET_WORD_MAX
563	# else
564	# if 4 * BITSET_WORD_BITS < ASCII_CHARS
565	# error "bitset_word_t is narrower than 32 bits"
566	# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567	BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
568	# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569	BITSET_WORD_MAX, BITSET_WORD_MAX,
570	# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
571	BITSET_WORD_MAX,
572	# endif
573	(BITSET_WORD_MAX
574	>> (SBC_MAX % BITSET_WORD_BITS == `0`
575	? `0`
576	: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577	# endif
578	};
579	#endif
580
581
582	static void
583	free_dfa_content (re_dfa_t *dfa)
584	{
585	Idx i, j;
586
587	if (dfa->nodes)
588	for (i = `0`; i < dfa->nodes_len; ++i)
589	free_token (dfa->nodes + i);
590	re_free (dfa->nexts);
591	for (i = `0`; i < dfa->nodes_len; ++i)
592	{
593	if (dfa->eclosures != NULL)
594	re_node_set_free (dfa->eclosures + i);
595	if (dfa->inveclosures != NULL)
596	re_node_set_free (dfa->inveclosures + i);
597	if (dfa->edests != NULL)
598	re_node_set_free (dfa->edests + i);
599	}
600	re_free (dfa->edests);
601	re_free (dfa->eclosures);
602	re_free (dfa->inveclosures);
603	re_free (dfa->nodes);
604
605	if (dfa->state_table)
606	for (i = `0`; i <= dfa->state_hash_mask; ++i)
607	{
608	struct re_state_table_entry *entry = dfa->state_table + i;
609	for (j = `0`; j < entry->num; ++j)
610	{
611	re_dfastate_t *state = entry->array[j];
612	free_state (state);
613	}
614	re_free (entry->array);
615	}
616	re_free (dfa->state_table);
617	#ifdef RE_ENABLE_I18N
618	if (dfa->sb_char != utf8_sb_map)
619	re_free (dfa->sb_char);
620	#endif
621	re_free (dfa->subexp_map);
622	#ifdef DEBUG
623	re_free (dfa->re_str);
624	#endif
625
626	re_free (dfa);
627	}
628
629
630	/ Free dynamically allocated space used by PREG. /
631
632	void
633	regfree (regex_t *preg)
634	{
635	re_dfa_t *dfa = preg->buffer;
636	if (__glibc_likely (dfa != NULL))
637	{
638	lock_fini (dfa->lock);
639	free_dfa_content (dfa);
640	}
641	preg->buffer = NULL;
642	preg->allocated = `0`;
643
644	re_free (preg->fastmap);
645	preg->fastmap = NULL;
646
647	re_free (preg->translate);
648	preg->translate = NULL;
649	}
650	libc_hidden_def (__regfree)
651	weak_alias (__regfree, regfree)
652
653	/ Entry points compatible with 4.2 BSD regex library. We don't define*
654	them unless specifically requested. /*
655
656	#if defined _REGEX_RE_COMP \|\| defined _LIBC
657
658	/ BSD has one and only one pattern buffer. /
659	static struct re_pattern_buffer re_comp_buf;
660
661	char *
662	# ifdef _LIBC
663	/ Make these definitions weak in libc, so POSIX programs can redefine*
664	these names if they don't use our functions, and still use
665	regcomp/regexec above without link errors. /*
666	weak_function
667	# endif
668	re_comp (const char *s)
669	{
670	reg_errcode_t ret;
671	char *fastmap;
672
673	if (!s)
674	{
675	if (!re_comp_buf.buffer)
676	return gettext ("No previous regular expression");
677	return `0`;
678	}
679
680	if (re_comp_buf.buffer)
681	{
682	fastmap = re_comp_buf.fastmap;
683	re_comp_buf.fastmap = NULL;
684	__regfree (&re_comp_buf);
685	memset (&re_comp_buf, `'\0'`, sizeof (re_comp_buf));
686	re_comp_buf.fastmap = fastmap;
687	}
688
689	if (re_comp_buf.fastmap == NULL)
690	{
691	re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
692	if (re_comp_buf.fastmap == NULL)
693	return (char *) gettext (__re_error_msgid
694	+ __re_error_msgid_idx[(int) REG_ESPACE]);
695	}
696
697	/ Since 're_exec' always passes NULL for the 'regs' argument, we*
698	don't need to initialize the pattern buffer fields which affect it. /*
699
700	/ Match anchors at newlines. /
701	re_comp_buf.newline_anchor = `1`;
702
703	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
704
705	if (!ret)
706	return NULL;
707
708	/ Yes, we're discarding 'const' here if !HAVE_LIBINTL. /
709	return (char ) gettext (__re_error_msgid + __re_error_msgid_idx[(int*) ret]);
710	}
711
712	#ifdef _LIBC
713	libc_freeres_fn (free_mem)
714	{
715	__regfree (&re_comp_buf);
716	}
717	#endif
718
719	#endif /* _REGEX_RE_COMP */
720
721	/ Internal entry point.*
722	Compile the regular expression PATTERN, whose length is LENGTH.
723	SYNTAX indicate regular expression's syntax. /*
724
725	static reg_errcode_t
726	re_compile_internal (regex_t preg, const* char * pattern, size_t length,
727	reg_syntax_t syntax)
728	{
729	reg_errcode_t err = REG_NOERROR;
730	re_dfa_t *dfa;
731	re_string_t regexp;
732
733	/ Initialize the pattern buffer. /
734	preg->fastmap_accurate = `0`;
735	preg->syntax = syntax;
736	preg->not_bol = preg->not_eol = `0`;
737	preg->used = `0`;
738	preg->re_nsub = `0`;
739	preg->can_be_null = `0`;
740	preg->regs_allocated = REGS_UNALLOCATED;
741
742	/ Initialize the dfa. /
743	dfa = preg->buffer;
744	if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
745	{
746	/ If zero allocated, but buffer is non-null, try to realloc*
747	enough space. This loses if buffer's address is bogus, but
748	that is the user's responsibility. If ->buffer is NULL this
749	is a simple allocation. /*
750	dfa = re_realloc (preg->buffer, re_dfa_t, `1`);
751	if (dfa == NULL)
752	return REG_ESPACE;
753	preg->allocated = sizeof (re_dfa_t);
754	preg->buffer = dfa;
755	}
756	preg->used = sizeof (re_dfa_t);
757
758	err = init_dfa (dfa, length);
759	if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != `0`))
760	err = REG_ESPACE;
761	if (__glibc_unlikely (err != REG_NOERROR))
762	{
763	free_dfa_content (dfa);
764	preg->buffer = NULL;
765	preg->allocated = `0`;
766	return err;
767	}
768	#ifdef DEBUG
769	/ Note: length+1 will not overflow since it is checked in init_dfa. /
770	dfa->re_str = re_malloc (char, length + `1`);
771	strncpy (dfa->re_str, pattern, length + `1`);
772	#endif
773
774	err = re_string_construct (&regexp, pattern, length, preg->translate,
775	(syntax & RE_ICASE) != `0`, dfa);
776	if (__glibc_unlikely (err != REG_NOERROR))
777	{
778	re_compile_internal_free_return:
779	free_workarea_compile (preg);
780	re_string_destruct (&regexp);
781	lock_fini (dfa->lock);
782	free_dfa_content (dfa);
783	preg->buffer = NULL;
784	preg->allocated = `0`;
785	return err;
786	}
787
788	/ Parse the regular expression, and build a structure tree. /
789	preg->re_nsub = `0`;
790	dfa->str_tree = parse (&regexp, preg, syntax, &err);
791	if (__glibc_unlikely (dfa->str_tree == NULL))
792	goto re_compile_internal_free_return;
793
794	/ Analyze the tree and create the nfa. /
795	err = analyze (preg);
796	if (__glibc_unlikely (err != REG_NOERROR))
797	goto re_compile_internal_free_return;
798
799	#ifdef RE_ENABLE_I18N
800	/ If possible, do searching in single byte encoding to speed things up. /
801	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
802	optimize_utf8 (dfa);
803	#endif
804
805	/ Then create the initial state of the dfa. /
806	err = create_initial_state (dfa);
807
808	/ Release work areas. /
809	free_workarea_compile (preg);
810	re_string_destruct (&regexp);
811
812	if (__glibc_unlikely (err != REG_NOERROR))
813	{
814	lock_fini (dfa->lock);
815	free_dfa_content (dfa);
816	preg->buffer = NULL;
817	preg->allocated = `0`;
818	}
819
820	return err;
821	}
822
823	/ Initialize DFA. We use the length of the regular expression PAT_LEN*
824	as the initial length of some arrays. /*
825
826	static reg_errcode_t
827	init_dfa (re_dfa_t *dfa, size_t pat_len)
828	{
829	__re_size_t table_size;
830	#ifndef _LIBC
831	const char *codeset_name;
832	#endif
833	#ifdef RE_ENABLE_I18N
834	size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
835	#else
836	size_t max_i18n_object_size = `0`;
837	#endif
838	size_t max_object_size =
839	MAX (sizeof (struct re_state_table_entry),
840	MAX (sizeof (re_token_t),
841	MAX (sizeof (re_node_set),
842	MAX (sizeof (regmatch_t),
843	max_i18n_object_size))));
844
845	memset (dfa, `'\0'`, sizeof (re_dfa_t));
846
847	/ Force allocation of str_tree_storage the first time. /
848	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
849
850	/ Avoid overflows. The extra "/ 2" is for the table_size doubling*
851	calculation below, and for similar doubling calculations
852	elsewhere. And it's <= rather than <, because some of the
853	doubling calculations add 1 afterwards. /*
854	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / `2`
855	<= pat_len))
856	return REG_ESPACE;
857
858	dfa->nodes_alloc = pat_len + `1`;
859	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
860
861	/ table_size = 2 ^ ceil(log pat_len) /
862	for (table_size = `1`; ; table_size <<= `1`)
863	if (table_size > pat_len)
864	break;
865
866	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867	dfa->state_hash_mask = table_size - `1`;
868
869	dfa->mb_cur_max = MB_CUR_MAX;
870	#ifdef _LIBC
871	if (dfa->mb_cur_max == `6`
872	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == `0`)
873	dfa->is_utf8 = `1`;
874	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875	!= `0`);
876	#else
877	codeset_name = nl_langinfo (CODESET);
878	if ((codeset_name[`0`] == `'U'` \|\| codeset_name[`0`] == `'u'`)
879	&& (codeset_name[`1`] == `'T'` \|\| codeset_name[`1`] == `'t'`)
880	&& (codeset_name[`2`] == `'F'` \|\| codeset_name[`2`] == `'f'`)
881	&& strcmp (codeset_name + `3` + (codeset_name[`3`] == `'-'`), "8") == `0`)
882	dfa->is_utf8 = `1`;
883
884	/ We check exhaustively in the loop below if this charset is a*
885	superset of ASCII. /*
886	dfa->map_notascii = `0`;
887	#endif
888
889	#ifdef RE_ENABLE_I18N
890	if (dfa->mb_cur_max > `1`)
891	{
892	if (dfa->is_utf8)
893	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
894	else
895	{
896	int i, j, ch;
897
898	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), `1`);
899	if (__glibc_unlikely (dfa->sb_char == NULL))
900	return REG_ESPACE;
901
902	/ Set the bits corresponding to single byte chars. /
903	for (i = `0`, ch = `0`; i < BITSET_WORDS; ++i)
904	for (j = `0`; j < BITSET_WORD_BITS; ++j, ++ch)
905	{
906	wint_t wch = __btowc (ch);
907	if (wch != WEOF)
908	dfa->sb_char[i] \|= (bitset_word_t) `1` << j;
909	# ifndef _LIBC
910	if (isascii (ch) && wch != ch)
911	dfa->map_notascii = `1`;
912	# endif
913	}
914	}
915	}
916	#endif
917
918	if (__glibc_unlikely (dfa->nodes == NULL \|\| dfa->state_table == NULL))
919	return REG_ESPACE;
920	return REG_NOERROR;
921	}
922
923	/ Initialize WORD_CHAR table, which indicate which character is*
924	"word". In this case "word" means that it is the word construction
925	character used by some operators like "\<", "\>", etc. /*
926
927	static void
928	init_word_char (re_dfa_t *dfa)
929	{
930	int i = `0`;
931	int j;
932	int ch = `0`;
933	dfa->word_ops_used = `1`;
934	if (__glibc_likely (dfa->map_notascii == `0`))
935	{
936	/ Avoid uint32_t and uint64_t as some non-GCC platforms lack*
937	them, an issue when this code is used in Gnulib. /*
938	bitset_word_t bits0 = `0x00000000`;
939	bitset_word_t bits1 = `0x03ff0000`;
940	bitset_word_t bits2 = `0x87fffffe`;
941	bitset_word_t bits3 = `0x07fffffe`;
942	if (BITSET_WORD_BITS == `64`)
943	{
944	/ Pacify gcc -Woverflow on 32-bit platformns. /
945	dfa->word_char[`0`] = bits1 << `31` << `1` \| bits0;
946	dfa->word_char[`1`] = bits3 << `31` << `1` \| bits2;
947	i = `2`;
948	}
949	else if (BITSET_WORD_BITS == `32`)
950	{
951	dfa->word_char[`0`] = bits0;
952	dfa->word_char[`1`] = bits1;
953	dfa->word_char[`2`] = bits2;
954	dfa->word_char[`3`] = bits3;
955	i = `4`;
956	}
957	else
958	goto general_case;
959	ch = `128`;
960
961	if (__glibc_likely (dfa->is_utf8))
962	{
963	memset (&dfa->word_char[i], `'\0'`, (SBC_MAX - ch) / `8`);
964	return;
965	}
966	}
967
968	general_case:
969	for (; i < BITSET_WORDS; ++i)
970	for (j = `0`; j < BITSET_WORD_BITS; ++j, ++ch)
971	if (isalnum (ch) \|\| ch == `'_'`)
972	dfa->word_char[i] \|= (bitset_word_t) `1` << j;
973	}
974
975	/ Free the work area which are only used while compiling. /
976
977	static void
978	free_workarea_compile (regex_t *preg)
979	{
980	re_dfa_t *dfa = preg->buffer;
981	bin_tree_storage_t storage, next;
982	for (storage = dfa->str_tree_storage; storage; storage = next)
983	{
984	next = storage->next;
985	re_free (storage);
986	}
987	dfa->str_tree_storage = NULL;
988	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
989	dfa->str_tree = NULL;
990	re_free (dfa->org_indices);
991	dfa->org_indices = NULL;
992	}
993
994	/ Create initial states for all contexts. /
995
996	static reg_errcode_t
997	create_initial_state (re_dfa_t *dfa)
998	{
999	Idx first, i;
1000	reg_errcode_t err;
1001	re_node_set init_nodes;
1002
1003	/ Initial states have the epsilon closure of the node which is*
1004	the first node of the regular expression. /*
1005	first = dfa->str_tree->first->node_idx;
1006	dfa->init_node = first;
1007	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008	if (__glibc_unlikely (err != REG_NOERROR))
1009	return err;
1010
1011	/ The back-references which are in initial states can epsilon transit,*
1012	since in this case all of the subexpressions can be null.
1013	Then we add epsilon closures of the nodes which are the next nodes of
1014	the back-references. /*
1015	if (dfa->nbackref > `0`)
1016	for (i = `0`; i < init_nodes.nelem; ++i)
1017	{
1018	Idx node_idx = init_nodes.elems[i];
1019	re_token_type_t type = dfa->nodes[node_idx].type;
1020
1021	Idx clexp_idx;
1022	if (type != OP_BACK_REF)
1023	continue;
1024	for (clexp_idx = `0`; clexp_idx < init_nodes.nelem; ++clexp_idx)
1025	{
1026	re_token_t *clexp_node;
1027	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028	if (clexp_node->type == OP_CLOSE_SUBEXP
1029	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030	break;
1031	}
1032	if (clexp_idx == init_nodes.nelem)
1033	continue;
1034
1035	if (type == OP_BACK_REF)
1036	{
1037	Idx dest_idx = dfa->edests[node_idx].elems[`0`];
1038	if (!re_node_set_contains (&init_nodes, dest_idx))
1039	{
1040	reg_errcode_t merge_err
1041	= re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1042	if (merge_err != REG_NOERROR)
1043	return merge_err;
1044	i = `0`;
1045	}
1046	}
1047	}
1048
1049	/ It must be the first time to invoke acquire_state. /
1050	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, `0`);
1051	/ We don't check ERR here, since the initial state must not be NULL. /
1052	if (__glibc_unlikely (dfa->init_state == NULL))
1053	return err;
1054	if (dfa->init_state->has_constraint)
1055	{
1056	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1057	CONTEXT_WORD);
1058	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1059	CONTEXT_NEWLINE);
1060	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1061	&init_nodes,
1062	CONTEXT_NEWLINE
1063	\| CONTEXT_BEGBUF);
1064	if (__glibc_unlikely (dfa->init_state_word == NULL
1065	\|\| dfa->init_state_nl == NULL
1066	\|\| dfa->init_state_begbuf == NULL))
1067	return err;
1068	}
1069	else
1070	dfa->init_state_word = dfa->init_state_nl
1071	= dfa->init_state_begbuf = dfa->init_state;
1072
1073	re_node_set_free (&init_nodes);
1074	return REG_NOERROR;
1075	}
1076
1077	#ifdef RE_ENABLE_I18N
1078	/ If it is possible to do searching in single byte encoding instead of UTF-8*
1079	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1080	DFA nodes where needed. /*
1081
1082	static void
1083	optimize_utf8 (re_dfa_t *dfa)
1084	{
1085	Idx node;
1086	int i;
1087	bool mb_chars = false;
1088	bool has_period = false;
1089
1090	for (node = `0`; node < dfa->nodes_len; ++node)
1091	switch (dfa->nodes[node].type)
1092	{
1093	case CHARACTER:
1094	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1095	mb_chars = true;
1096	break;
1097	case ANCHOR:
1098	switch (dfa->nodes[node].opr.ctx_type)
1099	{
1100	case LINE_FIRST:
1101	case LINE_LAST:
1102	case BUF_FIRST:
1103	case BUF_LAST:
1104	break;
1105	default:
1106	/ Word anchors etc. cannot be handled. It's okay to test*
1107	opr.ctx_type since constraints (for all DFA nodes) are
1108	created by ORing one or more opr.ctx_type values. /*
1109	return;
1110	}
1111	break;
1112	case OP_PERIOD:
1113	has_period = true;
1114	break;
1115	case OP_BACK_REF:
1116	case OP_ALT:
1117	case END_OF_RE:
1118	case OP_DUP_ASTERISK:
1119	case OP_OPEN_SUBEXP:
1120	case OP_CLOSE_SUBEXP:
1121	break;
1122	case COMPLEX_BRACKET:
1123	return;
1124	case SIMPLE_BRACKET:
1125	/ Just double check. /
1126	{
1127	int rshift = (ASCII_CHARS % BITSET_WORD_BITS == `0`
1128	? `0`
1129	: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1130	for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1131	{
1132	if (dfa->nodes[node].opr.sbcset[i] >> rshift != `0`)
1133	return;
1134	rshift = `0`;
1135	}
1136	}
1137	break;
1138	default:
1139	abort ();
1140	}
1141
1142	if (mb_chars \|\| has_period)
1143	for (node = `0`; node < dfa->nodes_len; ++node)
1144	{
1145	if (dfa->nodes[node].type == CHARACTER
1146	&& dfa->nodes[node].opr.c >= ASCII_CHARS)
1147	dfa->nodes[node].mb_partial = `0`;
1148	else if (dfa->nodes[node].type == OP_PERIOD)
1149	dfa->nodes[node].type = OP_UTF8_PERIOD;
1150	}
1151
1152	/ The search can be in single byte locale. /
1153	dfa->mb_cur_max = `1`;
1154	dfa->is_utf8 = `0`;
1155	dfa->has_mb_node = dfa->nbackref > `0` \|\| has_period;
1156	}
1157	#endif
1158
1159	/ Analyze the structure tree, and calculate "first", "next", "edest",*
1160	"eclosure", and "inveclosure". /*
1161
1162	static reg_errcode_t
1163	analyze (regex_t *preg)
1164	{
1165	re_dfa_t *dfa = preg->buffer;
1166	reg_errcode_t ret;
1167
1168	/ Allocate arrays. /
1169	dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1170	dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1171	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1172	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1173	if (__glibc_unlikely (dfa->nexts == NULL \|\| dfa->org_indices == NULL
1174	\|\| dfa->edests == NULL \|\| dfa->eclosures == NULL))
1175	return REG_ESPACE;
1176
1177	dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1178	if (dfa->subexp_map != NULL)
1179	{
1180	Idx i;
1181	for (i = `0`; i < preg->re_nsub; i++)
1182	dfa->subexp_map[i] = i;
1183	preorder (dfa->str_tree, optimize_subexps, dfa);
1184	for (i = `0`; i < preg->re_nsub; i++)
1185	if (dfa->subexp_map[i] != i)
1186	break;
1187	if (i == preg->re_nsub)
1188	{
1189	re_free (dfa->subexp_map);
1190	dfa->subexp_map = NULL;
1191	}
1192	}
1193
1194	ret = postorder (dfa->str_tree, lower_subexps, preg);
1195	if (__glibc_unlikely (ret != REG_NOERROR))
1196	return ret;
1197	ret = postorder (dfa->str_tree, calc_first, dfa);
1198	if (__glibc_unlikely (ret != REG_NOERROR))
1199	return ret;
1200	preorder (dfa->str_tree, calc_next, dfa);
1201	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1202	if (__glibc_unlikely (ret != REG_NOERROR))
1203	return ret;
1204	ret = calc_eclosure (dfa);
1205	if (__glibc_unlikely (ret != REG_NOERROR))
1206	return ret;
1207
1208	/ We only need this during the prune_impossible_nodes pass in regexec.c;*
1209	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. /*
1210	if ((!preg->no_sub && preg->re_nsub > `0` && dfa->has_plural_match)
1211	\|\| dfa->nbackref)
1212	{
1213	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1214	if (__glibc_unlikely (dfa->inveclosures == NULL))
1215	return REG_ESPACE;
1216	ret = calc_inveclosure (dfa);
1217	}
1218
1219	return ret;
1220	}
1221
1222	/ Our parse trees are very unbalanced, so we cannot use a stack to*
1223	implement parse tree visits. Instead, we use parent pointers and
1224	some hairy code in these two functions. /*
1225	static reg_errcode_t
1226	postorder (bin_tree_t root, reg_errcode_t (fn (void* , bin_tree_t )),
1227	void *extra)
1228	{
1229	bin_tree_t node, prev;
1230
1231	for (node = root; ; )
1232	{
1233	/ Descend down the tree, preferably to the left (or to the right*
1234	if that's the only child). /*
1235	while (node->left \|\| node->right)
1236	if (node->left)
1237	node = node->left;
1238	else
1239	node = node->right;
1240
1241	do
1242	{
1243	reg_errcode_t err = fn (extra, node);
1244	if (__glibc_unlikely (err != REG_NOERROR))
1245	return err;
1246	if (node->parent == NULL)
1247	return REG_NOERROR;
1248	prev = node;
1249	node = node->parent;
1250	}
1251	/ Go up while we have a node that is reached from the right. /
1252	while (node->right == prev \|\| node->right == NULL);
1253	node = node->right;
1254	}
1255	}
1256
1257	static reg_errcode_t
1258	preorder (bin_tree_t root, reg_errcode_t (fn (void* , bin_tree_t )),
1259	void *extra)
1260	{
1261	bin_tree_t *node;
1262
1263	for (node = root; ; )
1264	{
1265	reg_errcode_t err = fn (extra, node);
1266	if (__glibc_unlikely (err != REG_NOERROR))
1267	return err;
1268
1269	/ Go to the left node, or up and to the right. /
1270	if (node->left)
1271	node = node->left;
1272	else
1273	{
1274	bin_tree_t *prev = NULL;
1275	while (node->right == prev \|\| node->right == NULL)
1276	{
1277	prev = node;
1278	node = node->parent;
1279	if (!node)
1280	return REG_NOERROR;
1281	}
1282	node = node->right;
1283	}
1284	}
1285	}
1286
1287	/ Optimization pass: if a SUBEXP is entirely contained, strip it and tell*
1288	re_search_internal to map the inner one's opr.idx to this one's. Adjust
1289	backreferences as well. Requires a preorder visit. /*
1290	static reg_errcode_t
1291	optimize_subexps (void extra, bin_tree_t node)
1292	{
1293	re_dfa_t dfa = (re_dfa_t ) extra;
1294
1295	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1296	{
1297	int idx = node->token.opr.idx;
1298	node->token.opr.idx = dfa->subexp_map[idx];
1299	dfa->used_bkref_map \|= `1` << node->token.opr.idx;
1300	}
1301
1302	else if (node->token.type == SUBEXP
1303	&& node->left && node->left->token.type == SUBEXP)
1304	{
1305	Idx other_idx = node->left->token.opr.idx;
1306
1307	node->left = node->left->left;
1308	if (node->left)
1309	node->left->parent = node;
1310
1311	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1312	if (other_idx < BITSET_WORD_BITS)
1313	dfa->used_bkref_map &= ~((bitset_word_t) `1` << other_idx);
1314	}
1315
1316	return REG_NOERROR;
1317	}
1318
1319	/ Lowering pass: Turn each SUBEXP node into the appropriate concatenation*
1320	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. /*
1321	static reg_errcode_t
1322	lower_subexps (void extra, bin_tree_t node)
1323	{
1324	regex_t preg = (regex_t ) extra;
1325	reg_errcode_t err = REG_NOERROR;
1326
1327	if (node->left && node->left->token.type == SUBEXP)
1328	{
1329	node->left = lower_subexp (&err, preg, node->left);
1330	if (node->left)
1331	node->left->parent = node;
1332	}
1333	if (node->right && node->right->token.type == SUBEXP)
1334	{
1335	node->right = lower_subexp (&err, preg, node->right);
1336	if (node->right)
1337	node->right->parent = node;
1338	}
1339
1340	return err;
1341	}
1342
1343	static bin_tree_t *
1344	lower_subexp (reg_errcode_t err, regex_t preg, bin_tree_t *node)
1345	{
1346	re_dfa_t *dfa = preg->buffer;
1347	bin_tree_t *body = node->left;
1348	bin_tree_t op, cls, tree1, tree;
1349
1350	if (preg->no_sub
1351	/ We do not optimize empty subexpressions, because otherwise we may*
1352	have bad CONCAT nodes with NULL children. This is obviously not
1353	very common, so we do not lose much. An example that triggers
1354	this case is the sed "script" //x. /*
1355	&& node->left != NULL
1356	&& (node->token.opr.idx >= BITSET_WORD_BITS
1357	\|\| !(dfa->used_bkref_map
1358	& ((bitset_word_t) `1` << node->token.opr.idx))))
1359	return node->left;
1360
1361	/ Convert the SUBEXP node to the concatenation of an*
1362	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. /*
1363	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1364	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1365	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1366	tree = create_tree (dfa, op, tree1, CONCAT);
1367	if (__glibc_unlikely (tree == NULL \|\| tree1 == NULL
1368	\|\| op == NULL \|\| cls == NULL))
1369	{
1370	*err = REG_ESPACE;
1371	return NULL;
1372	}
1373
1374	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1375	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1376	return tree;
1377	}
1378
1379	/ Pass 1 in building the NFA: compute FIRST and create unlinked automaton*
1380	nodes. Requires a postorder visit. /*
1381	static reg_errcode_t
1382	calc_first (void extra, bin_tree_t node)
1383	{
1384	re_dfa_t dfa = (re_dfa_t ) extra;
1385	if (node->token.type == CONCAT)
1386	{
1387	node->first = node->left->first;
1388	node->node_idx = node->left->node_idx;
1389	}
1390	else
1391	{
1392	node->first = node;
1393	node->node_idx = re_dfa_add_node (dfa, node->token);
1394	if (__glibc_unlikely (node->node_idx == -`1`))
1395	return REG_ESPACE;
1396	if (node->token.type == ANCHOR)
1397	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1398	}
1399	return REG_NOERROR;
1400	}
1401
1402	/ Pass 2: compute NEXT on the tree. Preorder visit. /
1403	static reg_errcode_t
1404	calc_next (void extra, bin_tree_t node)
1405	{
1406	switch (node->token.type)
1407	{
1408	case OP_DUP_ASTERISK:
1409	node->left->next = node;
1410	break;
1411	case CONCAT:
1412	node->left->next = node->right->first;
1413	node->right->next = node->next;
1414	break;
1415	default:
1416	if (node->left)
1417	node->left->next = node->next;
1418	if (node->right)
1419	node->right->next = node->next;
1420	break;
1421	}
1422	return REG_NOERROR;
1423	}
1424
1425	/ Pass 3: link all DFA nodes to their NEXT node (any order will do). /
1426	static reg_errcode_t
1427	link_nfa_nodes (void extra, bin_tree_t node)
1428	{
1429	re_dfa_t dfa = (re_dfa_t ) extra;
1430	Idx idx = node->node_idx;
1431	reg_errcode_t err = REG_NOERROR;
1432
1433	switch (node->token.type)
1434	{
1435	case CONCAT:
1436	break;
1437
1438	case END_OF_RE:
1439	DEBUG_ASSERT (node->next == NULL);
1440	break;
1441
1442	case OP_DUP_ASTERISK:
1443	case OP_ALT:
1444	{
1445	Idx left, right;
1446	dfa->has_plural_match = `1`;
1447	if (node->left != NULL)
1448	left = node->left->first->node_idx;
1449	else
1450	left = node->next->node_idx;
1451	if (node->right != NULL)
1452	right = node->right->first->node_idx;
1453	else
1454	right = node->next->node_idx;
1455	DEBUG_ASSERT (left > -`1`);
1456	DEBUG_ASSERT (right > -`1`);
1457	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1458	}
1459	break;
1460
1461	case ANCHOR:
1462	case OP_OPEN_SUBEXP:
1463	case OP_CLOSE_SUBEXP:
1464	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1465	break;
1466
1467	case OP_BACK_REF:
1468	dfa->nexts[idx] = node->next->node_idx;
1469	if (node->token.type == OP_BACK_REF)
1470	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1471	break;
1472
1473	default:
1474	DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1475	dfa->nexts[idx] = node->next->node_idx;
1476	break;
1477	}
1478
1479	return err;
1480	}
1481
1482	/ Duplicate the epsilon closure of the node ROOT_NODE.*
1483	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1484	to their own constraint. /*
1485
1486	static reg_errcode_t
1487	duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1488	Idx root_node, unsigned int init_constraint)
1489	{
1490	Idx org_node, clone_node;
1491	bool ok;
1492	unsigned int constraint = init_constraint;
1493	for (org_node = top_org_node, clone_node = top_clone_node;;)
1494	{
1495	Idx org_dest, clone_dest;
1496	if (dfa->nodes[org_node].type == OP_BACK_REF)
1497	{
1498	/ If the back reference epsilon-transit, its destination must*
1499	also have the constraint. Then duplicate the epsilon closure
1500	of the destination of the back reference, and store it in
1501	edests of the back reference. /*
1502	org_dest = dfa->nexts[org_node];
1503	re_node_set_empty (dfa->edests + clone_node);
1504	clone_dest = duplicate_node (dfa, org_dest, constraint);
1505	if (__glibc_unlikely (clone_dest == -`1`))
1506	return REG_ESPACE;
1507	dfa->nexts[clone_node] = dfa->nexts[org_node];
1508	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509	if (__glibc_unlikely (! ok))
1510	return REG_ESPACE;
1511	}
1512	else if (dfa->edests[org_node].nelem == `0`)
1513	{
1514	/ In case of the node can't epsilon-transit, don't duplicate the*
1515	destination and store the original destination as the
1516	destination of the node. /*
1517	dfa->nexts[clone_node] = dfa->nexts[org_node];
1518	break;
1519	}
1520	else if (dfa->edests[org_node].nelem == `1`)
1521	{
1522	/ In case of the node can epsilon-transit, and it has only one*
1523	destination. /*
1524	org_dest = dfa->edests[org_node].elems[`0`];
1525	re_node_set_empty (dfa->edests + clone_node);
1526	/ If the node is root_node itself, it means the epsilon closure*
1527	has a loop. Then tie it to the destination of the root_node. /*
1528	if (org_node == root_node && clone_node != org_node)
1529	{
1530	ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1531	if (__glibc_unlikely (! ok))
1532	return REG_ESPACE;
1533	break;
1534	}
1535	/ In case the node has another constraint, append it. /
1536	constraint \|= dfa->nodes[org_node].constraint;
1537	clone_dest = duplicate_node (dfa, org_dest, constraint);
1538	if (__glibc_unlikely (clone_dest == -`1`))
1539	return REG_ESPACE;
1540	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1541	if (__glibc_unlikely (! ok))
1542	return REG_ESPACE;
1543	}
1544	else / dfa->edests[org_node].nelem == 2 /
1545	{
1546	/ In case of the node can epsilon-transit, and it has two*
1547	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
1548	org_dest = dfa->edests[org_node].elems[`0`];
1549	re_node_set_empty (dfa->edests + clone_node);
1550	/ Search for a duplicated node which satisfies the constraint. /
1551	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1552	if (clone_dest == -`1`)
1553	{
1554	/ There is no such duplicated node, create a new one. /
1555	reg_errcode_t err;
1556	clone_dest = duplicate_node (dfa, org_dest, constraint);
1557	if (__glibc_unlikely (clone_dest == -`1`))
1558	return REG_ESPACE;
1559	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560	if (__glibc_unlikely (! ok))
1561	return REG_ESPACE;
1562	err = duplicate_node_closure (dfa, org_dest, clone_dest,
1563	root_node, constraint);
1564	if (__glibc_unlikely (err != REG_NOERROR))
1565	return err;
1566	}
1567	else
1568	{
1569	/ There is a duplicated node which satisfies the constraint,*
1570	use it to avoid infinite loop. /*
1571	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572	if (__glibc_unlikely (! ok))
1573	return REG_ESPACE;
1574	}
1575
1576	org_dest = dfa->edests[org_node].elems[`1`];
1577	clone_dest = duplicate_node (dfa, org_dest, constraint);
1578	if (__glibc_unlikely (clone_dest == -`1`))
1579	return REG_ESPACE;
1580	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581	if (__glibc_unlikely (! ok))
1582	return REG_ESPACE;
1583	}
1584	org_node = org_dest;
1585	clone_node = clone_dest;
1586	}
1587	return REG_NOERROR;
1588	}
1589
1590	/ Search for a node which is duplicated from the node ORG_NODE, and*
1591	satisfies the constraint CONSTRAINT. /*
1592
1593	static Idx
1594	search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1595	unsigned int constraint)
1596	{
1597	Idx idx;
1598	for (idx = dfa->nodes_len - `1`; dfa->nodes[idx].duplicated && idx > `0`; --idx)
1599	{
1600	if (org_node == dfa->org_indices[idx]
1601	&& constraint == dfa->nodes[idx].constraint)
1602	return idx; / Found. /
1603	}
1604	return -`1`; / Not found. /
1605	}
1606
1607	/ Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.*
1608	Return the index of the new node, or -1 if insufficient storage is
1609	available. /*
1610
1611	static Idx
1612	duplicate_node (re_dfa_t dfa, Idx org_idx, unsigned* int constraint)
1613	{
1614	Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1615	if (__glibc_likely (dup_idx != -`1`))
1616	{
1617	dfa->nodes[dup_idx].constraint = constraint;
1618	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].constraint;
1619	dfa->nodes[dup_idx].duplicated = `1`;
1620
1621	/ Store the index of the original node. /
1622	dfa->org_indices[dup_idx] = org_idx;
1623	}
1624	return dup_idx;
1625	}
1626
1627	static reg_errcode_t
1628	calc_inveclosure (re_dfa_t *dfa)
1629	{
1630	Idx src, idx;
1631	bool ok;
1632	for (idx = `0`; idx < dfa->nodes_len; ++idx)
1633	re_node_set_init_empty (dfa->inveclosures + idx);
1634
1635	for (src = `0`; src < dfa->nodes_len; ++src)
1636	{
1637	Idx *elems = dfa->eclosures[src].elems;
1638	for (idx = `0`; idx < dfa->eclosures[src].nelem; ++idx)
1639	{
1640	ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1641	if (__glibc_unlikely (! ok))
1642	return REG_ESPACE;
1643	}
1644	}
1645
1646	return REG_NOERROR;
1647	}
1648
1649	/ Calculate "eclosure" for all the node in DFA. /
1650
1651	static reg_errcode_t
1652	calc_eclosure (re_dfa_t *dfa)
1653	{
1654	Idx node_idx;
1655	bool incomplete;
1656	DEBUG_ASSERT (dfa->nodes_len > `0`);
1657	incomplete = false;
1658	/ For each nodes, calculate epsilon closure. /
1659	for (node_idx = `0`; ; ++node_idx)
1660	{
1661	reg_errcode_t err;
1662	re_node_set eclosure_elem;
1663	if (node_idx == dfa->nodes_len)
1664	{
1665	if (!incomplete)
1666	break;
1667	incomplete = false;
1668	node_idx = `0`;
1669	}
1670
1671	DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -`1`);
1672
1673	/ If we have already calculated, skip it. /
1674	if (dfa->eclosures[node_idx].nelem != `0`)
1675	continue;
1676	/ Calculate epsilon closure of 'node_idx'. /
1677	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1678	if (__glibc_unlikely (err != REG_NOERROR))
1679	return err;
1680
1681	if (dfa->eclosures[node_idx].nelem == `0`)
1682	{
1683	incomplete = true;
1684	re_node_set_free (&eclosure_elem);
1685	}
1686	}
1687	return REG_NOERROR;
1688	}
1689
1690	/ Calculate epsilon closure of NODE. /
1691
1692	static reg_errcode_t
1693	calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa, Idx node, bool root)
1694	{
1695	reg_errcode_t err;
1696	Idx i;
1697	re_node_set eclosure;
1698	bool ok;
1699	bool incomplete = false;
1700	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + `1`);
1701	if (__glibc_unlikely (err != REG_NOERROR))
1702	return err;
1703
1704	/ This indicates that we are calculating this node now.*
1705	We reference this value to avoid infinite loop. /*
1706	dfa->eclosures[node].nelem = -`1`;
1707
1708	/ If the current node has constraints, duplicate all nodes*
1709	since they must inherit the constraints. /*
1710	if (dfa->nodes[node].constraint
1711	&& dfa->edests[node].nelem
1712	&& !dfa->nodes[dfa->edests[node].elems[`0`]].duplicated)
1713	{
1714	err = duplicate_node_closure (dfa, node, node, node,
1715	dfa->nodes[node].constraint);
1716	if (__glibc_unlikely (err != REG_NOERROR))
1717	return err;
1718	}
1719
1720	/ Expand each epsilon destination nodes. /
1721	if (IS_EPSILON_NODE(dfa->nodes[node].type))
1722	for (i = `0`; i < dfa->edests[node].nelem; ++i)
1723	{
1724	re_node_set eclosure_elem;
1725	Idx edest = dfa->edests[node].elems[i];
1726	/ If calculating the epsilon closure of 'edest' is in progress,*
1727	return intermediate result. /*
1728	if (dfa->eclosures[edest].nelem == -`1`)
1729	{
1730	incomplete = true;
1731	continue;
1732	}
1733	/ If we haven't calculated the epsilon closure of 'edest' yet,*
1734	calculate now. Otherwise use calculated epsilon closure. /*
1735	if (dfa->eclosures[edest].nelem == `0`)
1736	{
1737	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1738	if (__glibc_unlikely (err != REG_NOERROR))
1739	return err;
1740	}
1741	else
1742	eclosure_elem = dfa->eclosures[edest];
1743	/ Merge the epsilon closure of 'edest'. /
1744	err = re_node_set_merge (&eclosure, &eclosure_elem);
1745	if (__glibc_unlikely (err != REG_NOERROR))
1746	return err;
1747	/ If the epsilon closure of 'edest' is incomplete,*
1748	the epsilon closure of this node is also incomplete. /*
1749	if (dfa->eclosures[edest].nelem == `0`)
1750	{
1751	incomplete = true;
1752	re_node_set_free (&eclosure_elem);
1753	}
1754	}
1755
1756	/ An epsilon closure includes itself. /
1757	ok = re_node_set_insert (&eclosure, node);
1758	if (__glibc_unlikely (! ok))
1759	return REG_ESPACE;
1760	if (incomplete && !root)
1761	dfa->eclosures[node].nelem = `0`;
1762	else
1763	dfa->eclosures[node] = eclosure;
1764	*new_set = eclosure;
1765	return REG_NOERROR;
1766	}
1767
1768	/ Functions for token which are used in the parser. /
1769
1770	/ Fetch a token from INPUT.*
1771	We must not use this function inside bracket expressions. /*
1772
1773	static void
1774	fetch_token (re_token_t result, re_string_t input, reg_syntax_t syntax)
1775	{
1776	re_string_skip_bytes (input, peek_token (result, input, syntax));
1777	}
1778
1779	/ Peek a token from INPUT, and return the length of the token.*
1780	We must not use this function inside bracket expressions. /*
1781
1782	static int
1783	peek_token (re_token_t token, re_string_t input, reg_syntax_t syntax)
1784	{
1785	unsigned char c;
1786
1787	if (re_string_eoi (input))
1788	{
1789	token->type = END_OF_RE;
1790	return `0`;
1791	}
1792
1793	c = re_string_peek_byte (input, `0`);
1794	token->opr.c = c;
1795
1796	token->word_char = `0`;
1797	#ifdef RE_ENABLE_I18N
1798	token->mb_partial = `0`;
1799	if (input->mb_cur_max > `1`
1800	&& !re_string_first_byte (input, re_string_cur_idx (input)))
1801	{
1802	token->type = CHARACTER;
1803	token->mb_partial = `1`;
1804	return `1`;
1805	}
1806	#endif
1807	if (c == `'\\'`)
1808	{
1809	unsigned char c2;
1810	if (re_string_cur_idx (input) + `1` >= re_string_length (input))
1811	{
1812	token->type = BACK_SLASH;
1813	return `1`;
1814	}
1815
1816	c2 = re_string_peek_byte_case (input, `1`);
1817	token->opr.c = c2;
1818	token->type = CHARACTER;
1819	#ifdef RE_ENABLE_I18N
1820	if (input->mb_cur_max > `1`)
1821	{
1822	wint_t wc = re_string_wchar_at (input,
1823	re_string_cur_idx (input) + `1`);
1824	token->word_char = IS_WIDE_WORD_CHAR (wc) != `0`;
1825	}
1826	else
1827	#endif
1828	token->word_char = IS_WORD_CHAR (c2) != `0`;
1829
1830	switch (c2)
1831	{
1832	case `'\|'`:
1833	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1834	token->type = OP_ALT;
1835	break;
1836	case `'1'`: case `'2'`: case `'3'`: case `'4'`: case `'5'`:
1837	case `'6'`: case `'7'`: case `'8'`: case `'9'`:
1838	if (!(syntax & RE_NO_BK_REFS))
1839	{
1840	token->type = OP_BACK_REF;
1841	token->opr.idx = c2 - `'1'`;
1842	}
1843	break;
1844	case `'<'`:
1845	if (!(syntax & RE_NO_GNU_OPS))
1846	{
1847	token->type = ANCHOR;
1848	token->opr.ctx_type = WORD_FIRST;
1849	}
1850	break;
1851	case `'>'`:
1852	if (!(syntax & RE_NO_GNU_OPS))
1853	{
1854	token->type = ANCHOR;
1855	token->opr.ctx_type = WORD_LAST;
1856	}
1857	break;
1858	case `'b'`:
1859	if (!(syntax & RE_NO_GNU_OPS))
1860	{
1861	token->type = ANCHOR;
1862	token->opr.ctx_type = WORD_DELIM;
1863	}
1864	break;
1865	case `'B'`:
1866	if (!(syntax & RE_NO_GNU_OPS))
1867	{
1868	token->type = ANCHOR;
1869	token->opr.ctx_type = NOT_WORD_DELIM;
1870	}
1871	break;
1872	case `'w'`:
1873	if (!(syntax & RE_NO_GNU_OPS))
1874	token->type = OP_WORD;
1875	break;
1876	case `'W'`:
1877	if (!(syntax & RE_NO_GNU_OPS))
1878	token->type = OP_NOTWORD;
1879	break;
1880	case `'s'`:
1881	if (!(syntax & RE_NO_GNU_OPS))
1882	token->type = OP_SPACE;
1883	break;
1884	case `'S'`:
1885	if (!(syntax & RE_NO_GNU_OPS))
1886	token->type = OP_NOTSPACE;
1887	break;
1888	case '`':
1889	if (!(syntax & RE_NO_GNU_OPS))
1890	{
1891	token->type = ANCHOR;
1892	token->opr.ctx_type = BUF_FIRST;
1893	}
1894	break;
1895	case `'\''`:
1896	if (!(syntax & RE_NO_GNU_OPS))
1897	{
1898	token->type = ANCHOR;
1899	token->opr.ctx_type = BUF_LAST;
1900	}
1901	break;
1902	case `'('`:
1903	if (!(syntax & RE_NO_BK_PARENS))
1904	token->type = OP_OPEN_SUBEXP;
1905	break;
1906	case `')'`:
1907	if (!(syntax & RE_NO_BK_PARENS))
1908	token->type = OP_CLOSE_SUBEXP;
1909	break;
1910	case `'+'`:
1911	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1912	token->type = OP_DUP_PLUS;
1913	break;
1914	case `'?'`:
1915	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1916	token->type = OP_DUP_QUESTION;
1917	break;
1918	case `'{'`:
1919	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1920	token->type = OP_OPEN_DUP_NUM;
1921	break;
1922	case `'}'`:
1923	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1924	token->type = OP_CLOSE_DUP_NUM;
1925	break;
1926	default:
1927	break;
1928	}
1929	return `2`;
1930	}
1931
1932	token->type = CHARACTER;
1933	#ifdef RE_ENABLE_I18N
1934	if (input->mb_cur_max > `1`)
1935	{
1936	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1937	token->word_char = IS_WIDE_WORD_CHAR (wc) != `0`;
1938	}
1939	else
1940	#endif
1941	token->word_char = IS_WORD_CHAR (token->opr.c);
1942
1943	switch (c)
1944	{
1945	case `'\n'`:
1946	if (syntax & RE_NEWLINE_ALT)
1947	token->type = OP_ALT;
1948	break;
1949	case `'\|'`:
1950	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1951	token->type = OP_ALT;
1952	break;
1953	case `'*'`:
1954	token->type = OP_DUP_ASTERISK;
1955	break;
1956	case `'+'`:
1957	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1958	token->type = OP_DUP_PLUS;
1959	break;
1960	case `'?'`:
1961	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1962	token->type = OP_DUP_QUESTION;
1963	break;
1964	case `'{'`:
1965	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1966	token->type = OP_OPEN_DUP_NUM;
1967	break;
1968	case `'}'`:
1969	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1970	token->type = OP_CLOSE_DUP_NUM;
1971	break;
1972	case `'('`:
1973	if (syntax & RE_NO_BK_PARENS)
1974	token->type = OP_OPEN_SUBEXP;
1975	break;
1976	case `')'`:
1977	if (syntax & RE_NO_BK_PARENS)
1978	token->type = OP_CLOSE_SUBEXP;
1979	break;
1980	case `'['`:
1981	token->type = OP_OPEN_BRACKET;
1982	break;
1983	case `'.'`:
1984	token->type = OP_PERIOD;
1985	break;
1986	case `'^'`:
1987	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE))
1988	&& re_string_cur_idx (input) != `0`)
1989	{
1990	char prev = re_string_peek_byte (input, -`1`);
1991	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != `'\n'`)
1992	break;
1993	}
1994	token->type = ANCHOR;
1995	token->opr.ctx_type = LINE_FIRST;
1996	break;
1997	case `'$'`:
1998	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1999	&& re_string_cur_idx (input) + `1` != re_string_length (input))
2000	{
2001	re_token_t next;
2002	re_string_skip_bytes (input, `1`);
2003	peek_token (&next, input, syntax);
2004	re_string_skip_bytes (input, -`1`);
2005	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2006	break;
2007	}
2008	token->type = ANCHOR;
2009	token->opr.ctx_type = LINE_LAST;
2010	break;
2011	default:
2012	break;
2013	}
2014	return `1`;
2015	}
2016
2017	/ Peek a token from INPUT, and return the length of the token.*
2018	We must not use this function out of bracket expressions. /*
2019
2020	static int
2021	peek_token_bracket (re_token_t token, re_string_t input, reg_syntax_t syntax)
2022	{
2023	unsigned char c;
2024	if (re_string_eoi (input))
2025	{
2026	token->type = END_OF_RE;
2027	return `0`;
2028	}
2029	c = re_string_peek_byte (input, `0`);
2030	token->opr.c = c;
2031
2032	#ifdef RE_ENABLE_I18N
2033	if (input->mb_cur_max > `1`
2034	&& !re_string_first_byte (input, re_string_cur_idx (input)))
2035	{
2036	token->type = CHARACTER;
2037	return `1`;
2038	}
2039	#endif /* RE_ENABLE_I18N */
2040
2041	if (c == `'\\'` && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2042	&& re_string_cur_idx (input) + `1` < re_string_length (input))
2043	{
2044	/ In this case, '\' escape a character. /
2045	unsigned char c2;
2046	re_string_skip_bytes (input, `1`);
2047	c2 = re_string_peek_byte (input, `0`);
2048	token->opr.c = c2;
2049	token->type = CHARACTER;
2050	return `1`;
2051	}
2052	if (c == `'['`) / '[' is a special char in a bracket exps. /
2053	{
2054	unsigned char c2;
2055	int token_len;
2056	if (re_string_cur_idx (input) + `1` < re_string_length (input))
2057	c2 = re_string_peek_byte (input, `1`);
2058	else
2059	c2 = `0`;
2060	token->opr.c = c2;
2061	token_len = `2`;
2062	switch (c2)
2063	{
2064	case `'.'`:
2065	token->type = OP_OPEN_COLL_ELEM;
2066	break;
2067
2068	case `'='`:
2069	token->type = OP_OPEN_EQUIV_CLASS;
2070	break;
2071
2072	case `':'`:
2073	if (syntax & RE_CHAR_CLASSES)
2074	{
2075	token->type = OP_OPEN_CHAR_CLASS;
2076	break;
2077	}
2078	FALLTHROUGH;
2079	default:
2080	token->type = CHARACTER;
2081	token->opr.c = c;
2082	token_len = `1`;
2083	break;
2084	}
2085	return token_len;
2086	}
2087	switch (c)
2088	{
2089	case `'-'`:
2090	token->type = OP_CHARSET_RANGE;
2091	break;
2092	case `']'`:
2093	token->type = OP_CLOSE_BRACKET;
2094	break;
2095	case `'^'`:
2096	token->type = OP_NON_MATCH_LIST;
2097	break;
2098	default:
2099	token->type = CHARACTER;
2100	}
2101	return `1`;
2102	}
2103
2104	/ Functions for parser. /
2105
2106	/ Entry point of the parser.*
2107	Parse the regular expression REGEXP and return the structure tree.
2108	If an error occurs, ERR is set by error code, and return NULL.
2109	This function build the following tree, from regular expression <reg_exp>:
2110	CAT
2111	/ \
2112	/ \
2113	<reg_exp> EOR
2114
2115	CAT means concatenation.
2116	EOR means end of regular expression. /*
2117
2118	static bin_tree_t *
2119	parse (re_string_t regexp, regex_t preg, reg_syntax_t syntax,
2120	reg_errcode_t *err)
2121	{
2122	re_dfa_t *dfa = preg->buffer;
2123	bin_tree_t tree, eor, *root;
2124	re_token_t current_token;
2125	dfa->syntax = syntax;
2126	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2127	tree = parse_reg_exp (regexp, preg, &current_token, syntax, `0`, err);
2128	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2129	return NULL;
2130	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2131	if (tree != NULL)
2132	root = create_tree (dfa, tree, eor, CONCAT);
2133	else
2134	root = eor;
2135	if (__glibc_unlikely (eor == NULL \|\| root == NULL))
2136	{
2137	*err = REG_ESPACE;
2138	return NULL;
2139	}
2140	return root;
2141	}
2142
2143	/ This function build the following tree, from regular expression*
2144	<branch1>\|<branch2>:
2145	ALT
2146	/ \
2147	/ \
2148	<branch1> <branch2>
2149
2150	ALT means alternative, which represents the operator '\|'. /*
2151
2152	static bin_tree_t *
2153	parse_reg_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2154	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2155	{
2156	re_dfa_t *dfa = preg->buffer;
2157	bin_tree_t tree, branch = NULL;
2158	bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2159	tree = parse_branch (regexp, preg, token, syntax, nest, err);
2160	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2161	return NULL;
2162
2163	while (token->type == OP_ALT)
2164	{
2165	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2166	if (token->type != OP_ALT && token->type != END_OF_RE
2167	&& (nest == `0` \|\| token->type != OP_CLOSE_SUBEXP))
2168	{
2169	bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2170	dfa->completed_bkref_map = initial_bkref_map;
2171	branch = parse_branch (regexp, preg, token, syntax, nest, err);
2172	if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2173	{
2174	if (tree != NULL)
2175	postorder (tree, free_tree, NULL);
2176	return NULL;
2177	}
2178	dfa->completed_bkref_map \|= accumulated_bkref_map;
2179	}
2180	else
2181	branch = NULL;
2182	tree = create_tree (dfa, tree, branch, OP_ALT);
2183	if (__glibc_unlikely (tree == NULL))
2184	{
2185	*err = REG_ESPACE;
2186	return NULL;
2187	}
2188	}
2189	return tree;
2190	}
2191
2192	/ This function build the following tree, from regular expression*
2193	<exp1><exp2>:
2194	CAT
2195	/ \
2196	/ \
2197	<exp1> <exp2>
2198
2199	CAT means concatenation. /*
2200
2201	static bin_tree_t *
2202	parse_branch (re_string_t regexp, regex_t preg, re_token_t *token,
2203	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2204	{
2205	bin_tree_t tree, expr;
2206	re_dfa_t *dfa = preg->buffer;
2207	tree = parse_expression (regexp, preg, token, syntax, nest, err);
2208	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2209	return NULL;
2210
2211	while (token->type != OP_ALT && token->type != END_OF_RE
2212	&& (nest == `0` \|\| token->type != OP_CLOSE_SUBEXP))
2213	{
2214	expr = parse_expression (regexp, preg, token, syntax, nest, err);
2215	if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2216	{
2217	if (tree != NULL)
2218	postorder (tree, free_tree, NULL);
2219	return NULL;
2220	}
2221	if (tree != NULL && expr != NULL)
2222	{
2223	bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2224	if (newtree == NULL)
2225	{
2226	postorder (expr, free_tree, NULL);
2227	postorder (tree, free_tree, NULL);
2228	*err = REG_ESPACE;
2229	return NULL;
2230	}
2231	tree = newtree;
2232	}
2233	else if (tree == NULL)
2234	tree = expr;
2235	/ Otherwise expr == NULL, we don't need to create new tree. /
2236	}
2237	return tree;
2238	}
2239
2240	/ This function build the following tree, from regular expression a:
2241	*
2242	\|
2243	a
2244	*/
2245
2246	static bin_tree_t *
2247	parse_expression (re_string_t regexp, regex_t preg, re_token_t *token,
2248	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2249	{
2250	re_dfa_t *dfa = preg->buffer;
2251	bin_tree_t *tree;
2252	switch (token->type)
2253	{
2254	case CHARACTER:
2255	tree = create_token_tree (dfa, NULL, NULL, token);
2256	if (__glibc_unlikely (tree == NULL))
2257	{
2258	*err = REG_ESPACE;
2259	return NULL;
2260	}
2261	#ifdef RE_ENABLE_I18N
2262	if (dfa->mb_cur_max > `1`)
2263	{
2264	while (!re_string_eoi (regexp)
2265	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2266	{
2267	bin_tree_t *mbc_remain;
2268	fetch_token (token, regexp, syntax);
2269	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2270	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2271	if (__glibc_unlikely (mbc_remain == NULL \|\| tree == NULL))
2272	{
2273	*err = REG_ESPACE;
2274	return NULL;
2275	}
2276	}
2277	}
2278	#endif
2279	break;
2280
2281	case OP_OPEN_SUBEXP:
2282	tree = parse_sub_exp (regexp, preg, token, syntax, nest + `1`, err);
2283	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2284	return NULL;
2285	break;
2286
2287	case OP_OPEN_BRACKET:
2288	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2289	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2290	return NULL;
2291	break;
2292
2293	case OP_BACK_REF:
2294	if (!__glibc_likely (dfa->completed_bkref_map & (`1` << token->opr.idx)))
2295	{
2296	*err = REG_ESUBREG;
2297	return NULL;
2298	}
2299	dfa->used_bkref_map \|= `1` << token->opr.idx;
2300	tree = create_token_tree (dfa, NULL, NULL, token);
2301	if (__glibc_unlikely (tree == NULL))
2302	{
2303	*err = REG_ESPACE;
2304	return NULL;
2305	}
2306	++dfa->nbackref;
2307	dfa->has_mb_node = `1`;
2308	break;
2309
2310	case OP_OPEN_DUP_NUM:
2311	if (syntax & RE_CONTEXT_INVALID_DUP)
2312	{
2313	*err = REG_BADRPT;
2314	return NULL;
2315	}
2316	FALLTHROUGH;
2317	case OP_DUP_ASTERISK:
2318	case OP_DUP_PLUS:
2319	case OP_DUP_QUESTION:
2320	if (syntax & RE_CONTEXT_INVALID_OPS)
2321	{
2322	*err = REG_BADRPT;
2323	return NULL;
2324	}
2325	else if (syntax & RE_CONTEXT_INDEP_OPS)
2326	{
2327	fetch_token (token, regexp, syntax);
2328	return parse_expression (regexp, preg, token, syntax, nest, err);
2329	}
2330	FALLTHROUGH;
2331	case OP_CLOSE_SUBEXP:
2332	if ((token->type == OP_CLOSE_SUBEXP)
2333	&& !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2334	{
2335	*err = REG_ERPAREN;
2336	return NULL;
2337	}
2338	FALLTHROUGH;
2339	case OP_CLOSE_DUP_NUM:
2340	/ We treat it as a normal character. /
2341
2342	/ Then we can these characters as normal characters. /
2343	token->type = CHARACTER;
2344	/ mb_partial and word_char bits should be initialized already*
2345	by peek_token. /*
2346	tree = create_token_tree (dfa, NULL, NULL, token);
2347	if (__glibc_unlikely (tree == NULL))
2348	{
2349	*err = REG_ESPACE;
2350	return NULL;
2351	}
2352	break;
2353
2354	case ANCHOR:
2355	if ((token->opr.ctx_type
2356	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
2357	&& dfa->word_ops_used == `0`)
2358	init_word_char (dfa);
2359	if (token->opr.ctx_type == WORD_DELIM
2360	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
2361	{
2362	bin_tree_t tree_first, tree_last;
2363	if (token->opr.ctx_type == WORD_DELIM)
2364	{
2365	token->opr.ctx_type = WORD_FIRST;
2366	tree_first = create_token_tree (dfa, NULL, NULL, token);
2367	token->opr.ctx_type = WORD_LAST;
2368	}
2369	else
2370	{
2371	token->opr.ctx_type = INSIDE_WORD;
2372	tree_first = create_token_tree (dfa, NULL, NULL, token);
2373	token->opr.ctx_type = INSIDE_NOTWORD;
2374	}
2375	tree_last = create_token_tree (dfa, NULL, NULL, token);
2376	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2377	if (__glibc_unlikely (tree_first == NULL \|\| tree_last == NULL
2378	\|\| tree == NULL))
2379	{
2380	*err = REG_ESPACE;
2381	return NULL;
2382	}
2383	}
2384	else
2385	{
2386	tree = create_token_tree (dfa, NULL, NULL, token);
2387	if (__glibc_unlikely (tree == NULL))
2388	{
2389	*err = REG_ESPACE;
2390	return NULL;
2391	}
2392	}
2393	/ We must return here, since ANCHORs can't be followed*
2394	by repetition operators.
2395	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
2396	it must not be "<ANCHOR(^)><REPEAT()>". /
2397	fetch_token (token, regexp, syntax);
2398	return tree;
2399
2400	case OP_PERIOD:
2401	tree = create_token_tree (dfa, NULL, NULL, token);
2402	if (__glibc_unlikely (tree == NULL))
2403	{
2404	*err = REG_ESPACE;
2405	return NULL;
2406	}
2407	if (dfa->mb_cur_max > `1`)
2408	dfa->has_mb_node = `1`;
2409	break;
2410
2411	case OP_WORD:
2412	case OP_NOTWORD:
2413	tree = build_charclass_op (dfa, regexp->trans,
2414	"alnum",
2415	"_",
2416	token->type == OP_NOTWORD, err);
2417	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2418	return NULL;
2419	break;
2420
2421	case OP_SPACE:
2422	case OP_NOTSPACE:
2423	tree = build_charclass_op (dfa, regexp->trans,
2424	"space",
2425	"",
2426	token->type == OP_NOTSPACE, err);
2427	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2428	return NULL;
2429	break;
2430
2431	case OP_ALT:
2432	case END_OF_RE:
2433	return NULL;
2434
2435	case BACK_SLASH:
2436	*err = REG_EESCAPE;
2437	return NULL;
2438
2439	default:
2440	/ Must not happen? /
2441	DEBUG_ASSERT (false);
2442	return NULL;
2443	}
2444	fetch_token (token, regexp, syntax);
2445
2446	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
2447	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
2448	{
2449	bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2450	syntax, err);
2451	if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2452	{
2453	if (tree != NULL)
2454	postorder (tree, free_tree, NULL);
2455	return NULL;
2456	}
2457	tree = dup_tree;
2458	/ In BRE consecutive duplications are not allowed. /
2459	if ((syntax & RE_CONTEXT_INVALID_DUP)
2460	&& (token->type == OP_DUP_ASTERISK
2461	\|\| token->type == OP_OPEN_DUP_NUM))
2462	{
2463	if (tree != NULL)
2464	postorder (tree, free_tree, NULL);
2465	*err = REG_BADRPT;
2466	return NULL;
2467	}
2468	}
2469
2470	return tree;
2471	}
2472
2473	/ This function build the following tree, from regular expression*
2474	(<reg_exp>):
2475	SUBEXP
2476	\|
2477	<reg_exp>
2478	*/
2479
2480	static bin_tree_t *
2481	parse_sub_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2482	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2483	{
2484	re_dfa_t *dfa = preg->buffer;
2485	bin_tree_t *tree;
2486	size_t cur_nsub;
2487	cur_nsub = preg->re_nsub++;
2488
2489	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2490
2491	/ The subexpression may be a null string. /
2492	if (token->type == OP_CLOSE_SUBEXP)
2493	tree = NULL;
2494	else
2495	{
2496	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2497	if (__glibc_unlikely (*err == REG_NOERROR
2498	&& token->type != OP_CLOSE_SUBEXP))
2499	{
2500	if (tree != NULL)
2501	postorder (tree, free_tree, NULL);
2502	*err = REG_EPAREN;
2503	}
2504	if (__glibc_unlikely (*err != REG_NOERROR))
2505	return NULL;
2506	}
2507
2508	if (cur_nsub <= `'9'` - `'1'`)
2509	dfa->completed_bkref_map \|= `1` << cur_nsub;
2510
2511	tree = create_tree (dfa, tree, NULL, SUBEXP);
2512	if (__glibc_unlikely (tree == NULL))
2513	{
2514	*err = REG_ESPACE;
2515	return NULL;
2516	}
2517	tree->token.opr.idx = cur_nsub;
2518	return tree;
2519	}
2520
2521	/ This function parse repetition operators like "", "+", "{1,3}" etc. /*
2522
2523	static bin_tree_t *
2524	parse_dup_op (bin_tree_t elem, re_string_t regexp, re_dfa_t *dfa,
2525	re_token_t token, reg_syntax_t syntax, reg_errcode_t err)
2526	{
2527	bin_tree_t tree = NULL, old_tree = NULL;
2528	Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2529	re_token_t start_token = *token;
2530
2531	if (token->type == OP_OPEN_DUP_NUM)
2532	{
2533	end = `0`;
2534	start = fetch_number (regexp, token, syntax);
2535	if (start == -`1`)
2536	{
2537	if (token->type == CHARACTER && token->opr.c == `','`)
2538	start = `0`; / We treat "{,m}" as "{0,m}". /
2539	else
2540	{
2541	err = REG_BADBR; /* <re>{} is invalid. /
2542	return NULL;
2543	}
2544	}
2545	if (__glibc_likely (start != -`2`))
2546	{
2547	/ We treat "{n}" as "{n,n}". /
2548	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2549	: ((token->type == CHARACTER && token->opr.c == `','`)
2550	? fetch_number (regexp, token, syntax) : -`2`));
2551	}
2552	if (__glibc_unlikely (start == -`2` \|\| end == -`2`))
2553	{
2554	/ Invalid sequence. /
2555	if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2556	{
2557	if (token->type == END_OF_RE)
2558	*err = REG_EBRACE;
2559	else
2560	*err = REG_BADBR;
2561
2562	return NULL;
2563	}
2564
2565	/ If the syntax bit is set, rollback. /
2566	re_string_set_index (regexp, start_idx);
2567	*token = start_token;
2568	token->type = CHARACTER;
2569	/ mb_partial and word_char bits should be already initialized by*
2570	peek_token. /*
2571	return elem;
2572	}
2573
2574	if (__glibc_unlikely ((end != -`1` && start > end)
2575	\|\| token->type != OP_CLOSE_DUP_NUM))
2576	{
2577	/ First number greater than second. /
2578	*err = REG_BADBR;
2579	return NULL;
2580	}
2581
2582	if (__glibc_unlikely (RE_DUP_MAX < (end == -`1` ? start : end)))
2583	{
2584	*err = REG_ESIZE;
2585	return NULL;
2586	}
2587	}
2588	else
2589	{
2590	start = (token->type == OP_DUP_PLUS) ? `1` : `0`;
2591	end = (token->type == OP_DUP_QUESTION) ? `1` : -`1`;
2592	}
2593
2594	fetch_token (token, regexp, syntax);
2595
2596	if (__glibc_unlikely (elem == NULL))
2597	return NULL;
2598	if (__glibc_unlikely (start == `0` && end == `0`))
2599	{
2600	postorder (elem, free_tree, NULL);
2601	return NULL;
2602	}
2603
2604	/ Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". /
2605	if (__glibc_unlikely (start > `0`))
2606	{
2607	tree = elem;
2608	for (i = `2`; i <= start; ++i)
2609	{
2610	elem = duplicate_tree (elem, dfa);
2611	tree = create_tree (dfa, tree, elem, CONCAT);
2612	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
2613	goto parse_dup_op_espace;
2614	}
2615
2616	if (start == end)
2617	return tree;
2618
2619	/ Duplicate ELEM before it is marked optional. /
2620	elem = duplicate_tree (elem, dfa);
2621	if (__glibc_unlikely (elem == NULL))
2622	goto parse_dup_op_espace;
2623	old_tree = tree;
2624	}
2625	else
2626	old_tree = NULL;
2627
2628	if (elem->token.type == SUBEXP)
2629	{
2630	uintptr_t subidx = elem->token.opr.idx;
2631	postorder (elem, mark_opt_subexp, (void *) subidx);
2632	}
2633
2634	tree = create_tree (dfa, elem, NULL,
2635	(end == -`1` ? OP_DUP_ASTERISK : OP_ALT));
2636	if (__glibc_unlikely (tree == NULL))
2637	goto parse_dup_op_espace;
2638
2639	/ This loop is actually executed only when end != -1,*
2640	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2641	already created the start+1-th copy. /*
2642	if (TYPE_SIGNED (Idx) \|\| end != -`1`)
2643	for (i = start + `2`; i <= end; ++i)
2644	{
2645	elem = duplicate_tree (elem, dfa);
2646	tree = create_tree (dfa, tree, elem, CONCAT);
2647	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
2648	goto parse_dup_op_espace;
2649
2650	tree = create_tree (dfa, tree, NULL, OP_ALT);
2651	if (__glibc_unlikely (tree == NULL))
2652	goto parse_dup_op_espace;
2653	}
2654
2655	if (old_tree)
2656	tree = create_tree (dfa, old_tree, tree, CONCAT);
2657
2658	return tree;
2659
2660	parse_dup_op_espace:
2661	*err = REG_ESPACE;
2662	return NULL;
2663	}
2664
2665	/ Size of the names for collating symbol/equivalence_class/character_class.*
2666	I'm not sure, but maybe enough. /*
2667	#define BRACKET_NAME_BUF_SIZE 32
2668
2669	#ifndef _LIBC
2670
2671	# ifdef RE_ENABLE_I18N
2672	/ Convert the byte B to the corresponding wide character. In a*
2673	unibyte locale, treat B as itself. In a multibyte locale, return
2674	WEOF if B is an encoding error. /*
2675	static wint_t
2676	parse_byte (unsigned char b, re_charset_t *mbcset)
2677	{
2678	return mbcset == NULL ? b : __btowc (b);
2679	}
2680	# endif
2681
2682	/ Local function for parse_bracket_exp only used in case of NOT _LIBC.*
2683	Build the range expression which starts from START_ELEM, and ends
2684	at END_ELEM. The result are written to MBCSET and SBCSET.
2685	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2686	mbcset->range_ends, is a pointer argument since we may
2687	update it. /*
2688
2689	static reg_errcode_t
2690	# ifdef RE_ENABLE_I18N
2691	build_range_exp (const reg_syntax_t syntax,
2692	bitset_t sbcset,
2693	re_charset_t *mbcset,
2694	Idx *range_alloc,
2695	const bracket_elem_t *start_elem,
2696	const bracket_elem_t *end_elem)
2697	# else /* not RE_ENABLE_I18N */
2698	build_range_exp (const reg_syntax_t syntax,
2699	bitset_t sbcset,
2700	const bracket_elem_t *start_elem,
2701	const bracket_elem_t *end_elem)
2702	# endif /* not RE_ENABLE_I18N */
2703	{
2704	unsigned int start_ch, end_ch;
2705	/ Equivalence Classes and Character Classes can't be a range start/end. /
2706	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2707	\|\| start_elem->type == CHAR_CLASS
2708	\|\| end_elem->type == EQUIV_CLASS
2709	\|\| end_elem->type == CHAR_CLASS))
2710	return REG_ERANGE;
2711
2712	/ We can handle no multi character collating elements without libc*
2713	support. /*
2714	if (__glibc_unlikely ((start_elem->type == COLL_SYM
2715	&& strlen ((char *) start_elem->opr.name) > `1`)
2716	\|\| (end_elem->type == COLL_SYM
2717	&& strlen ((char *) end_elem->opr.name) > `1`)))
2718	return REG_ECOLLATE;
2719
2720	# ifdef RE_ENABLE_I18N
2721	{
2722	wchar_t wc;
2723	wint_t start_wc;
2724	wint_t end_wc;
2725
2726	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2727	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[`0`]
2728	: `0`));
2729	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2730	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[`0`]
2731	: `0`));
2732	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
2733	? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2734	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
2735	? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2736	if (start_wc == WEOF \|\| end_wc == WEOF)
2737	return REG_ECOLLATE;
2738	else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2739	&& start_wc > end_wc))
2740	return REG_ERANGE;
2741
2742	/ Got valid collation sequence values, add them as a new entry.*
2743	However, for !_LIBC we have no collation elements: if the
2744	character set is single byte, the single byte character set
2745	that we build below suffices. parse_bracket_exp passes
2746	no MBCSET if dfa->mb_cur_max == 1. /*
2747	if (mbcset)
2748	{
2749	/ Check the space of the arrays. /
2750	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2751	{
2752	/ There is not enough space, need realloc. /
2753	wchar_t new_array_start, new_array_end;
2754	Idx new_nranges;
2755
2756	/ +1 in case of mbcset->nranges is 0. /
2757	new_nranges = `2` * mbcset->nranges + `1`;
2758	/ Use realloc since mbcset->range_starts and mbcset->range_ends*
2759	are NULL if range_alloc == 0. /
2760	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2761	new_nranges);
2762	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2763	new_nranges);
2764
2765	if (__glibc_unlikely (new_array_start == NULL
2766	\|\| new_array_end == NULL))
2767	{
2768	re_free (new_array_start);
2769	re_free (new_array_end);
2770	return REG_ESPACE;
2771	}
2772
2773	mbcset->range_starts = new_array_start;
2774	mbcset->range_ends = new_array_end;
2775	*range_alloc = new_nranges;
2776	}
2777
2778	mbcset->range_starts[mbcset->nranges] = start_wc;
2779	mbcset->range_ends[mbcset->nranges++] = end_wc;
2780	}
2781
2782	/ Build the table for single byte characters. /
2783	for (wc = `0`; wc < SBC_MAX; ++wc)
2784	{
2785	if (start_wc <= wc && wc <= end_wc)
2786	bitset_set (sbcset, wc);
2787	}
2788	}
2789	# else /* not RE_ENABLE_I18N */
2790	{
2791	unsigned int ch;
2792	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2793	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[`0`]
2794	: `0`));
2795	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2796	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[`0`]
2797	: `0`));
2798	if (start_ch > end_ch)
2799	return REG_ERANGE;
2800	/ Build the table for single byte characters. /
2801	for (ch = `0`; ch < SBC_MAX; ++ch)
2802	if (start_ch <= ch && ch <= end_ch)
2803	bitset_set (sbcset, ch);
2804	}
2805	# endif /* not RE_ENABLE_I18N */
2806	return REG_NOERROR;
2807	}
2808	#endif /* not _LIBC */
2809
2810	#ifndef _LIBC
2811	/ Helper function for parse_bracket_exp only used in case of NOT _LIBC..*
2812	Build the collating element which is represented by NAME.
2813	The result are written to MBCSET and SBCSET.
2814	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2815	pointer argument since we may update it. /*
2816
2817	static reg_errcode_t
2818	# ifdef RE_ENABLE_I18N
2819	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2820	Idx coll_sym_alloc, const* unsigned char *name)
2821	# else /* not RE_ENABLE_I18N */
2822	build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2823	# endif /* not RE_ENABLE_I18N */
2824	{
2825	size_t name_len = strlen ((const char *) name);
2826	if (__glibc_unlikely (name_len != `1`))
2827	return REG_ECOLLATE;
2828	else
2829	{
2830	bitset_set (sbcset, name[`0`]);
2831	return REG_NOERROR;
2832	}
2833	}
2834	#endif /* not _LIBC */
2835
2836	/ This function parse bracket expression like "[abc]", "[a-c]",*
2837	"[[.a-a.]]" etc. /*
2838
2839	static bin_tree_t *
2840	parse_bracket_exp (re_string_t regexp, re_dfa_t dfa, re_token_t *token,
2841	reg_syntax_t syntax, reg_errcode_t *err)
2842	{
2843	#ifdef _LIBC
2844	const unsigned char *collseqmb;
2845	const char *collseqwc;
2846	uint32_t nrules;
2847	int32_t table_size;
2848	const int32_t *symb_table;
2849	const unsigned char *extra;
2850
2851	/ Local function for parse_bracket_exp used in _LIBC environment.*
2852	Seek the collating symbol entry corresponding to NAME.
2853	Return the index of the symbol in the SYMB_TABLE,
2854	or -1 if not found. /*
2855
2856	auto inline int32_t
2857	__attribute__ ((always_inline))
2858	seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2859	{
2860	int32_t elem;
2861
2862	for (elem = `0`; elem < table_size; elem++)
2863	if (symb_table[`2` * elem] != `0`)
2864	{
2865	int32_t idx = symb_table[`2` * elem + `1`];
2866	/ Skip the name of collating element name. /
2867	idx += `1` + extra[idx];
2868	if (/ Compare the length of the name. /
2869	name_len == extra[idx]
2870	/ Compare the name. /
2871	&& memcmp (name, &extra[idx + `1`], name_len) == `0`)
2872	/ Yep, this is the entry. /
2873	return elem;
2874	}
2875	return -`1`;
2876	}
2877
2878	/ Local function for parse_bracket_exp used in _LIBC environment.*
2879	Look up the collation sequence value of BR_ELEM.
2880	Return the value if succeeded, UINT_MAX otherwise. /*
2881
2882	auto inline unsigned int
2883	__attribute__ ((always_inline))
2884	lookup_collation_sequence_value (bracket_elem_t *br_elem)
2885	{
2886	if (br_elem->type == SB_CHAR)
2887	{
2888	/*
2889	if (MB_CUR_MAX == 1)
2890	*/
2891	if (nrules == `0`)
2892	return collseqmb[br_elem->opr.ch];
2893	else
2894	{
2895	wint_t wc = __btowc (br_elem->opr.ch);
2896	return __collseq_table_lookup (collseqwc, wc);
2897	}
2898	}
2899	else if (br_elem->type == MB_CHAR)
2900	{
2901	if (nrules != `0`)
2902	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2903	}
2904	else if (br_elem->type == COLL_SYM)
2905	{
2906	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2907	if (nrules != `0`)
2908	{
2909	int32_t elem, idx;
2910	elem = seek_collating_symbol_entry (br_elem->opr.name,
2911	sym_name_len);
2912	if (elem != -`1`)
2913	{
2914	/ We found the entry. /
2915	idx = symb_table[`2` * elem + `1`];
2916	/ Skip the name of collating element name. /
2917	idx += `1` + extra[idx];
2918	/ Skip the byte sequence of the collating element. /
2919	idx += `1` + extra[idx];
2920	/ Adjust for the alignment. /
2921	idx = (idx + `3`) & ~`3`;
2922	/ Skip the multibyte collation sequence value. /
2923	idx += sizeof (unsigned int);
2924	/ Skip the wide char sequence of the collating element. /
2925	idx += sizeof (unsigned int) *
2926	(`1` + (unsigned* int *) (extra + idx));
2927	/ Return the collation sequence value. /
2928	return (unsigned* int *) (extra + idx);
2929	}
2930	else if (sym_name_len == `1`)
2931	{
2932	/ No valid character. Match it as a single byte*
2933	character. /*
2934	return collseqmb[br_elem->opr.name[`0`]];
2935	}
2936	}
2937	else if (sym_name_len == `1`)
2938	return collseqmb[br_elem->opr.name[`0`]];
2939	}
2940	return UINT_MAX;
2941	}
2942
2943	/ Local function for parse_bracket_exp used in _LIBC environment.*
2944	Build the range expression which starts from START_ELEM, and ends
2945	at END_ELEM. The result are written to MBCSET and SBCSET.
2946	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2947	mbcset->range_ends, is a pointer argument since we may
2948	update it. /*
2949
2950	auto inline reg_errcode_t
2951	__attribute__ ((always_inline))
2952	build_range_exp (bitset_t sbcset, re_charset_t mbcset, int* *range_alloc,
2953	bracket_elem_t start_elem, bracket_elem_t end_elem)
2954	{
2955	unsigned int ch;
2956	uint32_t start_collseq;
2957	uint32_t end_collseq;
2958
2959	/ Equivalence Classes and Character Classes can't be a range*
2960	start/end. /*
2961	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2962	\|\| start_elem->type == CHAR_CLASS
2963	\|\| end_elem->type == EQUIV_CLASS
2964	\|\| end_elem->type == CHAR_CLASS))
2965	return REG_ERANGE;
2966
2967	/ FIXME: Implement rational ranges here, too. /
2968	start_collseq = lookup_collation_sequence_value (start_elem);
2969	end_collseq = lookup_collation_sequence_value (end_elem);
2970	/ Check start/end collation sequence values. /
2971	if (__glibc_unlikely (start_collseq == UINT_MAX
2972	\|\| end_collseq == UINT_MAX))
2973	return REG_ECOLLATE;
2974	if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2975	&& start_collseq > end_collseq))
2976	return REG_ERANGE;
2977
2978	/ Got valid collation sequence values, add them as a new entry.*
2979	However, if we have no collation elements, and the character set
2980	is single byte, the single byte character set that we
2981	build below suffices. /*
2982	if (nrules > `0` \|\| dfa->mb_cur_max > `1`)
2983	{
2984	/ Check the space of the arrays. /
2985	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2986	{
2987	/ There is not enough space, need realloc. /
2988	uint32_t *new_array_start;
2989	uint32_t *new_array_end;
2990	Idx new_nranges;
2991
2992	/ +1 in case of mbcset->nranges is 0. /
2993	new_nranges = `2` * mbcset->nranges + `1`;
2994	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2995	new_nranges);
2996	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2997	new_nranges);
2998
2999	if (__glibc_unlikely (new_array_start == NULL
3000	\|\| new_array_end == NULL))
3001	return REG_ESPACE;
3002
3003	mbcset->range_starts = new_array_start;
3004	mbcset->range_ends = new_array_end;
3005	*range_alloc = new_nranges;
3006	}
3007
3008	mbcset->range_starts[mbcset->nranges] = start_collseq;
3009	mbcset->range_ends[mbcset->nranges++] = end_collseq;
3010	}
3011
3012	/ Build the table for single byte characters. /
3013	for (ch = `0`; ch < SBC_MAX; ch++)
3014	{
3015	uint32_t ch_collseq;
3016	/*
3017	if (MB_CUR_MAX == 1)
3018	*/
3019	if (nrules == `0`)
3020	ch_collseq = collseqmb[ch];
3021	else
3022	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3023	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3024	bitset_set (sbcset, ch);
3025	}
3026	return REG_NOERROR;
3027	}
3028
3029	/ Local function for parse_bracket_exp used in _LIBC environment.*
3030	Build the collating element which is represented by NAME.
3031	The result are written to MBCSET and SBCSET.
3032	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3033	pointer argument since we may update it. /*
3034
3035	auto inline reg_errcode_t
3036	__attribute__ ((always_inline))
3037	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3038	Idx coll_sym_alloc, const* unsigned char *name)
3039	{
3040	int32_t elem, idx;
3041	size_t name_len = strlen ((const char *) name);
3042	if (nrules != `0`)
3043	{
3044	elem = seek_collating_symbol_entry (name, name_len);
3045	if (elem != -`1`)
3046	{
3047	/ We found the entry. /
3048	idx = symb_table[`2` * elem + `1`];
3049	/ Skip the name of collating element name. /
3050	idx += `1` + extra[idx];
3051	}
3052	else if (name_len == `1`)
3053	{
3054	/ No valid character, treat it as a normal*
3055	character. /*
3056	bitset_set (sbcset, name[`0`]);
3057	return REG_NOERROR;
3058	}
3059	else
3060	return REG_ECOLLATE;
3061
3062	/ Got valid collation sequence, add it as a new entry. /
3063	/ Check the space of the arrays. /
3064	if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3065	{
3066	/ Not enough, realloc it. /
3067	/ +1 in case of mbcset->ncoll_syms is 0. /
3068	Idx new_coll_sym_alloc = `2` * mbcset->ncoll_syms + `1`;
3069	/ Use realloc since mbcset->coll_syms is NULL*
3070	if alloc == 0. /
3071	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3072	new_coll_sym_alloc);
3073	if (__glibc_unlikely (new_coll_syms == NULL))
3074	return REG_ESPACE;
3075	mbcset->coll_syms = new_coll_syms;
3076	*coll_sym_alloc = new_coll_sym_alloc;
3077	}
3078	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3079	return REG_NOERROR;
3080	}
3081	else
3082	{
3083	if (__glibc_unlikely (name_len != `1`))
3084	return REG_ECOLLATE;
3085	else
3086	{
3087	bitset_set (sbcset, name[`0`]);
3088	return REG_NOERROR;
3089	}
3090	}
3091	}
3092	#endif
3093
3094	re_token_t br_token;
3095	re_bitset_ptr_t sbcset;
3096	#ifdef RE_ENABLE_I18N
3097	re_charset_t *mbcset;
3098	Idx coll_sym_alloc = `0`, range_alloc = `0`, mbchar_alloc = `0`;
3099	Idx equiv_class_alloc = `0`, char_class_alloc = `0`;
3100	#endif /* not RE_ENABLE_I18N */
3101	bool non_match = false;
3102	bin_tree_t *work_tree;
3103	int token_len;
3104	bool first_round = true;
3105	#ifdef _LIBC
3106	collseqmb = (const unsigned char *)
3107	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3108	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3109	if (nrules)
3110	{
3111	/*
3112	if (MB_CUR_MAX > 1)
3113	*/
3114	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3115	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3116	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3117	_NL_COLLATE_SYMB_TABLEMB);
3118	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3119	_NL_COLLATE_SYMB_EXTRAMB);
3120	}
3121	#endif
3122	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), `1`);
3123	#ifdef RE_ENABLE_I18N
3124	mbcset = (re_charset_t ) calloc (sizeof* (re_charset_t), `1`);
3125	#endif /* RE_ENABLE_I18N */
3126	#ifdef RE_ENABLE_I18N
3127	if (__glibc_unlikely (sbcset == NULL \|\| mbcset == NULL))
3128	#else
3129	if (__glibc_unlikely (sbcset == NULL))
3130	#endif /* RE_ENABLE_I18N */
3131	{
3132	re_free (sbcset);
3133	#ifdef RE_ENABLE_I18N
3134	re_free (mbcset);
3135	#endif
3136	*err = REG_ESPACE;
3137	return NULL;
3138	}
3139
3140	token_len = peek_token_bracket (token, regexp, syntax);
3141	if (__glibc_unlikely (token->type == END_OF_RE))
3142	{
3143	*err = REG_BADPAT;
3144	goto parse_bracket_exp_free_return;
3145	}
3146	if (token->type == OP_NON_MATCH_LIST)
3147	{
3148	#ifdef RE_ENABLE_I18N
3149	mbcset->non_match = `1`;
3150	#endif /* not RE_ENABLE_I18N */
3151	non_match = true;
3152	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3153	bitset_set (sbcset, `'\n'`);
3154	re_string_skip_bytes (regexp, token_len); / Skip a token. /
3155	token_len = peek_token_bracket (token, regexp, syntax);
3156	if (__glibc_unlikely (token->type == END_OF_RE))
3157	{
3158	*err = REG_BADPAT;
3159	goto parse_bracket_exp_free_return;
3160	}
3161	}
3162
3163	/ We treat the first ']' as a normal character. /
3164	if (token->type == OP_CLOSE_BRACKET)
3165	token->type = CHARACTER;
3166
3167	while (`1`)
3168	{
3169	bracket_elem_t start_elem, end_elem;
3170	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3171	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3172	reg_errcode_t ret;
3173	int token_len2 = `0`;
3174	bool is_range_exp = false;
3175	re_token_t token2;
3176
3177	start_elem.opr.name = start_name_buf;
3178	start_elem.type = COLL_SYM;
3179	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3180	syntax, first_round);
3181	if (__glibc_unlikely (ret != REG_NOERROR))
3182	{
3183	*err = ret;
3184	goto parse_bracket_exp_free_return;
3185	}
3186	first_round = false;
3187
3188	/ Get information about the next token. We need it in any case. /
3189	token_len = peek_token_bracket (token, regexp, syntax);
3190
3191	/ Do not check for ranges if we know they are not allowed. /
3192	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3193	{
3194	if (__glibc_unlikely (token->type == END_OF_RE))
3195	{
3196	*err = REG_EBRACK;
3197	goto parse_bracket_exp_free_return;
3198	}
3199	if (token->type == OP_CHARSET_RANGE)
3200	{
3201	re_string_skip_bytes (regexp, token_len); / Skip '-'. /
3202	token_len2 = peek_token_bracket (&token2, regexp, syntax);
3203	if (__glibc_unlikely (token2.type == END_OF_RE))
3204	{
3205	*err = REG_EBRACK;
3206	goto parse_bracket_exp_free_return;
3207	}
3208	if (token2.type == OP_CLOSE_BRACKET)
3209	{
3210	/ We treat the last '-' as a normal character. /
3211	re_string_skip_bytes (regexp, -token_len);
3212	token->type = CHARACTER;
3213	}
3214	else
3215	is_range_exp = true;
3216	}
3217	}
3218
3219	if (is_range_exp == true)
3220	{
3221	end_elem.opr.name = end_name_buf;
3222	end_elem.type = COLL_SYM;
3223	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3224	dfa, syntax, true);
3225	if (__glibc_unlikely (ret != REG_NOERROR))
3226	{
3227	*err = ret;
3228	goto parse_bracket_exp_free_return;
3229	}
3230
3231	token_len = peek_token_bracket (token, regexp, syntax);
3232
3233	#ifdef _LIBC
3234	*err = build_range_exp (sbcset, mbcset, &range_alloc,
3235	&start_elem, &end_elem);
3236	#else
3237	# ifdef RE_ENABLE_I18N
3238	*err = build_range_exp (syntax, sbcset,
3239	dfa->mb_cur_max > `1` ? mbcset : NULL,
3240	&range_alloc, &start_elem, &end_elem);
3241	# else
3242	*err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3243	# endif
3244	#endif /* RE_ENABLE_I18N */
3245	if (__glibc_unlikely (*err != REG_NOERROR))
3246	goto parse_bracket_exp_free_return;
3247	}
3248	else
3249	{
3250	switch (start_elem.type)
3251	{
3252	case SB_CHAR:
3253	bitset_set (sbcset, start_elem.opr.ch);
3254	break;
3255	#ifdef RE_ENABLE_I18N
3256	case MB_CHAR:
3257	/ Check whether the array has enough space. /
3258	if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3259	{
3260	wchar_t *new_mbchars;
3261	/ Not enough, realloc it. /
3262	/ +1 in case of mbcset->nmbchars is 0. /
3263	mbchar_alloc = `2` * mbcset->nmbchars + `1`;
3264	/ Use realloc since array is NULL if alloc == 0. /*
3265	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3266	mbchar_alloc);
3267	if (__glibc_unlikely (new_mbchars == NULL))
3268	goto parse_bracket_exp_espace;
3269	mbcset->mbchars = new_mbchars;
3270	}
3271	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3272	break;
3273	#endif /* RE_ENABLE_I18N */
3274	case EQUIV_CLASS:
3275	*err = build_equiv_class (sbcset,
3276	#ifdef RE_ENABLE_I18N
3277	mbcset, &equiv_class_alloc,
3278	#endif /* RE_ENABLE_I18N */
3279	start_elem.opr.name);
3280	if (__glibc_unlikely (*err != REG_NOERROR))
3281	goto parse_bracket_exp_free_return;
3282	break;
3283	case COLL_SYM:
3284	*err = build_collating_symbol (sbcset,
3285	#ifdef RE_ENABLE_I18N
3286	mbcset, &coll_sym_alloc,
3287	#endif /* RE_ENABLE_I18N */
3288	start_elem.opr.name);
3289	if (__glibc_unlikely (*err != REG_NOERROR))
3290	goto parse_bracket_exp_free_return;
3291	break;
3292	case CHAR_CLASS:
3293	*err = build_charclass (regexp->trans, sbcset,
3294	#ifdef RE_ENABLE_I18N
3295	mbcset, &char_class_alloc,
3296	#endif /* RE_ENABLE_I18N */
3297	(const char *) start_elem.opr.name,
3298	syntax);
3299	if (__glibc_unlikely (*err != REG_NOERROR))
3300	goto parse_bracket_exp_free_return;
3301	break;
3302	default:
3303	DEBUG_ASSERT (false);
3304	break;
3305	}
3306	}
3307	if (__glibc_unlikely (token->type == END_OF_RE))
3308	{
3309	*err = REG_EBRACK;
3310	goto parse_bracket_exp_free_return;
3311	}
3312	if (token->type == OP_CLOSE_BRACKET)
3313	break;
3314	}
3315
3316	re_string_skip_bytes (regexp, token_len); / Skip a token. /
3317
3318	/ If it is non-matching list. /
3319	if (non_match)
3320	bitset_not (sbcset);
3321
3322	#ifdef RE_ENABLE_I18N
3323	/ Ensure only single byte characters are set. /
3324	if (dfa->mb_cur_max > `1`)
3325	bitset_mask (sbcset, dfa->sb_char);
3326
3327	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
3328	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > `1` && (mbcset->nchar_classes
3329	\|\| mbcset->non_match)))
3330	{
3331	bin_tree_t *mbc_tree;
3332	int sbc_idx;
3333	/ Build a tree for complex bracket. /
3334	dfa->has_mb_node = `1`;
3335	br_token.type = COMPLEX_BRACKET;
3336	br_token.opr.mbcset = mbcset;
3337	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3338	if (__glibc_unlikely (mbc_tree == NULL))
3339	goto parse_bracket_exp_espace;
3340	for (sbc_idx = `0`; sbc_idx < BITSET_WORDS; ++sbc_idx)
3341	if (sbcset[sbc_idx])
3342	break;
3343	/ If there are no bits set in sbcset, there is no point*
3344	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. /*
3345	if (sbc_idx < BITSET_WORDS)
3346	{
3347	/ Build a tree for simple bracket. /
3348	br_token.type = SIMPLE_BRACKET;
3349	br_token.opr.sbcset = sbcset;
3350	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3351	if (__glibc_unlikely (work_tree == NULL))
3352	goto parse_bracket_exp_espace;
3353
3354	/ Then join them by ALT node. /
3355	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3356	if (__glibc_unlikely (work_tree == NULL))
3357	goto parse_bracket_exp_espace;
3358	}
3359	else
3360	{
3361	re_free (sbcset);
3362	work_tree = mbc_tree;
3363	}
3364	}
3365	else
3366	#endif /* not RE_ENABLE_I18N */
3367	{
3368	#ifdef RE_ENABLE_I18N
3369	free_charset (mbcset);
3370	#endif
3371	/ Build a tree for simple bracket. /
3372	br_token.type = SIMPLE_BRACKET;
3373	br_token.opr.sbcset = sbcset;
3374	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3375	if (__glibc_unlikely (work_tree == NULL))
3376	goto parse_bracket_exp_espace;
3377	}
3378	return work_tree;
3379
3380	parse_bracket_exp_espace:
3381	*err = REG_ESPACE;
3382	parse_bracket_exp_free_return:
3383	re_free (sbcset);
3384	#ifdef RE_ENABLE_I18N
3385	free_charset (mbcset);
3386	#endif /* RE_ENABLE_I18N */
3387	return NULL;
3388	}
3389
3390	/ Parse an element in the bracket expression. /
3391
3392	static reg_errcode_t
3393	parse_bracket_element (bracket_elem_t elem, re_string_t regexp,
3394	re_token_t token, int* token_len, re_dfa_t *dfa,
3395	reg_syntax_t syntax, bool accept_hyphen)
3396	{
3397	#ifdef RE_ENABLE_I18N
3398	int cur_char_size;
3399	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3400	if (cur_char_size > `1`)
3401	{
3402	elem->type = MB_CHAR;
3403	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3404	re_string_skip_bytes (regexp, cur_char_size);
3405	return REG_NOERROR;
3406	}
3407	#endif /* RE_ENABLE_I18N */
3408	re_string_skip_bytes (regexp, token_len); / Skip a token. /
3409	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
3410	\|\| token->type == OP_OPEN_EQUIV_CLASS)
3411	return parse_bracket_symbol (elem, regexp, token);
3412	if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3413	{
3414	/ A '-' must only appear as anything but a range indicator before*
3415	the closing bracket. Everything else is an error. /*
3416	re_token_t token2;
3417	(void) peek_token_bracket (&token2, regexp, syntax);
3418	if (token2.type != OP_CLOSE_BRACKET)
3419	/ The actual error value is not standardized since this whole*
3420	case is undefined. But ERANGE makes good sense. /*
3421	return REG_ERANGE;
3422	}
3423	elem->type = SB_CHAR;
3424	elem->opr.ch = token->opr.c;
3425	return REG_NOERROR;
3426	}
3427
3428	/ Parse a bracket symbol in the bracket expression. Bracket symbols are*
3429	such as [:<character_class>:], [.<collating_element>.], and
3430	[=<equivalent_class>=]. /*
3431
3432	static reg_errcode_t
3433	parse_bracket_symbol (bracket_elem_t elem, re_string_t regexp,
3434	re_token_t *token)
3435	{
3436	unsigned char ch, delim = token->opr.c;
3437	int i = `0`;
3438	if (re_string_eoi(regexp))
3439	return REG_EBRACK;
3440	for (;; ++i)
3441	{
3442	if (i >= BRACKET_NAME_BUF_SIZE)
3443	return REG_EBRACK;
3444	if (token->type == OP_OPEN_CHAR_CLASS)
3445	ch = re_string_fetch_byte_case (regexp);
3446	else
3447	ch = re_string_fetch_byte (regexp);
3448	if (re_string_eoi(regexp))
3449	return REG_EBRACK;
3450	if (ch == delim && re_string_peek_byte (regexp, `0`) == `']'`)
3451	break;
3452	elem->opr.name[i] = ch;
3453	}
3454	re_string_skip_bytes (regexp, `1`);
3455	elem->opr.name[i] = `'\0'`;
3456	switch (token->type)
3457	{
3458	case OP_OPEN_COLL_ELEM:
3459	elem->type = COLL_SYM;
3460	break;
3461	case OP_OPEN_EQUIV_CLASS:
3462	elem->type = EQUIV_CLASS;
3463	break;
3464	case OP_OPEN_CHAR_CLASS:
3465	elem->type = CHAR_CLASS;
3466	break;
3467	default:
3468	break;
3469	}
3470	return REG_NOERROR;
3471	}
3472
3473	/ Helper function for parse_bracket_exp.*
3474	Build the equivalence class which is represented by NAME.
3475	The result are written to MBCSET and SBCSET.
3476	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3477	is a pointer argument since we may update it. /*
3478
3479	static reg_errcode_t
3480	#ifdef RE_ENABLE_I18N
3481	build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3482	Idx equiv_class_alloc, const* unsigned char *name)
3483	#else /* not RE_ENABLE_I18N */
3484	build_equiv_class (bitset_t sbcset, const unsigned char *name)
3485	#endif /* not RE_ENABLE_I18N */
3486	{
3487	#ifdef _LIBC
3488	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3489	if (nrules != `0`)
3490	{
3491	const int32_t table, indirect;
3492	const unsigned char weights, extra, *cp;
3493	unsigned char char_buf[`2`];
3494	int32_t idx1, idx2;
3495	unsigned int ch;
3496	size_t len;
3497	/ Calculate the index for equivalence class. /
3498	cp = name;
3499	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3500	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3501	_NL_COLLATE_WEIGHTMB);
3502	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3503	_NL_COLLATE_EXTRAMB);
3504	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3505	_NL_COLLATE_INDIRECTMB);
3506	idx1 = findidx (table, indirect, extra, &cp, -`1`);
3507	if (__glibc_unlikely (idx1 == `0` \|\| *cp != `'\0'`))
3508	/ This isn't a valid character. /
3509	return REG_ECOLLATE;
3510
3511	/ Build single byte matching table for this equivalence class. /
3512	len = weights[idx1 & `0xffffff`];
3513	for (ch = `0`; ch < SBC_MAX; ++ch)
3514	{
3515	char_buf[`0`] = ch;
3516	cp = char_buf;
3517	idx2 = findidx (table, indirect, extra, &cp, `1`);
3518	/*
3519	idx2 = table[ch];
3520	*/
3521	if (idx2 == `0`)
3522	/ This isn't a valid character. /
3523	continue;
3524	/ Compare only if the length matches and the collation rule*
3525	index is the same. /*
3526	if (len == weights[idx2 & `0xffffff`] && (idx1 >> `24`) == (idx2 >> `24`)
3527	&& memcmp (weights + (idx1 & `0xffffff`) + `1`,
3528	weights + (idx2 & `0xffffff`) + `1`, len) == `0`)
3529	bitset_set (sbcset, ch);
3530	}
3531	/ Check whether the array has enough space. /
3532	if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3533	{
3534	/ Not enough, realloc it. /
3535	/ +1 in case of mbcset->nequiv_classes is 0. /
3536	Idx new_equiv_class_alloc = `2` * mbcset->nequiv_classes + `1`;
3537	/ Use realloc since the array is NULL if alloc == 0. /*
3538	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3539	int32_t,
3540	new_equiv_class_alloc);
3541	if (__glibc_unlikely (new_equiv_classes == NULL))
3542	return REG_ESPACE;
3543	mbcset->equiv_classes = new_equiv_classes;
3544	*equiv_class_alloc = new_equiv_class_alloc;
3545	}
3546	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3547	}
3548	else
3549	#endif /* _LIBC */
3550	{
3551	if (__glibc_unlikely (strlen ((const char *) name) != `1`))
3552	return REG_ECOLLATE;
3553	bitset_set (sbcset, *name);
3554	}
3555	return REG_NOERROR;
3556	}
3557
3558	/ Helper function for parse_bracket_exp.*
3559	Build the character class which is represented by NAME.
3560	The result are written to MBCSET and SBCSET.
3561	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3562	is a pointer argument since we may update it. /*
3563
3564	static reg_errcode_t
3565	#ifdef RE_ENABLE_I18N
3566	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3567	re_charset_t mbcset, Idx char_class_alloc,
3568	const char *class_name, reg_syntax_t syntax)
3569	#else /* not RE_ENABLE_I18N */
3570	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3571	const char *class_name, reg_syntax_t syntax)
3572	#endif /* not RE_ENABLE_I18N */
3573	{
3574	int i;
3575	const char *name = class_name;
3576
3577	/ In case of REG_ICASE "upper" and "lower" match the both of*
3578	upper and lower cases. /*
3579	if ((syntax & RE_ICASE)
3580	&& (strcmp (name, "upper") == `0` \|\| strcmp (name, "lower") == `0`))
3581	name = "alpha";
3582
3583	#ifdef RE_ENABLE_I18N
3584	/ Check the space of the arrays. /
3585	if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3586	{
3587	/ Not enough, realloc it. /
3588	/ +1 in case of mbcset->nchar_classes is 0. /
3589	Idx new_char_class_alloc = `2` * mbcset->nchar_classes + `1`;
3590	/ Use realloc since array is NULL if alloc == 0. /*
3591	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3592	new_char_class_alloc);
3593	if (__glibc_unlikely (new_char_classes == NULL))
3594	return REG_ESPACE;
3595	mbcset->char_classes = new_char_classes;
3596	*char_class_alloc = new_char_class_alloc;
3597	}
3598	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3599	#endif /* RE_ENABLE_I18N */
3600
3601	#define BUILD_CHARCLASS_LOOP(ctype_func) \
3602	do { \
3603	if (__glibc_unlikely (trans != NULL)) \
3604	{ \
3605	for (i = 0; i < SBC_MAX; ++i) \
3606	if (ctype_func (i)) \
3607	bitset_set (sbcset, trans[i]); \
3608	} \
3609	else \
3610	{ \
3611	for (i = 0; i < SBC_MAX; ++i) \
3612	if (ctype_func (i)) \
3613	bitset_set (sbcset, i); \
3614	} \
3615	} while (0)
3616
3617	if (strcmp (name, "alnum") == `0`)
3618	BUILD_CHARCLASS_LOOP (isalnum);
3619	else if (strcmp (name, "cntrl") == `0`)
3620	BUILD_CHARCLASS_LOOP (iscntrl);
3621	else if (strcmp (name, "lower") == `0`)
3622	BUILD_CHARCLASS_LOOP (islower);
3623	else if (strcmp (name, "space") == `0`)
3624	BUILD_CHARCLASS_LOOP (isspace);
3625	else if (strcmp (name, "alpha") == `0`)
3626	BUILD_CHARCLASS_LOOP (isalpha);
3627	else if (strcmp (name, "digit") == `0`)
3628	BUILD_CHARCLASS_LOOP (isdigit);
3629	else if (strcmp (name, "print") == `0`)
3630	BUILD_CHARCLASS_LOOP (isprint);
3631	else if (strcmp (name, "upper") == `0`)
3632	BUILD_CHARCLASS_LOOP (isupper);
3633	else if (strcmp (name, "blank") == `0`)
3634	BUILD_CHARCLASS_LOOP (isblank);
3635	else if (strcmp (name, "graph") == `0`)
3636	BUILD_CHARCLASS_LOOP (isgraph);
3637	else if (strcmp (name, "punct") == `0`)
3638	BUILD_CHARCLASS_LOOP (ispunct);
3639	else if (strcmp (name, "xdigit") == `0`)
3640	BUILD_CHARCLASS_LOOP (isxdigit);
3641	else
3642	return REG_ECTYPE;
3643
3644	return REG_NOERROR;
3645	}
3646
3647	static bin_tree_t *
3648	build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3649	const char *class_name,
3650	const char *extra, bool non_match,
3651	reg_errcode_t *err)
3652	{
3653	re_bitset_ptr_t sbcset;
3654	#ifdef RE_ENABLE_I18N
3655	re_charset_t *mbcset;
3656	Idx alloc = `0`;
3657	#endif /* not RE_ENABLE_I18N */
3658	reg_errcode_t ret;
3659	bin_tree_t *tree;
3660
3661	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), `1`);
3662	if (__glibc_unlikely (sbcset == NULL))
3663	{
3664	*err = REG_ESPACE;
3665	return NULL;
3666	}
3667	#ifdef RE_ENABLE_I18N
3668	mbcset = (re_charset_t ) calloc (sizeof* (re_charset_t), `1`);
3669	if (__glibc_unlikely (mbcset == NULL))
3670	{
3671	re_free (sbcset);
3672	*err = REG_ESPACE;
3673	return NULL;
3674	}
3675	mbcset->non_match = non_match;
3676	#endif /* RE_ENABLE_I18N */
3677
3678	/ We don't care the syntax in this case. /
3679	ret = build_charclass (trans, sbcset,
3680	#ifdef RE_ENABLE_I18N
3681	mbcset, &alloc,
3682	#endif /* RE_ENABLE_I18N */
3683	class_name, `0`);
3684
3685	if (__glibc_unlikely (ret != REG_NOERROR))
3686	{
3687	re_free (sbcset);
3688	#ifdef RE_ENABLE_I18N
3689	free_charset (mbcset);
3690	#endif /* RE_ENABLE_I18N */
3691	*err = ret;
3692	return NULL;
3693	}
3694	/ \w match '_' also. /
3695	for (; *extra; extra++)
3696	bitset_set (sbcset, *extra);
3697
3698	/ If it is non-matching list. /
3699	if (non_match)
3700	bitset_not (sbcset);
3701
3702	#ifdef RE_ENABLE_I18N
3703	/ Ensure only single byte characters are set. /
3704	if (dfa->mb_cur_max > `1`)
3705	bitset_mask (sbcset, dfa->sb_char);
3706	#endif
3707
3708	/ Build a tree for simple bracket. /
3709	re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3710	tree = create_token_tree (dfa, NULL, NULL, &br_token);
3711	if (__glibc_unlikely (tree == NULL))
3712	goto build_word_op_espace;
3713
3714	#ifdef RE_ENABLE_I18N
3715	if (dfa->mb_cur_max > `1`)
3716	{
3717	bin_tree_t *mbc_tree;
3718	/ Build a tree for complex bracket. /
3719	br_token.type = COMPLEX_BRACKET;
3720	br_token.opr.mbcset = mbcset;
3721	dfa->has_mb_node = `1`;
3722	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3723	if (__glibc_unlikely (mbc_tree == NULL))
3724	goto build_word_op_espace;
3725	/ Then join them by ALT node. /
3726	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3727	if (__glibc_likely (mbc_tree != NULL))
3728	return tree;
3729	}
3730	else
3731	{
3732	free_charset (mbcset);
3733	return tree;
3734	}
3735	#else /* not RE_ENABLE_I18N */
3736	return tree;
3737	#endif /* not RE_ENABLE_I18N */
3738
3739	build_word_op_espace:
3740	re_free (sbcset);
3741	#ifdef RE_ENABLE_I18N
3742	free_charset (mbcset);
3743	#endif /* RE_ENABLE_I18N */
3744	*err = REG_ESPACE;
3745	return NULL;
3746	}
3747
3748	/ This is intended for the expressions like "a{1,3}".*
3749	Fetch a number from 'input', and return the number.
3750	Return -1 if the number field is empty like "{,1}".
3751	Return RE_DUP_MAX + 1 if the number field is too large.
3752	Return -2 if an error occurred. /*
3753
3754	static Idx
3755	fetch_number (re_string_t input, re_token_t token, reg_syntax_t syntax)
3756	{
3757	Idx num = -`1`;
3758	unsigned char c;
3759	while (`1`)
3760	{
3761	fetch_token (token, input, syntax);
3762	c = token->opr.c;
3763	if (__glibc_unlikely (token->type == END_OF_RE))
3764	return -`2`;
3765	if (token->type == OP_CLOSE_DUP_NUM \|\| c == `','`)
3766	break;
3767	num = ((token->type != CHARACTER \|\| c < `'0'` \|\| `'9'` < c \|\| num == -`2`)
3768	? -`2`
3769	: num == -`1`
3770	? c - `'0'`
3771	: MIN (RE_DUP_MAX + `1`, num * `10` + c - `'0'`));
3772	}
3773	return num;
3774	}
3775
3776	#ifdef RE_ENABLE_I18N
3777	static void
3778	free_charset (re_charset_t *cset)
3779	{
3780	re_free (cset->mbchars);
3781	# ifdef _LIBC
3782	re_free (cset->coll_syms);
3783	re_free (cset->equiv_classes);
3784	# endif
3785	re_free (cset->range_starts);
3786	re_free (cset->range_ends);
3787	re_free (cset->char_classes);
3788	re_free (cset);
3789	}
3790	#endif /* RE_ENABLE_I18N */
3791
3792	/ Functions for binary tree operation. /
3793
3794	/ Create a tree node. /
3795
3796	static bin_tree_t *
3797	create_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3798	re_token_type_t type)
3799	{
3800	re_token_t t = { .type = type };
3801	return create_token_tree (dfa, left, right, &t);
3802	}
3803
3804	static bin_tree_t *
3805	create_token_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3806	const re_token_t *token)
3807	{
3808	bin_tree_t *tree;
3809	if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3810	{
3811	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, `1`);
3812
3813	if (storage == NULL)
3814	return NULL;
3815	storage->next = dfa->str_tree_storage;
3816	dfa->str_tree_storage = storage;
3817	dfa->str_tree_storage_idx = `0`;
3818	}
3819	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3820
3821	tree->parent = NULL;
3822	tree->left = left;
3823	tree->right = right;
3824	tree->token = *token;
3825	tree->token.duplicated = `0`;
3826	tree->token.opt_subexp = `0`;
3827	tree->first = NULL;
3828	tree->next = NULL;
3829	tree->node_idx = -`1`;
3830
3831	if (left != NULL)
3832	left->parent = tree;
3833	if (right != NULL)
3834	right->parent = tree;
3835	return tree;
3836	}
3837
3838	/ Mark the tree SRC as an optional subexpression.*
3839	To be called from preorder or postorder. /*
3840
3841	static reg_errcode_t
3842	mark_opt_subexp (void extra, bin_tree_t node)
3843	{
3844	Idx idx = (uintptr_t) extra;
3845	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3846	node->token.opt_subexp = `1`;
3847
3848	return REG_NOERROR;
3849	}
3850
3851	/ Free the allocated memory inside NODE. /
3852
3853	static void
3854	free_token (re_token_t *node)
3855	{
3856	#ifdef RE_ENABLE_I18N
3857	if (node->type == COMPLEX_BRACKET && node->duplicated == `0`)
3858	free_charset (node->opr.mbcset);
3859	else
3860	#endif /* RE_ENABLE_I18N */
3861	if (node->type == SIMPLE_BRACKET && node->duplicated == `0`)
3862	re_free (node->opr.sbcset);
3863	}
3864
3865	/ Worker function for tree walking. Free the allocated memory inside NODE*
3866	and its children. /*
3867
3868	static reg_errcode_t
3869	free_tree (void extra, bin_tree_t node)
3870	{
3871	free_token (&node->token);
3872	return REG_NOERROR;
3873	}
3874
3875
3876	/ Duplicate the node SRC, and return new node. This is a preorder*
3877	visit similar to the one implemented by the generic visitor, but
3878	we need more infrastructure to maintain two parallel trees --- so,
3879	it's easier to duplicate. /*
3880
3881	static bin_tree_t *
3882	duplicate_tree (const bin_tree_t root, re_dfa_t dfa)
3883	{
3884	const bin_tree_t *node;
3885	bin_tree_t *dup_root;
3886	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
3887
3888	for (node = root; ; )
3889	{
3890	/ Create a new tree and link it back to the current parent. /
3891	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3892	if (*p_new == NULL)
3893	return NULL;
3894	(*p_new)->parent = dup_node;
3895	(*p_new)->token.duplicated = `1`;
3896	dup_node = *p_new;
3897
3898	/ Go to the left node, or up and to the right. /
3899	if (node->left)
3900	{
3901	node = node->left;
3902	p_new = &dup_node->left;
3903	}
3904	else
3905	{
3906	const bin_tree_t *prev = NULL;
3907	while (node->right == prev \|\| node->right == NULL)
3908	{
3909	prev = node;
3910	node = node->parent;
3911	dup_node = dup_node->parent;
3912	if (!node)
3913	return dup_root;
3914	}
3915	node = node->right;
3916	p_new = &dup_node->right;
3917	}
3918	}
3919	}
3920

Browse the source code of Glibc/posix/regcomp.c