regex_internal.c source code [Glibc/posix/regex_internal.c]

1	/ Extended regular expression matching and search library.*
2	Copyright (C) 2002-2020 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	static void re_string_construct_common (const char *str, Idx len,
21	re_string_t *pstr,
22	RE_TRANSLATE_TYPE trans, bool icase,
23	const re_dfa_t *dfa);
24	static re_dfastate_t create_ci_newstate (const* re_dfa_t *dfa,
25	const re_node_set *nodes,
26	re_hashval_t hash);
27	static re_dfastate_t create_cd_newstate (const* re_dfa_t *dfa,
28	const re_node_set *nodes,
29	unsigned int context,
30	re_hashval_t hash);
31	static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32	Idx new_buf_len);
33	#ifdef RE_ENABLE_I18N
34	static void build_wcs_buffer (re_string_t *pstr);
35	static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36	#endif /* RE_ENABLE_I18N */
37	static void build_upper_buffer (re_string_t *pstr);
38	static void re_string_translate_buffer (re_string_t *pstr);
39	static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40	int eflags) __attribute__ ((pure));
41
42	/ Functions for string operation. /
43
44	/ This function allocate the buffers. It is necessary to call*
45	re_string_reconstruct before using the object. /*
46
47	static reg_errcode_t
48	__attribute_warn_unused_result__
49	re_string_allocate (re_string_t pstr, const* char *str, Idx len, Idx init_len,
50	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
51	{
52	reg_errcode_t ret;
53	Idx init_buf_len;
54
55	/ Ensure at least one character fits into the buffers. /
56	if (init_len < dfa->mb_cur_max)
57	init_len = dfa->mb_cur_max;
58	init_buf_len = (len + `1` < init_len) ? len + `1`: init_len;
59	re_string_construct_common (str, len, pstr, trans, icase, dfa);
60
61	ret = re_string_realloc_buffers (pstr, init_buf_len);
62	if (__glibc_unlikely (ret != REG_NOERROR))
63	return ret;
64
65	pstr->word_char = dfa->word_char;
66	pstr->word_ops_used = dfa->word_ops_used;
67	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > `1`) ? `0` : len;
69	pstr->valid_raw_len = pstr->valid_len;
70	return REG_NOERROR;
71	}
72
73	/ This function allocate the buffers, and initialize them. /
74
75	static reg_errcode_t
76	__attribute_warn_unused_result__
77	re_string_construct (re_string_t pstr, const* char *str, Idx len,
78	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
79	{
80	reg_errcode_t ret;
81	memset (pstr, `'\0'`, sizeof (re_string_t));
82	re_string_construct_common (str, len, pstr, trans, icase, dfa);
83
84	if (len > `0`)
85	{
86	ret = re_string_realloc_buffers (pstr, len + `1`);
87	if (__glibc_unlikely (ret != REG_NOERROR))
88	return ret;
89	}
90	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
91
92	if (icase)
93	{
94	#ifdef RE_ENABLE_I18N
95	if (dfa->mb_cur_max > `1`)
96	{
97	while (`1`)
98	{
99	ret = build_wcs_upper_buffer (pstr);
100	if (__glibc_unlikely (ret != REG_NOERROR))
101	return ret;
102	if (pstr->valid_raw_len >= len)
103	break;
104	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105	break;
106	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * `2`);
107	if (__glibc_unlikely (ret != REG_NOERROR))
108	return ret;
109	}
110	}
111	else
112	#endif /* RE_ENABLE_I18N */
113	build_upper_buffer (pstr);
114	}
115	else
116	{
117	#ifdef RE_ENABLE_I18N
118	if (dfa->mb_cur_max > `1`)
119	build_wcs_buffer (pstr);
120	else
121	#endif /* RE_ENABLE_I18N */
122	{
123	if (trans != NULL)
124	re_string_translate_buffer (pstr);
125	else
126	{
127	pstr->valid_len = pstr->bufs_len;
128	pstr->valid_raw_len = pstr->bufs_len;
129	}
130	}
131	}
132
133	return REG_NOERROR;
134	}
135
136	/ Helper functions for re_string_allocate, and re_string_construct. /
137
138	static reg_errcode_t
139	__attribute_warn_unused_result__
140	re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
141	{
142	#ifdef RE_ENABLE_I18N
143	if (pstr->mb_cur_max > `1`)
144	{
145	wint_t *new_wcs;
146
147	/ Avoid overflow in realloc. /
148	const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
150	< new_buf_len))
151	return REG_ESPACE;
152
153	new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
154	if (__glibc_unlikely (new_wcs == NULL))
155	return REG_ESPACE;
156	pstr->wcs = new_wcs;
157	if (pstr->offsets != NULL)
158	{
159	Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
160	if (__glibc_unlikely (new_offsets == NULL))
161	return REG_ESPACE;
162	pstr->offsets = new_offsets;
163	}
164	}
165	#endif /* RE_ENABLE_I18N */
166	if (pstr->mbs_allocated)
167	{
168	unsigned char new_mbs = re_realloc (pstr->mbs, unsigned* char,
169	new_buf_len);
170	if (__glibc_unlikely (new_mbs == NULL))
171	return REG_ESPACE;
172	pstr->mbs = new_mbs;
173	}
174	pstr->bufs_len = new_buf_len;
175	return REG_NOERROR;
176	}
177
178
179	static void
180	re_string_construct_common (const char str, Idx len, re_string_t pstr,
181	RE_TRANSLATE_TYPE trans, bool icase,
182	const re_dfa_t *dfa)
183	{
184	pstr->raw_mbs = (const unsigned char *) str;
185	pstr->len = len;
186	pstr->raw_len = len;
187	pstr->trans = trans;
188	pstr->icase = icase;
189	pstr->mbs_allocated = (trans != NULL \|\| icase);
190	pstr->mb_cur_max = dfa->mb_cur_max;
191	pstr->is_utf8 = dfa->is_utf8;
192	pstr->map_notascii = dfa->map_notascii;
193	pstr->stop = pstr->len;
194	pstr->raw_stop = pstr->stop;
195	}
196
197	#ifdef RE_ENABLE_I18N
198
199	/ Build wide character buffer PSTR->WCS.*
200	If the byte sequence of the string are:
201	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
202	Then wide character buffer will be:
203	<wc1> , WEOF , <wc2> , WEOF , <wc3>
204	We use WEOF for padding, they indicate that the position isn't
205	a first byte of a multibyte character.
206
207	Note that this function assumes PSTR->VALID_LEN elements are already
208	built and starts from PSTR->VALID_LEN. /*
209
210	static void
211	build_wcs_buffer (re_string_t *pstr)
212	{
213	#ifdef _LIBC
214	unsigned char buf[MB_LEN_MAX];
215	DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
216	#else
217	unsigned char buf[`64`];
218	#endif
219	mbstate_t prev_st;
220	Idx byte_idx, end_idx, remain_len;
221	size_t mbclen;
222
223	/ Build the buffers from pstr->valid_len to either pstr->len or*
224	pstr->bufs_len. /*
225	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
226	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
227	{
228	wchar_t wc;
229	const char *p;
230
231	remain_len = end_idx - byte_idx;
232	prev_st = pstr->cur_state;
233	/ Apply the translation if we need. /
234	if (__glibc_unlikely (pstr->trans != NULL))
235	{
236	int i, ch;
237
238	for (i = `0`; i < pstr->mb_cur_max && i < remain_len; ++i)
239	{
240	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
241	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
242	}
243	p = (const char *) buf;
244	}
245	else
246	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
247	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
248	if (__glibc_unlikely (mbclen == (size_t) -`1` \|\| mbclen == `0`
249	\|\| (mbclen == (size_t) -`2`
250	&& pstr->bufs_len >= pstr->len)))
251	{
252	/ We treat these cases as a singlebyte character. /
253	mbclen = `1`;
254	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
255	if (__glibc_unlikely (pstr->trans != NULL))
256	wc = pstr->trans[wc];
257	pstr->cur_state = prev_st;
258	}
259	else if (__glibc_unlikely (mbclen == (size_t) -`2`))
260	{
261	/ The buffer doesn't have enough space, finish to build. /
262	pstr->cur_state = prev_st;
263	break;
264	}
265
266	/ Write wide character and padding. /
267	pstr->wcs[byte_idx++] = wc;
268	/ Write paddings. /
269	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
270	pstr->wcs[byte_idx++] = WEOF;
271	}
272	pstr->valid_len = byte_idx;
273	pstr->valid_raw_len = byte_idx;
274	}
275
276	/ Build wide character buffer PSTR->WCS like build_wcs_buffer,*
277	but for REG_ICASE. /*
278
279	static reg_errcode_t
280	__attribute_warn_unused_result__
281	build_wcs_upper_buffer (re_string_t *pstr)
282	{
283	mbstate_t prev_st;
284	Idx src_idx, byte_idx, end_idx, remain_len;
285	size_t mbclen;
286	#ifdef _LIBC
287	char buf[MB_LEN_MAX];
288	DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
289	#else
290	char buf[`64`];
291	#endif
292
293	byte_idx = pstr->valid_len;
294	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
295
296	/ The following optimization assumes that ASCII characters can be*
297	mapped to wide characters with a simple cast. /*
298	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
299	{
300	while (byte_idx < end_idx)
301	{
302	wchar_t wc;
303
304	if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
305	&& mbsinit (&pstr->cur_state))
306	{
307	/ In case of a singlebyte character. /
308	pstr->mbs[byte_idx]
309	= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
310	/ The next step uses the assumption that wchar_t is encoded*
311	ASCII-safe: all ASCII values can be converted like this. /*
312	pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
313	++byte_idx;
314	continue;
315	}
316
317	remain_len = end_idx - byte_idx;
318	prev_st = pstr->cur_state;
319	mbclen = __mbrtowc (&wc,
320	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
321	+ byte_idx), remain_len, &pstr->cur_state);
322	if (__glibc_likely (`0` < mbclen && mbclen < (size_t) -`2`))
323	{
324	wchar_t wcu = __towupper (wc);
325	if (wcu != wc)
326	{
327	size_t mbcdlen;
328
329	mbcdlen = __wcrtomb (buf, wcu, &prev_st);
330	if (__glibc_likely (mbclen == mbcdlen))
331	memcpy (pstr->mbs + byte_idx, buf, mbclen);
332	else
333	{
334	src_idx = byte_idx;
335	goto offsets_needed;
336	}
337	}
338	else
339	memcpy (pstr->mbs + byte_idx,
340	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
341	pstr->wcs[byte_idx++] = wcu;
342	/ Write paddings. /
343	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
344	pstr->wcs[byte_idx++] = WEOF;
345	}
346	else if (mbclen == (size_t) -`1` \|\| mbclen == `0`
347	\|\| (mbclen == (size_t) -`2` && pstr->bufs_len >= pstr->len))
348	{
349	/ It is an invalid character, an incomplete character*
350	at the end of the string, or '\0'. Just use the byte. /*
351	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
352	pstr->mbs[byte_idx] = ch;
353	/ And also cast it to wide char. /
354	pstr->wcs[byte_idx++] = (wchar_t) ch;
355	if (__glibc_unlikely (mbclen == (size_t) -`1`))
356	pstr->cur_state = prev_st;
357	}
358	else
359	{
360	/ The buffer doesn't have enough space, finish to build. /
361	pstr->cur_state = prev_st;
362	break;
363	}
364	}
365	pstr->valid_len = byte_idx;
366	pstr->valid_raw_len = byte_idx;
367	return REG_NOERROR;
368	}
369	else
370	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
371	{
372	wchar_t wc;
373	const char *p;
374	offsets_needed:
375	remain_len = end_idx - byte_idx;
376	prev_st = pstr->cur_state;
377	if (__glibc_unlikely (pstr->trans != NULL))
378	{
379	int i, ch;
380
381	for (i = `0`; i < pstr->mb_cur_max && i < remain_len; ++i)
382	{
383	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
384	buf[i] = pstr->trans[ch];
385	}
386	p = (const char *) buf;
387	}
388	else
389	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
390	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
391	if (__glibc_likely (`0` < mbclen && mbclen < (size_t) -`2`))
392	{
393	wchar_t wcu = __towupper (wc);
394	if (wcu != wc)
395	{
396	size_t mbcdlen;
397
398	mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
399	if (__glibc_likely (mbclen == mbcdlen))
400	memcpy (pstr->mbs + byte_idx, buf, mbclen);
401	else if (mbcdlen != (size_t) -`1`)
402	{
403	size_t i;
404
405	if (byte_idx + mbcdlen > pstr->bufs_len)
406	{
407	pstr->cur_state = prev_st;
408	break;
409	}
410
411	if (pstr->offsets == NULL)
412	{
413	pstr->offsets = re_malloc (Idx, pstr->bufs_len);
414
415	if (pstr->offsets == NULL)
416	return REG_ESPACE;
417	}
418	if (!pstr->offsets_needed)
419	{
420	for (i = `0`; i < (size_t) byte_idx; ++i)
421	pstr->offsets[i] = i;
422	pstr->offsets_needed = `1`;
423	}
424
425	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
426	pstr->wcs[byte_idx] = wcu;
427	pstr->offsets[byte_idx] = src_idx;
428	for (i = `1`; i < mbcdlen; ++i)
429	{
430	pstr->offsets[byte_idx + i]
431	= src_idx + (i < mbclen ? i : mbclen - `1`);
432	pstr->wcs[byte_idx + i] = WEOF;
433	}
434	pstr->len += mbcdlen - mbclen;
435	if (pstr->raw_stop > src_idx)
436	pstr->stop += mbcdlen - mbclen;
437	end_idx = (pstr->bufs_len > pstr->len)
438	? pstr->len : pstr->bufs_len;
439	byte_idx += mbcdlen;
440	src_idx += mbclen;
441	continue;
442	}
443	else
444	memcpy (pstr->mbs + byte_idx, p, mbclen);
445	}
446	else
447	memcpy (pstr->mbs + byte_idx, p, mbclen);
448
449	if (__glibc_unlikely (pstr->offsets_needed != `0`))
450	{
451	size_t i;
452	for (i = `0`; i < mbclen; ++i)
453	pstr->offsets[byte_idx + i] = src_idx + i;
454	}
455	src_idx += mbclen;
456
457	pstr->wcs[byte_idx++] = wcu;
458	/ Write paddings. /
459	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
460	pstr->wcs[byte_idx++] = WEOF;
461	}
462	else if (mbclen == (size_t) -`1` \|\| mbclen == `0`
463	\|\| (mbclen == (size_t) -`2` && pstr->bufs_len >= pstr->len))
464	{
465	/ It is an invalid character or '\0'. Just use the byte. /
466	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
467
468	if (__glibc_unlikely (pstr->trans != NULL))
469	ch = pstr->trans [ch];
470	pstr->mbs[byte_idx] = ch;
471
472	if (__glibc_unlikely (pstr->offsets_needed != `0`))
473	pstr->offsets[byte_idx] = src_idx;
474	++src_idx;
475
476	/ And also cast it to wide char. /
477	pstr->wcs[byte_idx++] = (wchar_t) ch;
478	if (__glibc_unlikely (mbclen == (size_t) -`1`))
479	pstr->cur_state = prev_st;
480	}
481	else
482	{
483	/ The buffer doesn't have enough space, finish to build. /
484	pstr->cur_state = prev_st;
485	break;
486	}
487	}
488	pstr->valid_len = byte_idx;
489	pstr->valid_raw_len = src_idx;
490	return REG_NOERROR;
491	}
492
493	/ Skip characters until the index becomes greater than NEW_RAW_IDX.*
494	Return the index. /*
495
496	static Idx
497	re_string_skip_chars (re_string_t pstr, Idx new_raw_idx, wint_t last_wc)
498	{
499	mbstate_t prev_st;
500	Idx rawbuf_idx;
501	size_t mbclen;
502	wint_t wc = WEOF;
503
504	/ Skip the characters which are not necessary to check. /
505	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
506	rawbuf_idx < new_raw_idx;)
507	{
508	wchar_t wc2;
509	Idx remain_len = pstr->raw_len - rawbuf_idx;
510	prev_st = pstr->cur_state;
511	mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
512	remain_len, &pstr->cur_state);
513	if (__glibc_unlikely (mbclen == (size_t) -`2` \|\| mbclen == (size_t) -`1`
514	\|\| mbclen == `0`))
515	{
516	/ We treat these cases as a single byte character. /
517	if (mbclen == `0` \|\| remain_len == `0`)
518	wc = L`'\0'`;
519	else
520	wc = (unsigned* char *) (pstr->raw_mbs + rawbuf_idx);
521	mbclen = `1`;
522	pstr->cur_state = prev_st;
523	}
524	else
525	wc = wc2;
526	/ Then proceed the next character. /
527	rawbuf_idx += mbclen;
528	}
529	*last_wc = wc;
530	return rawbuf_idx;
531	}
532	#endif /* RE_ENABLE_I18N */
533
534	/ Build the buffer PSTR->MBS, and apply the translation if we need.*
535	This function is used in case of REG_ICASE. /*
536
537	static void
538	build_upper_buffer (re_string_t *pstr)
539	{
540	Idx char_idx, end_idx;
541	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
542
543	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
544	{
545	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
546	if (__glibc_unlikely (pstr->trans != NULL))
547	ch = pstr->trans[ch];
548	pstr->mbs[char_idx] = toupper (ch);
549	}
550	pstr->valid_len = char_idx;
551	pstr->valid_raw_len = char_idx;
552	}
553
554	/ Apply TRANS to the buffer in PSTR. /
555
556	static void
557	re_string_translate_buffer (re_string_t *pstr)
558	{
559	Idx buf_idx, end_idx;
560	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
561
562	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
563	{
564	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
565	pstr->mbs[buf_idx] = pstr->trans[ch];
566	}
567
568	pstr->valid_len = buf_idx;
569	pstr->valid_raw_len = buf_idx;
570	}
571
572	/ This function re-construct the buffers.*
573	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
574	convert to upper case in case of REG_ICASE, apply translation. /*
575
576	static reg_errcode_t
577	__attribute_warn_unused_result__
578	re_string_reconstruct (re_string_t pstr, Idx idx, int* eflags)
579	{
580	Idx offset;
581
582	if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
583	offset = idx - pstr->raw_mbs_idx;
584	else
585	{
586	/ Reset buffer. /
587	#ifdef RE_ENABLE_I18N
588	if (pstr->mb_cur_max > `1`)
589	memset (&pstr->cur_state, `'\0'`, sizeof (mbstate_t));
590	#endif /* RE_ENABLE_I18N */
591	pstr->len = pstr->raw_len;
592	pstr->stop = pstr->raw_stop;
593	pstr->valid_len = `0`;
594	pstr->raw_mbs_idx = `0`;
595	pstr->valid_raw_len = `0`;
596	pstr->offsets_needed = `0`;
597	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
598	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
599	if (!pstr->mbs_allocated)
600	pstr->mbs = (unsigned char *) pstr->raw_mbs;
601	offset = idx;
602	}
603
604	if (__glibc_likely (offset != `0`))
605	{
606	/ Should the already checked characters be kept? /
607	if (__glibc_likely (offset < pstr->valid_raw_len))
608	{
609	/ Yes, move them to the front of the buffer. /
610	#ifdef RE_ENABLE_I18N
611	if (__glibc_unlikely (pstr->offsets_needed))
612	{
613	Idx low = `0`, high = pstr->valid_len, mid;
614	do
615	{
616	mid = (high + low) / `2`;
617	if (pstr->offsets[mid] > offset)
618	high = mid;
619	else if (pstr->offsets[mid] < offset)
620	low = mid + `1`;
621	else
622	break;
623	}
624	while (low < high);
625	if (pstr->offsets[mid] < offset)
626	++mid;
627	pstr->tip_context = re_string_context_at (pstr, mid - `1`,
628	eflags);
629	/ This can be quite complicated, so handle specially*
630	only the common and easy case where the character with
631	different length representation of lower and upper
632	case is present at or after offset. /*
633	if (pstr->valid_len > offset
634	&& mid == offset && pstr->offsets[mid] == offset)
635	{
636	memmove (pstr->wcs, pstr->wcs + offset,
637	(pstr->valid_len - offset) * sizeof (wint_t));
638	memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
639	pstr->valid_len -= offset;
640	pstr->valid_raw_len -= offset;
641	for (low = `0`; low < pstr->valid_len; low++)
642	pstr->offsets[low] = pstr->offsets[low + offset] - offset;
643	}
644	else
645	{
646	/ Otherwise, just find out how long the partial multibyte*
647	character at offset is and fill it with WEOF/255. /*
648	pstr->len = pstr->raw_len - idx + offset;
649	pstr->stop = pstr->raw_stop - idx + offset;
650	pstr->offsets_needed = `0`;
651	while (mid > `0` && pstr->offsets[mid - `1`] == offset)
652	--mid;
653	while (mid < pstr->valid_len)
654	if (pstr->wcs[mid] != WEOF)
655	break;
656	else
657	++mid;
658	if (mid == pstr->valid_len)
659	pstr->valid_len = `0`;
660	else
661	{
662	pstr->valid_len = pstr->offsets[mid] - offset;
663	if (pstr->valid_len)
664	{
665	for (low = `0`; low < pstr->valid_len; ++low)
666	pstr->wcs[low] = WEOF;
667	memset (pstr->mbs, `255`, pstr->valid_len);
668	}
669	}
670	pstr->valid_raw_len = pstr->valid_len;
671	}
672	}
673	else
674	#endif
675	{
676	pstr->tip_context = re_string_context_at (pstr, offset - `1`,
677	eflags);
678	#ifdef RE_ENABLE_I18N
679	if (pstr->mb_cur_max > `1`)
680	memmove (pstr->wcs, pstr->wcs + offset,
681	(pstr->valid_len - offset) * sizeof (wint_t));
682	#endif /* RE_ENABLE_I18N */
683	if (__glibc_unlikely (pstr->mbs_allocated))
684	memmove (pstr->mbs, pstr->mbs + offset,
685	pstr->valid_len - offset);
686	pstr->valid_len -= offset;
687	pstr->valid_raw_len -= offset;
688	DEBUG_ASSERT (pstr->valid_len > `0`);
689	}
690	}
691	else
692	{
693	#ifdef RE_ENABLE_I18N
694	/ No, skip all characters until IDX. /
695	Idx prev_valid_len = pstr->valid_len;
696
697	if (__glibc_unlikely (pstr->offsets_needed))
698	{
699	pstr->len = pstr->raw_len - idx + offset;
700	pstr->stop = pstr->raw_stop - idx + offset;
701	pstr->offsets_needed = `0`;
702	}
703	#endif
704	pstr->valid_len = `0`;
705	#ifdef RE_ENABLE_I18N
706	if (pstr->mb_cur_max > `1`)
707	{
708	Idx wcs_idx;
709	wint_t wc = WEOF;
710
711	if (pstr->is_utf8)
712	{
713	const unsigned char raw, p, *end;
714
715	/ Special case UTF-8. Multi-byte chars start with any*
716	byte other than 0x80 - 0xbf. /*
717	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
718	end = raw + (offset - pstr->mb_cur_max);
719	if (end < pstr->raw_mbs)
720	end = pstr->raw_mbs;
721	p = raw + offset - `1`;
722	#ifdef _LIBC
723	/ We know the wchar_t encoding is UCS4, so for the simple*
724	case, ASCII characters, skip the conversion step. /*
725	if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
726	{
727	memset (&pstr->cur_state, `'\0'`, sizeof (mbstate_t));
728	/ pstr->valid_len = 0; /
729	wc = (wchar_t) *p;
730	}
731	else
732	#endif
733	for (; p >= end; --p)
734	if ((*p & `0xc0`) != `0x80`)
735	{
736	mbstate_t cur_state;
737	wchar_t wc2;
738	Idx mlen = raw + pstr->len - p;
739	unsigned char buf[`6`];
740	size_t mbclen;
741
742	const unsigned char *pp = p;
743	if (__glibc_unlikely (pstr->trans != NULL))
744	{
745	int i = mlen < `6` ? mlen : `6`;
746	while (--i >= `0`)
747	buf[i] = pstr->trans[p[i]];
748	pp = buf;
749	}
750	/ XXX Don't use mbrtowc, we know which conversion*
751	to use (UTF-8 -> UCS4). /*
752	memset (&cur_state, `0`, sizeof (cur_state));
753	mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
754	&cur_state);
755	if (raw + offset - p <= mbclen
756	&& mbclen < (size_t) -`2`)
757	{
758	memset (&pstr->cur_state, `'\0'`,
759	sizeof (mbstate_t));
760	pstr->valid_len = mbclen - (raw + offset - p);
761	wc = wc2;
762	}
763	break;
764	}
765	}
766
767	if (wc == WEOF)
768	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
769	if (wc == WEOF)
770	pstr->tip_context
771	= re_string_context_at (pstr, prev_valid_len - `1`, eflags);
772	else
773	pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != `0`)
774	&& IS_WIDE_WORD_CHAR (wc))
775	? CONTEXT_WORD
776	: ((IS_WIDE_NEWLINE (wc)
777	&& pstr->newline_anchor)
778	? CONTEXT_NEWLINE : `0`));
779	if (__glibc_unlikely (pstr->valid_len))
780	{
781	for (wcs_idx = `0`; wcs_idx < pstr->valid_len; ++wcs_idx)
782	pstr->wcs[wcs_idx] = WEOF;
783	if (pstr->mbs_allocated)
784	memset (pstr->mbs, `255`, pstr->valid_len);
785	}
786	pstr->valid_raw_len = pstr->valid_len;
787	}
788	else
789	#endif /* RE_ENABLE_I18N */
790	{
791	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - `1`];
792	pstr->valid_raw_len = `0`;
793	if (pstr->trans)
794	c = pstr->trans[c];
795	pstr->tip_context = (bitset_contain (pstr->word_char, c)
796	? CONTEXT_WORD
797	: ((IS_NEWLINE (c) && pstr->newline_anchor)
798	? CONTEXT_NEWLINE : `0`));
799	}
800	}
801	if (!__glibc_unlikely (pstr->mbs_allocated))
802	pstr->mbs += offset;
803	}
804	pstr->raw_mbs_idx = idx;
805	pstr->len -= offset;
806	pstr->stop -= offset;
807
808	/ Then build the buffers. /
809	#ifdef RE_ENABLE_I18N
810	if (pstr->mb_cur_max > `1`)
811	{
812	if (pstr->icase)
813	{
814	reg_errcode_t ret = build_wcs_upper_buffer (pstr);
815	if (__glibc_unlikely (ret != REG_NOERROR))
816	return ret;
817	}
818	else
819	build_wcs_buffer (pstr);
820	}
821	else
822	#endif /* RE_ENABLE_I18N */
823	if (__glibc_unlikely (pstr->mbs_allocated))
824	{
825	if (pstr->icase)
826	build_upper_buffer (pstr);
827	else if (pstr->trans != NULL)
828	re_string_translate_buffer (pstr);
829	}
830	else
831	pstr->valid_len = pstr->len;
832
833	pstr->cur_idx = `0`;
834	return REG_NOERROR;
835	}
836
837	static unsigned char
838	__attribute__ ((pure))
839	re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
840	{
841	int ch;
842	Idx off;
843
844	/ Handle the common (easiest) cases first. /
845	if (__glibc_likely (!pstr->mbs_allocated))
846	return re_string_peek_byte (pstr, idx);
847
848	#ifdef RE_ENABLE_I18N
849	if (pstr->mb_cur_max > `1`
850	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
851	return re_string_peek_byte (pstr, idx);
852	#endif
853
854	off = pstr->cur_idx + idx;
855	#ifdef RE_ENABLE_I18N
856	if (pstr->offsets_needed)
857	off = pstr->offsets[off];
858	#endif
859
860	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
861
862	#ifdef RE_ENABLE_I18N
863	/ Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I*
864	this function returns CAPITAL LETTER I instead of first byte of
865	DOTLESS SMALL LETTER I. The latter would confuse the parser,
866	since peek_byte_case doesn't advance cur_idx in any way. /*
867	if (pstr->offsets_needed && !isascii (ch))
868	return re_string_peek_byte (pstr, idx);
869	#endif
870
871	return ch;
872	}
873
874	static unsigned char
875	re_string_fetch_byte_case (re_string_t *pstr)
876	{
877	if (__glibc_likely (!pstr->mbs_allocated))
878	return re_string_fetch_byte (pstr);
879
880	#ifdef RE_ENABLE_I18N
881	if (pstr->offsets_needed)
882	{
883	Idx off;
884	int ch;
885
886	/ For tr_TR.UTF-8 [[:islower:]] there is*
887	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
888	in that case the whole multi-byte character and return
889	the original letter. On the other side, with
890	[[: DOTLESS SMALL LETTER I return [[:I, as doing
891	anything else would complicate things too much. /*
892
893	if (!re_string_first_byte (pstr, pstr->cur_idx))
894	return re_string_fetch_byte (pstr);
895
896	off = pstr->offsets[pstr->cur_idx];
897	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
898
899	if (! isascii (ch))
900	return re_string_fetch_byte (pstr);
901
902	re_string_skip_bytes (pstr,
903	re_string_char_size_at (pstr, pstr->cur_idx));
904	return ch;
905	}
906	#endif
907
908	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
909	}
910
911	static void
912	re_string_destruct (re_string_t *pstr)
913	{
914	#ifdef RE_ENABLE_I18N
915	re_free (pstr->wcs);
916	re_free (pstr->offsets);
917	#endif /* RE_ENABLE_I18N */
918	if (pstr->mbs_allocated)
919	re_free (pstr->mbs);
920	}
921
922	/ Return the context at IDX in INPUT. /
923
924	static unsigned int
925	re_string_context_at (const re_string_t input, Idx idx, int* eflags)
926	{
927	int c;
928	if (__glibc_unlikely (idx < `0`))
929	/ In this case, we use the value stored in input->tip_context,*
930	since we can't know the character in input->mbs[-1] here. /*
931	return input->tip_context;
932	if (__glibc_unlikely (idx == input->len))
933	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
934	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
935	#ifdef RE_ENABLE_I18N
936	if (input->mb_cur_max > `1`)
937	{
938	wint_t wc;
939	Idx wc_idx = idx;
940	while(input->wcs[wc_idx] == WEOF)
941	{
942	DEBUG_ASSERT (wc_idx >= `0`);
943	--wc_idx;
944	if (wc_idx < `0`)
945	return input->tip_context;
946	}
947	wc = input->wcs[wc_idx];
948	if (__glibc_unlikely (input->word_ops_used != `0`)
949	&& IS_WIDE_WORD_CHAR (wc))
950	return CONTEXT_WORD;
951	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
952	? CONTEXT_NEWLINE : `0`);
953	}
954	else
955	#endif
956	{
957	c = re_string_byte_at (input, idx);
958	if (bitset_contain (input->word_char, c))
959	return CONTEXT_WORD;
960	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : `0`;
961	}
962	}
963
964	/ Functions for set operation. /
965
966	static reg_errcode_t
967	__attribute_warn_unused_result__
968	re_node_set_alloc (re_node_set *set, Idx size)
969	{
970	set->alloc = size;
971	set->nelem = `0`;
972	set->elems = re_malloc (Idx, size);
973	if (__glibc_unlikely (set->elems == NULL)
974	&& (MALLOC_0_IS_NONNULL \|\| size != `0`))
975	return REG_ESPACE;
976	return REG_NOERROR;
977	}
978
979	static reg_errcode_t
980	__attribute_warn_unused_result__
981	re_node_set_init_1 (re_node_set *set, Idx elem)
982	{
983	set->alloc = `1`;
984	set->nelem = `1`;
985	set->elems = re_malloc (Idx, `1`);
986	if (__glibc_unlikely (set->elems == NULL))
987	{
988	set->alloc = set->nelem = `0`;
989	return REG_ESPACE;
990	}
991	set->elems[`0`] = elem;
992	return REG_NOERROR;
993	}
994
995	static reg_errcode_t
996	__attribute_warn_unused_result__
997	re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
998	{
999	set->alloc = `2`;
1000	set->elems = re_malloc (Idx, `2`);
1001	if (__glibc_unlikely (set->elems == NULL))
1002	return REG_ESPACE;
1003	if (elem1 == elem2)
1004	{
1005	set->nelem = `1`;
1006	set->elems[`0`] = elem1;
1007	}
1008	else
1009	{
1010	set->nelem = `2`;
1011	if (elem1 < elem2)
1012	{
1013	set->elems[`0`] = elem1;
1014	set->elems[`1`] = elem2;
1015	}
1016	else
1017	{
1018	set->elems[`0`] = elem2;
1019	set->elems[`1`] = elem1;
1020	}
1021	}
1022	return REG_NOERROR;
1023	}
1024
1025	static reg_errcode_t
1026	__attribute_warn_unused_result__
1027	re_node_set_init_copy (re_node_set dest, const* re_node_set *src)
1028	{
1029	dest->nelem = src->nelem;
1030	if (src->nelem > `0`)
1031	{
1032	dest->alloc = dest->nelem;
1033	dest->elems = re_malloc (Idx, dest->alloc);
1034	if (__glibc_unlikely (dest->elems == NULL))
1035	{
1036	dest->alloc = dest->nelem = `0`;
1037	return REG_ESPACE;
1038	}
1039	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1040	}
1041	else
1042	re_node_set_init_empty (dest);
1043	return REG_NOERROR;
1044	}
1045
1046	/ Calculate the intersection of the sets SRC1 and SRC2. And merge it to*
1047	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1048	Note: We assume dest->elems is NULL, when dest->alloc is 0. /*
1049
1050	static reg_errcode_t
1051	__attribute_warn_unused_result__
1052	re_node_set_add_intersect (re_node_set dest, const* re_node_set *src1,
1053	const re_node_set *src2)
1054	{
1055	Idx i1, i2, is, id, delta, sbase;
1056	if (src1->nelem == `0` \|\| src2->nelem == `0`)
1057	return REG_NOERROR;
1058
1059	/ We need dest->nelem + 2 * elems_in_intersection; this is a*
1060	conservative estimate. /*
1061	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1062	{
1063	Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1064	Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1065	if (__glibc_unlikely (new_elems == NULL))
1066	return REG_ESPACE;
1067	dest->elems = new_elems;
1068	dest->alloc = new_alloc;
1069	}
1070
1071	/ Find the items in the intersection of SRC1 and SRC2, and copy*
1072	into the top of DEST those that are not already in DEST itself. /*
1073	sbase = dest->nelem + src1->nelem + src2->nelem;
1074	i1 = src1->nelem - `1`;
1075	i2 = src2->nelem - `1`;
1076	id = dest->nelem - `1`;
1077	for (;;)
1078	{
1079	if (src1->elems[i1] == src2->elems[i2])
1080	{
1081	/ Try to find the item in DEST. Maybe we could binary search? /
1082	while (id >= `0` && dest->elems[id] > src1->elems[i1])
1083	--id;
1084
1085	if (id < `0` \|\| dest->elems[id] != src1->elems[i1])
1086	dest->elems[--sbase] = src1->elems[i1];
1087
1088	if (--i1 < `0` \|\| --i2 < `0`)
1089	break;
1090	}
1091
1092	/ Lower the highest of the two items. /
1093	else if (src1->elems[i1] < src2->elems[i2])
1094	{
1095	if (--i2 < `0`)
1096	break;
1097	}
1098	else
1099	{
1100	if (--i1 < `0`)
1101	break;
1102	}
1103	}
1104
1105	id = dest->nelem - `1`;
1106	is = dest->nelem + src1->nelem + src2->nelem - `1`;
1107	delta = is - sbase + `1`;
1108
1109	/ Now copy. When DELTA becomes zero, the remaining*
1110	DEST elements are already in place; this is more or
1111	less the same loop that is in re_node_set_merge. /*
1112	dest->nelem += delta;
1113	if (delta > `0` && id >= `0`)
1114	for (;;)
1115	{
1116	if (dest->elems[is] > dest->elems[id])
1117	{
1118	/ Copy from the top. /
1119	dest->elems[id + delta--] = dest->elems[is--];
1120	if (delta == `0`)
1121	break;
1122	}
1123	else
1124	{
1125	/ Slide from the bottom. /
1126	dest->elems[id + delta] = dest->elems[id];
1127	if (--id < `0`)
1128	break;
1129	}
1130	}
1131
1132	/ Copy remaining SRC elements. /
1133	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1134
1135	return REG_NOERROR;
1136	}
1137
1138	/ Calculate the union set of the sets SRC1 and SRC2. And store it to*
1139	DEST. Return value indicate the error code or REG_NOERROR if succeeded. /*
1140
1141	static reg_errcode_t
1142	__attribute_warn_unused_result__
1143	re_node_set_init_union (re_node_set dest, const* re_node_set *src1,
1144	const re_node_set *src2)
1145	{
1146	Idx i1, i2, id;
1147	if (src1 != NULL && src1->nelem > `0` && src2 != NULL && src2->nelem > `0`)
1148	{
1149	dest->alloc = src1->nelem + src2->nelem;
1150	dest->elems = re_malloc (Idx, dest->alloc);
1151	if (__glibc_unlikely (dest->elems == NULL))
1152	return REG_ESPACE;
1153	}
1154	else
1155	{
1156	if (src1 != NULL && src1->nelem > `0`)
1157	return re_node_set_init_copy (dest, src1);
1158	else if (src2 != NULL && src2->nelem > `0`)
1159	return re_node_set_init_copy (dest, src2);
1160	else
1161	re_node_set_init_empty (dest);
1162	return REG_NOERROR;
1163	}
1164	for (i1 = i2 = id = `0` ; i1 < src1->nelem && i2 < src2->nelem ;)
1165	{
1166	if (src1->elems[i1] > src2->elems[i2])
1167	{
1168	dest->elems[id++] = src2->elems[i2++];
1169	continue;
1170	}
1171	if (src1->elems[i1] == src2->elems[i2])
1172	++i2;
1173	dest->elems[id++] = src1->elems[i1++];
1174	}
1175	if (i1 < src1->nelem)
1176	{
1177	memcpy (dest->elems + id, src1->elems + i1,
1178	(src1->nelem - i1) * sizeof (Idx));
1179	id += src1->nelem - i1;
1180	}
1181	else if (i2 < src2->nelem)
1182	{
1183	memcpy (dest->elems + id, src2->elems + i2,
1184	(src2->nelem - i2) * sizeof (Idx));
1185	id += src2->nelem - i2;
1186	}
1187	dest->nelem = id;
1188	return REG_NOERROR;
1189	}
1190
1191	/ Calculate the union set of the sets DEST and SRC. And store it to*
1192	DEST. Return value indicate the error code or REG_NOERROR if succeeded. /*
1193
1194	static reg_errcode_t
1195	__attribute_warn_unused_result__
1196	re_node_set_merge (re_node_set dest, const* re_node_set *src)
1197	{
1198	Idx is, id, sbase, delta;
1199	if (src == NULL \|\| src->nelem == `0`)
1200	return REG_NOERROR;
1201	if (dest->alloc < `2` * src->nelem + dest->nelem)
1202	{
1203	Idx new_alloc = `2` * (src->nelem + dest->alloc);
1204	Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1205	if (__glibc_unlikely (new_buffer == NULL))
1206	return REG_ESPACE;
1207	dest->elems = new_buffer;
1208	dest->alloc = new_alloc;
1209	}
1210
1211	if (__glibc_unlikely (dest->nelem == `0`))
1212	{
1213	dest->nelem = src->nelem;
1214	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1215	return REG_NOERROR;
1216	}
1217
1218	/ Copy into the top of DEST the items of SRC that are not*
1219	found in DEST. Maybe we could binary search in DEST? /*
1220	for (sbase = dest->nelem + `2` * src->nelem,
1221	is = src->nelem - `1`, id = dest->nelem - `1`; is >= `0` && id >= `0`; )
1222	{
1223	if (dest->elems[id] == src->elems[is])
1224	is--, id--;
1225	else if (dest->elems[id] < src->elems[is])
1226	dest->elems[--sbase] = src->elems[is--];
1227	else / if (dest->elems[id] > src->elems[is]) /
1228	--id;
1229	}
1230
1231	if (is >= `0`)
1232	{
1233	/ If DEST is exhausted, the remaining items of SRC must be unique. /
1234	sbase -= is + `1`;
1235	memcpy (dest->elems + sbase, src->elems, (is + `1`) * sizeof (Idx));
1236	}
1237
1238	id = dest->nelem - `1`;
1239	is = dest->nelem + `2` * src->nelem - `1`;
1240	delta = is - sbase + `1`;
1241	if (delta == `0`)
1242	return REG_NOERROR;
1243
1244	/ Now copy. When DELTA becomes zero, the remaining*
1245	DEST elements are already in place. /*
1246	dest->nelem += delta;
1247	for (;;)
1248	{
1249	if (dest->elems[is] > dest->elems[id])
1250	{
1251	/ Copy from the top. /
1252	dest->elems[id + delta--] = dest->elems[is--];
1253	if (delta == `0`)
1254	break;
1255	}
1256	else
1257	{
1258	/ Slide from the bottom. /
1259	dest->elems[id + delta] = dest->elems[id];
1260	if (--id < `0`)
1261	{
1262	/ Copy remaining SRC elements. /
1263	memcpy (dest->elems, dest->elems + sbase,
1264	delta * sizeof (Idx));
1265	break;
1266	}
1267	}
1268	}
1269
1270	return REG_NOERROR;
1271	}
1272
1273	/ Insert the new element ELEM to the re_node_set* SET.*
1274	SET should not already have ELEM.
1275	Return true if successful. /*
1276
1277	static bool
1278	__attribute_warn_unused_result__
1279	re_node_set_insert (re_node_set *set, Idx elem)
1280	{
1281	Idx idx;
1282	/ In case the set is empty. /
1283	if (set->alloc == `0`)
1284	return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1285
1286	if (__glibc_unlikely (set->nelem) == `0`)
1287	{
1288	/ We already guaranteed above that set->alloc != 0. /
1289	set->elems[`0`] = elem;
1290	++set->nelem;
1291	return true;
1292	}
1293
1294	/ Realloc if we need. /
1295	if (set->alloc == set->nelem)
1296	{
1297	Idx *new_elems;
1298	set->alloc = set->alloc * `2`;
1299	new_elems = re_realloc (set->elems, Idx, set->alloc);
1300	if (__glibc_unlikely (new_elems == NULL))
1301	return false;
1302	set->elems = new_elems;
1303	}
1304
1305	/ Move the elements which follows the new element. Test the*
1306	first element separately to skip a check in the inner loop. /*
1307	if (elem < set->elems[`0`])
1308	{
1309	for (idx = set->nelem; idx > `0`; idx--)
1310	set->elems[idx] = set->elems[idx - `1`];
1311	}
1312	else
1313	{
1314	for (idx = set->nelem; set->elems[idx - `1`] > elem; idx--)
1315	set->elems[idx] = set->elems[idx - `1`];
1316	}
1317
1318	/ Insert the new element. /
1319	set->elems[idx] = elem;
1320	++set->nelem;
1321	return true;
1322	}
1323
1324	/ Insert the new element ELEM to the re_node_set* SET.*
1325	SET should not already have any element greater than or equal to ELEM.
1326	Return true if successful. /*
1327
1328	static bool
1329	__attribute_warn_unused_result__
1330	re_node_set_insert_last (re_node_set *set, Idx elem)
1331	{
1332	/ Realloc if we need. /
1333	if (set->alloc == set->nelem)
1334	{
1335	Idx *new_elems;
1336	set->alloc = (set->alloc + `1`) * `2`;
1337	new_elems = re_realloc (set->elems, Idx, set->alloc);
1338	if (__glibc_unlikely (new_elems == NULL))
1339	return false;
1340	set->elems = new_elems;
1341	}
1342
1343	/ Insert the new element. /
1344	set->elems[set->nelem++] = elem;
1345	return true;
1346	}
1347
1348	/ Compare two node sets SET1 and SET2.*
1349	Return true if SET1 and SET2 are equivalent. /*
1350
1351	static bool
1352	__attribute__ ((pure))
1353	re_node_set_compare (const re_node_set set1, const* re_node_set *set2)
1354	{
1355	Idx i;
1356	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
1357	return false;
1358	for (i = set1->nelem ; --i >= `0` ; )
1359	if (set1->elems[i] != set2->elems[i])
1360	return false;
1361	return true;
1362	}
1363
1364	/ Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. /
1365
1366	static Idx
1367	__attribute__ ((pure))
1368	re_node_set_contains (const re_node_set *set, Idx elem)
1369	{
1370	__re_size_t idx, right, mid;
1371	if (set->nelem <= `0`)
1372	return `0`;
1373
1374	/ Binary search the element. /
1375	idx = `0`;
1376	right = set->nelem - `1`;
1377	while (idx < right)
1378	{
1379	mid = (idx + right) / `2`;
1380	if (set->elems[mid] < elem)
1381	idx = mid + `1`;
1382	else
1383	right = mid;
1384	}
1385	return set->elems[idx] == elem ? idx + `1` : `0`;
1386	}
1387
1388	static void
1389	re_node_set_remove_at (re_node_set *set, Idx idx)
1390	{
1391	if (idx < `0` \|\| idx >= set->nelem)
1392	return;
1393	--set->nelem;
1394	for (; idx < set->nelem; idx++)
1395	set->elems[idx] = set->elems[idx + `1`];
1396	}
1397
1398
1399	/ Add the token TOKEN to dfa->nodes, and return the index of the token.*
1400	Or return -1 if an error occurred. /*
1401
1402	static Idx
1403	re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1404	{
1405	if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1406	{
1407	size_t new_nodes_alloc = dfa->nodes_alloc * `2`;
1408	Idx new_nexts, new_indices;
1409	re_node_set new_edests, new_eclosures;
1410	re_token_t *new_nodes;
1411
1412	/ Avoid overflows in realloc. /
1413	const size_t max_object_size = MAX (sizeof (re_token_t),
1414	MAX (sizeof (re_node_set),
1415	sizeof (Idx)));
1416	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1417	< new_nodes_alloc))
1418	return -`1`;
1419
1420	new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1421	if (__glibc_unlikely (new_nodes == NULL))
1422	return -`1`;
1423	dfa->nodes = new_nodes;
1424	new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1425	new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1426	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1427	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1428	if (__glibc_unlikely (new_nexts == NULL \|\| new_indices == NULL
1429	\|\| new_edests == NULL \|\| new_eclosures == NULL))
1430	{
1431	re_free (new_nexts);
1432	re_free (new_indices);
1433	re_free (new_edests);
1434	re_free (new_eclosures);
1435	return -`1`;
1436	}
1437	dfa->nexts = new_nexts;
1438	dfa->org_indices = new_indices;
1439	dfa->edests = new_edests;
1440	dfa->eclosures = new_eclosures;
1441	dfa->nodes_alloc = new_nodes_alloc;
1442	}
1443	dfa->nodes[dfa->nodes_len] = token;
1444	dfa->nodes[dfa->nodes_len].constraint = `0`;
1445	#ifdef RE_ENABLE_I18N
1446	dfa->nodes[dfa->nodes_len].accept_mb =
1447	((token.type == OP_PERIOD && dfa->mb_cur_max > `1`)
1448	\|\| token.type == COMPLEX_BRACKET);
1449	#endif
1450	dfa->nexts[dfa->nodes_len] = -`1`;
1451	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453	return dfa->nodes_len++;
1454	}
1455
1456	static re_hashval_t
1457	calc_state_hash (const re_node_set nodes, unsigned* int context)
1458	{
1459	re_hashval_t hash = nodes->nelem + context;
1460	Idx i;
1461	for (i = `0` ; i < nodes->nelem ; i++)
1462	hash += nodes->elems[i];
1463	return hash;
1464	}
1465
1466	/ Search for the state whose node_set is equivalent to NODES.*
1467	Return the pointer to the state, if we found it in the DFA.
1468	Otherwise create the new one and return it. In case of an error
1469	return NULL and set the error code in ERR.
1470	Note: - We assume NULL as the invalid state, then it is possible that
1471	return value is NULL and ERR is REG_NOERROR.
1472	- We never return non-NULL value in case of any errors, it is for
1473	optimization. /*
1474
1475	static re_dfastate_t *
1476	__attribute_warn_unused_result__
1477	re_acquire_state (reg_errcode_t err, const* re_dfa_t *dfa,
1478	const re_node_set *nodes)
1479	{
1480	re_hashval_t hash;
1481	re_dfastate_t *new_state;
1482	struct re_state_table_entry *spot;
1483	Idx i;
1484	#if defined GCC_LINT \|\| defined lint
1485	/ Suppress bogus uninitialized-variable warnings. /
1486	*err = REG_NOERROR;
1487	#endif
1488	if (__glibc_unlikely (nodes->nelem == `0`))
1489	{
1490	*err = REG_NOERROR;
1491	return NULL;
1492	}
1493	hash = calc_state_hash (nodes, `0`);
1494	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1495
1496	for (i = `0` ; i < spot->num ; i++)
1497	{
1498	re_dfastate_t *state = spot->array[i];
1499	if (hash != state->hash)
1500	continue;
1501	if (re_node_set_compare (&state->nodes, nodes))
1502	return state;
1503	}
1504
1505	/ There are no appropriate state in the dfa, create the new one. /
1506	new_state = create_ci_newstate (dfa, nodes, hash);
1507	if (__glibc_unlikely (new_state == NULL))
1508	*err = REG_ESPACE;
1509
1510	return new_state;
1511	}
1512
1513	/ Search for the state whose node_set is equivalent to NODES and*
1514	whose context is equivalent to CONTEXT.
1515	Return the pointer to the state, if we found it in the DFA.
1516	Otherwise create the new one and return it. In case of an error
1517	return NULL and set the error code in ERR.
1518	Note: - We assume NULL as the invalid state, then it is possible that
1519	return value is NULL and ERR is REG_NOERROR.
1520	- We never return non-NULL value in case of any errors, it is for
1521	optimization. /*
1522
1523	static re_dfastate_t *
1524	__attribute_warn_unused_result__
1525	re_acquire_state_context (reg_errcode_t err, const* re_dfa_t *dfa,
1526	const re_node_set nodes, unsigned* int context)
1527	{
1528	re_hashval_t hash;
1529	re_dfastate_t *new_state;
1530	struct re_state_table_entry *spot;
1531	Idx i;
1532	#if defined GCC_LINT \|\| defined lint
1533	/ Suppress bogus uninitialized-variable warnings. /
1534	*err = REG_NOERROR;
1535	#endif
1536	if (nodes->nelem == `0`)
1537	{
1538	*err = REG_NOERROR;
1539	return NULL;
1540	}
1541	hash = calc_state_hash (nodes, context);
1542	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1543
1544	for (i = `0` ; i < spot->num ; i++)
1545	{
1546	re_dfastate_t *state = spot->array[i];
1547	if (state->hash == hash
1548	&& state->context == context
1549	&& re_node_set_compare (state->entrance_nodes, nodes))
1550	return state;
1551	}
1552	/ There are no appropriate state in 'dfa', create the new one. /
1553	new_state = create_cd_newstate (dfa, nodes, context, hash);
1554	if (__glibc_unlikely (new_state == NULL))
1555	*err = REG_ESPACE;
1556
1557	return new_state;
1558	}
1559
1560	/ Finish initialization of the new state NEWSTATE, and using its hash value*
1561	HASH put in the appropriate bucket of DFA's state table. Return value
1562	indicates the error code if failed. /*
1563
1564	static reg_errcode_t
1565	__attribute_warn_unused_result__
1566	register_state (const re_dfa_t dfa, re_dfastate_t newstate,
1567	re_hashval_t hash)
1568	{
1569	struct re_state_table_entry *spot;
1570	reg_errcode_t err;
1571	Idx i;
1572
1573	newstate->hash = hash;
1574	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1575	if (__glibc_unlikely (err != REG_NOERROR))
1576	return REG_ESPACE;
1577	for (i = `0`; i < newstate->nodes.nelem; i++)
1578	{
1579	Idx elem = newstate->nodes.elems[i];
1580	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1581	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1582	return REG_ESPACE;
1583	}
1584
1585	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1586	if (__glibc_unlikely (spot->alloc <= spot->num))
1587	{
1588	Idx new_alloc = `2` * spot->num + `2`;
1589	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
1590	new_alloc);
1591	if (__glibc_unlikely (new_array == NULL))
1592	return REG_ESPACE;
1593	spot->array = new_array;
1594	spot->alloc = new_alloc;
1595	}
1596	spot->array[spot->num++] = newstate;
1597	return REG_NOERROR;
1598	}
1599
1600	static void
1601	free_state (re_dfastate_t *state)
1602	{
1603	re_node_set_free (&state->non_eps_nodes);
1604	re_node_set_free (&state->inveclosure);
1605	if (state->entrance_nodes != &state->nodes)
1606	{
1607	re_node_set_free (state->entrance_nodes);
1608	re_free (state->entrance_nodes);
1609	}
1610	re_node_set_free (&state->nodes);
1611	re_free (state->word_trtable);
1612	re_free (state->trtable);
1613	re_free (state);
1614	}
1615
1616	/ Create the new state which is independent of contexts.*
1617	Return the new state if succeeded, otherwise return NULL. /*
1618
1619	static re_dfastate_t *
1620	__attribute_warn_unused_result__
1621	create_ci_newstate (const re_dfa_t dfa, const* re_node_set *nodes,
1622	re_hashval_t hash)
1623	{
1624	Idx i;
1625	reg_errcode_t err;
1626	re_dfastate_t *newstate;
1627
1628	newstate = (re_dfastate_t ) calloc (sizeof* (re_dfastate_t), `1`);
1629	if (__glibc_unlikely (newstate == NULL))
1630	return NULL;
1631	err = re_node_set_init_copy (&newstate->nodes, nodes);
1632	if (__glibc_unlikely (err != REG_NOERROR))
1633	{
1634	re_free (newstate);
1635	return NULL;
1636	}
1637
1638	newstate->entrance_nodes = &newstate->nodes;
1639	for (i = `0` ; i < nodes->nelem ; i++)
1640	{
1641	re_token_t *node = dfa->nodes + nodes->elems[i];
1642	re_token_type_t type = node->type;
1643	if (type == CHARACTER && !node->constraint)
1644	continue;
1645	#ifdef RE_ENABLE_I18N
1646	newstate->accept_mb \|= node->accept_mb;
1647	#endif /* RE_ENABLE_I18N */
1648
1649	/ If the state has the halt node, the state is a halt state. /
1650	if (type == END_OF_RE)
1651	newstate->halt = `1`;
1652	else if (type == OP_BACK_REF)
1653	newstate->has_backref = `1`;
1654	else if (type == ANCHOR \|\| node->constraint)
1655	newstate->has_constraint = `1`;
1656	}
1657	err = register_state (dfa, newstate, hash);
1658	if (__glibc_unlikely (err != REG_NOERROR))
1659	{
1660	free_state (newstate);
1661	newstate = NULL;
1662	}
1663	return newstate;
1664	}
1665
1666	/ Create the new state which is depend on the context CONTEXT.*
1667	Return the new state if succeeded, otherwise return NULL. /*
1668
1669	static re_dfastate_t *
1670	__attribute_warn_unused_result__
1671	create_cd_newstate (const re_dfa_t dfa, const* re_node_set *nodes,
1672	unsigned int context, re_hashval_t hash)
1673	{
1674	Idx i, nctx_nodes = `0`;
1675	reg_errcode_t err;
1676	re_dfastate_t *newstate;
1677
1678	newstate = (re_dfastate_t ) calloc (sizeof* (re_dfastate_t), `1`);
1679	if (__glibc_unlikely (newstate == NULL))
1680	return NULL;
1681	err = re_node_set_init_copy (&newstate->nodes, nodes);
1682	if (__glibc_unlikely (err != REG_NOERROR))
1683	{
1684	re_free (newstate);
1685	return NULL;
1686	}
1687
1688	newstate->context = context;
1689	newstate->entrance_nodes = &newstate->nodes;
1690
1691	for (i = `0` ; i < nodes->nelem ; i++)
1692	{
1693	re_token_t *node = dfa->nodes + nodes->elems[i];
1694	re_token_type_t type = node->type;
1695	unsigned int constraint = node->constraint;
1696
1697	if (type == CHARACTER && !constraint)
1698	continue;
1699	#ifdef RE_ENABLE_I18N
1700	newstate->accept_mb \|= node->accept_mb;
1701	#endif /* RE_ENABLE_I18N */
1702
1703	/ If the state has the halt node, the state is a halt state. /
1704	if (type == END_OF_RE)
1705	newstate->halt = `1`;
1706	else if (type == OP_BACK_REF)
1707	newstate->has_backref = `1`;
1708
1709	if (constraint)
1710	{
1711	if (newstate->entrance_nodes == &newstate->nodes)
1712	{
1713	re_node_set *entrance_nodes = re_malloc (re_node_set, `1`);
1714	if (__glibc_unlikely (entrance_nodes == NULL))
1715	{
1716	free_state (newstate);
1717	return NULL;
1718	}
1719	newstate->entrance_nodes = entrance_nodes;
1720	if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1721	!= REG_NOERROR)
1722	{
1723	free_state (newstate);
1724	return NULL;
1725	}
1726	nctx_nodes = `0`;
1727	newstate->has_constraint = `1`;
1728	}
1729
1730	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1731	{
1732	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1733	++nctx_nodes;
1734	}
1735	}
1736	}
1737	err = register_state (dfa, newstate, hash);
1738	if (__glibc_unlikely (err != REG_NOERROR))
1739	{
1740	free_state (newstate);
1741	newstate = NULL;
1742	}
1743	return newstate;
1744	}
1745

Browse the source code of Glibc/posix/regex_internal.c