pcre_study.c source code [POCO/Foundation/src/pcre_study.c]

1	/*************************************************
2	* Perl-Compatible Regular Expressions *
3	*************************************************/
4
5	/ PCRE is a library of functions to support regular expressions whose syntax*
6	and semantics are as close as possible to those of the Perl 5 language.
7
8	Written by Philip Hazel
9	Copyright (c) 1997-2012 University of Cambridge
10
11	-----------------------------------------------------------------------------
12	Redistribution and use in source and binary forms, with or without
13	modification, are permitted provided that the following conditions are met:
14
15	* Redistributions of source code must retain the above copyright notice,
16	this list of conditions and the following disclaimer.
17
18	* Redistributions in binary form must reproduce the above copyright
19	notice, this list of conditions and the following disclaimer in the
20	documentation and/or other materials provided with the distribution.
21
22	* Neither the name of the University of Cambridge nor the names of its
23	contributors may be used to endorse or promote products derived from
24	this software without specific prior written permission.
25
26	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36	POSSIBILITY OF SUCH DAMAGE.
37	-----------------------------------------------------------------------------
38	*/
39
40	#pragma warning( disable : 4244) // conversion from 'int' to 'unsigned short', possible loss of data
41
42	/ This module contains the external function pcre_study(), along with local*
43	supporting functions. /*
44
45	#include "pcre_config.h"
46	#include "pcre_internal.h"
47
48	#define SET_BIT(c) start_bits[c/8] \|= (1 << (c&7))
49
50	/ Returns from set_start_bits() /
51
52	enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
53
54
55
56	/*************************************************
57	* Find the minimum subject length for a group *
58	*************************************************/
59
60	/ Scan a parenthesized group and compute the minimum length of subject that*
61	is needed to match it. This is a lower bound; it does not mean there is a
62	string of that length that matches. In UTF8 mode, the result is in characters
63	rather than bytes.
64
65	Arguments:
66	re compiled pattern block
67	code pointer to start of group (the bracket)
68	startcode pointer to start of the whole pattern's code
69	options the compiling options
70	recurses chain of recurse_check to catch mutual recursion
71	countptr pointer to call count (to catch over complexity)
72
73	Returns: the minimum length
74	-1 if \C in UTF-8 mode or (ACCEPT) was encountered*
75	-2 internal error (missing capturing bracket)
76	-3 internal error (opcode not listed)
77	*/
78
79	static int
80	find_minlength(const REAL_PCRE re, const* pcre_uchar *code,
81	const pcre_uchar startcode, int* options, recurse_check *recurses,
82	int *countptr)
83	{
84	int length = -`1`;
85	/ PCRE_UTF16 has the same value as PCRE_UTF8. /
86	BOOL utf = (options & PCRE_UTF8) != `0`;
87	BOOL had_recurse = FALSE;
88	recurse_check this_recurse;
89	register int branchlength = `0`;
90	register pcre_uchar cc = (pcre_uchar )code + `1` + LINK_SIZE;
91
92	if ((countptr)++ > `1000`) return* -`1`; / too complex /
93
94	if (code == OP_CBRA \|\| code == OP_SCBRA \|\|
95	code == OP_CBRAPOS \|\| code == OP_SCBRAPOS) cc += IMM2_SIZE;
96
97	/ Scan along the opcodes for this branch. If we get to the end of the*
98	branch, check the length against that of the other branches. /*
99
100	for (;;)
101	{
102	int d, min;
103	pcre_uchar cs, ce;
104	register pcre_uchar op = *cc;
105
106	switch (op)
107	{
108	case OP_COND:
109	case OP_SCOND:
110
111	/ If there is only one branch in a condition, the implied branch has zero*
112	length, so we don't add anything. This covers the DEFINE "condition"
113	automatically. /*
114
115	cs = cc + GET(cc, `1`);
116	if (*cs != OP_ALT)
117	{
118	cc = cs + `1` + LINK_SIZE;
119	break;
120	}
121
122	/ Otherwise we can fall through and treat it the same as any other*
123	subpattern. /*
124
125	case OP_CBRA:
126	case OP_SCBRA:
127	case OP_BRA:
128	case OP_SBRA:
129	case OP_CBRAPOS:
130	case OP_SCBRAPOS:
131	case OP_BRAPOS:
132	case OP_SBRAPOS:
133	case OP_ONCE:
134	case OP_ONCE_NC:
135	d = find_minlength(re, cc, startcode, options, recurses, countptr);
136	if (d < `0`) return d;
137	branchlength += d;
138	do cc += GET(cc, `1`); while (*cc == OP_ALT);
139	cc += `1` + LINK_SIZE;
140	break;
141
142	/ ACCEPT makes things far too complicated; we have to give up. /
143
144	case OP_ACCEPT:
145	case OP_ASSERT_ACCEPT:
146	return -`1`;
147
148	/ Reached end of a branch; if it's a ket it is the end of a nested*
149	call. If it's ALT it is an alternation in a nested call. If it is END it's
150	the end of the outer call. All can be handled by the same code. If an
151	ACCEPT was previously encountered, use the length that was in force at that
152	time, and pass back the shortest ACCEPT length. /*
153
154	case OP_ALT:
155	case OP_KET:
156	case OP_KETRMAX:
157	case OP_KETRMIN:
158	case OP_KETRPOS:
159	case OP_END:
160	if (length < `0` \|\| (!had_recurse && branchlength < length))
161	length = branchlength;
162	if (op != OP_ALT) return length;
163	cc += `1` + LINK_SIZE;
164	branchlength = `0`;
165	had_recurse = FALSE;
166	break;
167
168	/ Skip over assertive subpatterns /
169
170	case OP_ASSERT:
171	case OP_ASSERT_NOT:
172	case OP_ASSERTBACK:
173	case OP_ASSERTBACK_NOT:
174	do cc += GET(cc, `1`); while (*cc == OP_ALT);
175	/ Fall through /
176
177	/ Skip over things that don't match chars /
178
179	case OP_REVERSE:
180	case OP_CREF:
181	case OP_DNCREF:
182	case OP_RREF:
183	case OP_DNRREF:
184	case OP_DEF:
185	case OP_CALLOUT:
186	case OP_SOD:
187	case OP_SOM:
188	case OP_EOD:
189	case OP_EODN:
190	case OP_CIRC:
191	case OP_CIRCM:
192	case OP_DOLL:
193	case OP_DOLLM:
194	case OP_NOT_WORD_BOUNDARY:
195	case OP_WORD_BOUNDARY:
196	cc += PRIV(OP_lengths)[*cc];
197	break;
198
199	/ Skip over a subpattern that has a {0} or {0,x} quantifier /
200
201	case OP_BRAZERO:
202	case OP_BRAMINZERO:
203	case OP_BRAPOSZERO:
204	case OP_SKIPZERO:
205	cc += PRIV(OP_lengths)[*cc];
206	do cc += GET(cc, `1`); while (*cc == OP_ALT);
207	cc += `1` + LINK_SIZE;
208	break;
209
210	/ Handle literal characters and + repetitions /
211
212	case OP_CHAR:
213	case OP_CHARI:
214	case OP_NOT:
215	case OP_NOTI:
216	case OP_PLUS:
217	case OP_PLUSI:
218	case OP_MINPLUS:
219	case OP_MINPLUSI:
220	case OP_POSPLUS:
221	case OP_POSPLUSI:
222	case OP_NOTPLUS:
223	case OP_NOTPLUSI:
224	case OP_NOTMINPLUS:
225	case OP_NOTMINPLUSI:
226	case OP_NOTPOSPLUS:
227	case OP_NOTPOSPLUSI:
228	branchlength++;
229	cc += `2`;
230	#ifdef SUPPORT_UTF
231	if (utf && HAS_EXTRALEN(cc[-`1`])) cc += GET_EXTRALEN(cc[-`1`]);
232	#endif
233	break;
234
235	case OP_TYPEPLUS:
236	case OP_TYPEMINPLUS:
237	case OP_TYPEPOSPLUS:
238	branchlength++;
239	cc += (cc[`1`] == OP_PROP \|\| cc[`1`] == OP_NOTPROP)? `4` : `2`;
240	break;
241
242	/ Handle exact repetitions. The count is already in characters, but we*
243	need to skip over a multibyte character in UTF8 mode. /*
244
245	case OP_EXACT:
246	case OP_EXACTI:
247	case OP_NOTEXACT:
248	case OP_NOTEXACTI:
249	branchlength += GET2(cc,`1`);
250	cc += `2` + IMM2_SIZE;
251	#ifdef SUPPORT_UTF
252	if (utf && HAS_EXTRALEN(cc[-`1`])) cc += GET_EXTRALEN(cc[-`1`]);
253	#endif
254	break;
255
256	case OP_TYPEEXACT:
257	branchlength += GET2(cc,`1`);
258	cc += `2` + IMM2_SIZE + ((cc[`1` + IMM2_SIZE] == OP_PROP
259	\|\| cc[`1` + IMM2_SIZE] == OP_NOTPROP)? `2` : `0`);
260	break;
261
262	/ Handle single-char non-literal matchers /
263
264	case OP_PROP:
265	case OP_NOTPROP:
266	cc += `2`;
267	/ Fall through /
268
269	case OP_NOT_DIGIT:
270	case OP_DIGIT:
271	case OP_NOT_WHITESPACE:
272	case OP_WHITESPACE:
273	case OP_NOT_WORDCHAR:
274	case OP_WORDCHAR:
275	case OP_ANY:
276	case OP_ALLANY:
277	case OP_EXTUNI:
278	case OP_HSPACE:
279	case OP_NOT_HSPACE:
280	case OP_VSPACE:
281	case OP_NOT_VSPACE:
282	branchlength++;
283	cc++;
284	break;
285
286	/ "Any newline" might match two characters, but it also might match just*
287	one. /*
288
289	case OP_ANYNL:
290	branchlength += `1`;
291	cc++;
292	break;
293
294	/ The single-byte matcher means we can't proceed in UTF-8 mode. (In*
295	non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
296	appear, but leave the code, just in case.) /*
297
298	case OP_ANYBYTE:
299	#ifdef SUPPORT_UTF
300	if (utf) return -`1`;
301	#endif
302	branchlength++;
303	cc++;
304	break;
305
306	/ For repeated character types, we have to test for \p and \P, which have*
307	an extra two bytes of parameters. /*
308
309	case OP_TYPESTAR:
310	case OP_TYPEMINSTAR:
311	case OP_TYPEQUERY:
312	case OP_TYPEMINQUERY:
313	case OP_TYPEPOSSTAR:
314	case OP_TYPEPOSQUERY:
315	if (cc[`1`] == OP_PROP \|\| cc[`1`] == OP_NOTPROP) cc += `2`;
316	cc += PRIV(OP_lengths)[op];
317	break;
318
319	case OP_TYPEUPTO:
320	case OP_TYPEMINUPTO:
321	case OP_TYPEPOSUPTO:
322	if (cc[`1` + IMM2_SIZE] == OP_PROP
323	\|\| cc[`1` + IMM2_SIZE] == OP_NOTPROP) cc += `2`;
324	cc += PRIV(OP_lengths)[op];
325	break;
326
327	/ Check a class for variable quantification /
328
329	case OP_CLASS:
330	case OP_NCLASS:
331	#if defined SUPPORT_UTF \|\| defined COMPILE_PCRE16 \|\| defined COMPILE_PCRE32
332	case OP_XCLASS:
333	/ The original code caused an unsigned overflow in 64 bit systems,*
334	so now we use a conditional statement. /*
335	if (op == OP_XCLASS)
336	cc += GET(cc, `1`);
337	else
338	cc += PRIV(OP_lengths)[OP_CLASS];
339	#else
340	cc += PRIV(OP_lengths)[OP_CLASS];
341	#endif
342
343	switch (*cc)
344	{
345	case OP_CRPLUS:
346	case OP_CRMINPLUS:
347	case OP_CRPOSPLUS:
348	branchlength++;
349	/ Fall through /
350
351	case OP_CRSTAR:
352	case OP_CRMINSTAR:
353	case OP_CRQUERY:
354	case OP_CRMINQUERY:
355	case OP_CRPOSSTAR:
356	case OP_CRPOSQUERY:
357	cc++;
358	break;
359
360	case OP_CRRANGE:
361	case OP_CRMINRANGE:
362	case OP_CRPOSRANGE:
363	branchlength += GET2(cc,`1`);
364	cc += `1` + `2` * IMM2_SIZE;
365	break;
366
367	default:
368	branchlength++;
369	break;
370	}
371	break;
372
373	/ Backreferences and subroutine calls are treated in the same way: we find*
374	the minimum length for the subpattern. A recursion, however, causes an
375	a flag to be set that causes the length of this branch to be ignored. The
376	logic is that a recursion can only make sense if there is another
377	alternation that stops the recursing. That will provide the minimum length
378	(when no recursion happens). A backreference within the group that it is
379	referencing behaves in the same way.
380
381	If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
382	matches an empty string (by default it causes a matching failure), so in
383	that case we must set the minimum length to zero. /*
384
385	case OP_DNREF: / Duplicate named pattern back reference /
386	case OP_DNREFI:
387	if ((options & PCRE_JAVASCRIPT_COMPAT) == `0`)
388	{
389	int count = GET2(cc, `1`+IMM2_SIZE);
390	pcre_uchar slot = (pcre_uchar )re +
391	re->name_table_offset + GET2(cc, `1`) * re->name_entry_size;
392	d = INT_MAX;
393	while (count-- > `0`)
394	{
395	ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, `0`));
396	if (cs == NULL) return -`2`;
397	do ce += GET(ce, `1`); while (*ce == OP_ALT);
398	if (cc > cs && cc < ce) / Simple recursion /
399	{
400	d = `0`;
401	had_recurse = TRUE;
402	break;
403	}
404	else
405	{
406	recurse_check *r = recurses;
407	for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
408	if (r != NULL) / Mutual recursion /
409	{
410	d = `0`;
411	had_recurse = TRUE;
412	break;
413	}
414	else
415	{
416	int dd;
417	this_recurse.prev = recurses;
418	this_recurse.group = cs;
419	dd = find_minlength(re, cs, startcode, options, &this_recurse,
420	countptr);
421	if (dd < d) d = dd;
422	}
423	}
424	slot += re->name_entry_size;
425	}
426	}
427	else d = `0`;
428	cc += `1` + `2`*IMM2_SIZE;
429	goto REPEAT_BACK_REFERENCE;
430
431	case OP_REF: / Single back reference /
432	case OP_REFI:
433	if ((options & PCRE_JAVASCRIPT_COMPAT) == `0`)
434	{
435	ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, `1`));
436	if (cs == NULL) return -`2`;
437	do ce += GET(ce, `1`); while (*ce == OP_ALT);
438	if (cc > cs && cc < ce) / Simple recursion /
439	{
440	d = `0`;
441	had_recurse = TRUE;
442	}
443	else
444	{
445	recurse_check *r = recurses;
446	for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
447	if (r != NULL) / Mutual recursion /
448	{
449	d = `0`;
450	had_recurse = TRUE;
451	}
452	else
453	{
454	this_recurse.prev = recurses;
455	this_recurse.group = cs;
456	d = find_minlength(re, cs, startcode, options, &this_recurse,
457	countptr);
458	}
459	}
460	}
461	else d = `0`;
462	cc += `1` + IMM2_SIZE;
463
464	/ Handle repeated back references /
465
466	REPEAT_BACK_REFERENCE:
467	switch (*cc)
468	{
469	case OP_CRSTAR:
470	case OP_CRMINSTAR:
471	case OP_CRQUERY:
472	case OP_CRMINQUERY:
473	case OP_CRPOSSTAR:
474	case OP_CRPOSQUERY:
475	min = `0`;
476	cc++;
477	break;
478
479	case OP_CRPLUS:
480	case OP_CRMINPLUS:
481	case OP_CRPOSPLUS:
482	min = `1`;
483	cc++;
484	break;
485
486	case OP_CRRANGE:
487	case OP_CRMINRANGE:
488	case OP_CRPOSRANGE:
489	min = GET2(cc, `1`);
490	cc += `1` + `2` * IMM2_SIZE;
491	break;
492
493	default:
494	min = `1`;
495	break;
496	}
497
498	branchlength += min * d;
499	break;
500
501	/ We can easily detect direct recursion, but not mutual recursion. This is*
502	caught by a recursion depth count. /*
503
504	case OP_RECURSE:
505	cs = ce = (pcre_uchar *)startcode + GET(cc, `1`);
506	do ce += GET(ce, `1`); while (*ce == OP_ALT);
507	if (cc > cs && cc < ce) / Simple recursion /
508	had_recurse = TRUE;
509	else
510	{
511	recurse_check *r = recurses;
512	for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
513	if (r != NULL) / Mutual recursion /
514	had_recurse = TRUE;
515	else
516	{
517	this_recurse.prev = recurses;
518	this_recurse.group = cs;
519	branchlength += find_minlength(re, cs, startcode, options,
520	&this_recurse, countptr);
521	}
522	}
523	cc += `1` + LINK_SIZE;
524	break;
525
526	/ Anything else does not or need not match a character. We can get the*
527	item's length from the table, but for those that can match zero occurrences
528	of a character, we must take special action for UTF-8 characters. As it
529	happens, the "NOT" versions of these opcodes are used at present only for
530	ASCII characters, so they could be omitted from this list. However, in
531	future that may change, so we include them here so as not to leave a
532	gotcha for a future maintainer. /*
533
534	case OP_UPTO:
535	case OP_UPTOI:
536	case OP_NOTUPTO:
537	case OP_NOTUPTOI:
538	case OP_MINUPTO:
539	case OP_MINUPTOI:
540	case OP_NOTMINUPTO:
541	case OP_NOTMINUPTOI:
542	case OP_POSUPTO:
543	case OP_POSUPTOI:
544	case OP_NOTPOSUPTO:
545	case OP_NOTPOSUPTOI:
546
547	case OP_STAR:
548	case OP_STARI:
549	case OP_NOTSTAR:
550	case OP_NOTSTARI:
551	case OP_MINSTAR:
552	case OP_MINSTARI:
553	case OP_NOTMINSTAR:
554	case OP_NOTMINSTARI:
555	case OP_POSSTAR:
556	case OP_POSSTARI:
557	case OP_NOTPOSSTAR:
558	case OP_NOTPOSSTARI:
559
560	case OP_QUERY:
561	case OP_QUERYI:
562	case OP_NOTQUERY:
563	case OP_NOTQUERYI:
564	case OP_MINQUERY:
565	case OP_MINQUERYI:
566	case OP_NOTMINQUERY:
567	case OP_NOTMINQUERYI:
568	case OP_POSQUERY:
569	case OP_POSQUERYI:
570	case OP_NOTPOSQUERY:
571	case OP_NOTPOSQUERYI:
572
573	cc += PRIV(OP_lengths)[op];
574	#ifdef SUPPORT_UTF
575	if (utf && HAS_EXTRALEN(cc[-`1`])) cc += GET_EXTRALEN(cc[-`1`]);
576	#endif
577	break;
578
579	/ Skip these, but we need to add in the name length. /
580
581	case OP_MARK:
582	case OP_PRUNE_ARG:
583	case OP_SKIP_ARG:
584	case OP_THEN_ARG:
585	cc += PRIV(OP_lengths)[op] + cc[`1`];
586	break;
587
588	/ The remaining opcodes are just skipped over. /
589
590	case OP_CLOSE:
591	case OP_COMMIT:
592	case OP_FAIL:
593	case OP_PRUNE:
594	case OP_SET_SOM:
595	case OP_SKIP:
596	case OP_THEN:
597	cc += PRIV(OP_lengths)[op];
598	break;
599
600	/ This should not occur: we list all opcodes explicitly so that when*
601	new ones get added they are properly considered. /*
602
603	default:
604	return -`3`;
605	}
606	}
607	/ Control never gets here /
608	}
609
610
611
612	/*************************************************
613	* Set a bit and maybe its alternate case *
614	*************************************************/
615
616	/ Given a character, set its first byte's bit in the table, and also the*
617	corresponding bit for the other version of a letter if we are caseless. In
618	UTF-8 mode, for characters greater than 127, we can only do the caseless thing
619	when Unicode property support is available.
620
621	Arguments:
622	start_bits points to the bit map
623	p points to the character
624	caseless the caseless flag
625	cd the block with char table pointers
626	utf TRUE for UTF-8 / UTF-16 / UTF-32 mode
627
628	Returns: pointer after the character
629	*/
630
631	static const pcre_uchar *
632	set_table_bit(pcre_uint8 start_bits, const* pcre_uchar *p, BOOL caseless,
633	compile_data *cd, BOOL utf)
634	{
635	pcre_uint32 c = *p;
636
637	#ifdef COMPILE_PCRE8
638	SET_BIT(c);
639
640	#ifdef SUPPORT_UTF
641	if (utf && c > `127`)
642	{
643	GETCHARINC(c, p);
644	#ifdef SUPPORT_UCP
645	if (caseless)
646	{
647	pcre_uchar buff[`6`];
648	c = UCD_OTHERCASE(c);
649	(void)PRIV(ord2utf)(c, buff);
650	SET_BIT(buff[`0`]);
651	}
652	#endif /* Not SUPPORT_UCP */
653	return p;
654	}
655	#else /* Not SUPPORT_UTF */
656	(void)(utf); / Stops warning for unused parameter /
657	#endif /* SUPPORT_UTF */
658
659	/ Not UTF-8 mode, or character is less than 127. /
660
661	if (caseless && (cd->ctypes[c] & ctype_letter) != `0`) SET_BIT(cd->fcc[c]);
662	return p + `1`;
663	#endif /* COMPILE_PCRE8 */
664
665	#if defined COMPILE_PCRE16 \|\| defined COMPILE_PCRE32
666	if (c > `0xff`)
667	{
668	c = `0xff`;
669	caseless = FALSE;
670	}
671	SET_BIT(c);
672
673	#ifdef SUPPORT_UTF
674	if (utf && c > `127`)
675	{
676	GETCHARINC(c, p);
677	#ifdef SUPPORT_UCP
678	if (caseless)
679	{
680	c = UCD_OTHERCASE(c);
681	if (c > `0xff`)
682	c = `0xff`;
683	SET_BIT(c);
684	}
685	#endif /* SUPPORT_UCP */
686	return p;
687	}
688	#else /* Not SUPPORT_UTF */
689	(void)(utf); / Stops warning for unused parameter /
690	#endif /* SUPPORT_UTF */
691
692	if (caseless && (cd->ctypes[c] & ctype_letter) != `0`) SET_BIT(cd->fcc[c]);
693	return p + `1`;
694	#endif
695	}
696
697
698
699	/*************************************************
700	* Set bits for a positive character type *
701	*************************************************/
702
703	/ This function sets starting bits for a character type. In UTF-8 mode, we can*
704	only do a direct setting for bytes less than 128, as otherwise there can be
705	confusion with bytes in the middle of UTF-8 characters. In a "traditional"
706	environment, the tables will only recognize ASCII characters anyway, but in at
707	least one Windows environment, some higher bytes bits were set in the tables.
708	So we deal with that case by considering the UTF-8 encoding.
709
710	Arguments:
711	start_bits the starting bitmap
712	cbit type the type of character wanted
713	table_limit 32 for non-UTF-8; 16 for UTF-8
714	cd the block with char table pointers
715
716	Returns: nothing
717	*/
718
719	static void
720	set_type_bits(pcre_uint8 start_bits, int* cbit_type, unsigned int table_limit,
721	compile_data *cd)
722	{
723	register pcre_uint32 c;
724	for (c = `0`; c < table_limit; c++) start_bits[c] \|= cd->cbits[c+cbit_type];
725	#if defined SUPPORT_UTF && defined COMPILE_PCRE8
726	if (table_limit == `32`) return;
727	for (c = `128`; c < `256`; c++)
728	{
729	if ((cd->cbits[c/`8`] & (`1` << (c&`7`))) != `0`)
730	{
731	pcre_uchar buff[`6`];
732	(void)PRIV(ord2utf)(c, buff);
733	SET_BIT(buff[`0`]);
734	}
735	}
736	#endif
737	}
738
739
740	/*************************************************
741	* Set bits for a negative character type *
742	*************************************************/
743
744	/ This function sets starting bits for a negative character type such as \D.*
745	In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
746	otherwise there can be confusion with bytes in the middle of UTF-8 characters.
747	Unlike in the positive case, where we can set appropriate starting bits for
748	specific high-valued UTF-8 characters, in this case we have to set the bits for
749	all high-valued characters. The lowest is 0xc2, but we overkill by starting at
750	0xc0 (192) for simplicity.
751
752	Arguments:
753	start_bits the starting bitmap
754	cbit type the type of character wanted
755	table_limit 32 for non-UTF-8; 16 for UTF-8
756	cd the block with char table pointers
757
758	Returns: nothing
759	*/
760
761	static void
762	set_nottype_bits(pcre_uint8 start_bits, int* cbit_type, unsigned int table_limit,
763	compile_data *cd)
764	{
765	register pcre_uint32 c;
766	for (c = `0`; c < table_limit; c++) start_bits[c] \|= ~cd->cbits[c+cbit_type];
767	#if defined SUPPORT_UTF && defined COMPILE_PCRE8
768	if (table_limit != `32`) for (c = `24`; c < `32`; c++) start_bits[c] = `0xff`;
769	#endif
770	}
771
772
773
774	/*************************************************
775	* Create bitmap of starting bytes *
776	*************************************************/
777
778	/ This function scans a compiled unanchored expression recursively and*
779	attempts to build a bitmap of the set of possible starting bytes. As time goes
780	by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
781	useful for parenthesized groups in patterns such as (a)b where the group*
782	provides some optional starting bytes but scanning must continue at the outer
783	level to find at least one mandatory byte. At the outermost level, this
784	function fails unless the result is SSB_DONE.
785
786	Arguments:
787	code points to an expression
788	start_bits points to a 32-byte table, initialized to 0
789	utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
790	cd the block with char table pointers
791
792	Returns: SSB_FAIL => Failed to find any starting bytes
793	SSB_DONE => Found mandatory starting bytes
794	SSB_CONTINUE => Found optional starting bytes
795	SSB_UNKNOWN => Hit an unrecognized opcode
796	*/
797
798	static int
799	set_start_bits(const pcre_uchar code, pcre_uint8 start_bits, BOOL utf,
800	compile_data *cd)
801	{
802	register pcre_uint32 c;
803	int yield = SSB_DONE;
804	#if defined SUPPORT_UTF && defined COMPILE_PCRE8
805	int table_limit = utf? `16`:`32`;
806	#else
807	int table_limit = `32`;
808	#endif
809
810	#if 0
811	/ ========================================================================= /
812	/ The following comment and code was inserted in January 1999. In May 2006,*
813	when it was observed to cause compiler warnings about unused values, I took it
814	out again. If anybody is still using OS/2, they will have to put it back
815	manually. /*
816
817	/ This next statement and the later reference to dummy are here in order to*
818	trick the optimizer of the IBM C compiler for OS/2 into generating correct
819	code. Apparently IBM isn't going to fix the problem, and we would rather not
820	disable optimization (in this module it actually makes a big difference, and
821	the pcre module can use all the optimization it can get). /*
822
823	volatile int dummy;
824	/ ========================================================================= /
825	#endif
826
827	do
828	{
829	BOOL try_next = TRUE;
830	const pcre_uchar *tcode = code + `1` + LINK_SIZE;
831
832	if (code == OP_CBRA \|\| code == OP_SCBRA \|\|
833	code == OP_CBRAPOS \|\| code == OP_SCBRAPOS) tcode += IMM2_SIZE;
834
835	while (try_next) / Loop for items in this branch /
836	{
837	int rc;
838
839	switch(*tcode)
840	{
841	/ If we reach something we don't understand, it means a new opcode has*
842	been created that hasn't been added to this code. Hopefully this problem
843	will be discovered during testing. /*
844
845	default:
846	return SSB_UNKNOWN;
847
848	/ Fail for a valid opcode that implies no starting bits. /
849
850	case OP_ACCEPT:
851	case OP_ASSERT_ACCEPT:
852	case OP_ALLANY:
853	case OP_ANY:
854	case OP_ANYBYTE:
855	case OP_CIRC:
856	case OP_CIRCM:
857	case OP_CLOSE:
858	case OP_COMMIT:
859	case OP_COND:
860	case OP_CREF:
861	case OP_DEF:
862	case OP_DNCREF:
863	case OP_DNREF:
864	case OP_DNREFI:
865	case OP_DNRREF:
866	case OP_DOLL:
867	case OP_DOLLM:
868	case OP_END:
869	case OP_EOD:
870	case OP_EODN:
871	case OP_EXTUNI:
872	case OP_FAIL:
873	case OP_MARK:
874	case OP_NOT:
875	case OP_NOTEXACT:
876	case OP_NOTEXACTI:
877	case OP_NOTI:
878	case OP_NOTMINPLUS:
879	case OP_NOTMINPLUSI:
880	case OP_NOTMINQUERY:
881	case OP_NOTMINQUERYI:
882	case OP_NOTMINSTAR:
883	case OP_NOTMINSTARI:
884	case OP_NOTMINUPTO:
885	case OP_NOTMINUPTOI:
886	case OP_NOTPLUS:
887	case OP_NOTPLUSI:
888	case OP_NOTPOSPLUS:
889	case OP_NOTPOSPLUSI:
890	case OP_NOTPOSQUERY:
891	case OP_NOTPOSQUERYI:
892	case OP_NOTPOSSTAR:
893	case OP_NOTPOSSTARI:
894	case OP_NOTPOSUPTO:
895	case OP_NOTPOSUPTOI:
896	case OP_NOTPROP:
897	case OP_NOTQUERY:
898	case OP_NOTQUERYI:
899	case OP_NOTSTAR:
900	case OP_NOTSTARI:
901	case OP_NOTUPTO:
902	case OP_NOTUPTOI:
903	case OP_NOT_HSPACE:
904	case OP_NOT_VSPACE:
905	case OP_PRUNE:
906	case OP_PRUNE_ARG:
907	case OP_RECURSE:
908	case OP_REF:
909	case OP_REFI:
910	case OP_REVERSE:
911	case OP_RREF:
912	case OP_SCOND:
913	case OP_SET_SOM:
914	case OP_SKIP:
915	case OP_SKIP_ARG:
916	case OP_SOD:
917	case OP_SOM:
918	case OP_THEN:
919	case OP_THEN_ARG:
920	return SSB_FAIL;
921
922	/ A "real" property test implies no starting bits, but the fake property*
923	PT_CLIST identifies a list of characters. These lists are short, as they
924	are used for characters with more than one "other case", so there is no
925	point in recognizing them for OP_NOTPROP. /*
926
927	case OP_PROP:
928	if (tcode[`1`] != PT_CLIST) return SSB_FAIL;
929	{
930	const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[`2`];
931	while ((c = *p++) < NOTACHAR)
932	{
933	#if defined SUPPORT_UTF && defined COMPILE_PCRE8
934	if (utf)
935	{
936	pcre_uchar buff[`6`];
937	(void)PRIV(ord2utf)(c, buff);
938	c = buff[`0`];
939	}
940	#endif
941	if (c > `0xff`) SET_BIT(`0xff`); else SET_BIT(c);
942	}
943	}
944	try_next = FALSE;
945	break;
946
947	/ We can ignore word boundary tests. /
948
949	case OP_WORD_BOUNDARY:
950	case OP_NOT_WORD_BOUNDARY:
951	tcode++;
952	break;
953
954	/ If we hit a bracket or a positive lookahead assertion, recurse to set*
955	bits from within the subpattern. If it can't find anything, we have to
956	give up. If it finds some mandatory character(s), we are done for this
957	branch. Otherwise, carry on scanning after the subpattern. /*
958
959	case OP_BRA:
960	case OP_SBRA:
961	case OP_CBRA:
962	case OP_SCBRA:
963	case OP_BRAPOS:
964	case OP_SBRAPOS:
965	case OP_CBRAPOS:
966	case OP_SCBRAPOS:
967	case OP_ONCE:
968	case OP_ONCE_NC:
969	case OP_ASSERT:
970	rc = set_start_bits(tcode, start_bits, utf, cd);
971	if (rc == SSB_FAIL \|\| rc == SSB_UNKNOWN) return rc;
972	if (rc == SSB_DONE) try_next = FALSE; else
973	{
974	do tcode += GET(tcode, `1`); while (*tcode == OP_ALT);
975	tcode += `1` + LINK_SIZE;
976	}
977	break;
978
979	/ If we hit ALT or KET, it means we haven't found anything mandatory in*
980	this branch, though we might have found something optional. For ALT, we
981	continue with the next alternative, but we have to arrange that the final
982	result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
983	return SSB_CONTINUE: if this is the top level, that indicates failure,
984	but after a nested subpattern, it causes scanning to continue. /*
985
986	case OP_ALT:
987	yield = SSB_CONTINUE;
988	try_next = FALSE;
989	break;
990
991	case OP_KET:
992	case OP_KETRMAX:
993	case OP_KETRMIN:
994	case OP_KETRPOS:
995	return SSB_CONTINUE;
996
997	/ Skip over callout /
998
999	case OP_CALLOUT:
1000	tcode += `2` + `2`*LINK_SIZE;
1001	break;
1002
1003	/ Skip over lookbehind and negative lookahead assertions /
1004
1005	case OP_ASSERT_NOT:
1006	case OP_ASSERTBACK:
1007	case OP_ASSERTBACK_NOT:
1008	do tcode += GET(tcode, `1`); while (*tcode == OP_ALT);
1009	tcode += `1` + LINK_SIZE;
1010	break;
1011
1012	/ BRAZERO does the bracket, but carries on. /
1013
1014	case OP_BRAZERO:
1015	case OP_BRAMINZERO:
1016	case OP_BRAPOSZERO:
1017	rc = set_start_bits(++tcode, start_bits, utf, cd);
1018	if (rc == SSB_FAIL \|\| rc == SSB_UNKNOWN) return rc;
1019	/ =========================================================================*
1020	See the comment at the head of this function concerning the next line,
1021	which was an old fudge for the benefit of OS/2.
1022	dummy = 1;
1023	========================================================================= /*
1024	do tcode += GET(tcode,`1`); while (*tcode == OP_ALT);
1025	tcode += `1` + LINK_SIZE;
1026	break;
1027
1028	/ SKIPZERO skips the bracket. /
1029
1030	case OP_SKIPZERO:
1031	tcode++;
1032	do tcode += GET(tcode,`1`); while (*tcode == OP_ALT);
1033	tcode += `1` + LINK_SIZE;
1034	break;
1035
1036	/ Single-char * or ? sets the bit and tries the next item /
1037
1038	case OP_STAR:
1039	case OP_MINSTAR:
1040	case OP_POSSTAR:
1041	case OP_QUERY:
1042	case OP_MINQUERY:
1043	case OP_POSQUERY:
1044	tcode = set_table_bit(start_bits, tcode + `1`, FALSE, cd, utf);
1045	break;
1046
1047	case OP_STARI:
1048	case OP_MINSTARI:
1049	case OP_POSSTARI:
1050	case OP_QUERYI:
1051	case OP_MINQUERYI:
1052	case OP_POSQUERYI:
1053	tcode = set_table_bit(start_bits, tcode + `1`, TRUE, cd, utf);
1054	break;
1055
1056	/ Single-char upto sets the bit and tries the next /
1057
1058	case OP_UPTO:
1059	case OP_MINUPTO:
1060	case OP_POSUPTO:
1061	tcode = set_table_bit(start_bits, tcode + `1` + IMM2_SIZE, FALSE, cd, utf);
1062	break;
1063
1064	case OP_UPTOI:
1065	case OP_MINUPTOI:
1066	case OP_POSUPTOI:
1067	tcode = set_table_bit(start_bits, tcode + `1` + IMM2_SIZE, TRUE, cd, utf);
1068	break;
1069
1070	/ At least one single char sets the bit and stops /
1071
1072	case OP_EXACT:
1073	tcode += IMM2_SIZE;
1074	/ Fall through /
1075	case OP_CHAR:
1076	case OP_PLUS:
1077	case OP_MINPLUS:
1078	case OP_POSPLUS:
1079	(void)set_table_bit(start_bits, tcode + `1`, FALSE, cd, utf);
1080	try_next = FALSE;
1081	break;
1082
1083	case OP_EXACTI:
1084	tcode += IMM2_SIZE;
1085	/ Fall through /
1086	case OP_CHARI:
1087	case OP_PLUSI:
1088	case OP_MINPLUSI:
1089	case OP_POSPLUSI:
1090	(void)set_table_bit(start_bits, tcode + `1`, TRUE, cd, utf);
1091	try_next = FALSE;
1092	break;
1093
1094	/ Special spacing and line-terminating items. These recognize specific*
1095	lists of characters. The difference between VSPACE and ANYNL is that the
1096	latter can match the two-character CRLF sequence, but that is not
1097	relevant for finding the first character, so their code here is
1098	identical. /*
1099
1100	case OP_HSPACE:
1101	SET_BIT(CHAR_HT);
1102	SET_BIT(CHAR_SPACE);
1103	#ifdef SUPPORT_UTF
1104	if (utf)
1105	{
1106	#ifdef COMPILE_PCRE8
1107	SET_BIT(`0xC2`); / For U+00A0 /
1108	SET_BIT(`0xE1`); / For U+1680, U+180E /
1109	SET_BIT(`0xE2`); / For U+2000 - U+200A, U+202F, U+205F /
1110	SET_BIT(`0xE3`); / For U+3000 /
1111	#elif defined COMPILE_PCRE16 \|\| defined COMPILE_PCRE32
1112	SET_BIT(`0xA0`);
1113	SET_BIT(`0xFF`); / For characters > 255 /
1114	#endif /* COMPILE_PCRE[8\|16\|32] */
1115	}
1116	else
1117	#endif /* SUPPORT_UTF */
1118	{
1119	#ifndef EBCDIC
1120	SET_BIT(`0xA0`);
1121	#endif /* Not EBCDIC */
1122	#if defined COMPILE_PCRE16 \|\| defined COMPILE_PCRE32
1123	SET_BIT(`0xFF`); / For characters > 255 /
1124	#endif /* COMPILE_PCRE[16\|32] */
1125	}
1126	try_next = FALSE;
1127	break;
1128
1129	case OP_ANYNL:
1130	case OP_VSPACE:
1131	SET_BIT(CHAR_LF);
1132	SET_BIT(CHAR_VT);
1133	SET_BIT(CHAR_FF);
1134	SET_BIT(CHAR_CR);
1135	#ifdef SUPPORT_UTF
1136	if (utf)
1137	{
1138	#ifdef COMPILE_PCRE8
1139	SET_BIT(`0xC2`); / For U+0085 /
1140	SET_BIT(`0xE2`); / For U+2028, U+2029 /
1141	#elif defined COMPILE_PCRE16 \|\| defined COMPILE_PCRE32
1142	SET_BIT(CHAR_NEL);
1143	SET_BIT(`0xFF`); / For characters > 255 /
1144	#endif /* COMPILE_PCRE[8\|16\|32] */
1145	}
1146	else
1147	#endif /* SUPPORT_UTF */
1148	{
1149	SET_BIT(CHAR_NEL);
1150	#if defined COMPILE_PCRE16 \|\| defined COMPILE_PCRE32
1151	SET_BIT(`0xFF`); / For characters > 255 /
1152	#endif
1153	}
1154	try_next = FALSE;
1155	break;
1156
1157	/ Single character types set the bits and stop. Note that if PCRE_UCP*
1158	is set, we do not see these op codes because \d etc are converted to
1159	properties. Therefore, these apply in the case when only characters less
1160	than 256 are recognized to match the types. /*
1161
1162	case OP_NOT_DIGIT:
1163	set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1164	try_next = FALSE;
1165	break;
1166
1167	case OP_DIGIT:
1168	set_type_bits(start_bits, cbit_digit, table_limit, cd);
1169	try_next = FALSE;
1170	break;
1171
1172	/ The cbit_space table has vertical tab as whitespace; we no longer*
1173	have to play fancy tricks because Perl added VT to its whitespace at
1174	release 5.18. PCRE added it at release 8.34. /*
1175
1176	case OP_NOT_WHITESPACE:
1177	set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1178	try_next = FALSE;
1179	break;
1180
1181	case OP_WHITESPACE:
1182	set_type_bits(start_bits, cbit_space, table_limit, cd);
1183	try_next = FALSE;
1184	break;
1185
1186	case OP_NOT_WORDCHAR:
1187	set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1188	try_next = FALSE;
1189	break;
1190
1191	case OP_WORDCHAR:
1192	set_type_bits(start_bits, cbit_word, table_limit, cd);
1193	try_next = FALSE;
1194	break;
1195
1196	/ One or more character type fudges the pointer and restarts, knowing*
1197	it will hit a single character type and stop there. /*
1198
1199	case OP_TYPEPLUS:
1200	case OP_TYPEMINPLUS:
1201	case OP_TYPEPOSPLUS:
1202	tcode++;
1203	break;
1204
1205	case OP_TYPEEXACT:
1206	tcode += `1` + IMM2_SIZE;
1207	break;
1208
1209	/ Zero or more repeats of character types set the bits and then*
1210	try again. /*
1211
1212	case OP_TYPEUPTO:
1213	case OP_TYPEMINUPTO:
1214	case OP_TYPEPOSUPTO:
1215	tcode += IMM2_SIZE; / Fall through /
1216
1217	case OP_TYPESTAR:
1218	case OP_TYPEMINSTAR:
1219	case OP_TYPEPOSSTAR:
1220	case OP_TYPEQUERY:
1221	case OP_TYPEMINQUERY:
1222	case OP_TYPEPOSQUERY:
1223	switch(tcode[`1`])
1224	{
1225	default:
1226	case OP_ANY:
1227	case OP_ALLANY:
1228	return SSB_FAIL;
1229
1230	case OP_HSPACE:
1231	SET_BIT(CHAR_HT);
1232	SET_BIT(CHAR_SPACE);
1233	#ifdef SUPPORT_UTF
1234	if (utf)
1235	{
1236	#ifdef COMPILE_PCRE8
1237	SET_BIT(`0xC2`); / For U+00A0 /
1238	SET_BIT(`0xE1`); / For U+1680, U+180E /
1239	SET_BIT(`0xE2`); / For U+2000 - U+200A, U+202F, U+205F /
1240	SET_BIT(`0xE3`); / For U+3000 /
1241	#elif defined COMPILE_PCRE16 \|\| defined COMPILE_PCRE32
1242	SET_BIT(`0xA0`);
1243	SET_BIT(`0xFF`); / For characters > 255 /
1244	#endif /* COMPILE_PCRE[8\|16\|32] */
1245	}
1246	else
1247	#endif /* SUPPORT_UTF */
1248	#ifndef EBCDIC
1249	SET_BIT(`0xA0`);
1250	#endif /* Not EBCDIC */
1251	break;
1252
1253	case OP_ANYNL:
1254	case OP_VSPACE:
1255	SET_BIT(CHAR_LF);
1256	SET_BIT(CHAR_VT);
1257	SET_BIT(CHAR_FF);
1258	SET_BIT(CHAR_CR);
1259	#ifdef SUPPORT_UTF
1260	if (utf)
1261	{
1262	#ifdef COMPILE_PCRE8
1263	SET_BIT(`0xC2`); / For U+0085 /
1264	SET_BIT(`0xE2`); / For U+2028, U+2029 /
1265	#elif defined COMPILE_PCRE16 \|\| defined COMPILE_PCRE32
1266	SET_BIT(CHAR_NEL);
1267	SET_BIT(`0xFF`); / For characters > 255 /
1268	#endif /* COMPILE_PCRE16 */
1269	}
1270	else
1271	#endif /* SUPPORT_UTF */
1272	SET_BIT(CHAR_NEL);
1273	break;
1274
1275	case OP_NOT_DIGIT:
1276	set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1277	break;
1278
1279	case OP_DIGIT:
1280	set_type_bits(start_bits, cbit_digit, table_limit, cd);
1281	break;
1282
1283	/ The cbit_space table has vertical tab as whitespace; we no longer*
1284	have to play fancy tricks because Perl added VT to its whitespace at
1285	release 5.18. PCRE added it at release 8.34. /*
1286
1287	case OP_NOT_WHITESPACE:
1288	set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1289	break;
1290
1291	case OP_WHITESPACE:
1292	set_type_bits(start_bits, cbit_space, table_limit, cd);
1293	break;
1294
1295	case OP_NOT_WORDCHAR:
1296	set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1297	break;
1298
1299	case OP_WORDCHAR:
1300	set_type_bits(start_bits, cbit_word, table_limit, cd);
1301	break;
1302	}
1303
1304	tcode += `2`;
1305	break;
1306
1307	/ Character class where all the information is in a bit map: set the*
1308	bits and either carry on or not, according to the repeat count. If it was
1309	a negative class, and we are operating with UTF-8 characters, any byte
1310	with a value >= 0xc4 is a potentially valid starter because it starts a
1311	character with a value > 255. /*
1312
1313	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
1314	case OP_XCLASS:
1315	if ((tcode[`1` + LINK_SIZE] & XCL_HASPROP) != `0`)
1316	return SSB_FAIL;
1317	/ All bits are set. /
1318	if ((tcode[`1` + LINK_SIZE] & XCL_MAP) == `0` && (tcode[`1` + LINK_SIZE] & XCL_NOT) != `0`)
1319	return SSB_FAIL;
1320	#endif
1321	/ Fall through /
1322
1323	case OP_NCLASS:
1324	#if defined SUPPORT_UTF && defined COMPILE_PCRE8
1325	if (utf)
1326	{
1327	start_bits[`24`] \|= `0xf0`; / Bits for 0xc4 - 0xc8 /
1328	memset(start_bits+`25`, `0xff`, `7`); / Bits for 0xc9 - 0xff /
1329	}
1330	#endif
1331	#if defined COMPILE_PCRE16 \|\| defined COMPILE_PCRE32
1332	SET_BIT(`0xFF`); / For characters > 255 /
1333	#endif
1334	/ Fall through /
1335
1336	case OP_CLASS:
1337	{
1338	pcre_uint8 *map;
1339	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
1340	map = NULL;
1341	if (*tcode == OP_XCLASS)
1342	{
1343	if ((tcode[`1` + LINK_SIZE] & XCL_MAP) != `0`)
1344	map = (pcre_uint8 *)(tcode + `1` + LINK_SIZE + `1`);
1345	tcode += GET(tcode, `1`);
1346	}
1347	else
1348	#endif
1349	{
1350	tcode++;
1351	map = (pcre_uint8 *)tcode;
1352	tcode += `32` / sizeof(pcre_uchar);
1353	}
1354
1355	/ In UTF-8 mode, the bits in a bit map correspond to character*
1356	values, not to byte values. However, the bit map we are constructing is
1357	for byte values. So we have to do a conversion for characters whose
1358	value is > 127. In fact, there are only two possible starting bytes for
1359	characters in the range 128 - 255. /*
1360
1361	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
1362	if (map != NULL)
1363	#endif
1364	{
1365	#if defined SUPPORT_UTF && defined COMPILE_PCRE8
1366	if (utf)
1367	{
1368	for (c = `0`; c < `16`; c++) start_bits[c] \|= map[c];
1369	for (c = `128`; c < `256`; c++)
1370	{
1371	if ((map[c/`8`] & (`1` << (c&`7`))) != `0`)
1372	{
1373	int d = (c >> `6`) \| `0xc0`; / Set bit for this starter /
1374	start_bits[d/`8`] \|= (`1` << (d&`7`)); / and then skip on to the /
1375	c = (c & `0xc0`) + `0x40` - `1`; / next relevant character. /
1376	}
1377	}
1378	}
1379	else
1380	#endif
1381	{
1382	/ In non-UTF-8 mode, the two bit maps are completely compatible. /
1383	for (c = `0`; c < `32`; c++) start_bits[c] \|= map[c];
1384	}
1385	}
1386
1387	/ Advance past the bit map, and act on what follows. For a zero*
1388	minimum repeat, continue; otherwise stop processing. /*
1389
1390	switch (*tcode)
1391	{
1392	case OP_CRSTAR:
1393	case OP_CRMINSTAR:
1394	case OP_CRQUERY:
1395	case OP_CRMINQUERY:
1396	case OP_CRPOSSTAR:
1397	case OP_CRPOSQUERY:
1398	tcode++;
1399	break;
1400
1401	case OP_CRRANGE:
1402	case OP_CRMINRANGE:
1403	case OP_CRPOSRANGE:
1404	if (GET2(tcode, `1`) == `0`) tcode += `1` + `2` * IMM2_SIZE;
1405	else try_next = FALSE;
1406	break;
1407
1408	default:
1409	try_next = FALSE;
1410	break;
1411	}
1412	}
1413	break; / End of bitmap class handling /
1414
1415	} / End of switch /
1416	} / End of try_next loop /
1417
1418	code += GET(code, `1`); / Advance to next branch /
1419	}
1420	while (*code == OP_ALT);
1421	return yield;
1422	}
1423
1424
1425
1426
1427
1428	/*************************************************
1429	* Study a compiled expression *
1430	*************************************************/
1431
1432	/ This function is handed a compiled expression that it must study to produce*
1433	information that will speed up the matching. It returns a pcre[16]_extra block
1434	which then gets handed back to pcre_exec().
1435
1436	Arguments:
1437	re points to the compiled expression
1438	options contains option bits
1439	errorptr points to where to place error messages;
1440	set NULL unless error
1441
1442	Returns: pointer to a pcre[16]_extra block, with study_data filled in and
1443	the appropriate flags set;
1444	NULL on error or if no optimization possible
1445	*/
1446
1447	#if defined COMPILE_PCRE8
1448	PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1449	pcre_study(const pcre external_re, int* options, const char **errorptr)
1450	#elif defined COMPILE_PCRE16
1451	PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1452	pcre16_study(const pcre16 external_re, int* options, const char **errorptr)
1453	#elif defined COMPILE_PCRE32
1454	PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1455	pcre32_study(const pcre32 external_re, int* options, const char **errorptr)
1456	#endif
1457	{
1458	int min;
1459	int count = `0`;
1460	BOOL bits_set = FALSE;
1461	pcre_uint8 start_bits[`32`];
1462	PUBL(extra) *extra = NULL;
1463	pcre_study_data *study;
1464	const pcre_uint8 *tables;
1465	pcre_uchar *code;
1466	compile_data compile_block;
1467	const REAL_PCRE re = (const* REAL_PCRE *)external_re;
1468
1469
1470	*errorptr = NULL;
1471
1472	if (re == NULL \|\| re->magic_number != MAGIC_NUMBER)
1473	{
1474	*errorptr = "argument is not a compiled regular expression";
1475	return NULL;
1476	}
1477
1478	if ((re->flags & PCRE_MODE) == `0`)
1479	{
1480	#if defined COMPILE_PCRE8
1481	*errorptr = "argument not compiled in 8 bit mode";
1482	#elif defined COMPILE_PCRE16
1483	*errorptr = "argument not compiled in 16 bit mode";
1484	#elif defined COMPILE_PCRE32
1485	*errorptr = "argument not compiled in 32 bit mode";
1486	#endif
1487	return NULL;
1488	}
1489
1490	if ((options & ~PUBLIC_STUDY_OPTIONS) != `0`)
1491	{
1492	*errorptr = "unknown or incorrect option bit(s) set";
1493	return NULL;
1494	}
1495
1496	code = (pcre_uchar *)re + re->name_table_offset +
1497	(re->name_count * re->name_entry_size);
1498
1499	/ For an anchored pattern, or an unanchored pattern that has a first char, or*
1500	a multiline pattern that matches only at "line starts", there is no point in
1501	seeking a list of starting bytes. /*
1502
1503	if ((re->options & PCRE_ANCHORED) == `0` &&
1504	(re->flags & (PCRE_FIRSTSET\|PCRE_STARTLINE)) == `0`)
1505	{
1506	int rc;
1507
1508	/ Set the character tables in the block that is passed around /
1509
1510	tables = re->tables;
1511
1512	#if defined COMPILE_PCRE8
1513	if (tables == NULL)
1514	(void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1515	(void *)(&tables));
1516	#elif defined COMPILE_PCRE16
1517	if (tables == NULL)
1518	(void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1519	(void *)(&tables));
1520	#elif defined COMPILE_PCRE32
1521	if (tables == NULL)
1522	(void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1523	(void *)(&tables));
1524	#endif
1525
1526	compile_block.lcc = tables + lcc_offset;
1527	compile_block.fcc = tables + fcc_offset;
1528	compile_block.cbits = tables + cbits_offset;
1529	compile_block.ctypes = tables + ctypes_offset;
1530
1531	/ See if we can find a fixed set of initial characters for the pattern. /
1532
1533	memset(start_bits, `0`, `32` * sizeof(pcre_uint8));
1534	rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != `0`,
1535	&compile_block);
1536	bits_set = rc == SSB_DONE;
1537	if (rc == SSB_UNKNOWN)
1538	{
1539	*errorptr = "internal error: opcode not recognized";
1540	return NULL;
1541	}
1542	}
1543
1544	/ Find the minimum length of subject string. /
1545
1546	switch(min = find_minlength(re, code, code, re->options, NULL, &count))
1547	{
1548	case -`2`: errorptr = "internal error: missing capturing bracket"; return* NULL;
1549	case -`3`: errorptr = "internal error: opcode not recognized"; return* NULL;
1550	default: break;
1551	}
1552
1553	/ If a set of starting bytes has been identified, or if the minimum length is*
1554	greater than zero, or if JIT optimization has been requested, or if
1555	PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1556	pcre_study_data block. The study data is put in the latter, which is pointed to
1557	by the former, which may also get additional data set later by the calling
1558	program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1559	save it in a field for returning via the pcre_fullinfo() function so that if it
1560	becomes variable in the future, we don't have to change that code. /*
1561
1562	if (bits_set \|\| min > `0` \|\| (options & (
1563	#ifdef SUPPORT_JIT
1564	PCRE_STUDY_JIT_COMPILE \| PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE \|
1565	PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE \|
1566	#endif
1567	PCRE_STUDY_EXTRA_NEEDED)) != `0`)
1568	{
1569	extra = (PUBL(extra) *)(PUBL(malloc))
1570	(sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1571	if (extra == NULL)
1572	{
1573	*errorptr = "failed to get memory";
1574	return NULL;
1575	}
1576
1577	study = (pcre_study_data )((char* )extra + sizeof*(PUBL(extra)));
1578	extra->flags = PCRE_EXTRA_STUDY_DATA;
1579	extra->study_data = study;
1580
1581	study->size = sizeof(pcre_study_data);
1582	study->flags = `0`;
1583
1584	/ Set the start bits always, to avoid unset memory errors if the*
1585	study data is written to a file, but set the flag only if any of the bits
1586	are set, to save time looking when none are. /*
1587
1588	if (bits_set)
1589	{
1590	study->flags \|= PCRE_STUDY_MAPPED;
1591	memcpy(study->start_bits, start_bits, sizeof(start_bits));
1592	}
1593	else memset(study->start_bits, `0`, `32` * sizeof(pcre_uint8));
1594
1595	#ifdef PCRE_DEBUG
1596	if (bits_set)
1597	{
1598	pcre_uint8 *ptr = start_bits;
1599	int i;
1600
1601	printf("Start bits:\n");
1602	for (i = `0`; i < `32`; i++)
1603	printf("%3d: %02x%s", i * `8`, *ptr++, ((i + `1`) & `0x7`) != `0`? " " : "\n");
1604	}
1605	#endif
1606
1607	/ Always set the minlength value in the block, because the JIT compiler*
1608	makes use of it. However, don't set the bit unless the length is greater than
1609	zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1610	checking the zero case. /*
1611
1612	if (min > `0`)
1613	{
1614	study->flags \|= PCRE_STUDY_MINLEN;
1615	study->minlength = min;
1616	}
1617	else study->minlength = `0`;
1618
1619	/ If JIT support was compiled and requested, attempt the JIT compilation.*
1620	If no starting bytes were found, and the minimum length is zero, and JIT
1621	compilation fails, abandon the extra block and return NULL, unless
1622	PCRE_STUDY_EXTRA_NEEDED is set. /*
1623
1624	#ifdef SUPPORT_JIT
1625	extra->executable_jit = NULL;
1626	if ((options & PCRE_STUDY_JIT_COMPILE) != `0`)
1627	PRIV(jit_compile)(re, extra, JIT_COMPILE);
1628	if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != `0`)
1629	PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1630	if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != `0`)
1631	PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1632
1633	if (study->flags == `0` && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == `0` &&
1634	(options & PCRE_STUDY_EXTRA_NEEDED) == `0`)
1635	{
1636	#if defined COMPILE_PCRE8
1637	pcre_free_study(extra);
1638	#elif defined COMPILE_PCRE16
1639	pcre16_free_study(extra);
1640	#elif defined COMPILE_PCRE32
1641	pcre32_free_study(extra);
1642	#endif
1643	extra = NULL;
1644	}
1645	#endif
1646	}
1647
1648	return extra;
1649	}
1650
1651
1652	/*************************************************
1653	* Free the study data *
1654	*************************************************/
1655
1656	/ This function frees the memory that was obtained by pcre_study().*
1657
1658	Argument: a pointer to the pcre[16]_extra block
1659	Returns: nothing
1660	*/
1661
1662	#if defined COMPILE_PCRE8
1663	PCRE_EXP_DEFN void
1664	pcre_free_study(pcre_extra *extra)
1665	#elif defined COMPILE_PCRE16
1666	PCRE_EXP_DEFN void
1667	pcre16_free_study(pcre16_extra *extra)
1668	#elif defined COMPILE_PCRE32
1669	PCRE_EXP_DEFN void
1670	pcre32_free_study(pcre32_extra *extra)
1671	#endif
1672	{
1673	if (extra == NULL)
1674	return;
1675	#ifdef SUPPORT_JIT
1676	if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != `0` &&
1677	extra->executable_jit != NULL)
1678	PRIV(jit_free)(extra->executable_jit);
1679	#endif
1680	PUBL(free)(extra);
1681	}
1682
1683	/ End of pcre_study.c /
1684

Browse the source code of POCO/Foundation/src/pcre_study.c