pcre_compile.c source code [ClickHouse/contrib/poco/Foundation/src/pcre_compile.c]

1	/*************************************************
2	* Perl-Compatible Regular Expressions *
3	*************************************************/
4
5	/ PCRE is a library of functions to support regular expressions whose syntax*
6	and semantics are as close as possible to those of the Perl 5 language.
7
8	Written by Philip Hazel
9	Copyright (c) 1997-2016 University of Cambridge
10
11	-----------------------------------------------------------------------------
12	Redistribution and use in source and binary forms, with or without
13	modification, are permitted provided that the following conditions are met:
14
15	* Redistributions of source code must retain the above copyright notice,
16	this list of conditions and the following disclaimer.
17
18	* Redistributions in binary form must reproduce the above copyright
19	notice, this list of conditions and the following disclaimer in the
20	documentation and/or other materials provided with the distribution.
21
22	* Neither the name of the University of Cambridge nor the names of its
23	contributors may be used to endorse or promote products derived from
24	this software without specific prior written permission.
25
26	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36	POSSIBILITY OF SUCH DAMAGE.
37	-----------------------------------------------------------------------------
38	*/
39
40	#pragma warning( disable : 4018) // '<' : signed/unsigned mismatch
41	#pragma warning( disable : 4127) // conditional expression is constant
42	#pragma warning( disable : 4244) // conversion from 'int' to 'unsigned short', possible loss of data
43	#pragma warning( disable : 4701) // local variable 'othercase' may be used without having been initialized
44	#pragma warning( disable : 4702) // unreachable code
45
46	/ This module contains the external function pcre_compile(), along with*
47	supporting internal functions that are not used by other modules. /*
48
49	#include "pcre_config.h"
50
51	#define NLBLOCK cd /* Block containing newline information */
52	#define PSSTART start_pattern /* Field containing pattern start */
53	#define PSEND end_pattern /* Field containing pattern end */
54
55	#include "pcre_internal.h"
56
57
58	/ When PCRE_DEBUG is defined, we need the pcre(16\|32)_printint() function, which*
59	is also used by pcretest. PCRE_DEBUG is not defined when building a production
60	library. We do not need to select pcre16_printint.c specially, because the
61	COMPILE_PCREx macro will already be appropriately set. /*
62
63	#ifdef PCRE_DEBUG
64	/ pcre_printint.c should not include any headers /
65	#define PCRE_INCLUDED
66	#include "pcre_printint.c"
67	#undef PCRE_INCLUDED
68	#endif
69
70
71	/ Macro for setting individual bits in class bitmaps. /
72
73	#define SETBIT(a,b) a[(b)/8] \|= (1 << ((b)&7))
74
75	/ Maximum length value to check against when making sure that the integer that*
76	holds the compiled pattern length does not overflow. We make it a bit less than
77	INT_MAX to allow for adding in group terminating bytes, so that we don't have
78	to check them every time. /*
79
80	#define OFLOW_MAX (INT_MAX - 20)
81
82	/ Definitions to allow mutual recursion /
83
84	static int
85	add_list_to_class(pcre_uint8 , pcre_uchar , int, compile_data ,
86	const pcre_uint32 , unsigned* int);
87
88	static BOOL
89	compile_regex(int, pcre_uchar *, const* pcre_uchar *, int* , BOOL, BOOL, int, int*,
90	pcre_uint32 , pcre_int32 , pcre_uint32 , pcre_int32 , branch_chain *,
91	compile_data , int* *);
92
93
94
95	/*************************************************
96	* Code parameters and static tables *
97	*************************************************/
98
99	/ This value specifies the size of stack workspace that is used during the*
100	first pre-compile phase that determines how much memory is required. The regex
101	is partly compiled into this space, but the compiled parts are discarded as
102	soon as they can be, so that hopefully there will never be an overrun. The code
103	does, however, check for an overrun. The largest amount I've seen used is 218,
104	so this number is very generous.
105
106	The same workspace is used during the second, actual compile phase for
107	remembering forward references to groups so that they can be filled in at the
108	end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
109	is 4 there is plenty of room for most patterns. However, the memory can get
110	filled up by repetitions of forward references, for example patterns like
111	/(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
112	that the workspace is expanded using malloc() in this situation. The value
113	below is therefore a minimum, and we put a maximum on it for safety. The
114	minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
115	kicks in at the same number of forward references in all cases. /*
116
117	#define COMPILE_WORK_SIZE (2048*LINK_SIZE)
118	#define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
119
120	/ This value determines the size of the initial vector that is used for*
121	remembering named groups during the pre-compile. It is allocated on the stack,
122	but if it is too small, it is expanded using malloc(), in a similar way to the
123	workspace. The value is the number of slots in the list. /*
124
125	#define NAMED_GROUP_LIST_SIZE 20
126
127	/ The overrun tests check for a slightly smaller size so that they detect the*
128	overrun before it actually does run off the end of the data block. /*
129
130	#define WORK_SIZE_SAFETY_MARGIN (100)
131
132	/ Private flags added to firstchar and reqchar. /
133
134	#define REQ_CASELESS (1 << 0) /* Indicates caselessness */
135	#define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
136	/ Negative values for the firstchar and reqchar flags /
137	#define REQ_UNSET (-2)
138	#define REQ_NONE (-1)
139
140	/ Repeated character flags. /
141
142	#define UTF_LENGTH 0x10000000l /* The char contains its length. */
143
144	/ Table for handling escaped characters in the range '0'-'z'. Positive returns*
145	are simple data values; negative values are for special things like \d and so
146	on. Zero means further processing is needed (for things like \x), or the escape
147	is invalid. /*
148
149	#ifndef EBCDIC
150
151	/ This is the "normal" table for ASCII systems or for EBCDIC systems running*
152	in UTF-8 mode. /*
153
154	static const short int escapes[] = {
155	`0`, `0`,
156	`0`, `0`,
157	`0`, `0`,
158	`0`, `0`,
159	`0`, `0`,
160	CHAR_COLON, CHAR_SEMICOLON,
161	CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
162	CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
163	CHAR_COMMERCIAL_AT, -ESC_A,
164	-ESC_B, -ESC_C,
165	-ESC_D, -ESC_E,
166	`0`, -ESC_G,
167	-ESC_H, `0`,
168	`0`, -ESC_K,
169	`0`, `0`,
170	-ESC_N, `0`,
171	-ESC_P, -ESC_Q,
172	-ESC_R, -ESC_S,
173	`0`, `0`,
174	-ESC_V, -ESC_W,
175	-ESC_X, `0`,
176	-ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
177	CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
178	CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
179	CHAR_GRAVE_ACCENT, ESC_a,
180	-ESC_b, `0`,
181	-ESC_d, ESC_e,
182	ESC_f, `0`,
183	-ESC_h, `0`,
184	`0`, -ESC_k,
185	`0`, `0`,
186	ESC_n, `0`,
187	-ESC_p, `0`,
188	ESC_r, -ESC_s,
189	ESC_tee, `0`,
190	-ESC_v, -ESC_w,
191	`0`, `0`,
192	-ESC_z
193	};
194
195	#else
196
197	/ This is the "abnormal" table for EBCDIC systems without UTF-8 support. /
198
199	static const short int escapes[] = {
200	/ 48 / `0`, `0`, `0`, `'.'`, `'<'`, `'('`, `'+'`, `'\|'`,
201	/ 50 / `'&'`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
202	/ 58 / `0`, `0`, `'!'`, `'$'`, `'*'`, `')'`, `';'`, `'~'`,
203	/ 60 / `'-'`, `'/'`, `0`, `0`, `0`, `0`, `0`, `0`,
204	/ 68 / `0`, `0`, `'\|'`, `','`, `'%'`, `'_'`, `'>'`, `'?'`,
205	/ 70 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
206	/ 78 / `0`, '`', `':'`, `'#'`, `'@'`, `'\''`, `'='`, `'"'`,
207	/ 80 / `0`, ESC_a, -ESC_b, `0`, -ESC_d, ESC_e, ESC_f, `0`,
208	/ 88 /-ESC_h, `0`, `0`, `'{'`, `0`, `0`, `0`, `0`,
209	/ 90 / `0`, `0`, -ESC_k, `0`, `0`, ESC_n, `0`, -ESC_p,
210	/ 98 / `0`, ESC_r, `0`, `'}'`, `0`, `0`, `0`, `0`,
211	/ A0 / `0`, `'~'`, -ESC_s, ESC_tee, `0`,-ESC_v, -ESC_w, `0`,
212	/ A8 / `0`,-ESC_z, `0`, `0`, `0`, `'['`, `0`, `0`,
213	/ B0 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
214	/ B8 / `0`, `0`, `0`, `0`, `0`, `']'`, `'='`, `'-'`,
215	/ C0 / `'{'`,-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, `0`, -ESC_G,
216	/ C8 /-ESC_H, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
217	/ D0 / `'}'`, `0`, -ESC_K, `0`, `0`,-ESC_N, `0`, -ESC_P,
218	/ D8 /-ESC_Q,-ESC_R, `0`, `0`, `0`, `0`, `0`, `0`,
219	/ E0 / `'\\'`, `0`, -ESC_S, `0`, `0`,-ESC_V, -ESC_W, -ESC_X,
220	/ E8 / `0`,-ESC_Z, `0`, `0`, `0`, `0`, `0`, `0`,
221	/ F0 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
222	/ F8 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`
223	};
224
225	/ We also need a table of characters that may follow \c in an EBCDIC*
226	environment for characters 0-31. /*
227
228	static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
229
230	#endif
231
232
233	/ Table of special "verbs" like (PRUNE). This is a short table, so it is
234	searched linearly. Put all the names into a single string, in order to reduce
235	the number of relocations when a shared library is dynamically linked. The
236	string is built from string macros so that it works in UTF-8 mode on EBCDIC
237	platforms. /*
238
239	typedef struct verbitem {
240	int len; / Length of verb name /
241	int op; / Op when no arg, or -1 if arg mandatory /
242	int op_arg; / Op when arg present, or -1 if not allowed /
243	} verbitem;
244
245	static const char verbnames[] =
246	"\0" / Empty name is a shorthand for MARK /
247	STRING_MARK0
248	STRING_ACCEPT0
249	STRING_COMMIT0
250	STRING_F0
251	STRING_FAIL0
252	STRING_PRUNE0
253	STRING_SKIP0
254	STRING_THEN;
255
256	static const verbitem verbs[] = {
257	{ `0`, -`1`, OP_MARK },
258	{ `4`, -`1`, OP_MARK },
259	{ `6`, OP_ACCEPT, -`1` },
260	{ `6`, OP_COMMIT, -`1` },
261	{ `1`, OP_FAIL, -`1` },
262	{ `4`, OP_FAIL, -`1` },
263	{ `5`, OP_PRUNE, OP_PRUNE_ARG },
264	{ `4`, OP_SKIP, OP_SKIP_ARG },
265	{ `4`, OP_THEN, OP_THEN_ARG }
266	};
267
268	static const int verbcount = sizeof(verbs)/sizeof(verbitem);
269
270
271	/ Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in*
272	another regex library. /*
273
274	static const pcre_uchar sub_start_of_word[] = {
275	CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
276	CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, `'\0'` };
277
278	static const pcre_uchar sub_end_of_word[] = {
279	CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
280	CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
281	CHAR_RIGHT_PARENTHESIS, `'\0'` };
282
283
284	/ Tables of names of POSIX character classes and their lengths. The names are*
285	now all in a single string, to reduce the number of relocations when a shared
286	library is dynamically loaded. The list of lengths is terminated by a zero
287	length entry. The first three must be alpha, lower, upper, as this is assumed
288	for handling case independence. The indices for graph, print, and punct are
289	needed, so identify them. /*
290
291	static const char posix_names[] =
292	STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
293	STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
294	STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
295	STRING_word0 STRING_xdigit;
296
297	static const pcre_uint8 posix_name_lengths[] = {
298	`5`, `5`, `5`, `5`, `5`, `5`, `5`, `5`, `5`, `5`, `5`, `5`, `4`, `6`, `0` };
299
300	#define PC_GRAPH 8
301	#define PC_PRINT 9
302	#define PC_PUNCT 10
303
304
305	/ Table of class bit maps for each POSIX class. Each class is formed from a*
306	base map, with an optional addition or removal of another map. Then, for some
307	classes, there is some additional tweaking: for [:blank:] the vertical space
308	characters are removed, and for [:alpha:] and [:alnum:] the underscore
309	character is removed. The triples in the table consist of the base map offset,
310	second map offset or -1 if no second map, and a non-negative value for map
311	addition or a negative value for map subtraction (if there are two maps). The
312	absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
313	remove vertical space characters, 2 => remove underscore. /*
314
315	static const int posix_class_maps[] = {
316	cbit_word, cbit_digit, -`2`, / alpha /
317	cbit_lower, -`1`, `0`, / lower /
318	cbit_upper, -`1`, `0`, / upper /
319	cbit_word, -`1`, `2`, / alnum - word without underscore /
320	cbit_print, cbit_cntrl, `0`, / ascii /
321	cbit_space, -`1`, `1`, / blank - a GNU extension /
322	cbit_cntrl, -`1`, `0`, / cntrl /
323	cbit_digit, -`1`, `0`, / digit /
324	cbit_graph, -`1`, `0`, / graph /
325	cbit_print, -`1`, `0`, / print /
326	cbit_punct, -`1`, `0`, / punct /
327	cbit_space, -`1`, `0`, / space /
328	cbit_word, -`1`, `0`, / word - a Perl extension /
329	cbit_xdigit,-`1`, `0` / xdigit /
330	};
331
332	/ Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by*
333	Unicode property escapes. /*
334
335	#ifdef SUPPORT_UCP
336	static const pcre_uchar string_PNd[] = {
337	CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338	CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
339	static const pcre_uchar string_pNd[] = {
340	CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
341	CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
342	static const pcre_uchar string_PXsp[] = {
343	CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344	CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
345	static const pcre_uchar string_pXsp[] = {
346	CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
347	CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
348	static const pcre_uchar string_PXwd[] = {
349	CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
350	CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
351	static const pcre_uchar string_pXwd[] = {
352	CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
353	CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
354
355	static const pcre_uchar *substitutes[] = {
356	string_PNd, / \D /
357	string_pNd, / \d /
358	string_PXsp, / \S / / Xsp is Perl space, but from 8.34, Perl /
359	string_pXsp, / \s / / space and POSIX space are the same. /
360	string_PXwd, / \W /
361	string_pXwd / \w /
362	};
363
364	/ The POSIX class substitutes must be in the order of the POSIX class names,*
365	defined above, and there are both positive and negative cases. NULL means no
366	general substitute of a Unicode property escape (\p or \P). However, for some
367	POSIX classes (e.g. graph, print, punct) a special property code is compiled
368	directly. /*
369
370	static const pcre_uchar string_pL[] = {
371	CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
372	CHAR_L, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
373	static const pcre_uchar string_pLl[] = {
374	CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
375	CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
376	static const pcre_uchar string_pLu[] = {
377	CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
378	CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
379	static const pcre_uchar string_pXan[] = {
380	CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
381	CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
382	static const pcre_uchar string_h[] = {
383	CHAR_BACKSLASH, CHAR_h, `'\0'` };
384	static const pcre_uchar string_pXps[] = {
385	CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
386	CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
387	static const pcre_uchar string_PL[] = {
388	CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
389	CHAR_L, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
390	static const pcre_uchar string_PLl[] = {
391	CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
392	CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
393	static const pcre_uchar string_PLu[] = {
394	CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
395	CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
396	static const pcre_uchar string_PXan[] = {
397	CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
398	CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
399	static const pcre_uchar string_H[] = {
400	CHAR_BACKSLASH, CHAR_H, `'\0'` };
401	static const pcre_uchar string_PXps[] = {
402	CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
403	CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, `'\0'` };
404
405	static const pcre_uchar *posix_substitutes[] = {
406	string_pL, / alpha /
407	string_pLl, / lower /
408	string_pLu, / upper /
409	string_pXan, / alnum /
410	NULL, / ascii /
411	string_h, / blank /
412	NULL, / cntrl /
413	string_pNd, / digit /
414	NULL, / graph /
415	NULL, / print /
416	NULL, / punct /
417	string_pXps, / space / / Xps is POSIX space, but from 8.34 /
418	string_pXwd, / word / / Perl and POSIX space are the same /
419	NULL, / xdigit /
420	/ Negated cases /
421	string_PL, / ^alpha /
422	string_PLl, / ^lower /
423	string_PLu, / ^upper /
424	string_PXan, / ^alnum /
425	NULL, / ^ascii /
426	string_H, / ^blank /
427	NULL, / ^cntrl /
428	string_PNd, / ^digit /
429	NULL, / ^graph /
430	NULL, / ^print /
431	NULL, / ^punct /
432	string_PXps, / ^space / / Xps is POSIX space, but from 8.34 /
433	string_PXwd, / ^word / / Perl and POSIX space are the same /
434	NULL / ^xdigit /
435	};
436	#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
437	#endif
438
439	#define STRING(a) # a
440	#define XSTRING(s) STRING(s)
441
442	/ The texts of compile-time error messages. These are "char " because they
443	are passed to the outside world. Do not ever re-use any error number, because
444	they are documented. Always add a new error instead. Messages marked DEAD below
445	are no longer used. This used to be a table of strings, but in order to reduce
446	the number of relocations needed when a shared library is loaded dynamically,
447	it is now one long string. We cannot use a table of offsets, because the
448	lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
449	simply count through to the one we want - this isn't a performance issue
450	because these strings are used only when there is a compilation error.
451
452	Each substring ends with \0 to insert a null character. This includes the final
453	substring, so that the whole string ends with \0\0, which can be detected when
454	counting through. /*
455
456	static const char error_texts[] =
457	"no error\0"
458	"\\ at end of pattern\0"
459	"\\c at end of pattern\0"
460	"unrecognized character follows \\\0"
461	"numbers out of order in {} quantifier\0"
462	/ 5 /
463	"number too big in {} quantifier\0"
464	"missing terminating ] for character class\0"
465	"invalid escape sequence in character class\0"
466	"range out of order in character class\0"
467	"nothing to repeat\0"
468	/ 10 /
469	"internal error: invalid forward reference offset\0"
470	"internal error: unexpected repeat\0"
471	"unrecognized character after (? or (?-\0"
472	"POSIX named classes are supported only within a class\0"
473	"missing )\0"
474	/ 15 /
475	"reference to non-existent subpattern\0"
476	"erroffset passed as NULL\0"
477	"unknown option bit(s) set\0"
478	"missing ) after comment\0"
479	"parentheses nested too deeply\0" /* DEAD */
480	/ 20 /
481	"regular expression is too large\0"
482	"failed to get memory\0"
483	"unmatched parentheses\0"
484	"internal error: code overflow\0"
485	"unrecognized character after (?<\0"
486	/ 25 /
487	"lookbehind assertion is not fixed length\0"
488	"malformed number or name after (?(\0"
489	"conditional group contains more than two branches\0"
490	"assertion expected after (?( or (?(?C)\0"
491	"(?R or (?[+-]digits must be followed by )\0"
492	/ 30 /
493	"unknown POSIX class name\0"
494	"POSIX collating elements are not supported\0"
495	"this version of PCRE is compiled without UTF support\0"
496	"spare error\0" /* DEAD */
497	"character value in \\x{} or \\o{} is too large\0"
498	/ 35 /
499	"invalid condition (?(0)\0"
500	"\\C not allowed in lookbehind assertion\0"
501	"PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
502	"number after (?C is > 255\0"
503	"closing ) for (?C expected\0"
504	/ 40 /
505	"recursive call could loop indefinitely\0"
506	"unrecognized character after (?P\0"
507	"syntax error in subpattern name (missing terminator)\0"
508	"two named subpatterns have the same name\0"
509	"invalid UTF-8 string\0"
510	/ 45 /
511	"support for \\P, \\p, and \\X has not been compiled\0"
512	"malformed \\P or \\p sequence\0"
513	"unknown property name after \\P or \\p\0"
514	"subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
515	"too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
516	/ 50 /
517	"repeated subpattern is too long\0" /* DEAD */
518	"octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
519	"internal error: overran compiling workspace\0"
520	"internal error: previously-checked referenced subpattern not found\0"
521	"DEFINE group contains more than one branch\0"
522	/ 55 /
523	"repeating a DEFINE group is not allowed\0" /* DEAD */
524	"inconsistent NEWLINE options\0"
525	"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
526	"a numbered reference must not be zero\0"
527	"an argument is not allowed for (ACCEPT), (FAIL), or (*COMMIT)\0"
528	/ 60 /
529	"(*VERB) not recognized or malformed\0"
530	"number is too big\0"
531	"subpattern name expected\0"
532	"digit expected after (?+\0"
533	"] is an invalid data character in JavaScript compatibility mode\0"
534	/ 65 /
535	"different names for subpatterns of the same number are not allowed\0"
536	"(*MARK) must have an argument\0"
537	"this version of PCRE is not compiled with Unicode property support\0"
538	#ifndef EBCDIC
539	"\\c must be followed by an ASCII character\0"
540	#else
541	"\\c must be followed by a letter or one of [\\]^_?\0"
542	#endif
543	"\\k is not followed by a braced, angle-bracketed, or quoted name\0"
544	/ 70 /
545	"internal error: unknown opcode in find_fixedlength()\0"
546	"\\N is not supported in a class\0"
547	"too many forward references\0"
548	"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
549	"invalid UTF-16 string\0"
550	/ 75 /
551	"name is too long in (MARK), (PRUNE), (SKIP), or (THEN)\0"
552	"character value in \\u.... sequence is too large\0"
553	"invalid UTF-32 string\0"
554	"setting UTF is disabled by the application\0"
555	"non-hex character in \\x{} (closing brace missing?)\0"
556	/ 80 /
557	"non-octal character in \\o{} (closing brace missing?)\0"
558	"missing opening brace after \\o\0"
559	"parentheses are too deeply nested\0"
560	"invalid range in character class\0"
561	"group name must start with a non-digit\0"
562	/ 85 /
563	"parentheses are too deeply nested (stack check)\0"
564	"digits missing in \\x{} or \\o{}\0"
565	"regular expression is too complicated\0"
566	;
567
568	/ Table to identify digits and hex digits. This is used when compiling*
569	patterns. Note that the tables in chartables are dependent on the locale, and
570	may mark arbitrary characters as digits - but the PCRE compiling code expects
571	to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
572	a private table here. It costs 256 bytes, but it is a lot faster than doing
573	character value tests (at least in some simple cases I timed), and in some
574	applications one wants PCRE to compile efficiently as well as match
575	efficiently.
576
577	For convenience, we use the same bit definitions as in chartables:
578
579	0x04 decimal digit
580	0x08 hexadecimal digit
581
582	Then we can use ctype_digit and ctype_xdigit in the code. /*
583
584	/ Using a simple comparison for decimal numbers rather than a memory read*
585	is much faster, and the resulting code is simpler (the compiler turns it
586	into a subtraction and unsigned comparison). /*
587
588	#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
589
590	#ifndef EBCDIC
591
592	/ This is the "normal" case, for ASCII systems, and EBCDIC systems running in*
593	UTF-8 mode. /*
594
595	static const pcre_uint8 digitab[] =
596	{
597	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 0- 7 /
598	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 8- 15 /
599	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 16- 23 /
600	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 24- 31 /
601	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / - ' /
602	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / ( - / /
603	`0x0c`,`0x0c`,`0x0c`,`0x0c`,`0x0c`,`0x0c`,`0x0c`,`0x0c`, / 0 - 7 /
604	`0x0c`,`0x0c`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 8 - ? /
605	`0x00`,`0x08`,`0x08`,`0x08`,`0x08`,`0x08`,`0x08`,`0x00`, / @ - G /
606	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / H - O /
607	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / P - W /
608	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / X - _ /
609	`0x00`,`0x08`,`0x08`,`0x08`,`0x08`,`0x08`,`0x08`,`0x00`, / ` - g /
610	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / h - o /
611	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / p - w /
612	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / x -127 /
613	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 128-135 /
614	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 136-143 /
615	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 144-151 /
616	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 152-159 /
617	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 160-167 /
618	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 168-175 /
619	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 176-183 /
620	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 184-191 /
621	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 192-199 /
622	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 200-207 /
623	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 208-215 /
624	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 216-223 /
625	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 224-231 /
626	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 232-239 /
627	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 240-247 /
628	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`};/ 248-255 /
629
630	#else
631
632	/ This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. /
633
634	static const pcre_uint8 digitab[] =
635	{
636	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 0- 7 0 /
637	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 8- 15 /
638	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 16- 23 10 /
639	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 24- 31 /
640	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 32- 39 20 /
641	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 40- 47 /
642	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 48- 55 30 /
643	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 56- 63 /
644	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / - 71 40 /
645	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 72- \| /
646	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / & - 87 50 /
647	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 88- 95 /
648	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / - -103 60 /
649	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 104- ? /
650	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 112-119 70 /
651	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 120- " /
652	`0x00`,`0x08`,`0x08`,`0x08`,`0x08`,`0x08`,`0x08`,`0x00`, / 128- g 80 /
653	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / h -143 /
654	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 144- p 90 /
655	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / q -159 /
656	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 160- x A0 /
657	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / y -175 /
658	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / ^ -183 B0 /
659	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 184-191 /
660	`0x00`,`0x08`,`0x08`,`0x08`,`0x08`,`0x08`,`0x08`,`0x00`, / { - G C0 /
661	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / H -207 /
662	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / } - P D0 /
663	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / Q -223 /
664	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / \ - X E0 /
665	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / Y -239 /
666	`0x0c`,`0x0c`,`0x0c`,`0x0c`,`0x0c`,`0x0c`,`0x0c`,`0x0c`, / 0 - 7 F0 /
667	`0x0c`,`0x0c`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`};/ 8 -255 /
668
669	static const pcre_uint8 ebcdic_chartab[] = { / chartable partial dup /
670	`0x80`,`0x00`,`0x00`,`0x00`,`0x00`,`0x01`,`0x00`,`0x00`, / 0- 7 /
671	`0x00`,`0x00`,`0x00`,`0x00`,`0x01`,`0x01`,`0x00`,`0x00`, / 8- 15 /
672	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x01`,`0x00`,`0x00`, / 16- 23 /
673	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 24- 31 /
674	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x01`,`0x00`,`0x00`, / 32- 39 /
675	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 40- 47 /
676	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 48- 55 /
677	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 56- 63 /
678	`0x01`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / - 71 /
679	`0x00`,`0x00`,`0x00`,`0x80`,`0x00`,`0x80`,`0x80`,`0x80`, / 72- \| /
680	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / & - 87 /
681	`0x00`,`0x00`,`0x00`,`0x80`,`0x80`,`0x80`,`0x00`,`0x00`, / 88- 95 /
682	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / - -103 /
683	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x10`,`0x00`,`0x80`, / 104- ? /
684	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 112-119 /
685	`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 120- " /
686	`0x00`,`0x1a`,`0x1a`,`0x1a`,`0x1a`,`0x1a`,`0x1a`,`0x12`, / 128- g /
687	`0x12`,`0x12`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / h -143 /
688	`0x00`,`0x12`,`0x12`,`0x12`,`0x12`,`0x12`,`0x12`,`0x12`, / 144- p /
689	`0x12`,`0x12`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / q -159 /
690	`0x00`,`0x00`,`0x12`,`0x12`,`0x12`,`0x12`,`0x12`,`0x12`, / 160- x /
691	`0x12`,`0x12`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / y -175 /
692	`0x80`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / ^ -183 /
693	`0x00`,`0x00`,`0x80`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / 184-191 /
694	`0x80`,`0x1a`,`0x1a`,`0x1a`,`0x1a`,`0x1a`,`0x1a`,`0x12`, / { - G /
695	`0x12`,`0x12`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / H -207 /
696	`0x00`,`0x12`,`0x12`,`0x12`,`0x12`,`0x12`,`0x12`,`0x12`, / } - P /
697	`0x12`,`0x12`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / Q -223 /
698	`0x00`,`0x00`,`0x12`,`0x12`,`0x12`,`0x12`,`0x12`,`0x12`, / \ - X /
699	`0x12`,`0x12`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`, / Y -239 /
700	`0x1c`,`0x1c`,`0x1c`,`0x1c`,`0x1c`,`0x1c`,`0x1c`,`0x1c`, / 0 - 7 /
701	`0x1c`,`0x1c`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`,`0x00`};/ 8 -255 /
702	#endif
703
704
705	/ This table is used to check whether auto-possessification is possible*
706	between adjacent character-type opcodes. The left-hand (repeated) opcode is
707	used to select the row, and the right-hand opcode is use to select the column.
708	A value of 1 means that auto-possessification is OK. For example, the second
709	value in the first row means that \D+\d can be turned into \D++\d.
710
711	The Unicode property types (\P and \p) have to be present to fill out the table
712	because of what their opcode values are, but the table values should always be
713	zero because property types are handled separately in the code. The last four
714	columns apply to items that cannot be repeated, so there is no need to have
715	rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
716	not set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
717
718	#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
719	#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
720
721	static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
722	/ \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M /
723	{ `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0` }, / \D /
724	{ `1`, `0`, `0`, `1`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `1`, `1`, `1` }, / \d /
725	{ `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `1`, `1`, `1` }, / \S /
726	{ `0`, `1`, `1`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0` }, / \s /
727	{ `0`, `1`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0` }, / \W /
728	{ `0`, `0`, `0`, `1`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `1`, `1`, `1` }, / \w /
729	{ `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0` }, / . /
730	{ `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0` }, / .+ /
731	{ `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0` }, / \C /
732	{ `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0` }, / \P /
733	{ `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0` }, / \p /
734	{ `0`, `1`, `0`, `1`, `0`, `1`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `1`, `0`, `0` }, / \R /
735	{ `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `1`, `0`, `0` }, / \H /
736	{ `0`, `1`, `1`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `0`, `0`, `1`, `0`, `0`, `1`, `0`, `0` }, / \h /
737	{ `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `1`, `0`, `0`, `1`, `0`, `0` }, / \V /
738	{ `0`, `1`, `1`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `0`, `0`, `0`, `1`, `0`, `0` }, / \v /
739	{ `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0` } / \X /
740	};
741
742
743	/ This table is used to check whether auto-possessification is possible*
744	between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
745	left-hand (repeated) opcode is used to select the row, and the right-hand
746	opcode is used to select the column. The values are as follows:
747
748	0 Always return FALSE (never auto-possessify)
749	1 Character groups are distinct (possessify if both are OP_PROP)
750	2 Check character categories in the same group (general or particular)
751	3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
752
753	4 Check left general category vs right particular category
754	5 Check right general category vs left particular category
755
756	6 Left alphanum vs right general category
757	7 Left space vs right general category
758	8 Left word vs right general category
759
760	9 Right alphanum vs left general category
761	10 Right space vs left general category
762	11 Right word vs left general category
763
764	12 Left alphanum vs right particular category
765	13 Left space vs right particular category
766	14 Left word vs right particular category
767
768	15 Right alphanum vs left particular category
769	16 Right space vs left particular category
770	17 Right word vs left particular category
771	*/
772
773	static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
774	/ ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC /
775	{ `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0` }, / PT_ANY /
776	{ `0`, `3`, `0`, `0`, `0`, `3`, `1`, `1`, `0`, `0`, `0` }, / PT_LAMP /
777	{ `0`, `0`, `2`, `4`, `0`, `9`, `10`, `10`, `11`, `0`, `0` }, / PT_GC /
778	{ `0`, `0`, `5`, `2`, `0`, `15`, `16`, `16`, `17`, `0`, `0` }, / PT_PC /
779	{ `0`, `0`, `0`, `0`, `2`, `0`, `0`, `0`, `0`, `0`, `0` }, / PT_SC /
780	{ `0`, `3`, `6`, `12`, `0`, `3`, `1`, `1`, `0`, `0`, `0` }, / PT_ALNUM /
781	{ `0`, `1`, `7`, `13`, `0`, `1`, `3`, `3`, `1`, `0`, `0` }, / PT_SPACE /
782	{ `0`, `1`, `7`, `13`, `0`, `1`, `3`, `3`, `1`, `0`, `0` }, / PT_PXSPACE /
783	{ `0`, `0`, `8`, `14`, `0`, `0`, `1`, `1`, `3`, `0`, `0` }, / PT_WORD /
784	{ `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0` }, / PT_CLIST /
785	{ `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `3` } / PT_UCNC /
786	};
787
788	/ This table is used to check whether auto-possessification is possible*
789	between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
790	specifies a general category and the other specifies a particular category. The
791	row is selected by the general category and the column by the particular
792	category. The value is 1 if the particular category is not part of the general
793	category. /*
794
795	static const pcre_uint8 catposstab[`7`][`30`] = {
796	/ Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs /
797	{ `0`, `0`, `0`, `0`, `0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1` }, / C /
798	{ `1`, `1`, `1`, `1`, `1`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1` }, / L /
799	{ `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `0`, `0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1` }, / M /
800	{ `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `0`, `0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1` }, / N /
801	{ `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `1`, `1`, `1`, `1`, `1` }, / P /
802	{ `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `0`, `0`, `0`, `1`, `1`, `1` }, / S /
803	{ `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `0`, `0` } / Z /
804	};
805
806	/ This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against*
807	a general or particular category. The properties in each row are those
808	that apply to the character set in question. Duplication means that a little
809	unnecessary work is done when checking, but this keeps things much simpler
810	because they can all use the same code. For more details see the comment where
811	this table is used.
812
813	Note: SPACE and PXSPACE used to be different because Perl excluded VT from
814	"space", but from Perl 5.18 it's included, so both categories are treated the
815	same here. /*
816
817	static const pcre_uint8 posspropstab[`3`][`4`] = {
818	{ ucp_L, ucp_N, ucp_N, ucp_Nl }, / ALNUM, 3rd and 4th values redundant /
819	{ ucp_Z, ucp_Z, ucp_C, ucp_Cc }, / SPACE and PXSPACE, 2nd value redundant /
820	{ ucp_L, ucp_N, ucp_P, ucp_Po } / WORD /
821	};
822
823	/ This table is used when converting repeating opcodes into possessified*
824	versions as a result of an explicit possessive quantifier such as ++. A zero
825	value means there is no possessified version - in those cases the item in
826	question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
827	because all relevant opcodes are less than that. /*
828
829	static const pcre_uint8 opcode_possessify[] = {
830	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, / 0 - 15 /
831	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, / 16 - 31 /
832
833	`0`, / NOTI /
834	OP_POSSTAR, `0`, / STAR, MINSTAR /
835	OP_POSPLUS, `0`, / PLUS, MINPLUS /
836	OP_POSQUERY, `0`, / QUERY, MINQUERY /
837	OP_POSUPTO, `0`, / UPTO, MINUPTO /
838	`0`, / EXACT /
839	`0`, `0`, `0`, `0`, / POS{STAR,PLUS,QUERY,UPTO} /
840
841	OP_POSSTARI, `0`, / STARI, MINSTARI /
842	OP_POSPLUSI, `0`, / PLUSI, MINPLUSI /
843	OP_POSQUERYI, `0`, / QUERYI, MINQUERYI /
844	OP_POSUPTOI, `0`, / UPTOI, MINUPTOI /
845	`0`, / EXACTI /
846	`0`, `0`, `0`, `0`, / POS{STARI,PLUSI,QUERYI,UPTOI} /
847
848	OP_NOTPOSSTAR, `0`, / NOTSTAR, NOTMINSTAR /
849	OP_NOTPOSPLUS, `0`, / NOTPLUS, NOTMINPLUS /
850	OP_NOTPOSQUERY, `0`, / NOTQUERY, NOTMINQUERY /
851	OP_NOTPOSUPTO, `0`, / NOTUPTO, NOTMINUPTO /
852	`0`, / NOTEXACT /
853	`0`, `0`, `0`, `0`, / NOTPOS{STAR,PLUS,QUERY,UPTO} /
854
855	OP_NOTPOSSTARI, `0`, / NOTSTARI, NOTMINSTARI /
856	OP_NOTPOSPLUSI, `0`, / NOTPLUSI, NOTMINPLUSI /
857	OP_NOTPOSQUERYI, `0`, / NOTQUERYI, NOTMINQUERYI /
858	OP_NOTPOSUPTOI, `0`, / NOTUPTOI, NOTMINUPTOI /
859	`0`, / NOTEXACTI /
860	`0`, `0`, `0`, `0`, / NOTPOS{STARI,PLUSI,QUERYI,UPTOI} /
861
862	OP_TYPEPOSSTAR, `0`, / TYPESTAR, TYPEMINSTAR /
863	OP_TYPEPOSPLUS, `0`, / TYPEPLUS, TYPEMINPLUS /
864	OP_TYPEPOSQUERY, `0`, / TYPEQUERY, TYPEMINQUERY /
865	OP_TYPEPOSUPTO, `0`, / TYPEUPTO, TYPEMINUPTO /
866	`0`, / TYPEEXACT /
867	`0`, `0`, `0`, `0`, / TYPEPOS{STAR,PLUS,QUERY,UPTO} /
868
869	OP_CRPOSSTAR, `0`, / CRSTAR, CRMINSTAR /
870	OP_CRPOSPLUS, `0`, / CRPLUS, CRMINPLUS /
871	OP_CRPOSQUERY, `0`, / CRQUERY, CRMINQUERY /
872	OP_CRPOSRANGE, `0`, / CRRANGE, CRMINRANGE /
873	`0`, `0`, `0`, `0`, / CRPOS{STAR,PLUS,QUERY,RANGE} /
874
875	`0`, `0`, `0`, / CLASS, NCLASS, XCLASS /
876	`0`, `0`, / REF, REFI /
877	`0`, `0`, / DNREF, DNREFI /
878	`0`, `0` / RECURSE, CALLOUT /
879	};
880
881
882
883	/*************************************************
884	* Find an error text *
885	*************************************************/
886
887	/ The error texts are now all in one long string, to save on relocations. As*
888	some of the text is of unknown length, we can't use a table of offsets.
889	Instead, just count through the strings. This is not a performance issue
890	because it happens only when there has been a compilation error.
891
892	Argument: the error number
893	Returns: pointer to the error string
894	*/
895
896	static const char *
897	find_error_text(int n)
898	{
899	const char *s = error_texts;
900	for (; n > `0`; n--)
901	{
902	while (*s++ != CHAR_NULL) {};
903	if (s == CHAR_NULL) return* "Error text not found (please report)";
904	}
905	return s;
906	}
907
908
909
910	/*************************************************
911	* Expand the workspace *
912	*************************************************/
913
914	/ This function is called during the second compiling phase, if the number of*
915	forward references fills the existing workspace, which is originally a block on
916	the stack. A larger block is obtained from malloc() unless the ultimate limit
917	has been reached or the increase will be rather small.
918
919	Argument: pointer to the compile data block
920	Returns: 0 if all went well, else an error number
921	*/
922
923	static int
924	expand_workspace(compile_data *cd)
925	{
926	pcre_uchar *newspace;
927	int newsize = cd->workspace_size * `2`;
928
929	if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
930	if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX \|\|
931	newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
932	return ERR72;
933
934	newspace = (PUBL(malloc))(IN_UCHARS(newsize));
935	if (newspace == NULL) return ERR21;
936	memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
937	cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
938	if (cd->workspace_size > COMPILE_WORK_SIZE)
939	(PUBL(free))((void *)cd->start_workspace);
940	cd->start_workspace = newspace;
941	cd->workspace_size = newsize;
942	return `0`;
943	}
944
945
946
947	/*************************************************
948	* Check for counted repeat *
949	*************************************************/
950
951	/ This function is called when a '{' is encountered in a place where it might*
952	start a quantifier. It looks ahead to see if it really is a quantifier or not.
953	It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
954	where the ddds are digits.
955
956	Arguments:
957	p pointer to the first char after '{'
958
959	Returns: TRUE or FALSE
960	*/
961
962	static BOOL
963	is_counted_repeat(const pcre_uchar *p)
964	{
965	if (!IS_DIGIT(p)) return* FALSE;
966	p++;
967	while (IS_DIGIT(*p)) p++;
968	if (p == CHAR_RIGHT_CURLY_BRACKET) return* TRUE;
969
970	if (p++ != CHAR_COMMA) return* FALSE;
971	if (p == CHAR_RIGHT_CURLY_BRACKET) return* TRUE;
972
973	if (!IS_DIGIT(p)) return* FALSE;
974	p++;
975	while (IS_DIGIT(*p)) p++;
976
977	return (*p == CHAR_RIGHT_CURLY_BRACKET);
978	}
979
980
981
982	/*************************************************
983	* Handle escapes *
984	*************************************************/
985
986	/ This function is called when a \ has been encountered. It either returns a*
987	positive value for a simple escape such as \n, or 0 for a data character which
988	will be placed in chptr. A backreference to group n is returned as negative n.
989	When UTF-8 is enabled, a positive value greater than 255 may be returned in
990	chptr. On entry, ptr is pointing at the \. On exit, it is on the final
991	character of the escape sequence.
992
993	Arguments:
994	ptrptr points to the pattern position pointer
995	chptr points to a returned data character
996	errorcodeptr points to the errorcode variable
997	bracount number of previous extracting brackets
998	options the options bits
999	isclass TRUE if inside a character class
1000
1001	Returns: zero => a data character
1002	positive => a special escape sequence
1003	negative => a back reference
1004	on error, errorcodeptr is set
1005	*/
1006
1007	static int
1008	check_escape(const pcre_uchar *ptrptr, pcre_uint32 chptr, int *errorcodeptr,
1009	int bracount, int options, BOOL isclass)
1010	{
1011	/ PCRE_UTF16 has the same value as PCRE_UTF8. /
1012	BOOL utf = (options & PCRE_UTF8) != `0`;
1013	const pcre_uchar ptr = ptrptr + `1`;
1014	pcre_uint32 c;
1015	int escape = `0`;
1016	int i;
1017
1018	GETCHARINCTEST(c, ptr); / Get character value, increment pointer /
1019	ptr--; / Set pointer back to the last byte /
1020
1021	/ If backslash is at the end of the pattern, it's an error. /
1022
1023	if (c == CHAR_NULL) *errorcodeptr = ERR1;
1024
1025	/ Non-alphanumerics are literals. For digits or letters, do an initial lookup*
1026	in a table. A non-zero result is something that can be returned immediately.
1027	Otherwise further processing may be required. /*
1028
1029	#ifndef EBCDIC /* ASCII/UTF-8 coding */
1030	/ Not alphanumeric /
1031	else if (c < CHAR_0 \|\| c > CHAR_z) {}
1032	else if ((i = escapes[c - CHAR_0]) != `0`)
1033	{ if (i > `0`) c = (pcre_uint32)i; else escape = -i; }
1034
1035	#else /* EBCDIC coding */
1036	/ Not alphanumeric /
1037	else if (c < CHAR_a \|\| (!MAX_255(c) \|\| (ebcdic_chartab[c] & `0x0E`) == `0`)) {}
1038	else if ((i = escapes[c - `0x48`]) != `0`) { if (i > `0`) c = (pcre_uint32)i; else escape = -i; }
1039	#endif
1040
1041	/ Escapes that need further processing, or are illegal. /
1042
1043	else
1044	{
1045	const pcre_uchar *oldptr;
1046	BOOL braced, negated, overflow;
1047	int s;
1048
1049	switch (c)
1050	{
1051	/ A number of Perl escapes are not handled by PCRE. We give an explicit*
1052	error. /*
1053
1054	case CHAR_l:
1055	case CHAR_L:
1056	*errorcodeptr = ERR37;
1057	break;
1058
1059	case CHAR_u:
1060	if ((options & PCRE_JAVASCRIPT_COMPAT) != `0`)
1061	{
1062	/ In JavaScript, \u must be followed by four hexadecimal numbers.*
1063	Otherwise it is a lowercase u letter. /*
1064	if (MAX_255(ptr[`1`]) && (digitab[ptr[`1`]] & ctype_xdigit) != `0`
1065	&& MAX_255(ptr[`2`]) && (digitab[ptr[`2`]] & ctype_xdigit) != `0`
1066	&& MAX_255(ptr[`3`]) && (digitab[ptr[`3`]] & ctype_xdigit) != `0`
1067	&& MAX_255(ptr[`4`]) && (digitab[ptr[`4`]] & ctype_xdigit) != `0`)
1068	{
1069	c = `0`;
1070	for (i = `0`; i < `4`; ++i)
1071	{
1072	register pcre_uint32 cc = *(++ptr);
1073	#ifndef EBCDIC /* ASCII/UTF-8 coding */
1074	if (cc >= CHAR_a) cc -= `32`; / Convert to upper case /
1075	c = (c << `4`) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - `10`));
1076	#else /* EBCDIC coding */
1077	if (cc >= CHAR_a && cc <= CHAR_z) cc += `64`; / Convert to upper case /
1078	c = (c << `4`) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - `10`));
1079	#endif
1080	}
1081
1082	#if defined COMPILE_PCRE8
1083	if (c > (utf ? `0x10ffffU` : `0xffU`))
1084	#elif defined COMPILE_PCRE16
1085	if (c > (utf ? `0x10ffffU` : `0xffffU`))
1086	#elif defined COMPILE_PCRE32
1087	if (utf && c > `0x10ffffU`)
1088	#endif
1089	{
1090	*errorcodeptr = ERR76;
1091	}
1092	else if (utf && c >= `0xd800` && c <= `0xdfff`) *errorcodeptr = ERR73;
1093	}
1094	}
1095	else
1096	*errorcodeptr = ERR37;
1097	break;
1098
1099	case CHAR_U:
1100	/ In JavaScript, \U is an uppercase U letter. /
1101	if ((options & PCRE_JAVASCRIPT_COMPAT) == `0`) *errorcodeptr = ERR37;
1102	break;
1103
1104	/ In a character class, \g is just a literal "g". Outside a character*
1105	class, \g must be followed by one of a number of specific things:
1106
1107	(1) A number, either plain or braced. If positive, it is an absolute
1108	backreference. If negative, it is a relative backreference. This is a Perl
1109	5.10 feature.
1110
1111	(2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1112	is part of Perl's movement towards a unified syntax for back references. As
1113	this is synonymous with \k{name}, we fudge it up by pretending it really
1114	was \k.
1115
1116	(3) For Oniguruma compatibility we also support \g followed by a name or a
1117	number either in angle brackets or in single quotes. However, these are
1118	(possibly recursive) subroutine calls, _not_ backreferences. Just return
1119	the ESC_g code (cf \k). /*
1120
1121	case CHAR_g:
1122	if (isclass) break;
1123	if (ptr[`1`] == CHAR_LESS_THAN_SIGN \|\| ptr[`1`] == CHAR_APOSTROPHE)
1124	{
1125	escape = ESC_g;
1126	break;
1127	}
1128
1129	/ Handle the Perl-compatible cases /
1130
1131	if (ptr[`1`] == CHAR_LEFT_CURLY_BRACKET)
1132	{
1133	const pcre_uchar *p;
1134	for (p = ptr+`2`; p != CHAR_NULL && p != CHAR_RIGHT_CURLY_BRACKET; p++)
1135	if (p != CHAR_MINUS && !IS_DIGIT(p)) break;
1136	if (p != CHAR_NULL && p != CHAR_RIGHT_CURLY_BRACKET)
1137	{
1138	escape = ESC_k;
1139	break;
1140	}
1141	braced = TRUE;
1142	ptr++;
1143	}
1144	else braced = FALSE;
1145
1146	if (ptr[`1`] == CHAR_MINUS)
1147	{
1148	negated = TRUE;
1149	ptr++;
1150	}
1151	else negated = FALSE;
1152
1153	/ The integer range is limited by the machine's int representation. /
1154	s = `0`;
1155	overflow = FALSE;
1156	while (IS_DIGIT(ptr[`1`]))
1157	{
1158	if (s > INT_MAX / `10` - `1`) / Integer overflow /
1159	{
1160	overflow = TRUE;
1161	break;
1162	}
1163	s = s * `10` + (int)(*(++ptr) - CHAR_0);
1164	}
1165	if (overflow) / Integer overflow /
1166	{
1167	while (IS_DIGIT(ptr[`1`]))
1168	ptr++;
1169	*errorcodeptr = ERR61;
1170	break;
1171	}
1172
1173	if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1174	{
1175	*errorcodeptr = ERR57;
1176	break;
1177	}
1178
1179	if (s == `0`)
1180	{
1181	*errorcodeptr = ERR58;
1182	break;
1183	}
1184
1185	if (negated)
1186	{
1187	if (s > bracount)
1188	{
1189	*errorcodeptr = ERR15;
1190	break;
1191	}
1192	s = bracount - (s - `1`);
1193	}
1194
1195	escape = -s;
1196	break;
1197
1198	/ The handling of escape sequences consisting of a string of digits*
1199	starting with one that is not zero is not straightforward. Perl has changed
1200	over the years. Nowadays \g{} for backreferences and \o{} for octal are
1201	recommended to avoid the ambiguities in the old syntax.
1202
1203	Outside a character class, the digits are read as a decimal number. If the
1204	number is less than 8 (used to be 10), or if there are that many previous
1205	extracting left brackets, then it is a back reference. Otherwise, up to
1206	three octal digits are read to form an escaped byte. Thus \123 is likely to
1207	be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1208	the octal value is greater than 377, the least significant 8 bits are
1209	taken. \8 and \9 are treated as the literal characters 8 and 9.
1210
1211	Inside a character class, \ followed by a digit is always either a literal
1212	8 or 9 or an octal number. /*
1213
1214	case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1215	case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1216
1217	if (!isclass)
1218	{
1219	oldptr = ptr;
1220	/ The integer range is limited by the machine's int representation. /
1221	s = (int)(c -CHAR_0);
1222	overflow = FALSE;
1223	while (IS_DIGIT(ptr[`1`]))
1224	{
1225	if (s > INT_MAX / `10` - `1`) / Integer overflow /
1226	{
1227	overflow = TRUE;
1228	break;
1229	}
1230	s = s * `10` + (int)(*(++ptr) - CHAR_0);
1231	}
1232	if (overflow) / Integer overflow /
1233	{
1234	while (IS_DIGIT(ptr[`1`]))
1235	ptr++;
1236	*errorcodeptr = ERR61;
1237	break;
1238	}
1239	if (s < `8` \|\| s <= bracount) / Check for back reference /
1240	{
1241	escape = -s;
1242	break;
1243	}
1244	ptr = oldptr; / Put the pointer back and fall through /
1245	}
1246
1247	/ Handle a digit following \ when the number is not a back reference. If*
1248	the first digit is 8 or 9, Perl used to generate a binary zero byte and
1249	then treat the digit as a following literal. At least by Perl 5.18 this
1250	changed so as not to insert the binary zero. /*
1251
1252	if ((c = ptr) >= CHAR_8) break*;
1253
1254	/ Fall through with a digit less than 8 /
1255
1256	/ \0 always starts an octal number, but we may drop through to here with a*
1257	larger first octal digit. The original code used just to take the least
1258	significant 8 bits of octal numbers (I think this is what early Perls used
1259	to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1260	but no more than 3 octal digits. /*
1261
1262	case CHAR_0:
1263	c -= CHAR_0;
1264	while(i++ < `2` && ptr[`1`] >= CHAR_0 && ptr[`1`] <= CHAR_7)
1265	c = c * `8` + *(++ptr) - CHAR_0;
1266	#ifdef COMPILE_PCRE8
1267	if (!utf && c > `0xff`) *errorcodeptr = ERR51;
1268	#endif
1269	break;
1270
1271	/ \o is a relatively new Perl feature, supporting a more general way of*
1272	specifying character codes in octal. The only supported form is \o{ddd}. /*
1273
1274	case CHAR_o:
1275	if (ptr[`1`] != CHAR_LEFT_CURLY_BRACKET) errorcodeptr = ERR81; else*
1276	if (ptr[`2`] == CHAR_RIGHT_CURLY_BRACKET) errorcodeptr = ERR86; else*
1277	{
1278	ptr += `2`;
1279	c = `0`;
1280	overflow = FALSE;
1281	while (ptr >= CHAR_0 && ptr <= CHAR_7)
1282	{
1283	register pcre_uint32 cc = *ptr++;
1284	if (c == `0` && cc == CHAR_0) continue; / Leading zeroes /
1285	#ifdef COMPILE_PCRE32
1286	if (c >= `0x20000000l`) { overflow = TRUE; break; }
1287	#endif
1288	c = (c << `3`) + cc - CHAR_0 ;
1289	#if defined COMPILE_PCRE8
1290	if (c > (utf ? `0x10ffffU` : `0xffU`)) { overflow = TRUE; break; }
1291	#elif defined COMPILE_PCRE16
1292	if (c > (utf ? `0x10ffffU` : `0xffffU`)) { overflow = TRUE; break; }
1293	#elif defined COMPILE_PCRE32
1294	if (utf && c > `0x10ffffU`) { overflow = TRUE; break; }
1295	#endif
1296	}
1297	if (overflow)
1298	{
1299	while (ptr >= CHAR_0 && ptr <= CHAR_7) ptr++;
1300	*errorcodeptr = ERR34;
1301	}
1302	else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1303	{
1304	if (utf && c >= `0xd800` && c <= `0xdfff`) *errorcodeptr = ERR73;
1305	}
1306	else *errorcodeptr = ERR80;
1307	}
1308	break;
1309
1310	/ \x is complicated. In JavaScript, \x must be followed by two hexadecimal*
1311	numbers. Otherwise it is a lowercase x letter. /*
1312
1313	case CHAR_x:
1314	if ((options & PCRE_JAVASCRIPT_COMPAT) != `0`)
1315	{
1316	if (MAX_255(ptr[`1`]) && (digitab[ptr[`1`]] & ctype_xdigit) != `0`
1317	&& MAX_255(ptr[`2`]) && (digitab[ptr[`2`]] & ctype_xdigit) != `0`)
1318	{
1319	c = `0`;
1320	for (i = `0`; i < `2`; ++i)
1321	{
1322	register pcre_uint32 cc = *(++ptr);
1323	#ifndef EBCDIC /* ASCII/UTF-8 coding */
1324	if (cc >= CHAR_a) cc -= `32`; / Convert to upper case /
1325	c = (c << `4`) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - `10`));
1326	#else /* EBCDIC coding */
1327	if (cc >= CHAR_a && cc <= CHAR_z) cc += `64`; / Convert to upper case /
1328	c = (c << `4`) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - `10`));
1329	#endif
1330	}
1331	}
1332	} / End JavaScript handling /
1333
1334	/ Handle \x in Perl's style. \x{ddd} is a character number which can be*
1335	greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1336	digits. If not, { used to be treated as a data character. However, Perl
1337	seems to read hex digits up to the first non-such, and ignore the rest, so
1338	that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1339	now gives an error. /*
1340
1341	else
1342	{
1343	if (ptr[`1`] == CHAR_LEFT_CURLY_BRACKET)
1344	{
1345	ptr += `2`;
1346	if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1347	{
1348	*errorcodeptr = ERR86;
1349	break;
1350	}
1351	c = `0`;
1352	overflow = FALSE;
1353	while (MAX_255(ptr) && (digitab[ptr] & ctype_xdigit) != `0`)
1354	{
1355	register pcre_uint32 cc = *ptr++;
1356	if (c == `0` && cc == CHAR_0) continue; / Leading zeroes /
1357
1358	#ifdef COMPILE_PCRE32
1359	if (c >= `0x10000000l`) { overflow = TRUE; break; }
1360	#endif
1361
1362	#ifndef EBCDIC /* ASCII/UTF-8 coding */
1363	if (cc >= CHAR_a) cc -= `32`; / Convert to upper case /
1364	c = (c << `4`) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - `10`));
1365	#else /* EBCDIC coding */
1366	if (cc >= CHAR_a && cc <= CHAR_z) cc += `64`; / Convert to upper case /
1367	c = (c << `4`) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - `10`));
1368	#endif
1369
1370	#if defined COMPILE_PCRE8
1371	if (c > (utf ? `0x10ffffU` : `0xffU`)) { overflow = TRUE; break; }
1372	#elif defined COMPILE_PCRE16
1373	if (c > (utf ? `0x10ffffU` : `0xffffU`)) { overflow = TRUE; break; }
1374	#elif defined COMPILE_PCRE32
1375	if (utf && c > `0x10ffffU`) { overflow = TRUE; break; }
1376	#endif
1377	}
1378
1379	if (overflow)
1380	{
1381	while (MAX_255(ptr) && (digitab[ptr] & ctype_xdigit) != `0`) ptr++;
1382	*errorcodeptr = ERR34;
1383	}
1384
1385	else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1386	{
1387	if (utf && c >= `0xd800` && c <= `0xdfff`) *errorcodeptr = ERR73;
1388	}
1389
1390	/ If the sequence of hex digits does not end with '}', give an error.*
1391	We used just to recognize this construct and fall through to the normal
1392	\x handling, but nowadays Perl gives an error, which seems much more
1393	sensible, so we do too. /*
1394
1395	else *errorcodeptr = ERR79;
1396	} / End of \x{} processing /
1397
1398	/ Read a single-byte hex-defined char (up to two hex digits after \x) /
1399
1400	else
1401	{
1402	c = `0`;
1403	while (i++ < `2` && MAX_255(ptr[`1`]) && (digitab[ptr[`1`]] & ctype_xdigit) != `0`)
1404	{
1405	pcre_uint32 cc; / Some compilers don't like /
1406	cc = (++ptr); /* ++ in initializers /
1407	#ifndef EBCDIC /* ASCII/UTF-8 coding */
1408	if (cc >= CHAR_a) cc -= `32`; / Convert to upper case /
1409	c = c * `16` + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - `10`));
1410	#else /* EBCDIC coding */
1411	if (cc <= CHAR_z) cc += `64`; / Convert to upper case /
1412	c = c * `16` + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - `10`));
1413	#endif
1414	}
1415	} / End of \xdd handling /
1416	} / End of Perl-style \x handling /
1417	break;
1418
1419	/ For \c, a following letter is upper-cased; then the 0x40 bit is flipped.*
1420	An error is given if the byte following \c is not an ASCII character. This
1421	coding is ASCII-specific, but then the whole concept of \cx is
1422	ASCII-specific. (However, an EBCDIC equivalent has now been added.) /*
1423
1424	case CHAR_c:
1425	c = *(++ptr);
1426	if (c == CHAR_NULL)
1427	{
1428	*errorcodeptr = ERR2;
1429	break;
1430	}
1431	#ifndef EBCDIC /* ASCII/UTF-8 coding */
1432	if (c > `127`) / Excludes all non-ASCII in either mode /
1433	{
1434	*errorcodeptr = ERR68;
1435	break;
1436	}
1437	if (c >= CHAR_a && c <= CHAR_z) c -= `32`;
1438	c ^= `0x40`;
1439	#else /* EBCDIC coding */
1440	if (c >= CHAR_a && c <= CHAR_z) c += `64`;
1441	if (c == CHAR_QUESTION_MARK)
1442	c = (`'\\'` == `188` && '`' == `74`)? `0x5f` : `0xff`;
1443	else
1444	{
1445	for (i = `0`; i < `32`; i++)
1446	{
1447	if (c == ebcdic_escape_c[i]) break;
1448	}
1449	if (i < `32`) c = i; else *errorcodeptr = ERR68;
1450	}
1451	#endif
1452	break;
1453
1454	/ PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any*
1455	other alphanumeric following \ is an error if PCRE_EXTRA was set;
1456	otherwise, for Perl compatibility, it is a literal. This code looks a bit
1457	odd, but there used to be some cases other than the default, and there may
1458	be again in future, so I haven't "optimized" it. /*
1459
1460	default:
1461	if ((options & PCRE_EXTRA) != `0`) switch(c)
1462	{
1463	default:
1464	*errorcodeptr = ERR3;
1465	break;
1466	}
1467	break;
1468	}
1469	}
1470
1471	/ Perl supports \N{name} for character names, as well as plain \N for "not*
1472	newline". PCRE does not support \N{name}. However, it does support
1473	quantification such as \N{2,3}. /*
1474
1475	if (escape == ESC_N && ptr[`1`] == CHAR_LEFT_CURLY_BRACKET &&
1476	!is_counted_repeat(ptr+`2`))
1477	*errorcodeptr = ERR37;
1478
1479	/ If PCRE_UCP is set, we change the values for \d etc. /
1480
1481	if ((options & PCRE_UCP) != `0` && escape >= ESC_D && escape <= ESC_w)
1482	escape += (ESC_DU - ESC_D);
1483
1484	/ Set the pointer to the final character before returning. /
1485
1486	*ptrptr = ptr;
1487	*chptr = c;
1488	return escape;
1489	}
1490
1491
1492
1493	#ifdef SUPPORT_UCP
1494	/*************************************************
1495	* Handle \P and \p *
1496	*************************************************/
1497
1498	/ This function is called after \P or \p has been encountered, provided that*
1499	PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1500	pointing at the P or p. On exit, it is pointing at the final character of the
1501	escape sequence.
1502
1503	Argument:
1504	ptrptr points to the pattern position pointer
1505	negptr points to a boolean that is set TRUE for negation else FALSE
1506	ptypeptr points to an unsigned int that is set to the type value
1507	pdataptr points to an unsigned int that is set to the detailed property value
1508	errorcodeptr points to the error code variable
1509
1510	Returns: TRUE if the type value was found, or FALSE for an invalid type
1511	*/
1512
1513	static BOOL
1514	get_ucp(const pcre_uchar *ptrptr, BOOL negptr, unsigned int *ptypeptr,
1515	unsigned int pdataptr, int* *errorcodeptr)
1516	{
1517	pcre_uchar c;
1518	int i, bot, top;
1519	const pcre_uchar ptr = ptrptr;
1520	pcre_uchar name[`32`];
1521
1522	c = *(++ptr);
1523	if (c == CHAR_NULL) goto ERROR_RETURN;
1524
1525	*negptr = FALSE;
1526
1527	/ \P or \p can be followed by a name in {}, optionally preceded by ^ for*
1528	negation. /*
1529
1530	if (c == CHAR_LEFT_CURLY_BRACKET)
1531	{
1532	if (ptr[`1`] == CHAR_CIRCUMFLEX_ACCENT)
1533	{
1534	*negptr = TRUE;
1535	ptr++;
1536	}
1537	for (i = `0`; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - `1`; i++)
1538	{
1539	c = *(++ptr);
1540	if (c == CHAR_NULL) goto ERROR_RETURN;
1541	if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1542	name[i] = c;
1543	}
1544	if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1545	name[i] = `0`;
1546	}
1547
1548	/ Otherwise there is just one following character /
1549
1550	else
1551	{
1552	name[`0`] = c;
1553	name[`1`] = `0`;
1554	}
1555
1556	*ptrptr = ptr;
1557
1558	/ Search for a recognized property name using binary chop /
1559
1560	bot = `0`;
1561	top = PRIV(utt_size);
1562
1563	while (bot < top)
1564	{
1565	int r;
1566	i = (bot + top) >> `1`;
1567	r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1568	if (r == `0`)
1569	{
1570	*ptypeptr = PRIV(utt)[i].type;
1571	*pdataptr = PRIV(utt)[i].value;
1572	return TRUE;
1573	}
1574	if (r > `0`) bot = i + `1`; else top = i;
1575	}
1576
1577	*errorcodeptr = ERR47;
1578	*ptrptr = ptr;
1579	return FALSE;
1580
1581	ERROR_RETURN:
1582	*errorcodeptr = ERR46;
1583	*ptrptr = ptr;
1584	return FALSE;
1585	}
1586	#endif
1587
1588
1589
1590	/*************************************************
1591	* Read repeat counts *
1592	*************************************************/
1593
1594	/ Read an item of the form {n,m} and return the values. This is called only*
1595	after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1596	so the syntax is guaranteed to be correct, but we need to check the values.
1597
1598	Arguments:
1599	p pointer to first char after '{'
1600	minp pointer to int for min
1601	maxp pointer to int for max
1602	returned as -1 if no max
1603	errorcodeptr points to error code variable
1604
1605	Returns: pointer to '}' on success;
1606	current ptr on error, with errorcodeptr set non-zero
1607	*/
1608
1609	static const pcre_uchar *
1610	read_repeat_counts(const pcre_uchar p, int* minp, int* maxp, int* *errorcodeptr)
1611	{
1612	int min = `0`;
1613	int max = -`1`;
1614
1615	while (IS_DIGIT(*p))
1616	{
1617	min = min * `10` + (int)(*p++ - CHAR_0);
1618	if (min > `65535`)
1619	{
1620	*errorcodeptr = ERR5;
1621	return p;
1622	}
1623	}
1624
1625	if (p == CHAR_RIGHT_CURLY_BRACKET) max = min; else*
1626	{
1627	if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1628	{
1629	max = `0`;
1630	while(IS_DIGIT(*p))
1631	{
1632	max = max * `10` + (int)(*p++ - CHAR_0);
1633	if (max > `65535`)
1634	{
1635	*errorcodeptr = ERR5;
1636	return p;
1637	}
1638	}
1639	if (max < min)
1640	{
1641	*errorcodeptr = ERR4;
1642	return p;
1643	}
1644	}
1645	}
1646
1647	*minp = min;
1648	*maxp = max;
1649	return p;
1650	}
1651
1652
1653
1654	/*************************************************
1655	* Find first significant op code *
1656	*************************************************/
1657
1658	/ This is called by several functions that scan a compiled expression looking*
1659	for a fixed first character, or an anchoring op code etc. It skips over things
1660	that do not influence this. For some calls, it makes sense to skip negative
1661	forward and all backward assertions, and also the \b assertion; for others it
1662	does not.
1663
1664	Arguments:
1665	code pointer to the start of the group
1666	skipassert TRUE if certain assertions are to be skipped
1667
1668	Returns: pointer to the first significant opcode
1669	*/
1670
1671	static const pcre_uchar*
1672	first_significant_code(const pcre_uchar *code, BOOL skipassert)
1673	{
1674	for (;;)
1675	{
1676	switch ((int)*code)
1677	{
1678	case OP_ASSERT_NOT:
1679	case OP_ASSERTBACK:
1680	case OP_ASSERTBACK_NOT:
1681	if (!skipassert) return code;
1682	do code += GET(code, `1`); while (*code == OP_ALT);
1683	code += PRIV(OP_lengths)[*code];
1684	break;
1685
1686	case OP_WORD_BOUNDARY:
1687	case OP_NOT_WORD_BOUNDARY:
1688	if (!skipassert) return code;
1689	/ Fall through /
1690
1691	case OP_CALLOUT:
1692	case OP_CREF:
1693	case OP_DNCREF:
1694	case OP_RREF:
1695	case OP_DNRREF:
1696	case OP_DEF:
1697	code += PRIV(OP_lengths)[*code];
1698	break;
1699
1700	default:
1701	return code;
1702	}
1703	}
1704	/ Control never reaches here /
1705	}
1706
1707
1708
1709	/*************************************************
1710	* Find the fixed length of a branch *
1711	*************************************************/
1712
1713	/ Scan a branch and compute the fixed length of subject that will match it,*
1714	if the length is fixed. This is needed for dealing with backward assertions.
1715	In UTF8 mode, the result is in characters rather than bytes. The branch is
1716	temporarily terminated with OP_END when this function is called.
1717
1718	This function is called when a backward assertion is encountered, so that if it
1719	fails, the error message can point to the correct place in the pattern.
1720	However, we cannot do this when the assertion contains subroutine calls,
1721	because they can be forward references. We solve this by remembering this case
1722	and doing the check at the end; a flag specifies which mode we are running in.
1723
1724	Arguments:
1725	code points to the start of the pattern (the bracket)
1726	utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1727	atend TRUE if called when the pattern is complete
1728	cd the "compile data" structure
1729	recurses chain of recurse_check to catch mutual recursion
1730
1731	Returns: the fixed length,
1732	or -1 if there is no fixed length,
1733	or -2 if \C was encountered (in UTF-8 mode only)
1734	or -3 if an OP_RECURSE item was encountered and atend is FALSE
1735	or -4 if an unknown opcode was encountered (internal error)
1736	*/
1737
1738	static int
1739	find_fixedlength(pcre_uchar code, BOOL utf, BOOL atend, compile_data cd,
1740	recurse_check *recurses)
1741	{
1742	int length = -`1`;
1743	recurse_check this_recurse;
1744	register int branchlength = `0`;
1745	register pcre_uchar *cc = code + `1` + LINK_SIZE;
1746
1747	/ Scan along the opcodes for this branch. If we get to the end of the*
1748	branch, check the length against that of the other branches. /*
1749
1750	for (;;)
1751	{
1752	int d;
1753	pcre_uchar ce, cs;
1754	register pcre_uchar op = *cc;
1755
1756	switch (op)
1757	{
1758	/ We only need to continue for OP_CBRA (normal capturing bracket) and*
1759	OP_BRA (normal non-capturing bracket) because the other variants of these
1760	opcodes are all concerned with unlimited repeated groups, which of course
1761	are not of fixed length. /*
1762
1763	case OP_CBRA:
1764	case OP_BRA:
1765	case OP_ONCE:
1766	case OP_ONCE_NC:
1767	case OP_COND:
1768	d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : `0`), utf, atend, cd,
1769	recurses);
1770	if (d < `0`) return d;
1771	branchlength += d;
1772	do cc += GET(cc, `1`); while (*cc == OP_ALT);
1773	cc += `1` + LINK_SIZE;
1774	break;
1775
1776	/ Reached end of a branch; if it's a ket it is the end of a nested call.*
1777	If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1778	an ALT. If it is END it's the end of the outer call. All can be handled by
1779	the same code. Note that we must not include the OP_KETRxxx opcodes here,
1780	because they all imply an unlimited repeat. /*
1781
1782	case OP_ALT:
1783	case OP_KET:
1784	case OP_END:
1785	case OP_ACCEPT:
1786	case OP_ASSERT_ACCEPT:
1787	if (length < `0`) length = branchlength;
1788	else if (length != branchlength) return -`1`;
1789	if (cc != OP_ALT) return* length;
1790	cc += `1` + LINK_SIZE;
1791	branchlength = `0`;
1792	break;
1793
1794	/ A true recursion implies not fixed length, but a subroutine call may*
1795	be OK. If the subroutine is a forward reference, we can't deal with
1796	it until the end of the pattern, so return -3. /*
1797
1798	case OP_RECURSE:
1799	if (!atend) return -`3`;
1800	cs = ce = (pcre_uchar )cd->start_code + GET(cc, `1`); /* Start subpattern /
1801	do ce += GET(ce, `1`); while (ce == OP_ALT); /* End subpattern /
1802	if (cc > cs && cc < ce) return -`1`; / Recursion /
1803	else / Check for mutual recursion /
1804	{
1805	recurse_check *r = recurses;
1806	for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1807	if (r != NULL) return -`1`; / Mutual recursion /
1808	}
1809	this_recurse.prev = recurses;
1810	this_recurse.group = cs;
1811	d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1812	if (d < `0`) return d;
1813	branchlength += d;
1814	cc += `1` + LINK_SIZE;
1815	break;
1816
1817	/ Skip over assertive subpatterns /
1818
1819	case OP_ASSERT:
1820	case OP_ASSERT_NOT:
1821	case OP_ASSERTBACK:
1822	case OP_ASSERTBACK_NOT:
1823	do cc += GET(cc, `1`); while (*cc == OP_ALT);
1824	cc += `1` + LINK_SIZE;
1825	break;
1826
1827	/ Skip over things that don't match chars /
1828
1829	case OP_MARK:
1830	case OP_PRUNE_ARG:
1831	case OP_SKIP_ARG:
1832	case OP_THEN_ARG:
1833	cc += cc[`1`] + PRIV(OP_lengths)[*cc];
1834	break;
1835
1836	case OP_CALLOUT:
1837	case OP_CIRC:
1838	case OP_CIRCM:
1839	case OP_CLOSE:
1840	case OP_COMMIT:
1841	case OP_CREF:
1842	case OP_DEF:
1843	case OP_DNCREF:
1844	case OP_DNRREF:
1845	case OP_DOLL:
1846	case OP_DOLLM:
1847	case OP_EOD:
1848	case OP_EODN:
1849	case OP_FAIL:
1850	case OP_NOT_WORD_BOUNDARY:
1851	case OP_PRUNE:
1852	case OP_REVERSE:
1853	case OP_RREF:
1854	case OP_SET_SOM:
1855	case OP_SKIP:
1856	case OP_SOD:
1857	case OP_SOM:
1858	case OP_THEN:
1859	case OP_WORD_BOUNDARY:
1860	cc += PRIV(OP_lengths)[*cc];
1861	break;
1862
1863	/ Handle literal characters /
1864
1865	case OP_CHAR:
1866	case OP_CHARI:
1867	case OP_NOT:
1868	case OP_NOTI:
1869	branchlength++;
1870	cc += `2`;
1871	#ifdef SUPPORT_UTF
1872	if (utf && HAS_EXTRALEN(cc[-`1`])) cc += GET_EXTRALEN(cc[-`1`]);
1873	#endif
1874	break;
1875
1876	/ Handle exact repetitions. The count is already in characters, but we*
1877	need to skip over a multibyte character in UTF8 mode. /*
1878
1879	case OP_EXACT:
1880	case OP_EXACTI:
1881	case OP_NOTEXACT:
1882	case OP_NOTEXACTI:
1883	branchlength += (int)GET2(cc,`1`);
1884	cc += `2` + IMM2_SIZE;
1885	#ifdef SUPPORT_UTF
1886	if (utf && HAS_EXTRALEN(cc[-`1`])) cc += GET_EXTRALEN(cc[-`1`]);
1887	#endif
1888	break;
1889
1890	case OP_TYPEEXACT:
1891	branchlength += GET2(cc,`1`);
1892	if (cc[`1` + IMM2_SIZE] == OP_PROP \|\| cc[`1` + IMM2_SIZE] == OP_NOTPROP)
1893	cc += `2`;
1894	cc += `1` + IMM2_SIZE + `1`;
1895	break;
1896
1897	/ Handle single-char matchers /
1898
1899	case OP_PROP:
1900	case OP_NOTPROP:
1901	cc += `2`;
1902	/ Fall through /
1903
1904	case OP_HSPACE:
1905	case OP_VSPACE:
1906	case OP_NOT_HSPACE:
1907	case OP_NOT_VSPACE:
1908	case OP_NOT_DIGIT:
1909	case OP_DIGIT:
1910	case OP_NOT_WHITESPACE:
1911	case OP_WHITESPACE:
1912	case OP_NOT_WORDCHAR:
1913	case OP_WORDCHAR:
1914	case OP_ANY:
1915	case OP_ALLANY:
1916	branchlength++;
1917	cc++;
1918	break;
1919
1920	/ The single-byte matcher isn't allowed. This only happens in UTF-8 mode;*
1921	otherwise \C is coded as OP_ALLANY. /*
1922
1923	case OP_ANYBYTE:
1924	return -`2`;
1925
1926	/ Check a class for variable quantification /
1927
1928	case OP_CLASS:
1929	case OP_NCLASS:
1930	#if defined SUPPORT_UTF \|\| defined COMPILE_PCRE16 \|\| defined COMPILE_PCRE32
1931	case OP_XCLASS:
1932	/ The original code caused an unsigned overflow in 64 bit systems,*
1933	so now we use a conditional statement. /*
1934	if (op == OP_XCLASS)
1935	cc += GET(cc, `1`);
1936	else
1937	cc += PRIV(OP_lengths)[OP_CLASS];
1938	#else
1939	cc += PRIV(OP_lengths)[OP_CLASS];
1940	#endif
1941
1942	switch (*cc)
1943	{
1944	case OP_CRSTAR:
1945	case OP_CRMINSTAR:
1946	case OP_CRPLUS:
1947	case OP_CRMINPLUS:
1948	case OP_CRQUERY:
1949	case OP_CRMINQUERY:
1950	case OP_CRPOSSTAR:
1951	case OP_CRPOSPLUS:
1952	case OP_CRPOSQUERY:
1953	return -`1`;
1954
1955	case OP_CRRANGE:
1956	case OP_CRMINRANGE:
1957	case OP_CRPOSRANGE:
1958	if (GET2(cc,`1`) != GET2(cc,`1`+IMM2_SIZE)) return -`1`;
1959	branchlength += (int)GET2(cc,`1`);
1960	cc += `1` + `2` * IMM2_SIZE;
1961	break;
1962
1963	default:
1964	branchlength++;
1965	}
1966	break;
1967
1968	/ Anything else is variable length /
1969
1970	case OP_ANYNL:
1971	case OP_BRAMINZERO:
1972	case OP_BRAPOS:
1973	case OP_BRAPOSZERO:
1974	case OP_BRAZERO:
1975	case OP_CBRAPOS:
1976	case OP_EXTUNI:
1977	case OP_KETRMAX:
1978	case OP_KETRMIN:
1979	case OP_KETRPOS:
1980	case OP_MINPLUS:
1981	case OP_MINPLUSI:
1982	case OP_MINQUERY:
1983	case OP_MINQUERYI:
1984	case OP_MINSTAR:
1985	case OP_MINSTARI:
1986	case OP_MINUPTO:
1987	case OP_MINUPTOI:
1988	case OP_NOTMINPLUS:
1989	case OP_NOTMINPLUSI:
1990	case OP_NOTMINQUERY:
1991	case OP_NOTMINQUERYI:
1992	case OP_NOTMINSTAR:
1993	case OP_NOTMINSTARI:
1994	case OP_NOTMINUPTO:
1995	case OP_NOTMINUPTOI:
1996	case OP_NOTPLUS:
1997	case OP_NOTPLUSI:
1998	case OP_NOTPOSPLUS:
1999	case OP_NOTPOSPLUSI:
2000	case OP_NOTPOSQUERY:
2001	case OP_NOTPOSQUERYI:
2002	case OP_NOTPOSSTAR:
2003	case OP_NOTPOSSTARI:
2004	case OP_NOTPOSUPTO:
2005	case OP_NOTPOSUPTOI:
2006	case OP_NOTQUERY:
2007	case OP_NOTQUERYI:
2008	case OP_NOTSTAR:
2009	case OP_NOTSTARI:
2010	case OP_NOTUPTO:
2011	case OP_NOTUPTOI:
2012	case OP_PLUS:
2013	case OP_PLUSI:
2014	case OP_POSPLUS:
2015	case OP_POSPLUSI:
2016	case OP_POSQUERY:
2017	case OP_POSQUERYI:
2018	case OP_POSSTAR:
2019	case OP_POSSTARI:
2020	case OP_POSUPTO:
2021	case OP_POSUPTOI:
2022	case OP_QUERY:
2023	case OP_QUERYI:
2024	case OP_REF:
2025	case OP_REFI:
2026	case OP_DNREF:
2027	case OP_DNREFI:
2028	case OP_SBRA:
2029	case OP_SBRAPOS:
2030	case OP_SCBRA:
2031	case OP_SCBRAPOS:
2032	case OP_SCOND:
2033	case OP_SKIPZERO:
2034	case OP_STAR:
2035	case OP_STARI:
2036	case OP_TYPEMINPLUS:
2037	case OP_TYPEMINQUERY:
2038	case OP_TYPEMINSTAR:
2039	case OP_TYPEMINUPTO:
2040	case OP_TYPEPLUS:
2041	case OP_TYPEPOSPLUS:
2042	case OP_TYPEPOSQUERY:
2043	case OP_TYPEPOSSTAR:
2044	case OP_TYPEPOSUPTO:
2045	case OP_TYPEQUERY:
2046	case OP_TYPESTAR:
2047	case OP_TYPEUPTO:
2048	case OP_UPTO:
2049	case OP_UPTOI:
2050	return -`1`;
2051
2052	/ Catch unrecognized opcodes so that when new ones are added they*
2053	are not forgotten, as has happened in the past. /*
2054
2055	default:
2056	return -`4`;
2057	}
2058	}
2059	/ Control never gets here /
2060	}
2061
2062
2063
2064	/*************************************************
2065	* Scan compiled regex for specific bracket *
2066	*************************************************/
2067
2068	/ This little function scans through a compiled pattern until it finds a*
2069	capturing bracket with the given number, or, if the number is negative, an
2070	instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2071	so that it can be called from pcre_study() when finding the minimum matching
2072	length.
2073
2074	Arguments:
2075	code points to start of expression
2076	utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2077	number the required bracket number or negative to find a lookbehind
2078
2079	Returns: pointer to the opcode for the bracket, or NULL if not found
2080	*/
2081
2082	const pcre_uchar *
2083	PRIV(find_bracket)(const pcre_uchar code, BOOL utf, int* number)
2084	{
2085	for (;;)
2086	{
2087	register pcre_uchar c = *code;
2088
2089	if (c == OP_END) return NULL;
2090
2091	/ XCLASS is used for classes that cannot be represented just by a bit*
2092	map. This includes negated single high-valued characters. The length in
2093	the table is zero; the actual length is stored in the compiled code. /*
2094
2095	if (c == OP_XCLASS) code += GET(code, `1`);
2096
2097	/ Handle recursion /
2098
2099	else if (c == OP_REVERSE)
2100	{
2101	if (number < `0`) return (pcre_uchar *)code;
2102	code += PRIV(OP_lengths)[c];
2103	}
2104
2105	/ Handle capturing bracket /
2106
2107	else if (c == OP_CBRA \|\| c == OP_SCBRA \|\|
2108	c == OP_CBRAPOS \|\| c == OP_SCBRAPOS)
2109	{
2110	int n = (int)GET2(code, `1`+LINK_SIZE);
2111	if (n == number) return (pcre_uchar *)code;
2112	code += PRIV(OP_lengths)[c];
2113	}
2114
2115	/ Otherwise, we can get the item's length from the table, except that for*
2116	repeated character types, we have to test for \p and \P, which have an extra
2117	two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2118	must add in its length. /*
2119
2120	else
2121	{
2122	switch(c)
2123	{
2124	case OP_TYPESTAR:
2125	case OP_TYPEMINSTAR:
2126	case OP_TYPEPLUS:
2127	case OP_TYPEMINPLUS:
2128	case OP_TYPEQUERY:
2129	case OP_TYPEMINQUERY:
2130	case OP_TYPEPOSSTAR:
2131	case OP_TYPEPOSPLUS:
2132	case OP_TYPEPOSQUERY:
2133	if (code[`1`] == OP_PROP \|\| code[`1`] == OP_NOTPROP) code += `2`;
2134	break;
2135
2136	case OP_TYPEUPTO:
2137	case OP_TYPEMINUPTO:
2138	case OP_TYPEEXACT:
2139	case OP_TYPEPOSUPTO:
2140	if (code[`1` + IMM2_SIZE] == OP_PROP \|\| code[`1` + IMM2_SIZE] == OP_NOTPROP)
2141	code += `2`;
2142	break;
2143
2144	case OP_MARK:
2145	case OP_PRUNE_ARG:
2146	case OP_SKIP_ARG:
2147	case OP_THEN_ARG:
2148	code += code[`1`];
2149	break;
2150	}
2151
2152	/ Add in the fixed length from the table /
2153
2154	code += PRIV(OP_lengths)[c];
2155
2156	/ In UTF-8 mode, opcodes that are followed by a character may be followed by*
2157	a multi-byte character. The length in the table is a minimum, so we have to
2158	arrange to skip the extra bytes. /*
2159
2160	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2161	if (utf) switch(c)
2162	{
2163	case OP_CHAR:
2164	case OP_CHARI:
2165	case OP_NOT:
2166	case OP_NOTI:
2167	case OP_EXACT:
2168	case OP_EXACTI:
2169	case OP_NOTEXACT:
2170	case OP_NOTEXACTI:
2171	case OP_UPTO:
2172	case OP_UPTOI:
2173	case OP_NOTUPTO:
2174	case OP_NOTUPTOI:
2175	case OP_MINUPTO:
2176	case OP_MINUPTOI:
2177	case OP_NOTMINUPTO:
2178	case OP_NOTMINUPTOI:
2179	case OP_POSUPTO:
2180	case OP_POSUPTOI:
2181	case OP_NOTPOSUPTO:
2182	case OP_NOTPOSUPTOI:
2183	case OP_STAR:
2184	case OP_STARI:
2185	case OP_NOTSTAR:
2186	case OP_NOTSTARI:
2187	case OP_MINSTAR:
2188	case OP_MINSTARI:
2189	case OP_NOTMINSTAR:
2190	case OP_NOTMINSTARI:
2191	case OP_POSSTAR:
2192	case OP_POSSTARI:
2193	case OP_NOTPOSSTAR:
2194	case OP_NOTPOSSTARI:
2195	case OP_PLUS:
2196	case OP_PLUSI:
2197	case OP_NOTPLUS:
2198	case OP_NOTPLUSI:
2199	case OP_MINPLUS:
2200	case OP_MINPLUSI:
2201	case OP_NOTMINPLUS:
2202	case OP_NOTMINPLUSI:
2203	case OP_POSPLUS:
2204	case OP_POSPLUSI:
2205	case OP_NOTPOSPLUS:
2206	case OP_NOTPOSPLUSI:
2207	case OP_QUERY:
2208	case OP_QUERYI:
2209	case OP_NOTQUERY:
2210	case OP_NOTQUERYI:
2211	case OP_MINQUERY:
2212	case OP_MINQUERYI:
2213	case OP_NOTMINQUERY:
2214	case OP_NOTMINQUERYI:
2215	case OP_POSQUERY:
2216	case OP_POSQUERYI:
2217	case OP_NOTPOSQUERY:
2218	case OP_NOTPOSQUERYI:
2219	if (HAS_EXTRALEN(code[-`1`])) code += GET_EXTRALEN(code[-`1`]);
2220	break;
2221	}
2222	#else
2223	(void)(utf); / Keep compiler happy by referencing function argument /
2224	#endif
2225	}
2226	}
2227	}
2228
2229
2230
2231	/*************************************************
2232	* Scan compiled regex for recursion reference *
2233	*************************************************/
2234
2235	/ This little function scans through a compiled pattern until it finds an*
2236	instance of OP_RECURSE.
2237
2238	Arguments:
2239	code points to start of expression
2240	utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2241
2242	Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2243	*/
2244
2245	static const pcre_uchar *
2246	find_recurse(const pcre_uchar *code, BOOL utf)
2247	{
2248	for (;;)
2249	{
2250	register pcre_uchar c = *code;
2251	if (c == OP_END) return NULL;
2252	if (c == OP_RECURSE) return code;
2253
2254	/ XCLASS is used for classes that cannot be represented just by a bit*
2255	map. This includes negated single high-valued characters. The length in
2256	the table is zero; the actual length is stored in the compiled code. /*
2257
2258	if (c == OP_XCLASS) code += GET(code, `1`);
2259
2260	/ Otherwise, we can get the item's length from the table, except that for*
2261	repeated character types, we have to test for \p and \P, which have an extra
2262	two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2263	must add in its length. /*
2264
2265	else
2266	{
2267	switch(c)
2268	{
2269	case OP_TYPESTAR:
2270	case OP_TYPEMINSTAR:
2271	case OP_TYPEPLUS:
2272	case OP_TYPEMINPLUS:
2273	case OP_TYPEQUERY:
2274	case OP_TYPEMINQUERY:
2275	case OP_TYPEPOSSTAR:
2276	case OP_TYPEPOSPLUS:
2277	case OP_TYPEPOSQUERY:
2278	if (code[`1`] == OP_PROP \|\| code[`1`] == OP_NOTPROP) code += `2`;
2279	break;
2280
2281	case OP_TYPEPOSUPTO:
2282	case OP_TYPEUPTO:
2283	case OP_TYPEMINUPTO:
2284	case OP_TYPEEXACT:
2285	if (code[`1` + IMM2_SIZE] == OP_PROP \|\| code[`1` + IMM2_SIZE] == OP_NOTPROP)
2286	code += `2`;
2287	break;
2288
2289	case OP_MARK:
2290	case OP_PRUNE_ARG:
2291	case OP_SKIP_ARG:
2292	case OP_THEN_ARG:
2293	code += code[`1`];
2294	break;
2295	}
2296
2297	/ Add in the fixed length from the table /
2298
2299	code += PRIV(OP_lengths)[c];
2300
2301	/ In UTF-8 mode, opcodes that are followed by a character may be followed*
2302	by a multi-byte character. The length in the table is a minimum, so we have
2303	to arrange to skip the extra bytes. /*
2304
2305	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2306	if (utf) switch(c)
2307	{
2308	case OP_CHAR:
2309	case OP_CHARI:
2310	case OP_NOT:
2311	case OP_NOTI:
2312	case OP_EXACT:
2313	case OP_EXACTI:
2314	case OP_NOTEXACT:
2315	case OP_NOTEXACTI:
2316	case OP_UPTO:
2317	case OP_UPTOI:
2318	case OP_NOTUPTO:
2319	case OP_NOTUPTOI:
2320	case OP_MINUPTO:
2321	case OP_MINUPTOI:
2322	case OP_NOTMINUPTO:
2323	case OP_NOTMINUPTOI:
2324	case OP_POSUPTO:
2325	case OP_POSUPTOI:
2326	case OP_NOTPOSUPTO:
2327	case OP_NOTPOSUPTOI:
2328	case OP_STAR:
2329	case OP_STARI:
2330	case OP_NOTSTAR:
2331	case OP_NOTSTARI:
2332	case OP_MINSTAR:
2333	case OP_MINSTARI:
2334	case OP_NOTMINSTAR:
2335	case OP_NOTMINSTARI:
2336	case OP_POSSTAR:
2337	case OP_POSSTARI:
2338	case OP_NOTPOSSTAR:
2339	case OP_NOTPOSSTARI:
2340	case OP_PLUS:
2341	case OP_PLUSI:
2342	case OP_NOTPLUS:
2343	case OP_NOTPLUSI:
2344	case OP_MINPLUS:
2345	case OP_MINPLUSI:
2346	case OP_NOTMINPLUS:
2347	case OP_NOTMINPLUSI:
2348	case OP_POSPLUS:
2349	case OP_POSPLUSI:
2350	case OP_NOTPOSPLUS:
2351	case OP_NOTPOSPLUSI:
2352	case OP_QUERY:
2353	case OP_QUERYI:
2354	case OP_NOTQUERY:
2355	case OP_NOTQUERYI:
2356	case OP_MINQUERY:
2357	case OP_MINQUERYI:
2358	case OP_NOTMINQUERY:
2359	case OP_NOTMINQUERYI:
2360	case OP_POSQUERY:
2361	case OP_POSQUERYI:
2362	case OP_NOTPOSQUERY:
2363	case OP_NOTPOSQUERYI:
2364	if (HAS_EXTRALEN(code[-`1`])) code += GET_EXTRALEN(code[-`1`]);
2365	break;
2366	}
2367	#else
2368	(void)(utf); / Keep compiler happy by referencing function argument /
2369	#endif
2370	}
2371	}
2372	}
2373
2374
2375
2376	/*************************************************
2377	* Scan compiled branch for non-emptiness *
2378	*************************************************/
2379
2380	/ This function scans through a branch of a compiled pattern to see whether it*
2381	can match the empty string or not. It is called from could_be_empty()
2382	below and from compile_branch() when checking for an unlimited repeat of a
2383	group that can match nothing. Note that first_significant_code() skips over
2384	backward and negative forward assertions when its final argument is TRUE. If we
2385	hit an unclosed bracket, we return "empty" - this means we've struck an inner
2386	bracket whose current branch will already have been scanned.
2387
2388	Arguments:
2389	code points to start of search
2390	endcode points to where to stop
2391	utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2392	cd contains pointers to tables etc.
2393	recurses chain of recurse_check to catch mutual recursion
2394
2395	Returns: TRUE if what is matched could be empty
2396	*/
2397
2398	static BOOL
2399	could_be_empty_branch(const pcre_uchar code, const* pcre_uchar *endcode,
2400	BOOL utf, compile_data cd, recurse_check recurses)
2401	{
2402	register pcre_uchar c;
2403	recurse_check this_recurse;
2404
2405	for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2406	code < endcode;
2407	code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2408	{
2409	const pcre_uchar *ccode;
2410
2411	c = *code;
2412
2413	/ Skip over forward assertions; the other assertions are skipped by*
2414	first_significant_code() with a TRUE final argument. /*
2415
2416	if (c == OP_ASSERT)
2417	{
2418	do code += GET(code, `1`); while (*code == OP_ALT);
2419	c = *code;
2420	continue;
2421	}
2422
2423	/ For a recursion/subroutine call, if its end has been reached, which*
2424	implies a backward reference subroutine call, we can scan it. If it's a
2425	forward reference subroutine call, we can't. To detect forward reference
2426	we have to scan up the list that is kept in the workspace. This function is
2427	called only when doing the real compile, not during the pre-compile that
2428	measures the size of the compiled pattern. /*
2429
2430	if (c == OP_RECURSE)
2431	{
2432	const pcre_uchar *scode = cd->start_code + GET(code, `1`);
2433	const pcre_uchar *endgroup = scode;
2434	BOOL empty_branch;
2435
2436	/ Test for forward reference or uncompleted reference. This is disabled*
2437	when called to scan a completed pattern by setting cd->start_workspace to
2438	NULL. /*
2439
2440	if (cd->start_workspace != NULL)
2441	{
2442	const pcre_uchar *tcode;
2443	for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2444	if ((int)GET(tcode, `0`) == (int)(code + `1` - cd->start_code)) return TRUE;
2445	if (GET(scode, `1`) == `0`) return TRUE; / Unclosed /
2446	}
2447
2448	/ If the reference is to a completed group, we need to detect whether this*
2449	is a recursive call, as otherwise there will be an infinite loop. If it is
2450	a recursion, just skip over it. Simple recursions are easily detected. For
2451	mutual recursions we keep a chain on the stack. /*
2452
2453	do endgroup += GET(endgroup, `1`); while (*endgroup == OP_ALT);
2454	if (code >= scode && code <= endgroup) continue; / Simple recursion /
2455	else
2456	{
2457	recurse_check *r = recurses;
2458	for (r = recurses; r != NULL; r = r->prev)
2459	if (r->group == scode) break;
2460	if (r != NULL) continue; / Mutual recursion /
2461	}
2462
2463	/ Completed reference; scan the referenced group, remembering it on the*
2464	stack chain to detect mutual recursions. /*
2465
2466	empty_branch = FALSE;
2467	this_recurse.prev = recurses;
2468	this_recurse.group = scode;
2469
2470	do
2471	{
2472	if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2473	{
2474	empty_branch = TRUE;
2475	break;
2476	}
2477	scode += GET(scode, `1`);
2478	}
2479	while (*scode == OP_ALT);
2480
2481	if (!empty_branch) return FALSE; / All branches are non-empty /
2482	continue;
2483	}
2484
2485	/ Groups with zero repeats can of course be empty; skip them. /
2486
2487	if (c == OP_BRAZERO \|\| c == OP_BRAMINZERO \|\| c == OP_SKIPZERO \|\|
2488	c == OP_BRAPOSZERO)
2489	{
2490	code += PRIV(OP_lengths)[c];
2491	do code += GET(code, `1`); while (*code == OP_ALT);
2492	c = *code;
2493	continue;
2494	}
2495
2496	/ A nested group that is already marked as "could be empty" can just be*
2497	skipped. /*
2498
2499	if (c == OP_SBRA \|\| c == OP_SBRAPOS \|\|
2500	c == OP_SCBRA \|\| c == OP_SCBRAPOS)
2501	{
2502	do code += GET(code, `1`); while (*code == OP_ALT);
2503	c = *code;
2504	continue;
2505	}
2506
2507	/ For other groups, scan the branches. /
2508
2509	if (c == OP_BRA \|\| c == OP_BRAPOS \|\|
2510	c == OP_CBRA \|\| c == OP_CBRAPOS \|\|
2511	c == OP_ONCE \|\| c == OP_ONCE_NC \|\|
2512	c == OP_COND \|\| c == OP_SCOND)
2513	{
2514	BOOL empty_branch;
2515	if (GET(code, `1`) == `0`) return TRUE; / Hit unclosed bracket /
2516
2517	/ If a conditional group has only one branch, there is a second, implied,*
2518	empty branch, so just skip over the conditional, because it could be empty.
2519	Otherwise, scan the individual branches of the group. /*
2520
2521	if (c == OP_COND && code[GET(code, `1`)] != OP_ALT)
2522	code += GET(code, `1`);
2523	else
2524	{
2525	empty_branch = FALSE;
2526	do
2527	{
2528	if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2529	recurses)) empty_branch = TRUE;
2530	code += GET(code, `1`);
2531	}
2532	while (*code == OP_ALT);
2533	if (!empty_branch) return FALSE; / All branches are non-empty /
2534	}
2535
2536	c = *code;
2537	continue;
2538	}
2539
2540	/ Handle the other opcodes /
2541
2542	switch (c)
2543	{
2544	/ Check for quantifiers after a class. XCLASS is used for classes that*
2545	cannot be represented just by a bit map. This includes negated single
2546	high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2547	actual length is stored in the compiled code, so we must update "code"
2548	here. /*
2549
2550	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
2551	case OP_XCLASS:
2552	ccode = code += GET(code, `1`);
2553	goto CHECK_CLASS_REPEAT;
2554	#endif
2555
2556	case OP_CLASS:
2557	case OP_NCLASS:
2558	ccode = code + PRIV(OP_lengths)[OP_CLASS];
2559
2560	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
2561	CHECK_CLASS_REPEAT:
2562	#endif
2563
2564	switch (*ccode)
2565	{
2566	case OP_CRSTAR: / These could be empty; continue /
2567	case OP_CRMINSTAR:
2568	case OP_CRQUERY:
2569	case OP_CRMINQUERY:
2570	case OP_CRPOSSTAR:
2571	case OP_CRPOSQUERY:
2572	break;
2573
2574	default: / Non-repeat => class must match /
2575	case OP_CRPLUS: / These repeats aren't empty /
2576	case OP_CRMINPLUS:
2577	case OP_CRPOSPLUS:
2578	return FALSE;
2579
2580	case OP_CRRANGE:
2581	case OP_CRMINRANGE:
2582	case OP_CRPOSRANGE:
2583	if (GET2(ccode, `1`) > `0`) return FALSE; / Minimum > 0 /
2584	break;
2585	}
2586	break;
2587
2588	/ Opcodes that must match a character /
2589
2590	case OP_ANY:
2591	case OP_ALLANY:
2592	case OP_ANYBYTE:
2593
2594	case OP_PROP:
2595	case OP_NOTPROP:
2596	case OP_ANYNL:
2597
2598	case OP_NOT_HSPACE:
2599	case OP_HSPACE:
2600	case OP_NOT_VSPACE:
2601	case OP_VSPACE:
2602	case OP_EXTUNI:
2603
2604	case OP_NOT_DIGIT:
2605	case OP_DIGIT:
2606	case OP_NOT_WHITESPACE:
2607	case OP_WHITESPACE:
2608	case OP_NOT_WORDCHAR:
2609	case OP_WORDCHAR:
2610
2611	case OP_CHAR:
2612	case OP_CHARI:
2613	case OP_NOT:
2614	case OP_NOTI:
2615
2616	case OP_PLUS:
2617	case OP_PLUSI:
2618	case OP_MINPLUS:
2619	case OP_MINPLUSI:
2620
2621	case OP_NOTPLUS:
2622	case OP_NOTPLUSI:
2623	case OP_NOTMINPLUS:
2624	case OP_NOTMINPLUSI:
2625
2626	case OP_POSPLUS:
2627	case OP_POSPLUSI:
2628	case OP_NOTPOSPLUS:
2629	case OP_NOTPOSPLUSI:
2630
2631	case OP_EXACT:
2632	case OP_EXACTI:
2633	case OP_NOTEXACT:
2634	case OP_NOTEXACTI:
2635
2636	case OP_TYPEPLUS:
2637	case OP_TYPEMINPLUS:
2638	case OP_TYPEPOSPLUS:
2639	case OP_TYPEEXACT:
2640
2641	return FALSE;
2642
2643	/ These are going to continue, as they may be empty, but we have to*
2644	fudge the length for the \p and \P cases. /*
2645
2646	case OP_TYPESTAR:
2647	case OP_TYPEMINSTAR:
2648	case OP_TYPEPOSSTAR:
2649	case OP_TYPEQUERY:
2650	case OP_TYPEMINQUERY:
2651	case OP_TYPEPOSQUERY:
2652	if (code[`1`] == OP_PROP \|\| code[`1`] == OP_NOTPROP) code += `2`;
2653	break;
2654
2655	/ Same for these /
2656
2657	case OP_TYPEUPTO:
2658	case OP_TYPEMINUPTO:
2659	case OP_TYPEPOSUPTO:
2660	if (code[`1` + IMM2_SIZE] == OP_PROP \|\| code[`1` + IMM2_SIZE] == OP_NOTPROP)
2661	code += `2`;
2662	break;
2663
2664	/ End of branch /
2665
2666	case OP_KET:
2667	case OP_KETRMAX:
2668	case OP_KETRMIN:
2669	case OP_KETRPOS:
2670	case OP_ALT:
2671	return TRUE;
2672
2673	/ In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,*
2674	MINUPTO, and POSUPTO and their caseless and negative versions may be
2675	followed by a multibyte character. /*
2676
2677	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2678	case OP_STAR:
2679	case OP_STARI:
2680	case OP_NOTSTAR:
2681	case OP_NOTSTARI:
2682
2683	case OP_MINSTAR:
2684	case OP_MINSTARI:
2685	case OP_NOTMINSTAR:
2686	case OP_NOTMINSTARI:
2687
2688	case OP_POSSTAR:
2689	case OP_POSSTARI:
2690	case OP_NOTPOSSTAR:
2691	case OP_NOTPOSSTARI:
2692
2693	case OP_QUERY:
2694	case OP_QUERYI:
2695	case OP_NOTQUERY:
2696	case OP_NOTQUERYI:
2697
2698	case OP_MINQUERY:
2699	case OP_MINQUERYI:
2700	case OP_NOTMINQUERY:
2701	case OP_NOTMINQUERYI:
2702
2703	case OP_POSQUERY:
2704	case OP_POSQUERYI:
2705	case OP_NOTPOSQUERY:
2706	case OP_NOTPOSQUERYI:
2707
2708	if (utf && HAS_EXTRALEN(code[`1`])) code += GET_EXTRALEN(code[`1`]);
2709	break;
2710
2711	case OP_UPTO:
2712	case OP_UPTOI:
2713	case OP_NOTUPTO:
2714	case OP_NOTUPTOI:
2715
2716	case OP_MINUPTO:
2717	case OP_MINUPTOI:
2718	case OP_NOTMINUPTO:
2719	case OP_NOTMINUPTOI:
2720
2721	case OP_POSUPTO:
2722	case OP_POSUPTOI:
2723	case OP_NOTPOSUPTO:
2724	case OP_NOTPOSUPTOI:
2725
2726	if (utf && HAS_EXTRALEN(code[`1` + IMM2_SIZE])) code += GET_EXTRALEN(code[`1` + IMM2_SIZE]);
2727	break;
2728	#endif
2729
2730	/ MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument*
2731	string. /*
2732
2733	case OP_MARK:
2734	case OP_PRUNE_ARG:
2735	case OP_SKIP_ARG:
2736	case OP_THEN_ARG:
2737	code += code[`1`];
2738	break;
2739
2740	/ None of the remaining opcodes are required to match a character. /
2741
2742	default:
2743	break;
2744	}
2745	}
2746
2747	return TRUE;
2748	}
2749
2750
2751
2752	/*************************************************
2753	* Scan compiled regex for non-emptiness *
2754	*************************************************/
2755
2756	/ This function is called to check for left recursive calls. We want to check*
2757	the current branch of the current pattern to see if it could match the empty
2758	string. If it could, we must look outwards for branches at other levels,
2759	stopping when we pass beyond the bracket which is the subject of the recursion.
2760	This function is called only during the real compile, not during the
2761	pre-compile.
2762
2763	Arguments:
2764	code points to start of the recursion
2765	endcode points to where to stop (current RECURSE item)
2766	bcptr points to the chain of current (unclosed) branch starts
2767	utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2768	cd pointers to tables etc
2769
2770	Returns: TRUE if what is matched could be empty
2771	*/
2772
2773	static BOOL
2774	could_be_empty(const pcre_uchar code, const* pcre_uchar *endcode,
2775	branch_chain bcptr, BOOL utf, compile_data cd)
2776	{
2777	while (bcptr != NULL && bcptr->current_branch >= code)
2778	{
2779	if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2780	return FALSE;
2781	bcptr = bcptr->outer;
2782	}
2783	return TRUE;
2784	}
2785
2786
2787
2788	/*************************************************
2789	* Base opcode of repeated opcodes *
2790	*************************************************/
2791
2792	/ Returns the base opcode for repeated single character type opcodes. If the*
2793	opcode is not a repeated character type, it returns with the original value.
2794
2795	Arguments: c opcode
2796	Returns: base opcode for the type
2797	*/
2798
2799	static pcre_uchar
2800	get_repeat_base(pcre_uchar c)
2801	{
2802	return (c > OP_TYPEPOSUPTO)? c :
2803	(c >= OP_TYPESTAR)? OP_TYPESTAR :
2804	(c >= OP_NOTSTARI)? OP_NOTSTARI :
2805	(c >= OP_NOTSTAR)? OP_NOTSTAR :
2806	(c >= OP_STARI)? OP_STARI :
2807	OP_STAR;
2808	}
2809
2810
2811
2812	#ifdef SUPPORT_UCP
2813	/*************************************************
2814	* Check a character and a property *
2815	*************************************************/
2816
2817	/ This function is called by check_auto_possessive() when a property item*
2818	is adjacent to a fixed character.
2819
2820	Arguments:
2821	c the character
2822	ptype the property type
2823	pdata the data for the type
2824	negated TRUE if it's a negated property (\P or \p{^)
2825
2826	Returns: TRUE if auto-possessifying is OK
2827	*/
2828
2829	static BOOL
2830	check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2831	BOOL negated)
2832	{
2833	const pcre_uint32 *p;
2834	const ucd_record *prop = GET_UCD(c);
2835
2836	switch(ptype)
2837	{
2838	case PT_LAMP:
2839	return (prop->chartype == ucp_Lu \|\|
2840	prop->chartype == ucp_Ll \|\|
2841	prop->chartype == ucp_Lt) == negated;
2842
2843	case PT_GC:
2844	return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2845
2846	case PT_PC:
2847	return (pdata == prop->chartype) == negated;
2848
2849	case PT_SC:
2850	return (pdata == prop->script) == negated;
2851
2852	/ These are specials /
2853
2854	case PT_ALNUM:
2855	return (PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
2856	PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2857
2858	/ Perl space used to exclude VT, but from Perl 5.18 it is included, which*
2859	means that Perl space and POSIX space are now identical. PCRE was changed
2860	at release 8.34. /*
2861
2862	case PT_SPACE: / Perl space /
2863	case PT_PXSPACE: / POSIX space /
2864	switch(c)
2865	{
2866	HSPACE_CASES:
2867	VSPACE_CASES:
2868	return negated;
2869
2870	default:
2871	return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2872	}
2873	break; / Control never reaches here /
2874
2875	case PT_WORD:
2876	return (PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
2877	PRIV(ucp_gentype)[prop->chartype] == ucp_N \|\|
2878	c == CHAR_UNDERSCORE) == negated;
2879
2880	case PT_CLIST:
2881	p = PRIV(ucd_caseless_sets) + prop->caseset;
2882	for (;;)
2883	{
2884	if (c < p) return* !negated;
2885	if (c == p++) return* negated;
2886	}
2887	break; / Control never reaches here /
2888	}
2889
2890	return FALSE;
2891	}
2892	#endif /* SUPPORT_UCP */
2893
2894
2895
2896	/*************************************************
2897	* Fill the character property list *
2898	*************************************************/
2899
2900	/ Checks whether the code points to an opcode that can take part in auto-*
2901	possessification, and if so, fills a list with its properties.
2902
2903	Arguments:
2904	code points to start of expression
2905	utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2906	fcc points to case-flipping table
2907	list points to output list
2908	list[0] will be filled with the opcode
2909	list[1] will be non-zero if this opcode
2910	can match an empty character string
2911	list[2..7] depends on the opcode
2912
2913	Returns: points to the start of the next opcode if code is accepted*
2914	NULL if code is not accepted*
2915	*/
2916
2917	static const pcre_uchar *
2918	get_chr_property_list(const pcre_uchar *code, BOOL utf,
2919	const pcre_uint8 fcc, pcre_uint32 list)
2920	{
2921	pcre_uchar c = *code;
2922	pcre_uchar base;
2923	const pcre_uchar *end;
2924	pcre_uint32 chr;
2925
2926	#ifdef SUPPORT_UCP
2927	pcre_uint32 *clist_dest;
2928	const pcre_uint32 *clist_src;
2929	#else
2930	utf = utf; / Suppress "unused parameter" compiler warning /
2931	#endif
2932
2933	list[`0`] = c;
2934	list[`1`] = FALSE;
2935	code++;
2936
2937	if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2938	{
2939	base = get_repeat_base(c);
2940	c -= (base - OP_STAR);
2941
2942	if (c == OP_UPTO \|\| c == OP_MINUPTO \|\| c == OP_EXACT \|\| c == OP_POSUPTO)
2943	code += IMM2_SIZE;
2944
2945	list[`1`] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2946
2947	switch(base)
2948	{
2949	case OP_STAR:
2950	list[`0`] = OP_CHAR;
2951	break;
2952
2953	case OP_STARI:
2954	list[`0`] = OP_CHARI;
2955	break;
2956
2957	case OP_NOTSTAR:
2958	list[`0`] = OP_NOT;
2959	break;
2960
2961	case OP_NOTSTARI:
2962	list[`0`] = OP_NOTI;
2963	break;
2964
2965	case OP_TYPESTAR:
2966	list[`0`] = *code;
2967	code++;
2968	break;
2969	}
2970	c = list[`0`];
2971	}
2972
2973	switch(c)
2974	{
2975	case OP_NOT_DIGIT:
2976	case OP_DIGIT:
2977	case OP_NOT_WHITESPACE:
2978	case OP_WHITESPACE:
2979	case OP_NOT_WORDCHAR:
2980	case OP_WORDCHAR:
2981	case OP_ANY:
2982	case OP_ALLANY:
2983	case OP_ANYNL:
2984	case OP_NOT_HSPACE:
2985	case OP_HSPACE:
2986	case OP_NOT_VSPACE:
2987	case OP_VSPACE:
2988	case OP_EXTUNI:
2989	case OP_EODN:
2990	case OP_EOD:
2991	case OP_DOLL:
2992	case OP_DOLLM:
2993	return code;
2994
2995	case OP_CHAR:
2996	case OP_NOT:
2997	GETCHARINCTEST(chr, code);
2998	list[`2`] = chr;
2999	list[`3`] = NOTACHAR;
3000	return code;
3001
3002	case OP_CHARI:
3003	case OP_NOTI:
3004	list[`0`] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3005	GETCHARINCTEST(chr, code);
3006	list[`2`] = chr;
3007
3008	#ifdef SUPPORT_UCP
3009	if (chr < `128` \|\| (chr < `256` && !utf))
3010	list[`3`] = fcc[chr];
3011	else
3012	list[`3`] = UCD_OTHERCASE(chr);
3013	#elif defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
3014	list[`3`] = (chr < `256`) ? fcc[chr] : chr;
3015	#else
3016	list[`3`] = fcc[chr];
3017	#endif
3018
3019	/ The othercase might be the same value. /
3020
3021	if (chr == list[`3`])
3022	list[`3`] = NOTACHAR;
3023	else
3024	list[`4`] = NOTACHAR;
3025	return code;
3026
3027	#ifdef SUPPORT_UCP
3028	case OP_PROP:
3029	case OP_NOTPROP:
3030	if (code[`0`] != PT_CLIST)
3031	{
3032	list[`2`] = code[`0`];
3033	list[`3`] = code[`1`];
3034	return code + `2`;
3035	}
3036
3037	/ Convert only if we have enough space. /
3038
3039	clist_src = PRIV(ucd_caseless_sets) + code[`1`];
3040	clist_dest = list + `2`;
3041	code += `2`;
3042
3043	do {
3044	if (clist_dest >= list + `8`)
3045	{
3046	/ Early return if there is not enough space. This should never*
3047	happen, since all clists are shorter than 5 character now. /*
3048	list[`2`] = code[`0`];
3049	list[`3`] = code[`1`];
3050	return code;
3051	}
3052	clist_dest++ = clist_src;
3053	}
3054	while(*clist_src++ != NOTACHAR);
3055
3056	/ All characters are stored. The terminating NOTACHAR*
3057	is copied form the clist itself. /*
3058
3059	list[`0`] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3060	return code;
3061	#endif
3062
3063	case OP_NCLASS:
3064	case OP_CLASS:
3065	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
3066	case OP_XCLASS:
3067	if (c == OP_XCLASS)
3068	end = code + GET(code, `0`) - `1`;
3069	else
3070	#endif
3071	end = code + `32` / sizeof(pcre_uchar);
3072
3073	switch(*end)
3074	{
3075	case OP_CRSTAR:
3076	case OP_CRMINSTAR:
3077	case OP_CRQUERY:
3078	case OP_CRMINQUERY:
3079	case OP_CRPOSSTAR:
3080	case OP_CRPOSQUERY:
3081	list[`1`] = TRUE;
3082	end++;
3083	break;
3084
3085	case OP_CRPLUS:
3086	case OP_CRMINPLUS:
3087	case OP_CRPOSPLUS:
3088	end++;
3089	break;
3090
3091	case OP_CRRANGE:
3092	case OP_CRMINRANGE:
3093	case OP_CRPOSRANGE:
3094	list[`1`] = (GET2(end, `1`) == `0`);
3095	end += `1` + `2` * IMM2_SIZE;
3096	break;
3097	}
3098	list[`2`] = (pcre_uint32)(end - code);
3099	return end;
3100	}
3101	return NULL; / Opcode not accepted /
3102	}
3103
3104
3105
3106	/*************************************************
3107	* Scan further character sets for match *
3108	*************************************************/
3109
3110	/ Checks whether the base and the current opcode have a common character, in*
3111	which case the base cannot be possessified.
3112
3113	Arguments:
3114	code points to the byte code
3115	utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3116	cd static compile data
3117	base_list the data list of the base opcode
3118
3119	Returns: TRUE if the auto-possessification is possible
3120	*/
3121
3122	static BOOL
3123	compare_opcodes(const pcre_uchar code, BOOL utf, const* compile_data *cd,
3124	const pcre_uint32 base_list, const* pcre_uchar base_end, int* *rec_limit)
3125	{
3126	pcre_uchar c;
3127	pcre_uint32 list[`8`];
3128	const pcre_uint32 *chr_ptr;
3129	const pcre_uint32 *ochr_ptr;
3130	const pcre_uint32 *list_ptr;
3131	const pcre_uchar *next_code;
3132	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
3133	const pcre_uchar *xclass_flags;
3134	#endif
3135	const pcre_uint8 *class_bitset;
3136	const pcre_uint8 set1, set2, *set_end;
3137	pcre_uint32 chr;
3138	BOOL accepted, invert_bits;
3139	BOOL entered_a_group = FALSE;
3140
3141	if (rec_limit == `0`) return* FALSE;
3142	--(*rec_limit);
3143
3144	/ Note: the base_list[1] contains whether the current opcode has greedy*
3145	(represented by a non-zero value) quantifier. This is a different from
3146	other character type lists, which stores here that the character iterator
3147	matches to an empty string (also represented by a non-zero value). /*
3148
3149	for(;;)
3150	{
3151	/ All operations move the code pointer forward.*
3152	Therefore infinite recursions are not possible. /*
3153
3154	c = *code;
3155
3156	/ Skip over callouts /
3157
3158	if (c == OP_CALLOUT)
3159	{
3160	code += PRIV(OP_lengths)[c];
3161	continue;
3162	}
3163
3164	if (c == OP_ALT)
3165	{
3166	do code += GET(code, `1`); while (*code == OP_ALT);
3167	c = *code;
3168	}
3169
3170	switch(c)
3171	{
3172	case OP_END:
3173	case OP_KETRPOS:
3174	/ TRUE only in greedy case. The non-greedy case could be replaced by*
3175	an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3176	uses more memory, which we cannot get at this stage.) /*
3177
3178	return base_list[`1`] != `0`;
3179
3180	case OP_KET:
3181	/ If the bracket is capturing, and referenced by an OP_RECURSE, or*
3182	it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3183	cannot be converted to a possessive form. /*
3184
3185	if (base_list[`1`] == `0`) return FALSE;
3186
3187	switch(*(code - GET(code, `1`)))
3188	{
3189	case OP_ASSERT:
3190	case OP_ASSERT_NOT:
3191	case OP_ASSERTBACK:
3192	case OP_ASSERTBACK_NOT:
3193	case OP_ONCE:
3194	case OP_ONCE_NC:
3195	/ Atomic sub-patterns and assertions can always auto-possessify their*
3196	last iterator. However, if the group was entered as a result of checking
3197	a previous iterator, this is not possible. /*
3198
3199	return !entered_a_group;
3200	}
3201
3202	code += PRIV(OP_lengths)[c];
3203	continue;
3204
3205	case OP_ONCE:
3206	case OP_ONCE_NC:
3207	case OP_BRA:
3208	case OP_CBRA:
3209	next_code = code + GET(code, `1`);
3210	code += PRIV(OP_lengths)[c];
3211
3212	while (*next_code == OP_ALT)
3213	{
3214	if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3215	return FALSE;
3216	code = next_code + `1` + LINK_SIZE;
3217	next_code += GET(next_code, `1`);
3218	}
3219
3220	entered_a_group = TRUE;
3221	continue;
3222
3223	case OP_BRAZERO:
3224	case OP_BRAMINZERO:
3225
3226	next_code = code + `1`;
3227	if (next_code != OP_BRA && next_code != OP_CBRA
3228	&& next_code != OP_ONCE && next_code != OP_ONCE_NC) return FALSE;
3229
3230	do next_code += GET(next_code, `1`); while (*next_code == OP_ALT);
3231
3232	/ The bracket content will be checked by the*
3233	OP_BRA/OP_CBRA case above. /*
3234	next_code += `1` + LINK_SIZE;
3235	if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3236	return FALSE;
3237
3238	code += PRIV(OP_lengths)[c];
3239	continue;
3240
3241	default:
3242	break;
3243	}
3244
3245	/ Check for a supported opcode, and load its properties. /
3246
3247	code = get_chr_property_list(code, utf, cd->fcc, list);
3248	if (code == NULL) return FALSE; / Unsupported /
3249
3250	/ If either opcode is a small character list, set pointers for comparing*
3251	characters from that list with another list, or with a property. /*
3252
3253	if (base_list[`0`] == OP_CHAR)
3254	{
3255	chr_ptr = base_list + `2`;
3256	list_ptr = list;
3257	}
3258	else if (list[`0`] == OP_CHAR)
3259	{
3260	chr_ptr = list + `2`;
3261	list_ptr = base_list;
3262	}
3263
3264	/ Character bitsets can also be compared to certain opcodes. /
3265
3266	else if (base_list[`0`] == OP_CLASS \|\| list[`0`] == OP_CLASS
3267	#ifdef COMPILE_PCRE8
3268	/ In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. /
3269	\|\| (!utf && (base_list[`0`] == OP_NCLASS \|\| list[`0`] == OP_NCLASS))
3270	#endif
3271	)
3272	{
3273	#ifdef COMPILE_PCRE8
3274	if (base_list[`0`] == OP_CLASS \|\| (!utf && base_list[`0`] == OP_NCLASS))
3275	#else
3276	if (base_list[`0`] == OP_CLASS)
3277	#endif
3278	{
3279	set1 = (pcre_uint8 *)(base_end - base_list[`2`]);
3280	list_ptr = list;
3281	}
3282	else
3283	{
3284	set1 = (pcre_uint8 *)(code - list[`2`]);
3285	list_ptr = base_list;
3286	}
3287
3288	invert_bits = FALSE;
3289	switch(list_ptr[`0`])
3290	{
3291	case OP_CLASS:
3292	case OP_NCLASS:
3293	set2 = (pcre_uint8 *)
3294	((list_ptr == list ? code : base_end) - list_ptr[`2`]);
3295	break;
3296
3297	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
3298	case OP_XCLASS:
3299	xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[`2`] + LINK_SIZE;
3300	if ((xclass_flags & XCL_HASPROP) != `0`) return* FALSE;
3301	if ((*xclass_flags & XCL_MAP) == `0`)
3302	{
3303	/ No bits are set for characters < 256. /
3304	if (list[`1`] == `0`) return TRUE;
3305	/ Might be an empty repeat. /
3306	continue;
3307	}
3308	set2 = (pcre_uint8 *)(xclass_flags + `1`);
3309	break;
3310	#endif
3311
3312	case OP_NOT_DIGIT:
3313	invert_bits = TRUE;
3314	/ Fall through /
3315	case OP_DIGIT:
3316	set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3317	break;
3318
3319	case OP_NOT_WHITESPACE:
3320	invert_bits = TRUE;
3321	/ Fall through /
3322	case OP_WHITESPACE:
3323	set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3324	break;
3325
3326	case OP_NOT_WORDCHAR:
3327	invert_bits = TRUE;
3328	/ Fall through /
3329	case OP_WORDCHAR:
3330	set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3331	break;
3332
3333	default:
3334	return FALSE;
3335	}
3336
3337	/ Because the sets are unaligned, we need*
3338	to perform byte comparison here. /*
3339	set_end = set1 + `32`;
3340	if (invert_bits)
3341	{
3342	do
3343	{
3344	if ((set1++ & ~(set2++)) != `0`) return FALSE;
3345	}
3346	while (set1 < set_end);
3347	}
3348	else
3349	{
3350	do
3351	{
3352	if ((set1++ & set2++) != `0`) return FALSE;
3353	}
3354	while (set1 < set_end);
3355	}
3356
3357	if (list[`1`] == `0`) return TRUE;
3358	/ Might be an empty repeat. /
3359	continue;
3360	}
3361
3362	/ Some property combinations also acceptable. Unicode property opcodes are*
3363	processed specially; the rest can be handled with a lookup table. /*
3364
3365	else
3366	{
3367	pcre_uint32 leftop, rightop;
3368
3369	leftop = base_list[`0`];
3370	rightop = list[`0`];
3371
3372	#ifdef SUPPORT_UCP
3373	accepted = FALSE; / Always set in non-unicode case. /
3374	if (leftop == OP_PROP \|\| leftop == OP_NOTPROP)
3375	{
3376	if (rightop == OP_EOD)
3377	accepted = TRUE;
3378	else if (rightop == OP_PROP \|\| rightop == OP_NOTPROP)
3379	{
3380	int n;
3381	const pcre_uint8 *p;
3382	BOOL same = leftop == rightop;
3383	BOOL lisprop = leftop == OP_PROP;
3384	BOOL risprop = rightop == OP_PROP;
3385	BOOL bothprop = lisprop && risprop;
3386
3387	/ There's a table that specifies how each combination is to be*
3388	processed:
3389	0 Always return FALSE (never auto-possessify)
3390	1 Character groups are distinct (possessify if both are OP_PROP)
3391	2 Check character categories in the same group (general or particular)
3392	3 Return TRUE if the two opcodes are not the same
3393	... see comments below
3394	*/
3395
3396	n = propposstab[base_list[`2`]][list[`2`]];
3397	switch(n)
3398	{
3399	case `0`: break;
3400	case `1`: accepted = bothprop; break;
3401	case `2`: accepted = (base_list[`3`] == list[`3`]) != same; break;
3402	case `3`: accepted = !same; break;
3403
3404	case `4`: / Left general category, right particular category /
3405	accepted = risprop && catposstab[base_list[`3`]][list[`3`]] == same;
3406	break;
3407
3408	case `5`: / Right general category, left particular category /
3409	accepted = lisprop && catposstab[list[`3`]][base_list[`3`]] == same;
3410	break;
3411
3412	/ This code is logically tricky. Think hard before fiddling with it.*
3413	The posspropstab table has four entries per row. Each row relates to
3414	one of PCRE's special properties such as ALNUM or SPACE or WORD.
3415	Only WORD actually needs all four entries, but using repeats for the
3416	others means they can all use the same code below.
3417
3418	The first two entries in each row are Unicode general categories, and
3419	apply always, because all the characters they include are part of the
3420	PCRE character set. The third and fourth entries are a general and a
3421	particular category, respectively, that include one or more relevant
3422	characters. One or the other is used, depending on whether the check
3423	is for a general or a particular category. However, in both cases the
3424	category contains more characters than the specials that are defined
3425	for the property being tested against. Therefore, it cannot be used
3426	in a NOTPROP case.
3427
3428	Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3429	Underscore is covered by ucp_P or ucp_Po. /*
3430
3431	case `6`: / Left alphanum vs right general category /
3432	case `7`: / Left space vs right general category /
3433	case `8`: / Left word vs right general category /
3434	p = posspropstab[n-`6`];
3435	accepted = risprop && lisprop ==
3436	(list[`3`] != p[`0`] &&
3437	list[`3`] != p[`1`] &&
3438	(list[`3`] != p[`2`] \|\| !lisprop));
3439	break;
3440
3441	case `9`: / Right alphanum vs left general category /
3442	case `10`: / Right space vs left general category /
3443	case `11`: / Right word vs left general category /
3444	p = posspropstab[n-`9`];
3445	accepted = lisprop && risprop ==
3446	(base_list[`3`] != p[`0`] &&
3447	base_list[`3`] != p[`1`] &&
3448	(base_list[`3`] != p[`2`] \|\| !risprop));
3449	break;
3450
3451	case `12`: / Left alphanum vs right particular category /
3452	case `13`: / Left space vs right particular category /
3453	case `14`: / Left word vs right particular category /
3454	p = posspropstab[n-`12`];
3455	accepted = risprop && lisprop ==
3456	(catposstab[p[`0`]][list[`3`]] &&
3457	catposstab[p[`1`]][list[`3`]] &&
3458	(list[`3`] != p[`3`] \|\| !lisprop));
3459	break;
3460
3461	case `15`: / Right alphanum vs left particular category /
3462	case `16`: / Right space vs left particular category /
3463	case `17`: / Right word vs left particular category /
3464	p = posspropstab[n-`15`];
3465	accepted = lisprop && risprop ==
3466	(catposstab[p[`0`]][base_list[`3`]] &&
3467	catposstab[p[`1`]][base_list[`3`]] &&
3468	(base_list[`3`] != p[`3`] \|\| !risprop));
3469	break;
3470	}
3471	}
3472	}
3473
3474	else
3475	#endif /* SUPPORT_UCP */
3476
3477	accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3478	rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3479	autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3480
3481	if (!accepted) return FALSE;
3482
3483	if (list[`1`] == `0`) return TRUE;
3484	/ Might be an empty repeat. /
3485	continue;
3486	}
3487
3488	/ Control reaches here only if one of the items is a small character list.*
3489	All characters are checked against the other side. /*
3490
3491	do
3492	{
3493	chr = *chr_ptr;
3494
3495	switch(list_ptr[`0`])
3496	{
3497	case OP_CHAR:
3498	ochr_ptr = list_ptr + `2`;
3499	do
3500	{
3501	if (chr == ochr_ptr) return* FALSE;
3502	ochr_ptr++;
3503	}
3504	while(*ochr_ptr != NOTACHAR);
3505	break;
3506
3507	case OP_NOT:
3508	ochr_ptr = list_ptr + `2`;
3509	do
3510	{
3511	if (chr == *ochr_ptr)
3512	break;
3513	ochr_ptr++;
3514	}
3515	while(*ochr_ptr != NOTACHAR);
3516	if (ochr_ptr == NOTACHAR) return* FALSE; / Not found /
3517	break;
3518
3519	/* Note that OP_DIGIT etc. are generated only when PCRE_UCP is not
3520	set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. /*
3521
3522	case OP_DIGIT:
3523	if (chr < `256` && (cd->ctypes[chr] & ctype_digit) != `0`) return FALSE;
3524	break;
3525
3526	case OP_NOT_DIGIT:
3527	if (chr > `255` \|\| (cd->ctypes[chr] & ctype_digit) == `0`) return FALSE;
3528	break;
3529
3530	case OP_WHITESPACE:
3531	if (chr < `256` && (cd->ctypes[chr] & ctype_space) != `0`) return FALSE;
3532	break;
3533
3534	case OP_NOT_WHITESPACE:
3535	if (chr > `255` \|\| (cd->ctypes[chr] & ctype_space) == `0`) return FALSE;
3536	break;
3537
3538	case OP_WORDCHAR:
3539	if (chr < `255` && (cd->ctypes[chr] & ctype_word) != `0`) return FALSE;
3540	break;
3541
3542	case OP_NOT_WORDCHAR:
3543	if (chr > `255` \|\| (cd->ctypes[chr] & ctype_word) == `0`) return FALSE;
3544	break;
3545
3546	case OP_HSPACE:
3547	switch(chr)
3548	{
3549	HSPACE_CASES: return FALSE;
3550	default: break;
3551	}
3552	break;
3553
3554	case OP_NOT_HSPACE:
3555	switch(chr)
3556	{
3557	HSPACE_CASES: break;
3558	default: return FALSE;
3559	}
3560	break;
3561
3562	case OP_ANYNL:
3563	case OP_VSPACE:
3564	switch(chr)
3565	{
3566	VSPACE_CASES: return FALSE;
3567	default: break;
3568	}
3569	break;
3570
3571	case OP_NOT_VSPACE:
3572	switch(chr)
3573	{
3574	VSPACE_CASES: break;
3575	default: return FALSE;
3576	}
3577	break;
3578
3579	case OP_DOLL:
3580	case OP_EODN:
3581	switch (chr)
3582	{
3583	case CHAR_CR:
3584	case CHAR_LF:
3585	case CHAR_VT:
3586	case CHAR_FF:
3587	case CHAR_NEL:
3588	#ifndef EBCDIC
3589	case `0x2028`:
3590	case `0x2029`:
3591	#endif /* Not EBCDIC */
3592	return FALSE;
3593	}
3594	break;
3595
3596	case OP_EOD: / Can always possessify before \z /
3597	break;
3598
3599	#ifdef SUPPORT_UCP
3600	case OP_PROP:
3601	case OP_NOTPROP:
3602	if (!check_char_prop(chr, list_ptr[`2`], list_ptr[`3`],
3603	list_ptr[`0`] == OP_NOTPROP))
3604	return FALSE;
3605	break;
3606	#endif
3607
3608	case OP_NCLASS:
3609	if (chr > `255`) return FALSE;
3610	/ Fall through /
3611
3612	case OP_CLASS:
3613	if (chr > `255`) break;
3614	class_bitset = (pcre_uint8 *)
3615	((list_ptr == list ? code : base_end) - list_ptr[`2`]);
3616	if ((class_bitset[chr >> `3`] & (`1` << (chr & `7`))) != `0`) return FALSE;
3617	break;
3618
3619	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
3620	case OP_XCLASS:
3621	if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3622	list_ptr[`2`] + LINK_SIZE, utf)) return FALSE;
3623	break;
3624	#endif
3625
3626	default:
3627	return FALSE;
3628	}
3629
3630	chr_ptr++;
3631	}
3632	while(*chr_ptr != NOTACHAR);
3633
3634	/ At least one character must be matched from this opcode. /
3635
3636	if (list[`1`] == `0`) return TRUE;
3637	}
3638
3639	/ Control never reaches here. There used to be a fail-save return FALSE; here,*
3640	but some compilers complain about an unreachable statement. /*
3641
3642	}
3643
3644
3645
3646	/*************************************************
3647	* Scan compiled regex for auto-possession *
3648	*************************************************/
3649
3650	/ Replaces single character iterations with their possessive alternatives*
3651	if appropriate. This function modifies the compiled opcode!
3652
3653	Arguments:
3654	code points to start of the byte code
3655	utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3656	cd static compile data
3657
3658	Returns: nothing
3659	*/
3660
3661	static void
3662	auto_possessify(pcre_uchar code, BOOL utf, const* compile_data *cd)
3663	{
3664	register pcre_uchar c;
3665	const pcre_uchar *end;
3666	pcre_uchar *repeat_opcode;
3667	pcre_uint32 list[`8`];
3668	int rec_limit;
3669
3670	for (;;)
3671	{
3672	c = *code;
3673
3674	/ When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,*
3675	it may compile without complaining, but may get into a loop here if the code
3676	pointer points to a bad value. This is, of course a documentated possibility,
3677	when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3678	just give up on this optimization. /*
3679
3680	if (c >= OP_TABLE_LENGTH) return;
3681
3682	if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3683	{
3684	c -= get_repeat_base(c) - OP_STAR;
3685	end = (c <= OP_MINUPTO) ?
3686	get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3687	list[`1`] = c == OP_STAR \|\| c == OP_PLUS \|\| c == OP_QUERY \|\| c == OP_UPTO;
3688
3689	rec_limit = `1000`;
3690	if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3691	{
3692	switch(c)
3693	{
3694	case OP_STAR:
3695	*code += OP_POSSTAR - OP_STAR;
3696	break;
3697
3698	case OP_MINSTAR:
3699	*code += OP_POSSTAR - OP_MINSTAR;
3700	break;
3701
3702	case OP_PLUS:
3703	*code += OP_POSPLUS - OP_PLUS;
3704	break;
3705
3706	case OP_MINPLUS:
3707	*code += OP_POSPLUS - OP_MINPLUS;
3708	break;
3709
3710	case OP_QUERY:
3711	*code += OP_POSQUERY - OP_QUERY;
3712	break;
3713
3714	case OP_MINQUERY:
3715	*code += OP_POSQUERY - OP_MINQUERY;
3716	break;
3717
3718	case OP_UPTO:
3719	*code += OP_POSUPTO - OP_UPTO;
3720	break;
3721
3722	case OP_MINUPTO:
3723	*code += OP_POSUPTO - OP_MINUPTO;
3724	break;
3725	}
3726	}
3727	c = *code;
3728	}
3729	else if (c == OP_CLASS \|\| c == OP_NCLASS \|\| c == OP_XCLASS)
3730	{
3731	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
3732	if (c == OP_XCLASS)
3733	repeat_opcode = code + GET(code, `1`);
3734	else
3735	#endif
3736	repeat_opcode = code + `1` + (`32` / sizeof(pcre_uchar));
3737
3738	c = *repeat_opcode;
3739	if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3740	{
3741	/ end must not be NULL. /
3742	end = get_chr_property_list(code, utf, cd->fcc, list);
3743
3744	list[`1`] = (c & `1`) == `0`;
3745
3746	rec_limit = `1000`;
3747	if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3748	{
3749	switch (c)
3750	{
3751	case OP_CRSTAR:
3752	case OP_CRMINSTAR:
3753	*repeat_opcode = OP_CRPOSSTAR;
3754	break;
3755
3756	case OP_CRPLUS:
3757	case OP_CRMINPLUS:
3758	*repeat_opcode = OP_CRPOSPLUS;
3759	break;
3760
3761	case OP_CRQUERY:
3762	case OP_CRMINQUERY:
3763	*repeat_opcode = OP_CRPOSQUERY;
3764	break;
3765
3766	case OP_CRRANGE:
3767	case OP_CRMINRANGE:
3768	*repeat_opcode = OP_CRPOSRANGE;
3769	break;
3770	}
3771	}
3772	}
3773	c = *code;
3774	}
3775
3776	switch(c)
3777	{
3778	case OP_END:
3779	return;
3780
3781	case OP_TYPESTAR:
3782	case OP_TYPEMINSTAR:
3783	case OP_TYPEPLUS:
3784	case OP_TYPEMINPLUS:
3785	case OP_TYPEQUERY:
3786	case OP_TYPEMINQUERY:
3787	case OP_TYPEPOSSTAR:
3788	case OP_TYPEPOSPLUS:
3789	case OP_TYPEPOSQUERY:
3790	if (code[`1`] == OP_PROP \|\| code[`1`] == OP_NOTPROP) code += `2`;
3791	break;
3792
3793	case OP_TYPEUPTO:
3794	case OP_TYPEMINUPTO:
3795	case OP_TYPEEXACT:
3796	case OP_TYPEPOSUPTO:
3797	if (code[`1` + IMM2_SIZE] == OP_PROP \|\| code[`1` + IMM2_SIZE] == OP_NOTPROP)
3798	code += `2`;
3799	break;
3800
3801	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
3802	case OP_XCLASS:
3803	code += GET(code, `1`);
3804	break;
3805	#endif
3806
3807	case OP_MARK:
3808	case OP_PRUNE_ARG:
3809	case OP_SKIP_ARG:
3810	case OP_THEN_ARG:
3811	code += code[`1`];
3812	break;
3813	}
3814
3815	/ Add in the fixed length from the table /
3816
3817	code += PRIV(OP_lengths)[c];
3818
3819	/ In UTF-8 mode, opcodes that are followed by a character may be followed by*
3820	a multi-byte character. The length in the table is a minimum, so we have to
3821	arrange to skip the extra bytes. /*
3822
3823	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3824	if (utf) switch(c)
3825	{
3826	case OP_CHAR:
3827	case OP_CHARI:
3828	case OP_NOT:
3829	case OP_NOTI:
3830	case OP_STAR:
3831	case OP_MINSTAR:
3832	case OP_PLUS:
3833	case OP_MINPLUS:
3834	case OP_QUERY:
3835	case OP_MINQUERY:
3836	case OP_UPTO:
3837	case OP_MINUPTO:
3838	case OP_EXACT:
3839	case OP_POSSTAR:
3840	case OP_POSPLUS:
3841	case OP_POSQUERY:
3842	case OP_POSUPTO:
3843	case OP_STARI:
3844	case OP_MINSTARI:
3845	case OP_PLUSI:
3846	case OP_MINPLUSI:
3847	case OP_QUERYI:
3848	case OP_MINQUERYI:
3849	case OP_UPTOI:
3850	case OP_MINUPTOI:
3851	case OP_EXACTI:
3852	case OP_POSSTARI:
3853	case OP_POSPLUSI:
3854	case OP_POSQUERYI:
3855	case OP_POSUPTOI:
3856	case OP_NOTSTAR:
3857	case OP_NOTMINSTAR:
3858	case OP_NOTPLUS:
3859	case OP_NOTMINPLUS:
3860	case OP_NOTQUERY:
3861	case OP_NOTMINQUERY:
3862	case OP_NOTUPTO:
3863	case OP_NOTMINUPTO:
3864	case OP_NOTEXACT:
3865	case OP_NOTPOSSTAR:
3866	case OP_NOTPOSPLUS:
3867	case OP_NOTPOSQUERY:
3868	case OP_NOTPOSUPTO:
3869	case OP_NOTSTARI:
3870	case OP_NOTMINSTARI:
3871	case OP_NOTPLUSI:
3872	case OP_NOTMINPLUSI:
3873	case OP_NOTQUERYI:
3874	case OP_NOTMINQUERYI:
3875	case OP_NOTUPTOI:
3876	case OP_NOTMINUPTOI:
3877	case OP_NOTEXACTI:
3878	case OP_NOTPOSSTARI:
3879	case OP_NOTPOSPLUSI:
3880	case OP_NOTPOSQUERYI:
3881	case OP_NOTPOSUPTOI:
3882	if (HAS_EXTRALEN(code[-`1`])) code += GET_EXTRALEN(code[-`1`]);
3883	break;
3884	}
3885	#else
3886	(void)(utf); / Keep compiler happy by referencing function argument /
3887	#endif
3888	}
3889	}
3890
3891
3892
3893	/*************************************************
3894	* Check for POSIX class syntax *
3895	*************************************************/
3896
3897	/ This function is called when the sequence "[:" or "[." or "[=" is*
3898	encountered in a character class. It checks whether this is followed by a
3899	sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3900	reach an unescaped ']' without the special preceding character, return FALSE.
3901
3902	Originally, this function only recognized a sequence of letters between the
3903	terminators, but it seems that Perl recognizes any sequence of characters,
3904	though of course unknown POSIX names are subsequently rejected. Perl gives an
3905	"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3906	didn't consider this to be a POSIX class. Likewise for [:1234:].
3907
3908	The problem in trying to be exactly like Perl is in the handling of escapes. We
3909	have to be sure that [abc[:x\]pqr] is not* treated as containing a POSIX*
3910	class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3911	below handles the special cases \\ and \], but does not try to do any other
3912	escape processing. This makes it different from Perl for cases such as
3913	[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3914	not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3915	when Perl does, I think.
3916
3917	A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3918	It seems that the appearance of a nested POSIX class supersedes an apparent
3919	external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3920	a digit.
3921
3922	In Perl, unescaped square brackets may also appear as part of class names. For
3923	example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3924	[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3925	seem right at all. PCRE does not allow closing square brackets in POSIX class
3926	names.
3927
3928	Arguments:
3929	ptr pointer to the initial [
3930	endptr where to return the end pointer
3931
3932	Returns: TRUE or FALSE
3933	*/
3934
3935	static BOOL
3936	check_posix_syntax(const pcre_uchar ptr, const* pcre_uchar **endptr)
3937	{
3938	pcre_uchar terminator; / Don't combine these lines; the Solaris cc /
3939	terminator = (++ptr); /* compiler warns about "non-constant" initializer. /
3940	for (++ptr; *ptr != CHAR_NULL; ptr++)
3941	{
3942	if (*ptr == CHAR_BACKSLASH &&
3943	(ptr[`1`] == CHAR_RIGHT_SQUARE_BRACKET \|\|
3944	ptr[`1`] == CHAR_BACKSLASH))
3945	ptr++;
3946	else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[`1`] == terminator) \|\|
3947	ptr == CHAR_RIGHT_SQUARE_BRACKET) return* FALSE;
3948	else if (*ptr == terminator && ptr[`1`] == CHAR_RIGHT_SQUARE_BRACKET)
3949	{
3950	*endptr = ptr;
3951	return TRUE;
3952	}
3953	}
3954	return FALSE;
3955	}
3956
3957
3958
3959
3960	/*************************************************
3961	* Check POSIX class name *
3962	*************************************************/
3963
3964	/ This function is called to check the name given in a POSIX-style class entry*
3965	such as [:alnum:].
3966
3967	Arguments:
3968	ptr points to the first letter
3969	len the length of the name
3970
3971	Returns: a value representing the name, or -1 if unknown
3972	*/
3973
3974	static int
3975	check_posix_name(const pcre_uchar ptr, int* len)
3976	{
3977	const char *pn = posix_names;
3978	register int yield = `0`;
3979	while (posix_name_lengths[yield] != `0`)
3980	{
3981	if (len == posix_name_lengths[yield] &&
3982	STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == `0`) return yield;
3983	pn += posix_name_lengths[yield] + `1`;
3984	yield++;
3985	}
3986	return -`1`;
3987	}
3988
3989
3990	/*************************************************
3991	* Adjust OP_RECURSE items in repeated group *
3992	*************************************************/
3993
3994	/ OP_RECURSE items contain an offset from the start of the regex to the group*
3995	that is referenced. This means that groups can be replicated for fixed
3996	repetition simply by copying (because the recursion is allowed to refer to
3997	earlier groups that are outside the current group). However, when a group is
3998	optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3999	inserted before it, after it has been compiled. This means that any OP_RECURSE
4000	items within it that refer to the group itself or any contained groups have to
4001	have their offsets adjusted. That one of the jobs of this function. Before it
4002	is called, the partially compiled regex must be temporarily terminated with
4003	OP_END.
4004
4005	This function has been extended to cope with forward references for recursions
4006	and subroutine calls. It must check the list of such references for the
4007	group we are dealing with. If it finds that one of the recursions in the
4008	current group is on this list, it does not adjust the value in the reference
4009	(which is a group number). After the group has been scanned, all the offsets in
4010	the forward reference list for the group are adjusted.
4011
4012	Arguments:
4013	group points to the start of the group
4014	adjust the amount by which the group is to be moved
4015	utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
4016	cd contains pointers to tables etc.
4017	save_hwm_offset the hwm forward reference offset at the start of the group
4018
4019	Returns: nothing
4020	*/
4021
4022	static void
4023	adjust_recurse(pcre_uchar group, int* adjust, BOOL utf, compile_data *cd,
4024	size_t save_hwm_offset)
4025	{
4026	int offset;
4027	pcre_uchar *hc;
4028	pcre_uchar *ptr = group;
4029
4030	while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4031	{
4032	for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4033	hc += LINK_SIZE)
4034	{
4035	offset = (int)GET(hc, `0`);
4036	if (cd->start_code + offset == ptr + `1`) break;
4037	}
4038
4039	/ If we have not found this recursion on the forward reference list, adjust*
4040	the recursion's offset if it's after the start of this group. /*
4041
4042	if (hc >= cd->hwm)
4043	{
4044	offset = (int)GET(ptr, `1`);
4045	if (cd->start_code + offset >= group) PUT(ptr, `1`, offset + adjust);
4046	}
4047
4048	ptr += `1` + LINK_SIZE;
4049	}
4050
4051	/ Now adjust all forward reference offsets for the group. /
4052
4053	for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4054	hc += LINK_SIZE)
4055	{
4056	offset = (int)GET(hc, `0`);
4057	PUT(hc, `0`, offset + adjust);
4058	}
4059	}
4060
4061
4062
4063	/*************************************************
4064	* Insert an automatic callout point *
4065	*************************************************/
4066
4067	/ This function is called when the PCRE_AUTO_CALLOUT option is set, to insert*
4068	callout points before each pattern item.
4069
4070	Arguments:
4071	code current code pointer
4072	ptr current pattern pointer
4073	cd pointers to tables etc
4074
4075	Returns: new code pointer
4076	*/
4077
4078	static pcre_uchar *
4079	auto_callout(pcre_uchar code, const* pcre_uchar ptr, compile_data cd)
4080	{
4081	*code++ = OP_CALLOUT;
4082	*code++ = `255`;
4083	PUT(code, `0`, (int)(ptr - cd->start_pattern)); / Pattern offset /
4084	PUT(code, LINK_SIZE, `0`); / Default length /
4085	return code + `2` * LINK_SIZE;
4086	}
4087
4088
4089
4090	/*************************************************
4091	* Complete a callout item *
4092	*************************************************/
4093
4094	/ A callout item contains the length of the next item in the pattern, which*
4095	we can't fill in till after we have reached the relevant point. This is used
4096	for both automatic and manual callouts.
4097
4098	Arguments:
4099	previous_callout points to previous callout item
4100	ptr current pattern pointer
4101	cd pointers to tables etc
4102
4103	Returns: nothing
4104	*/
4105
4106	static void
4107	complete_callout(pcre_uchar previous_callout, const* pcre_uchar ptr, compile_data cd)
4108	{
4109	int length = (int)(ptr - cd->start_pattern - GET(previous_callout, `2`));
4110	PUT(previous_callout, `2` + LINK_SIZE, length);
4111	}
4112
4113
4114
4115	#ifdef SUPPORT_UCP
4116	/*************************************************
4117	* Get othercase range *
4118	*************************************************/
4119
4120	/ This function is passed the start and end of a class range, in UTF-8 mode*
4121	with UCP support. It searches up the characters, looking for ranges of
4122	characters in the "other" case. Each call returns the next one, updating the
4123	start address. A character with multiple other cases is returned on its own
4124	with a special return value.
4125
4126	Arguments:
4127	cptr points to starting character value; updated
4128	d end value
4129	ocptr where to put start of othercase range
4130	odptr where to put end of othercase range
4131
4132	Yield: -1 when no more
4133	0 when a range is returned
4134	>0 the CASESET offset for char with multiple other cases
4135	in this case, ocptr contains the original
4136	*/
4137
4138	static int
4139	get_othercase_range(pcre_uint32 cptr, pcre_uint32 d, pcre_uint32 ocptr,
4140	pcre_uint32 *odptr)
4141	{
4142	pcre_uint32 c, othercase, next;
4143	unsigned int co;
4144
4145	/ Find the first character that has an other case. If it has multiple other*
4146	cases, return its case offset value. /*
4147
4148	for (c = *cptr; c <= d; c++)
4149	{
4150	if ((co = UCD_CASESET(c)) != `0`)
4151	{
4152	ocptr = c++; /* Character that has the set /
4153	cptr = c; /* Rest of input range /
4154	return (int)co;
4155	}
4156	if ((othercase = UCD_OTHERCASE(c)) != c) break;
4157	}
4158
4159	if (c > d) return -`1`; / Reached end of range /
4160
4161	/ Found a character that has a single other case. Search for the end of the*
4162	range, which is either the end of the input range, or a character that has zero
4163	or more than one other cases. /*
4164
4165	*ocptr = othercase;
4166	next = othercase + `1`;
4167
4168	for (++c; c <= d; c++)
4169	{
4170	if ((co = UCD_CASESET(c)) != `0` \|\| UCD_OTHERCASE(c) != next) break;
4171	next++;
4172	}
4173
4174	odptr = next - `1`; /* End of othercase range /
4175	cptr = c; /* Rest of input range /
4176	return `0`;
4177	}
4178	#endif /* SUPPORT_UCP */
4179
4180
4181
4182	/*************************************************
4183	* Add a character or range to a class *
4184	*************************************************/
4185
4186	/ This function packages up the logic of adding a character or range of*
4187	characters to a class. The character values in the arguments will be within the
4188	valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4189	mutually recursive with the function immediately below.
4190
4191	Arguments:
4192	classbits the bit map for characters < 256
4193	uchardptr points to the pointer for extra data
4194	options the options word
4195	cd contains pointers to tables etc.
4196	start start of range character
4197	end end of range character
4198
4199	Returns: the number of < 256 characters added
4200	the pointer to extra data is updated
4201	*/
4202
4203	static int
4204	add_to_class(pcre_uint8 classbits, pcre_uchar uchardptr, int* options,
4205	compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4206	{
4207	pcre_uint32 c;
4208	pcre_uint32 classbits_end = (end <= `0xff` ? end : `0xff`);
4209	int n8 = `0`;
4210
4211	/ If caseless matching is required, scan the range and process alternate*
4212	cases. In Unicode, there are 8-bit characters that have alternate cases that
4213	are greater than 255 and vice-versa. Sometimes we can just extend the original
4214	range. /*
4215
4216	if ((options & PCRE_CASELESS) != `0`)
4217	{
4218	#ifdef SUPPORT_UCP
4219	if ((options & PCRE_UTF8) != `0`)
4220	{
4221	int rc;
4222	pcre_uint32 oc, od;
4223
4224	options &= ~PCRE_CASELESS; / Remove for recursive calls /
4225	c = start;
4226
4227	while ((rc = get_othercase_range(&c, end, &oc, &od)) >= `0`)
4228	{
4229	/ Handle a single character that has more than one other case. /
4230
4231	if (rc > `0`) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4232	PRIV(ucd_caseless_sets) + rc, oc);
4233
4234	/ Do nothing if the other case range is within the original range. /
4235
4236	else if (oc >= start && od <= end) continue;
4237
4238	/ Extend the original range if there is overlap, noting that if oc < c, we*
4239	can't have od > end because a subrange is always shorter than the basic
4240	range. Otherwise, use a recursive call to add the additional range. /*
4241
4242	else if (oc < start && od >= start - `1`) start = oc; / Extend downwards /
4243	else if (od > end && oc <= end + `1`)
4244	{
4245	end = od; / Extend upwards /
4246	if (end > classbits_end) classbits_end = (end <= `0xff` ? end : `0xff`);
4247	}
4248	else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4249	}
4250	}
4251	else
4252	#endif /* SUPPORT_UCP */
4253
4254	/ Not UTF-mode, or no UCP /
4255
4256	for (c = start; c <= classbits_end; c++)
4257	{
4258	SETBIT(classbits, cd->fcc[c]);
4259	n8++;
4260	}
4261	}
4262
4263	/ Now handle the original range. Adjust the final value according to the bit*
4264	length - this means that the same lists of (e.g.) horizontal spaces can be used
4265	in all cases. /*
4266
4267	#if defined COMPILE_PCRE8
4268	#ifdef SUPPORT_UTF
4269	if ((options & PCRE_UTF8) == `0`)
4270	#endif
4271	if (end > `0xff`) end = `0xff`;
4272
4273	#elif defined COMPILE_PCRE16
4274	#ifdef SUPPORT_UTF
4275	if ((options & PCRE_UTF16) == `0`)
4276	#endif
4277	if (end > `0xffff`) end = `0xffff`;
4278
4279	#endif /* COMPILE_PCRE[8\|16] */
4280
4281	/ Use the bitmap for characters < 256. Otherwise use extra data./
4282
4283	for (c = start; c <= classbits_end; c++)
4284	{
4285	/ Regardless of start, c will always be <= 255. /
4286	SETBIT(classbits, c);
4287	n8++;
4288	}
4289
4290	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
4291	if (start <= `0xff`) start = `0xff` + `1`;
4292
4293	if (end >= start)
4294	{
4295	pcre_uchar uchardata = uchardptr;
4296	#ifdef SUPPORT_UTF
4297	if ((options & PCRE_UTF8) != `0`) / All UTFs use the same flag bit /
4298	{
4299	if (start < end)
4300	{
4301	*uchardata++ = XCL_RANGE;
4302	uchardata += PRIV(ord2utf)(start, uchardata);
4303	uchardata += PRIV(ord2utf)(end, uchardata);
4304	}
4305	else if (start == end)
4306	{
4307	*uchardata++ = XCL_SINGLE;
4308	uchardata += PRIV(ord2utf)(start, uchardata);
4309	}
4310	}
4311	else
4312	#endif /* SUPPORT_UTF */
4313
4314	/ Without UTF support, character values are constrained by the bit length,*
4315	and can only be > 256 for 16-bit and 32-bit libraries. /*
4316
4317	#ifdef COMPILE_PCRE8
4318	{}
4319	#else
4320	if (start < end)
4321	{
4322	*uchardata++ = XCL_RANGE;
4323	*uchardata++ = start;
4324	*uchardata++ = end;
4325	}
4326	else if (start == end)
4327	{
4328	*uchardata++ = XCL_SINGLE;
4329	*uchardata++ = start;
4330	}
4331	#endif
4332
4333	uchardptr = uchardata; /* Updata extra data pointer /
4334	}
4335	#endif /* SUPPORT_UTF \|\| !COMPILE_PCRE8 */
4336
4337	return n8; / Number of 8-bit characters /
4338	}
4339
4340
4341
4342
4343	/*************************************************
4344	* Add a list of characters to a class *
4345	*************************************************/
4346
4347	/ This function is used for adding a list of case-equivalent characters to a*
4348	class, and also for adding a list of horizontal or vertical whitespace. If the
4349	list is in order (which it should be), ranges of characters are detected and
4350	handled appropriately. This function is mutually recursive with the function
4351	above.
4352
4353	Arguments:
4354	classbits the bit map for characters < 256
4355	uchardptr points to the pointer for extra data
4356	options the options word
4357	cd contains pointers to tables etc.
4358	p points to row of 32-bit values, terminated by NOTACHAR
4359	except character to omit; this is used when adding lists of
4360	case-equivalent characters to avoid including the one we
4361	already know about
4362
4363	Returns: the number of < 256 characters added
4364	the pointer to extra data is updated
4365	*/
4366
4367	static int
4368	add_list_to_class(pcre_uint8 classbits, pcre_uchar uchardptr, int* options,
4369	compile_data cd, const* pcre_uint32 p, unsigned* int except)
4370	{
4371	int n8 = `0`;
4372	while (p[`0`] < NOTACHAR)
4373	{
4374	int n = `0`;
4375	if (p[`0`] != except)
4376	{
4377	while(p[n+`1`] == p[`0`] + n + `1`) n++;
4378	n8 += add_to_class(classbits, uchardptr, options, cd, p[`0`], p[n]);
4379	}
4380	p += n + `1`;
4381	}
4382	return n8;
4383	}
4384
4385
4386
4387	/*************************************************
4388	* Add characters not in a list to a class *
4389	*************************************************/
4390
4391	/ This function is used for adding the complement of a list of horizontal or*
4392	vertical whitespace to a class. The list must be in order.
4393
4394	Arguments:
4395	classbits the bit map for characters < 256
4396	uchardptr points to the pointer for extra data
4397	options the options word
4398	cd contains pointers to tables etc.
4399	p points to row of 32-bit values, terminated by NOTACHAR
4400
4401	Returns: the number of < 256 characters added
4402	the pointer to extra data is updated
4403	*/
4404
4405	static int
4406	add_not_list_to_class(pcre_uint8 classbits, pcre_uchar *uchardptr,
4407	int options, compile_data cd, const* pcre_uint32 *p)
4408	{
4409	BOOL utf = (options & PCRE_UTF8) != `0`;
4410	int n8 = `0`;
4411	if (p[`0`] > `0`)
4412	n8 += add_to_class(classbits, uchardptr, options, cd, `0`, p[`0`] - `1`);
4413	while (p[`0`] < NOTACHAR)
4414	{
4415	while (p[`1`] == p[`0`] + `1`) p++;
4416	n8 += add_to_class(classbits, uchardptr, options, cd, p[`0`] + `1`,
4417	(p[`1`] == NOTACHAR) ? (utf ? `0x10ffffu` : `0xffffffffu`) : p[`1`] - `1`);
4418	p++;
4419	}
4420	return n8;
4421	}
4422
4423
4424
4425	/*************************************************
4426	* Compile one branch *
4427	*************************************************/
4428
4429	/ Scan the pattern, compiling it into the a vector. If the options are*
4430	changed during the branch, the pointer is used to change the external options
4431	bits. This function is used during the pre-compile phase when we are trying
4432	to find out the amount of memory needed, as well as during the real compile
4433	phase. The value of lengthptr distinguishes the two phases.
4434
4435	Arguments:
4436	optionsptr pointer to the option bits
4437	codeptr points to the pointer to the current code point
4438	ptrptr points to the current pattern pointer
4439	errorcodeptr points to error code variable
4440	firstcharptr place to put the first required character
4441	firstcharflagsptr place to put the first character flags, or a negative number
4442	reqcharptr place to put the last required character
4443	reqcharflagsptr place to put the last required character flags, or a negative number
4444	bcptr points to current branch chain
4445	cond_depth conditional nesting depth
4446	cd contains pointers to tables etc.
4447	lengthptr NULL during the real compile phase
4448	points to length accumulator during pre-compile phase
4449
4450	Returns: TRUE on success
4451	FALSE, with errorcodeptr set non-zero on error*
4452	*/
4453
4454	static BOOL
4455	compile_branch(int optionsptr, pcre_uchar *codeptr,
4456	const pcre_uchar *ptrptr, int* *errorcodeptr,
4457	pcre_uint32 firstcharptr, pcre_int32 firstcharflagsptr,
4458	pcre_uint32 reqcharptr, pcre_int32 reqcharflagsptr,
4459	branch_chain bcptr, int* cond_depth,
4460	compile_data cd, int* *lengthptr)
4461	{
4462	int repeat_type, op_type;
4463	int repeat_min = `0`, repeat_max = `0`; / To please picky compilers /
4464	int bravalue = `0`;
4465	int greedy_default, greedy_non_default;
4466	pcre_uint32 firstchar, reqchar;
4467	pcre_int32 firstcharflags, reqcharflags;
4468	pcre_uint32 zeroreqchar, zerofirstchar;
4469	pcre_int32 zeroreqcharflags, zerofirstcharflags;
4470	pcre_int32 req_caseopt, reqvary, tempreqvary;
4471	int options = optionsptr; /* May change dynamically /
4472	int after_manual_callout = `0`;
4473	int length_prevgroup = `0`;
4474	register pcre_uint32 c;
4475	int escape;
4476	register pcre_uchar code = codeptr;
4477	pcre_uchar *last_code = code;
4478	pcre_uchar *orig_code = code;
4479	pcre_uchar *tempcode;
4480	BOOL inescq = FALSE;
4481	BOOL groupsetfirstchar = FALSE;
4482	const pcre_uchar ptr = ptrptr;
4483	const pcre_uchar *tempptr;
4484	const pcre_uchar *nestptr = NULL;
4485	pcre_uchar *previous = NULL;
4486	pcre_uchar *previous_callout = NULL;
4487	size_t item_hwm_offset = `0`;
4488	pcre_uint8 classbits[`32`];
4489
4490	/ We can fish out the UTF-8 setting once and for all into a BOOL, but we*
4491	must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4492	dynamically as we process the pattern. /*
4493
4494	#ifdef SUPPORT_UTF
4495	/ PCRE_UTF[16\|32] have the same value as PCRE_UTF8. /
4496	BOOL utf = (options & PCRE_UTF8) != `0`;
4497	#ifndef COMPILE_PCRE32
4498	pcre_uchar utf_chars[`6`];
4499	#endif
4500	#else
4501	BOOL utf = FALSE;
4502	#endif
4503
4504	/ Helper variables for OP_XCLASS opcode (for characters > 255). We define*
4505	class_uchardata always so that it can be passed to add_to_class() always,
4506	though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4507	alternative calls for the different cases. /*
4508
4509	pcre_uchar *class_uchardata;
4510	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
4511	BOOL xclass;
4512	pcre_uchar *class_uchardata_base;
4513	#endif
4514
4515	#ifdef PCRE_DEBUG
4516	if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4517	#endif
4518
4519	/ Set up the default and non-default settings for greediness /
4520
4521	greedy_default = ((options & PCRE_UNGREEDY) != `0`);
4522	greedy_non_default = greedy_default ^ `1`;
4523
4524	/ Initialize no first byte, no required byte. REQ_UNSET means "no char*
4525	matching encountered yet". It gets changed to REQ_NONE if we hit something that
4526	matches a non-fixed char first char; reqchar just remains unset if we never
4527	find one.
4528
4529	When we hit a repeat whose minimum is zero, we may have to adjust these values
4530	to take the zero repeat into account. This is implemented by setting them to
4531	zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4532	item types that can be repeated set these backoff variables appropriately. /*
4533
4534	firstchar = reqchar = zerofirstchar = zeroreqchar = `0`;
4535	firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4536
4537	/ The variable req_caseopt contains either the REQ_CASELESS value*
4538	or zero, according to the current setting of the caseless flag. The
4539	REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4540	firstchar or reqchar variables to record the case status of the
4541	value. This is used only for ASCII characters. /*
4542
4543	req_caseopt = ((options & PCRE_CASELESS) != `0`)? REQ_CASELESS:`0`;
4544
4545	/ Switch on next character until the end of the branch /
4546
4547	for (;; ptr++)
4548	{
4549	BOOL negate_class;
4550	BOOL should_flip_negation;
4551	BOOL possessive_quantifier;
4552	BOOL is_quantifier;
4553	BOOL is_recurse;
4554	BOOL reset_bracount;
4555	int class_has_8bitchar;
4556	int class_one_char;
4557	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
4558	BOOL xclass_has_prop;
4559	#endif
4560	int newoptions;
4561	int recno;
4562	int refsign;
4563	int skipbytes;
4564	pcre_uint32 subreqchar, subfirstchar;
4565	pcre_int32 subreqcharflags, subfirstcharflags;
4566	int terminator;
4567	unsigned int mclength;
4568	unsigned int tempbracount;
4569	pcre_uint32 ec;
4570	pcre_uchar mcbuffer[`8`];
4571
4572	/ Come here to restart the loop without advancing the pointer. /
4573
4574	REDO_LOOP:
4575
4576	/ Get next character in the pattern /
4577
4578	c = *ptr;
4579
4580	/ If we are at the end of a nested substitution, revert to the outer level*
4581	string. Nesting only happens one level deep. /*
4582
4583	if (c == CHAR_NULL && nestptr != NULL)
4584	{
4585	ptr = nestptr;
4586	nestptr = NULL;
4587	c = *ptr;
4588	}
4589
4590	/ If we are in the pre-compile phase, accumulate the length used for the*
4591	previous cycle of this loop. /*
4592
4593	if (lengthptr != NULL)
4594	{
4595	#ifdef PCRE_DEBUG
4596	if (code > cd->hwm) cd->hwm = code; / High water info /
4597	#endif
4598	if (code > cd->start_workspace + cd->workspace_size -
4599	WORK_SIZE_SAFETY_MARGIN) / Check for overrun /
4600	{
4601	*errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)?
4602	ERR52 : ERR87;
4603	goto FAILED;
4604	}
4605
4606	/ There is at least one situation where code goes backwards: this is the*
4607	case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4608	the class is simply eliminated. However, it is created first, so we have to
4609	allow memory for it. Therefore, don't ever reduce the length at this point.
4610	*/
4611
4612	if (code < last_code) code = last_code;
4613
4614	/ Paranoid check for integer overflow /
4615
4616	if (OFLOW_MAX - *lengthptr < code - last_code)
4617	{
4618	*errorcodeptr = ERR20;
4619	goto FAILED;
4620	}
4621
4622	lengthptr += (int*)(code - last_code);
4623	DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4624	(int)(code - last_code), c, c));
4625
4626	/ If "previous" is set and it is not at the start of the work space, move*
4627	it back to there, in order to avoid filling up the work space. Otherwise,
4628	if "previous" is NULL, reset the current code pointer to the start. /*
4629
4630	if (previous != NULL)
4631	{
4632	if (previous > orig_code)
4633	{
4634	memmove(orig_code, previous, IN_UCHARS(code - previous));
4635	code -= previous - orig_code;
4636	previous = orig_code;
4637	}
4638	}
4639	else code = orig_code;
4640
4641	/ Remember where this code item starts so we can pick up the length*
4642	next time round. /*
4643
4644	last_code = code;
4645	}
4646
4647	/ In the real compile phase, just check the workspace used by the forward*
4648	reference list. /*
4649
4650	else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4651	{
4652	*errorcodeptr = ERR52;
4653	goto FAILED;
4654	}
4655
4656	/ If in \Q...\E, check for the end; if not, we have a literal. Otherwise an*
4657	isolated \E is ignored. /*
4658
4659	if (c != CHAR_NULL)
4660	{
4661	if (c == CHAR_BACKSLASH && ptr[`1`] == CHAR_E)
4662	{
4663	inescq = FALSE;
4664	ptr++;
4665	continue;
4666	}
4667	else if (inescq)
4668	{
4669	if (previous_callout != NULL)
4670	{
4671	if (lengthptr == NULL) / Don't attempt in pre-compile phase /
4672	complete_callout(previous_callout, ptr, cd);
4673	previous_callout = NULL;
4674	}
4675	if ((options & PCRE_AUTO_CALLOUT) != `0`)
4676	{
4677	previous_callout = code;
4678	code = auto_callout(code, ptr, cd);
4679	}
4680	goto NORMAL_CHAR;
4681	}
4682
4683	/ Check for the start of a \Q...\E sequence. We must do this here rather*
4684	than later in case it is immediately followed by \E, which turns it into a
4685	"do nothing" sequence. /*
4686
4687	if (c == CHAR_BACKSLASH && ptr[`1`] == CHAR_Q)
4688	{
4689	inescq = TRUE;
4690	ptr++;
4691	continue;
4692	}
4693	}
4694
4695	/ In extended mode, skip white space and comments. /
4696
4697	if ((options & PCRE_EXTENDED) != `0`)
4698	{
4699	const pcre_uchar *wscptr = ptr;
4700	while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != `0`) c = *(++ptr);
4701	if (c == CHAR_NUMBER_SIGN)
4702	{
4703	ptr++;
4704	while (*ptr != CHAR_NULL)
4705	{
4706	if (IS_NEWLINE(ptr)) / For non-fixed-length newline cases, /
4707	{ / IS_NEWLINE sets cd->nllen. /
4708	ptr += cd->nllen;
4709	break;
4710	}
4711	ptr++;
4712	#ifdef SUPPORT_UTF
4713	if (utf) FORWARDCHAR(ptr);
4714	#endif
4715	}
4716	}
4717
4718	/ If we skipped any characters, restart the loop. Otherwise, we didn't see*
4719	a comment. /*
4720
4721	if (ptr > wscptr) goto REDO_LOOP;
4722	}
4723
4724	/ Skip over (?# comments. We need to do this here because we want to know if*
4725	the next thing is a quantifier, and these comments may come between an item
4726	and its quantifier. /*
4727
4728	if (c == CHAR_LEFT_PARENTHESIS && ptr[`1`] == CHAR_QUESTION_MARK &&
4729	ptr[`2`] == CHAR_NUMBER_SIGN)
4730	{
4731	ptr += `3`;
4732	while (ptr != CHAR_NULL && ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4733	if (*ptr == CHAR_NULL)
4734	{
4735	*errorcodeptr = ERR18;
4736	goto FAILED;
4737	}
4738	continue;
4739	}
4740
4741	/ See if the next thing is a quantifier. /
4742
4743	is_quantifier =
4744	c == CHAR_ASTERISK \|\| c == CHAR_PLUS \|\| c == CHAR_QUESTION_MARK \|\|
4745	(c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+`1`));
4746
4747	/ Fill in length of a previous callout, except when the next thing is a*
4748	quantifier or when processing a property substitution string in UCP mode. /*
4749
4750	if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4751	after_manual_callout-- <= `0`)
4752	{
4753	if (lengthptr == NULL) / Don't attempt in pre-compile phase /
4754	complete_callout(previous_callout, ptr, cd);
4755	previous_callout = NULL;
4756	}
4757
4758	/ Create auto callout, except for quantifiers, or while processing property*
4759	strings that are substituted for \w etc in UCP mode. /*
4760
4761	if ((options & PCRE_AUTO_CALLOUT) != `0` && !is_quantifier && nestptr == NULL)
4762	{
4763	previous_callout = code;
4764	code = auto_callout(code, ptr, cd);
4765	}
4766
4767	/ Process the next pattern item. /
4768
4769	switch(c)
4770	{
4771	/ ===================================================================/
4772	case CHAR_NULL: / The branch terminates at string end /
4773	case CHAR_VERTICAL_LINE: / or \| or ) /
4774	case CHAR_RIGHT_PARENTHESIS:
4775	*firstcharptr = firstchar;
4776	*firstcharflagsptr = firstcharflags;
4777	*reqcharptr = reqchar;
4778	*reqcharflagsptr = reqcharflags;
4779	*codeptr = code;
4780	*ptrptr = ptr;
4781	if (lengthptr != NULL)
4782	{
4783	if (OFLOW_MAX - *lengthptr < code - last_code)
4784	{
4785	*errorcodeptr = ERR20;
4786	goto FAILED;
4787	}
4788	lengthptr += (int)(code - last_code); /* To include callout length /
4789	DPRINTF((">> end branch\n"));
4790	}
4791	return TRUE;
4792
4793
4794	/ ===================================================================/
4795	/ Handle single-character metacharacters. In multiline mode, ^ disables*
4796	the setting of any following char as a first character. /*
4797
4798	case CHAR_CIRCUMFLEX_ACCENT:
4799	previous = NULL;
4800	if ((options & PCRE_MULTILINE) != `0`)
4801	{
4802	if (firstcharflags == REQ_UNSET)
4803	zerofirstcharflags = firstcharflags = REQ_NONE;
4804	*code++ = OP_CIRCM;
4805	}
4806	else *code++ = OP_CIRC;
4807	break;
4808
4809	case CHAR_DOLLAR_SIGN:
4810	previous = NULL;
4811	*code++ = ((options & PCRE_MULTILINE) != `0`)? OP_DOLLM : OP_DOLL;
4812	break;
4813
4814	/ There can never be a first char if '.' is first, whatever happens about*
4815	repeats. The value of reqchar doesn't change either. /*
4816
4817	case CHAR_DOT:
4818	if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4819	zerofirstchar = firstchar;
4820	zerofirstcharflags = firstcharflags;
4821	zeroreqchar = reqchar;
4822	zeroreqcharflags = reqcharflags;
4823	previous = code;
4824	item_hwm_offset = cd->hwm - cd->start_workspace;
4825	*code++ = ((options & PCRE_DOTALL) != `0`)? OP_ALLANY: OP_ANY;
4826	break;
4827
4828
4829	/ ===================================================================/
4830	/ Character classes. If the included characters are all < 256, we build a*
4831	32-byte bitmap of the permitted characters, except in the special case
4832	where there is only one such character. For negated classes, we build the
4833	map as usual, then invert it at the end. However, we use a different opcode
4834	so that data characters > 255 can be handled correctly.
4835
4836	If the class contains characters outside the 0-255 range, a different
4837	opcode is compiled. It may optionally have a bit map for characters < 256,
4838	but those above are are explicitly listed afterwards. A flag byte tells
4839	whether the bitmap is present, and whether this is a negated class or not.
4840
4841	In JavaScript compatibility mode, an isolated ']' causes an error. In
4842	default (Perl) mode, it is treated as a data character. /*
4843
4844	case CHAR_RIGHT_SQUARE_BRACKET:
4845	if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != `0`)
4846	{
4847	*errorcodeptr = ERR64;
4848	goto FAILED;
4849	}
4850	goto NORMAL_CHAR;
4851
4852	/ In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is*
4853	used for "start of word" and "end of word". As these are otherwise illegal
4854	sequences, we don't break anything by recognizing them. They are replaced
4855	by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4856	erroneous and are handled by the normal code below. /*
4857
4858	case CHAR_LEFT_SQUARE_BRACKET:
4859	if (STRNCMP_UC_C8(ptr+`1`, STRING_WEIRD_STARTWORD, `6`) == `0`)
4860	{
4861	nestptr = ptr + `7`;
4862	ptr = sub_start_of_word;
4863	goto REDO_LOOP;
4864	}
4865
4866	if (STRNCMP_UC_C8(ptr+`1`, STRING_WEIRD_ENDWORD, `6`) == `0`)
4867	{
4868	nestptr = ptr + `7`;
4869	ptr = sub_end_of_word;
4870	goto REDO_LOOP;
4871	}
4872
4873	/ Handle a real character class. /
4874
4875	previous = code;
4876	item_hwm_offset = cd->hwm - cd->start_workspace;
4877
4878	/ PCRE supports POSIX class stuff inside a class. Perl gives an error if*
4879	they are encountered at the top level, so we'll do that too. /*
4880
4881	if ((ptr[`1`] == CHAR_COLON \|\| ptr[`1`] == CHAR_DOT \|\|
4882	ptr[`1`] == CHAR_EQUALS_SIGN) &&
4883	check_posix_syntax(ptr, &tempptr))
4884	{
4885	*errorcodeptr = (ptr[`1`] == CHAR_COLON)? ERR13 : ERR31;
4886	goto FAILED;
4887	}
4888
4889	/ If the first character is '^', set the negation flag and skip it. Also,*
4890	if the first few characters (either before or after ^) are \Q\E or \E we
4891	skip them too. This makes for compatibility with Perl. /*
4892
4893	negate_class = FALSE;
4894	for (;;)
4895	{
4896	c = *(++ptr);
4897	if (c == CHAR_BACKSLASH)
4898	{
4899	if (ptr[`1`] == CHAR_E)
4900	ptr++;
4901	else if (STRNCMP_UC_C8(ptr + `1`, STR_Q STR_BACKSLASH STR_E, `3`) == `0`)
4902	ptr += `3`;
4903	else
4904	break;
4905	}
4906	else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4907	negate_class = TRUE;
4908	else break;
4909	}
4910
4911	/ Empty classes are allowed in JavaScript compatibility mode. Otherwise,*
4912	an initial ']' is taken as a data character -- the code below handles
4913	that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4914	[^] must match any character, so generate OP_ALLANY. /*
4915
4916	if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4917	(cd->external_options & PCRE_JAVASCRIPT_COMPAT) != `0`)
4918	{
4919	*code++ = negate_class? OP_ALLANY : OP_FAIL;
4920	if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4921	zerofirstchar = firstchar;
4922	zerofirstcharflags = firstcharflags;
4923	break;
4924	}
4925
4926	/ If a class contains a negative special such as \S, we need to flip the*
4927	negation flag at the end, so that support for characters > 255 works
4928	correctly (they are all included in the class). /*
4929
4930	should_flip_negation = FALSE;
4931
4932	/ Extended class (xclass) will be used when characters > 255*
4933	might match. /*
4934
4935	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
4936	xclass = FALSE;
4937	class_uchardata = code + LINK_SIZE + `2`; / For XCLASS items /
4938	class_uchardata_base = class_uchardata; / Save the start /
4939	#endif
4940
4941	/ For optimization purposes, we track some properties of the class:*
4942	class_has_8bitchar will be non-zero if the class contains at least one <
4943	256 character; class_one_char will be 1 if the class contains just one
4944	character; xclass_has_prop will be TRUE if unicode property checks
4945	are present in the class. /*
4946
4947	class_has_8bitchar = `0`;
4948	class_one_char = `0`;
4949	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
4950	xclass_has_prop = FALSE;
4951	#endif
4952
4953	/ Initialize the 32-char bit map to all zeros. We build the map in a*
4954	temporary bit of memory, in case the class contains fewer than two
4955	8-bit characters because in that case the compiled code doesn't use the bit
4956	map. /*
4957
4958	memset(classbits, `0`, `32` * sizeof(pcre_uint8));
4959
4960	/ Process characters until ] is reached. By writing this as a "do" it*
4961	means that an initial ] is taken as a data character. At the start of the
4962	loop, c contains the first byte of the character. /*
4963
4964	if (c != CHAR_NULL) do
4965	{
4966	const pcre_uchar *oldptr;
4967
4968	#ifdef SUPPORT_UTF
4969	if (utf && HAS_EXTRALEN(c))
4970	{ / Braces are required because the /
4971	GETCHARLEN(c, ptr, ptr); / macro generates multiple statements /
4972	}
4973	#endif
4974
4975	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
4976	/ In the pre-compile phase, accumulate the length of any extra*
4977	data and reset the pointer. This is so that very large classes that
4978	contain a zillion > 255 characters no longer overwrite the work space
4979	(which is on the stack). We have to remember that there was XCLASS data,
4980	however. /*
4981
4982	if (class_uchardata > class_uchardata_base) xclass = TRUE;
4983
4984	if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4985	{
4986	lengthptr += (int*)(class_uchardata - class_uchardata_base);
4987	class_uchardata = class_uchardata_base;
4988	}
4989	#endif
4990
4991	/ Inside \Q...\E everything is literal except \E /
4992
4993	if (inescq)
4994	{
4995	if (c == CHAR_BACKSLASH && ptr[`1`] == CHAR_E) / If we are at \E /
4996	{
4997	inescq = FALSE; / Reset literal state /
4998	ptr++; / Skip the 'E' /
4999	continue; / Carry on with next /
5000	}
5001	goto CHECK_RANGE; / Could be range if \E follows /
5002	}
5003
5004	/ Handle POSIX class names. Perl allows a negation extension of the*
5005	form [:^name:]. A square bracket that doesn't match the syntax is
5006	treated as a literal. We also recognize the POSIX constructions
5007	[.ch.] and [=ch=] ("collating elements") and fault them, as Perl
5008	5.6 and 5.8 do. /*
5009
5010	if (c == CHAR_LEFT_SQUARE_BRACKET &&
5011	(ptr[`1`] == CHAR_COLON \|\| ptr[`1`] == CHAR_DOT \|\|
5012	ptr[`1`] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
5013	{
5014	BOOL local_negate = FALSE;
5015	int posix_class, taboffset, tabopt;
5016	register const pcre_uint8 *cbits = cd->cbits;
5017	pcre_uint8 pbits[`32`];
5018
5019	if (ptr[`1`] != CHAR_COLON)
5020	{
5021	*errorcodeptr = ERR31;
5022	goto FAILED;
5023	}
5024
5025	ptr += `2`;
5026	if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
5027	{
5028	local_negate = TRUE;
5029	should_flip_negation = TRUE; / Note negative special /
5030	ptr++;
5031	}
5032
5033	posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
5034	if (posix_class < `0`)
5035	{
5036	*errorcodeptr = ERR30;
5037	goto FAILED;
5038	}
5039
5040	/ If matching is caseless, upper and lower are converted to*
5041	alpha. This relies on the fact that the class table starts with
5042	alpha, lower, upper as the first 3 entries. /*
5043
5044	if ((options & PCRE_CASELESS) != `0` && posix_class <= `2`)
5045	posix_class = `0`;
5046
5047	/ When PCRE_UCP is set, some of the POSIX classes are converted to*
5048	different escape sequences that use Unicode properties \p or \P. Others
5049	that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5050	directly. /*
5051
5052	#ifdef SUPPORT_UCP
5053	if ((options & PCRE_UCP) != `0`)
5054	{
5055	unsigned int ptype = `0`;
5056	int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/`2` : `0`);
5057
5058	/ The posix_substitutes table specifies which POSIX classes can be*
5059	converted to \p or \P items. /*
5060
5061	if (posix_substitutes[pc] != NULL)
5062	{
5063	nestptr = tempptr + `1`;
5064	ptr = posix_substitutes[pc] - `1`;
5065	continue;
5066	}
5067
5068	/ There are three other classes that generate special property calls*
5069	that are recognized only in an XCLASS. /*
5070
5071	else switch(posix_class)
5072	{
5073	case PC_GRAPH:
5074	ptype = PT_PXGRAPH;
5075	/ Fall through /
5076	case PC_PRINT:
5077	if (ptype == `0`) ptype = PT_PXPRINT;
5078	/ Fall through /
5079	case PC_PUNCT:
5080	if (ptype == `0`) ptype = PT_PXPUNCT;
5081	*class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5082	*class_uchardata++ = ptype;
5083	*class_uchardata++ = `0`;
5084	xclass_has_prop = TRUE;
5085	ptr = tempptr + `1`;
5086	continue;
5087
5088	/ For the other POSIX classes (ascii, cntrl, xdigit) we are going*
5089	to fall through to the non-UCP case and build a bit map for
5090	characters with code points less than 256. If we are in a negated
5091	POSIX class, characters with code points greater than 255 must
5092	either all match or all not match. In the special case where we
5093	have not yet generated any xclass data, and this is the final item
5094	in the overall class, we need do nothing: later on, the opcode
5095	OP_NCLASS will be used to indicate that characters greater than 255
5096	are acceptable. If we have already seen an xclass item or one may
5097	follow (we have to assume that it might if this is not the end of
5098	the class), explicitly list all wide codepoints, which will then
5099	either not match or match, depending on whether the class is or is
5100	not negated. /*
5101
5102	default:
5103	if (local_negate &&
5104	(xclass \|\| tempptr[`2`] != CHAR_RIGHT_SQUARE_BRACKET))
5105	{
5106	*class_uchardata++ = XCL_RANGE;
5107	class_uchardata += PRIV(ord2utf)(`0x100`, class_uchardata);
5108	class_uchardata += PRIV(ord2utf)(`0x10ffff`, class_uchardata);
5109	}
5110	break;
5111	}
5112	}
5113	#endif
5114	/ In the non-UCP case, or when UCP makes no difference, we build the*
5115	bit map for the POSIX class in a chunk of local store because we may be
5116	adding and subtracting from it, and we don't want to subtract bits that
5117	may be in the main map already. At the end we or the result into the
5118	bit map that is being built. /*
5119
5120	posix_class *= `3`;
5121
5122	/ Copy in the first table (always present) /
5123
5124	memcpy(pbits, cbits + posix_class_maps[posix_class],
5125	`32` * sizeof(pcre_uint8));
5126
5127	/ If there is a second table, add or remove it as required. /
5128
5129	taboffset = posix_class_maps[posix_class + `1`];
5130	tabopt = posix_class_maps[posix_class + `2`];
5131
5132	if (taboffset >= `0`)
5133	{
5134	if (tabopt >= `0`)
5135	for (c = `0`; c < `32`; c++) pbits[c] \|= cbits[c + taboffset];
5136	else
5137	for (c = `0`; c < `32`; c++) pbits[c] &= ~cbits[c + taboffset];
5138	}
5139
5140	/ Now see if we need to remove any special characters. An option*
5141	value of 1 removes vertical space and 2 removes underscore. /*
5142
5143	if (tabopt < `0`) tabopt = -tabopt;
5144	if (tabopt == `1`) pbits[`1`] &= ~`0x3c`;
5145	else if (tabopt == `2`) pbits[`11`] &= `0x7f`;
5146
5147	/ Add the POSIX table or its complement into the main table that is*
5148	being built and we are done. /*
5149
5150	if (local_negate)
5151	for (c = `0`; c < `32`; c++) classbits[c] \|= ~pbits[c];
5152	else
5153	for (c = `0`; c < `32`; c++) classbits[c] \|= pbits[c];
5154
5155	ptr = tempptr + `1`;
5156	/ Every class contains at least one < 256 character. /
5157	class_has_8bitchar = `1`;
5158	/ Every class contains at least two characters. /
5159	class_one_char = `2`;
5160	continue; / End of POSIX syntax handling /
5161	}
5162
5163	/ Backslash may introduce a single character, or it may introduce one*
5164	of the specials, which just set a flag. The sequence \b is a special
5165	case. Inside a class (and only there) it is treated as backspace. We
5166	assume that other escapes have more than one character in them, so
5167	speculatively set both class_has_8bitchar and class_one_char bigger
5168	than one. Unrecognized escapes fall through and are either treated
5169	as literal characters (by default), or are faulted if
5170	PCRE_EXTRA is set. /*
5171
5172	if (c == CHAR_BACKSLASH)
5173	{
5174	escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5175	TRUE);
5176	if (errorcodeptr != `0`) goto* FAILED;
5177	if (escape == `0`) c = ec;
5178	else if (escape == ESC_b) c = CHAR_BS; / \b is backspace in a class /
5179	else if (escape == ESC_N) / \N is not supported in a class /
5180	{
5181	*errorcodeptr = ERR71;
5182	goto FAILED;
5183	}
5184	else if (escape == ESC_Q) / Handle start of quoted string /
5185	{
5186	if (ptr[`1`] == CHAR_BACKSLASH && ptr[`2`] == CHAR_E)
5187	{
5188	ptr += `2`; / avoid empty string /
5189	}
5190	else inescq = TRUE;
5191	continue;
5192	}
5193	else if (escape == ESC_E) continue; / Ignore orphan \E /
5194
5195	else
5196	{
5197	register const pcre_uint8 *cbits = cd->cbits;
5198	/ Every class contains at least two < 256 characters. /
5199	class_has_8bitchar++;
5200	/ Every class contains at least two characters. /
5201	class_one_char += `2`;
5202
5203	switch (escape)
5204	{
5205	#ifdef SUPPORT_UCP
5206	case ESC_du: / These are the values given for \d etc /
5207	case ESC_DU: / when PCRE_UCP is set. We replace the /
5208	case ESC_wu: / escape sequence with an appropriate \p /
5209	case ESC_WU: / or \P to test Unicode properties instead /
5210	case ESC_su: / of the default ASCII testing. /
5211	case ESC_SU:
5212	nestptr = ptr;
5213	ptr = substitutes[escape - ESC_DU] - `1`; / Just before substitute /
5214	class_has_8bitchar--; / Undo! /
5215	continue;
5216	#endif
5217	case ESC_d:
5218	for (c = `0`; c < `32`; c++) classbits[c] \|= cbits[c+cbit_digit];
5219	continue;
5220
5221	case ESC_D:
5222	should_flip_negation = TRUE;
5223	for (c = `0`; c < `32`; c++) classbits[c] \|= ~cbits[c+cbit_digit];
5224	continue;
5225
5226	case ESC_w:
5227	for (c = `0`; c < `32`; c++) classbits[c] \|= cbits[c+cbit_word];
5228	continue;
5229
5230	case ESC_W:
5231	should_flip_negation = TRUE;
5232	for (c = `0`; c < `32`; c++) classbits[c] \|= ~cbits[c+cbit_word];
5233	continue;
5234
5235	/ Perl 5.004 onwards omitted VT from \s, but restored it at Perl*
5236	5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5237	previously set by something earlier in the character class.
5238	Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5239	we could just adjust the appropriate bit. From PCRE 8.34 we no
5240	longer treat \s and \S specially. /*
5241
5242	case ESC_s:
5243	for (c = `0`; c < `32`; c++) classbits[c] \|= cbits[c+cbit_space];
5244	continue;
5245
5246	case ESC_S:
5247	should_flip_negation = TRUE;
5248	for (c = `0`; c < `32`; c++) classbits[c] \|= ~cbits[c+cbit_space];
5249	continue;
5250
5251	/ The rest apply in both UCP and non-UCP cases. /
5252
5253	case ESC_h:
5254	(void)add_list_to_class(classbits, &class_uchardata, options, cd,
5255	PRIV(hspace_list), NOTACHAR);
5256	continue;
5257
5258	case ESC_H:
5259	(void)add_not_list_to_class(classbits, &class_uchardata, options,
5260	cd, PRIV(hspace_list));
5261	continue;
5262
5263	case ESC_v:
5264	(void)add_list_to_class(classbits, &class_uchardata, options, cd,
5265	PRIV(vspace_list), NOTACHAR);
5266	continue;
5267
5268	case ESC_V:
5269	(void)add_not_list_to_class(classbits, &class_uchardata, options,
5270	cd, PRIV(vspace_list));
5271	continue;
5272
5273	case ESC_p:
5274	case ESC_P:
5275	#ifdef SUPPORT_UCP
5276	{
5277	BOOL negated;
5278	unsigned int ptype = `0`, pdata = `0`;
5279	if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5280	goto FAILED;
5281	*class_uchardata++ = ((escape == ESC_p) != negated)?
5282	XCL_PROP : XCL_NOTPROP;
5283	*class_uchardata++ = ptype;
5284	*class_uchardata++ = pdata;
5285	xclass_has_prop = TRUE;
5286	class_has_8bitchar--; / Undo! /
5287	continue;
5288	}
5289	#else
5290	*errorcodeptr = ERR45;
5291	goto FAILED;
5292	#endif
5293	/ Unrecognized escapes are faulted if PCRE is running in its*
5294	strict mode. By default, for compatibility with Perl, they are
5295	treated as literals. /*
5296
5297	default:
5298	if ((options & PCRE_EXTRA) != `0`)
5299	{
5300	*errorcodeptr = ERR7;
5301	goto FAILED;
5302	}
5303	class_has_8bitchar--; / Undo the speculative increase. /
5304	class_one_char -= `2`; / Undo the speculative increase. /
5305	c = ptr; /* Get the final character and fall through /
5306	break;
5307	}
5308	}
5309
5310	/ Fall through if the escape just defined a single character (c >= 0).*
5311	This may be greater than 256. /*
5312
5313	escape = `0`;
5314
5315	} / End of backslash handling /
5316
5317	/ A character may be followed by '-' to form a range. However, Perl does*
5318	not permit ']' to be the end of the range. A '-' character at the end is
5319	treated as a literal. Perl ignores orphaned \E sequences entirely. The
5320	code for handling \Q and \E is messy. /*
5321
5322	CHECK_RANGE:
5323	while (ptr[`1`] == CHAR_BACKSLASH && ptr[`2`] == CHAR_E)
5324	{
5325	inescq = FALSE;
5326	ptr += `2`;
5327	}
5328	oldptr = ptr;
5329
5330	/ Remember if \r or \n were explicitly used /
5331
5332	if (c == CHAR_CR \|\| c == CHAR_NL) cd->external_flags \|= PCRE_HASCRORLF;
5333
5334	/ Check for range /
5335
5336	if (!inescq && ptr[`1`] == CHAR_MINUS)
5337	{
5338	pcre_uint32 d;
5339	ptr += `2`;
5340	while (*ptr == CHAR_BACKSLASH && ptr[`1`] == CHAR_E) ptr += `2`;
5341
5342	/ If we hit \Q (not followed by \E) at this point, go into escaped*
5343	mode. /*
5344
5345	while (*ptr == CHAR_BACKSLASH && ptr[`1`] == CHAR_Q)
5346	{
5347	ptr += `2`;
5348	if (*ptr == CHAR_BACKSLASH && ptr[`1`] == CHAR_E)
5349	{ ptr += `2`; continue; }
5350	inescq = TRUE;
5351	break;
5352	}
5353
5354	/ Minus (hyphen) at the end of a class is treated as a literal, so put*
5355	back the pointer and jump to handle the character that preceded it. /*
5356
5357	if (ptr == CHAR_NULL \|\| (!inescq && ptr == CHAR_RIGHT_SQUARE_BRACKET))
5358	{
5359	ptr = oldptr;
5360	goto CLASS_SINGLE_CHARACTER;
5361	}
5362
5363	/ Otherwise, we have a potential range; pick up the next character /
5364
5365	#ifdef SUPPORT_UTF
5366	if (utf)
5367	{ / Braces are required because the /
5368	GETCHARLEN(d, ptr, ptr); / macro generates multiple statements /
5369	}
5370	else
5371	#endif
5372	d = ptr; /* Not UTF-8 mode /
5373
5374	/ The second part of a range can be a single-character escape*
5375	sequence, but not any of the other escapes. Perl treats a hyphen as a
5376	literal in such circumstances. However, in Perl's warning mode, a
5377	warning is given, so PCRE now faults it as it is almost certainly a
5378	mistake on the user's part. /*
5379
5380	if (!inescq)
5381	{
5382	if (d == CHAR_BACKSLASH)
5383	{
5384	int descape;
5385	descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5386	if (errorcodeptr != `0`) goto* FAILED;
5387
5388	/ 0 means a character was put into d; \b is backspace; any other*
5389	special causes an error. /*
5390
5391	if (descape != `0`)
5392	{
5393	if (descape == ESC_b) d = CHAR_BS; else
5394	{
5395	*errorcodeptr = ERR83;
5396	goto FAILED;
5397	}
5398	}
5399	}
5400
5401	/ A hyphen followed by a POSIX class is treated in the same way. /
5402
5403	else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5404	(ptr[`1`] == CHAR_COLON \|\| ptr[`1`] == CHAR_DOT \|\|
5405	ptr[`1`] == CHAR_EQUALS_SIGN) &&
5406	check_posix_syntax(ptr, &tempptr))
5407	{
5408	*errorcodeptr = ERR83;
5409	goto FAILED;
5410	}
5411	}
5412
5413	/ Check that the two values are in the correct order. Optimize*
5414	one-character ranges. /*
5415
5416	if (d < c)
5417	{
5418	*errorcodeptr = ERR8;
5419	goto FAILED;
5420	}
5421	if (d == c) goto CLASS_SINGLE_CHARACTER; / A few lines below /
5422
5423	/ We have found a character range, so single character optimizations*
5424	cannot be done anymore. Any value greater than 1 indicates that there
5425	is more than one character. /*
5426
5427	class_one_char = `2`;
5428
5429	/ Remember an explicit \r or \n, and add the range to the class. /
5430
5431	if (d == CHAR_CR \|\| d == CHAR_NL) cd->external_flags \|= PCRE_HASCRORLF;
5432
5433	class_has_8bitchar +=
5434	add_to_class(classbits, &class_uchardata, options, cd, c, d);
5435
5436	continue; / Go get the next char in the class /
5437	}
5438
5439	/ Handle a single character - we can get here for a normal non-escape*
5440	char, or after \ that introduces a single character or for an apparent
5441	range that isn't. Only the value 1 matters for class_one_char, so don't
5442	increase it if it is already 2 or more ... just in case there's a class
5443	with a zillion characters in it. /*
5444
5445	CLASS_SINGLE_CHARACTER:
5446	if (class_one_char < `2`) class_one_char++;
5447
5448	/ If xclass_has_prop is false and class_one_char is 1, we have the first*
5449	single character in the class, and there have been no prior ranges, or
5450	XCLASS items generated by escapes. If this is the final character in the
5451	class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5452	if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5453	can cause firstchar to be set. Otherwise, there can be no first char if
5454	this item is first, whatever repeat count may follow. In the case of
5455	reqchar, save the previous value for reinstating. /*
5456
5457	if (!inescq &&
5458	#ifdef SUPPORT_UCP
5459	!xclass_has_prop &&
5460	#endif
5461	class_one_char == `1` && ptr[`1`] == CHAR_RIGHT_SQUARE_BRACKET)
5462	{
5463	ptr++;
5464	zeroreqchar = reqchar;
5465	zeroreqcharflags = reqcharflags;
5466
5467	if (negate_class)
5468	{
5469	#ifdef SUPPORT_UCP
5470	int d;
5471	#endif
5472	if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5473	zerofirstchar = firstchar;
5474	zerofirstcharflags = firstcharflags;
5475
5476	/ For caseless UTF-8 mode when UCP support is available, check*
5477	whether this character has more than one other case. If so, generate
5478	a special OP_NOTPROP item instead of OP_NOTI. /*
5479
5480	#ifdef SUPPORT_UCP
5481	if (utf && (options & PCRE_CASELESS) != `0` &&
5482	(d = UCD_CASESET(c)) != `0`)
5483	{
5484	*code++ = OP_NOTPROP;
5485	*code++ = PT_CLIST;
5486	*code++ = d;
5487	}
5488	else
5489	#endif
5490	/ Char has only one other case, or UCP not available /
5491
5492	{
5493	*code++ = ((options & PCRE_CASELESS) != `0`)? OP_NOTI: OP_NOT;
5494	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5495	if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5496	code += PRIV(ord2utf)(c, code);
5497	else
5498	#endif
5499	*code++ = c;
5500	}
5501
5502	/ We are finished with this character class /
5503
5504	goto END_CLASS;
5505	}
5506
5507	/ For a single, positive character, get the value into mcbuffer, and*
5508	then we can handle this with the normal one-character code. /*
5509
5510	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5511	if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5512	mclength = PRIV(ord2utf)(c, mcbuffer);
5513	else
5514	#endif
5515	{
5516	mcbuffer[`0`] = c;
5517	mclength = `1`;
5518	}
5519	goto ONE_CHAR;
5520	} / End of 1-char optimization /
5521
5522	/ There is more than one character in the class, or an XCLASS item*
5523	has been generated. Add this character to the class. /*
5524
5525	class_has_8bitchar +=
5526	add_to_class(classbits, &class_uchardata, options, cd, c, c);
5527	}
5528
5529	/ Loop until ']' reached. This "while" is the end of the "do" far above.*
5530	If we are at the end of an internal nested string, revert to the outer
5531	string. /*
5532
5533	while (((c = *(++ptr)) != CHAR_NULL \|\|
5534	(nestptr != NULL &&
5535	(ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5536	(c != CHAR_RIGHT_SQUARE_BRACKET \|\| inescq));
5537
5538	/ Check for missing terminating ']' /
5539
5540	if (c == CHAR_NULL)
5541	{
5542	*errorcodeptr = ERR6;
5543	goto FAILED;
5544	}
5545
5546	/ We will need an XCLASS if data has been placed in class_uchardata. In*
5547	the second phase this is a sufficient test. However, in the pre-compile
5548	phase, class_uchardata gets emptied to prevent workspace overflow, so it
5549	only if the very last character in the class needs XCLASS will it contain
5550	anything at this point. For this reason, xclass gets set TRUE above when
5551	uchar_classdata is emptied, and that's why this code is the way it is here
5552	instead of just doing a test on class_uchardata below. /*
5553
5554	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
5555	if (class_uchardata > class_uchardata_base) xclass = TRUE;
5556	#endif
5557
5558	/ If this is the first thing in the branch, there can be no first char*
5559	setting, whatever the repeat count. Any reqchar setting must remain
5560	unchanged after any kind of repeat. /*
5561
5562	if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5563	zerofirstchar = firstchar;
5564	zerofirstcharflags = firstcharflags;
5565	zeroreqchar = reqchar;
5566	zeroreqcharflags = reqcharflags;
5567
5568	/ If there are characters with values > 255, we have to compile an*
5569	extended class, with its own opcode, unless there was a negated special
5570	such as \S in the class, and PCRE_UCP is not set, because in that case all
5571	characters > 255 are in the class, so any that were explicitly given as
5572	well can be ignored. If (when there are explicit characters > 255 that must
5573	be listed) there are no characters < 256, we can omit the bitmap in the
5574	actual compiled code. /*
5575
5576	#ifdef SUPPORT_UTF
5577	if (xclass && (xclass_has_prop \|\| !should_flip_negation \|\|
5578	(options & PCRE_UCP) != `0`))
5579	#elif !defined COMPILE_PCRE8
5580	if (xclass && (xclass_has_prop \|\| !should_flip_negation))
5581	#endif
5582	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
5583	{
5584	/ For non-UCP wide characters, in a non-negative class containing \S or*
5585	similar (should_flip_negation is set), all characters greater than 255
5586	must be in the class. /*
5587
5588	if (
5589	#if defined COMPILE_PCRE8
5590	utf &&
5591	#endif
5592	should_flip_negation && !negate_class && (options & PCRE_UCP) == `0`)
5593	{
5594	*class_uchardata++ = XCL_RANGE;
5595	if (utf) / Will always be utf in the 8-bit library /
5596	{
5597	class_uchardata += PRIV(ord2utf)(`0x100`, class_uchardata);
5598	class_uchardata += PRIV(ord2utf)(`0x10ffff`, class_uchardata);
5599	}
5600	else / Can only happen for the 16-bit & 32-bit libraries /
5601	{
5602	#if defined COMPILE_PCRE16
5603	*class_uchardata++ = `0x100`;
5604	*class_uchardata++ = `0xffffu`;
5605	#elif defined COMPILE_PCRE32
5606	*class_uchardata++ = `0x100`;
5607	*class_uchardata++ = `0xffffffffu`;
5608	#endif
5609	}
5610	}
5611
5612	class_uchardata++ = XCL_END; /* Marks the end of extra data /
5613	*code++ = OP_XCLASS;
5614	code += LINK_SIZE;
5615	*code = negate_class? XCL_NOT:`0`;
5616	if (xclass_has_prop) *code \|= XCL_HASPROP;
5617
5618	/ If the map is required, move up the extra data to make room for it;*
5619	otherwise just move the code pointer to the end of the extra data. /*
5620
5621	if (class_has_8bitchar > `0`)
5622	{
5623	*code++ \|= XCL_MAP;
5624	memmove(code + (`32` / sizeof(pcre_uchar)), code,
5625	IN_UCHARS(class_uchardata - code));
5626	if (negate_class && !xclass_has_prop)
5627	for (c = `0`; c < `32`; c++) classbits[c] = ~classbits[c];
5628	memcpy(code, classbits, `32`);
5629	code = class_uchardata + (`32` / sizeof(pcre_uchar));
5630	}
5631	else code = class_uchardata;
5632
5633	/ Now fill in the complete length of the item /
5634
5635	PUT(previous, `1`, (int)(code - previous));
5636	break; / End of class handling /
5637	}
5638
5639	/ Even though any XCLASS list is now discarded, we must allow for*
5640	its memory. /*
5641
5642	if (lengthptr != NULL)
5643	lengthptr += (int*)(class_uchardata - class_uchardata_base);
5644	#endif
5645
5646	/ If there are no characters > 255, or they are all to be included or*
5647	excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5648	whole class was negated and whether there were negative specials such as \S
5649	(non-UCP) in the class. Then copy the 32-byte map into the code vector,
5650	negating it if necessary. /*
5651
5652	*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5653	if (lengthptr == NULL) / Save time in the pre-compile phase /
5654	{
5655	if (negate_class)
5656	for (c = `0`; c < `32`; c++) classbits[c] = ~classbits[c];
5657	memcpy(code, classbits, `32`);
5658	}
5659	code += `32` / sizeof(pcre_uchar);
5660
5661	END_CLASS:
5662	break;
5663
5664
5665	/ ===================================================================/
5666	/ Various kinds of repeat; '{' is not necessarily a quantifier, but this*
5667	has been tested above. /*
5668
5669	case CHAR_LEFT_CURLY_BRACKET:
5670	if (!is_quantifier) goto NORMAL_CHAR;
5671	ptr = read_repeat_counts(ptr+`1`, &repeat_min, &repeat_max, errorcodeptr);
5672	if (errorcodeptr != `0`) goto* FAILED;
5673	goto REPEAT;
5674
5675	case CHAR_ASTERISK:
5676	repeat_min = `0`;
5677	repeat_max = -`1`;
5678	goto REPEAT;
5679
5680	case CHAR_PLUS:
5681	repeat_min = `1`;
5682	repeat_max = -`1`;
5683	goto REPEAT;
5684
5685	case CHAR_QUESTION_MARK:
5686	repeat_min = `0`;
5687	repeat_max = `1`;
5688
5689	REPEAT:
5690	if (previous == NULL)
5691	{
5692	*errorcodeptr = ERR9;
5693	goto FAILED;
5694	}
5695
5696	if (repeat_min == `0`)
5697	{
5698	firstchar = zerofirstchar; / Adjust for zero repeat /
5699	firstcharflags = zerofirstcharflags;
5700	reqchar = zeroreqchar; / Ditto /
5701	reqcharflags = zeroreqcharflags;
5702	}
5703
5704	/ Remember whether this is a variable length repeat /
5705
5706	reqvary = (repeat_min == repeat_max)? `0` : REQ_VARY;
5707
5708	op_type = `0`; / Default single-char op codes /
5709	possessive_quantifier = FALSE; / Default not possessive quantifier /
5710
5711	/ Save start of previous item, in case we have to move it up in order to*
5712	insert something before it. /*
5713
5714	tempcode = previous;
5715
5716	/ Before checking for a possessive quantifier, we must skip over*
5717	whitespace and comments in extended mode because Perl allows white space at
5718	this point. /*
5719
5720	if ((options & PCRE_EXTENDED) != `0`)
5721	{
5722	const pcre_uchar *p = ptr + `1`;
5723	for (;;)
5724	{
5725	while (MAX_255(p) && (cd->ctypes[p] & ctype_space) != `0`) p++;
5726	if (p != CHAR_NUMBER_SIGN) break*;
5727	p++;
5728	while (*p != CHAR_NULL)
5729	{
5730	if (IS_NEWLINE(p)) / For non-fixed-length newline cases, /
5731	{ / IS_NEWLINE sets cd->nllen. /
5732	p += cd->nllen;
5733	break;
5734	}
5735	p++;
5736	#ifdef SUPPORT_UTF
5737	if (utf) FORWARDCHAR(p);
5738	#endif
5739	} / Loop for comment characters /
5740	} / Loop for multiple comments /
5741	ptr = p - `1`; / Character before the next significant one. /
5742	}
5743
5744	/ If the next character is '+', we have a possessive quantifier. This*
5745	implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5746	If the next character is '?' this is a minimizing repeat, by default,
5747	but if PCRE_UNGREEDY is set, it works the other way round. We change the
5748	repeat type to the non-default. /*
5749
5750	if (ptr[`1`] == CHAR_PLUS)
5751	{
5752	repeat_type = `0`; / Force greedy /
5753	possessive_quantifier = TRUE;
5754	ptr++;
5755	}
5756	else if (ptr[`1`] == CHAR_QUESTION_MARK)
5757	{
5758	repeat_type = greedy_non_default;
5759	ptr++;
5760	}
5761	else repeat_type = greedy_default;
5762
5763	/ If previous was a recursion call, wrap it in atomic brackets so that*
5764	previous becomes the atomic group. All recursions were so wrapped in the
5765	past, but it no longer happens for non-repeated recursions. In fact, the
5766	repeated ones could be re-implemented independently so as not to need this,
5767	but for the moment we rely on the code for repeating groups. /*
5768
5769	if (*previous == OP_RECURSE)
5770	{
5771	memmove(previous + `1` + LINK_SIZE, previous, IN_UCHARS(`1` + LINK_SIZE));
5772	*previous = OP_ONCE;
5773	PUT(previous, `1`, `2` + `2`*LINK_SIZE);
5774	previous[`2` + `2`*LINK_SIZE] = OP_KET;
5775	PUT(previous, `3` + `2`LINK_SIZE, `2` + `2`LINK_SIZE);
5776	code += `2` + `2` * LINK_SIZE;
5777	length_prevgroup = `3` + `3`*LINK_SIZE;
5778
5779	/ When actually compiling, we need to check whether this was a forward*
5780	reference, and if so, adjust the offset. /*
5781
5782	if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5783	{
5784	int offset = GET(cd->hwm, -LINK_SIZE);
5785	if (offset == previous + `1` - cd->start_code)
5786	PUT(cd->hwm, -LINK_SIZE, offset + `1` + LINK_SIZE);
5787	}
5788	}
5789
5790	/ Now handle repetition for the different types of item. /
5791
5792	/ If previous was a character or negated character match, abolish the item*
5793	and generate a repeat item instead. If a char item has a minimum of more
5794	than one, ensure that it is set in reqchar - it might not be if a sequence
5795	such as x{3} is the first thing in a branch because the x will have gone
5796	into firstchar instead. /*
5797
5798	if (previous == OP_CHAR \|\| previous == OP_CHARI
5799	\|\| previous == OP_NOT \|\| previous == OP_NOTI)
5800	{
5801	switch (*previous)
5802	{
5803	default: / Make compiler happy. /
5804	case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5805	case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5806	case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5807	case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5808	}
5809
5810	/ Deal with UTF characters that take up more than one character. It's*
5811	easier to write this out separately than try to macrify it. Use c to
5812	hold the length of the character in bytes, plus UTF_LENGTH to flag that
5813	it's a length rather than a small character. /*
5814
5815	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5816	if (utf && NOT_FIRSTCHAR(code[-`1`]))
5817	{
5818	pcre_uchar *lastchar = code - `1`;
5819	BACKCHAR(lastchar);
5820	c = (int)(code - lastchar); / Length of UTF-8 character /
5821	memcpy(utf_chars, lastchar, IN_UCHARS(c)); / Save the char /
5822	c \|= UTF_LENGTH; / Flag c as a length /
5823	}
5824	else
5825	#endif /* SUPPORT_UTF */
5826
5827	/ Handle the case of a single charater - either with no UTF support, or*
5828	with UTF disabled, or for a single character UTF character. /*
5829	{
5830	c = code[-`1`];
5831	if (*previous <= OP_CHARI && repeat_min > `1`)
5832	{
5833	reqchar = c;
5834	reqcharflags = req_caseopt \| cd->req_varyopt;
5835	}
5836	}
5837
5838	goto OUTPUT_SINGLE_REPEAT; / Code shared with single character types /
5839	}
5840
5841	/ If previous was a character type match (\d or similar), abolish it and*
5842	create a suitable repeat item. The code is shared with single-character
5843	repeats by setting op_type to add a suitable offset into repeat_type. Note
5844	the the Unicode property types will be present only when SUPPORT_UCP is
5845	defined, but we don't wrap the little bits of code here because it just
5846	makes it horribly messy. /*
5847
5848	else if (*previous < OP_EODN)
5849	{
5850	pcre_uchar *oldcode;
5851	int prop_type, prop_value;
5852	op_type = OP_TYPESTAR - OP_STAR; / Use type opcodes /
5853	c = *previous;
5854
5855	OUTPUT_SINGLE_REPEAT:
5856	if (previous == OP_PROP \|\| previous == OP_NOTPROP)
5857	{
5858	prop_type = previous[`1`];
5859	prop_value = previous[`2`];
5860	}
5861	else prop_type = prop_value = -`1`;
5862
5863	oldcode = code;
5864	code = previous; / Usually overwrite previous item /
5865
5866	/ If the maximum is zero then the minimum must also be zero; Perl allows*
5867	this case, so we do too - by simply omitting the item altogether. /*
5868
5869	if (repeat_max == `0`) goto END_REPEAT;
5870
5871	/ Combine the op_type with the repeat_type /
5872
5873	repeat_type += op_type;
5874
5875	/ A minimum of zero is handled either as the special case * or ?, or as*
5876	an UPTO, with the maximum given. /*
5877
5878	if (repeat_min == `0`)
5879	{
5880	if (repeat_max == -`1`) *code++ = OP_STAR + repeat_type;
5881	else if (repeat_max == `1`) *code++ = OP_QUERY + repeat_type;
5882	else
5883	{
5884	*code++ = OP_UPTO + repeat_type;
5885	PUT2INC(code, `0`, repeat_max);
5886	}
5887	}
5888
5889	/ A repeat minimum of 1 is optimized into some special cases. If the*
5890	maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5891	left in place and, if the maximum is greater than 1, we use OP_UPTO with
5892	one less than the maximum. /*
5893
5894	else if (repeat_min == `1`)
5895	{
5896	if (repeat_max == -`1`)
5897	*code++ = OP_PLUS + repeat_type;
5898	else
5899	{
5900	code = oldcode; / leave previous item in place /
5901	if (repeat_max == `1`) goto END_REPEAT;
5902	*code++ = OP_UPTO + repeat_type;
5903	PUT2INC(code, `0`, repeat_max - `1`);
5904	}
5905	}
5906
5907	/ The case {n,n} is just an EXACT, while the general case {n,m} is*
5908	handled as an EXACT followed by an UPTO. /*
5909
5910	else
5911	{
5912	code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type /
5913	PUT2INC(code, `0`, repeat_min);
5914
5915	/ If the maximum is unlimited, insert an OP_STAR. Before doing so,*
5916	we have to insert the character for the previous code. For a repeated
5917	Unicode property match, there are two extra bytes that define the
5918	required property. In UTF-8 mode, long characters have their length in
5919	c, with the UTF_LENGTH bit as a flag. /*
5920
5921	if (repeat_max < `0`)
5922	{
5923	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5924	if (utf && (c & UTF_LENGTH) != `0`)
5925	{
5926	memcpy(code, utf_chars, IN_UCHARS(c & `7`));
5927	code += c & `7`;
5928	}
5929	else
5930	#endif
5931	{
5932	*code++ = c;
5933	if (prop_type >= `0`)
5934	{
5935	*code++ = prop_type;
5936	*code++ = prop_value;
5937	}
5938	}
5939	*code++ = OP_STAR + repeat_type;
5940	}
5941
5942	/ Else insert an UPTO if the max is greater than the min, again*
5943	preceded by the character, for the previously inserted code. If the
5944	UPTO is just for 1 instance, we can use QUERY instead. /*
5945
5946	else if (repeat_max != repeat_min)
5947	{
5948	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5949	if (utf && (c & UTF_LENGTH) != `0`)
5950	{
5951	memcpy(code, utf_chars, IN_UCHARS(c & `7`));
5952	code += c & `7`;
5953	}
5954	else
5955	#endif
5956	*code++ = c;
5957	if (prop_type >= `0`)
5958	{
5959	*code++ = prop_type;
5960	*code++ = prop_value;
5961	}
5962	repeat_max -= repeat_min;
5963
5964	if (repeat_max == `1`)
5965	{
5966	*code++ = OP_QUERY + repeat_type;
5967	}
5968	else
5969	{
5970	*code++ = OP_UPTO + repeat_type;
5971	PUT2INC(code, `0`, repeat_max);
5972	}
5973	}
5974	}
5975
5976	/ The character or character type itself comes last in all cases. /
5977
5978	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5979	if (utf && (c & UTF_LENGTH) != `0`)
5980	{
5981	memcpy(code, utf_chars, IN_UCHARS(c & `7`));
5982	code += c & `7`;
5983	}
5984	else
5985	#endif
5986	*code++ = c;
5987
5988	/ For a repeated Unicode property match, there are two extra bytes that*
5989	define the required property. /*
5990
5991	#ifdef SUPPORT_UCP
5992	if (prop_type >= `0`)
5993	{
5994	*code++ = prop_type;
5995	*code++ = prop_value;
5996	}
5997	#endif
5998	}
5999
6000	/ If previous was a character class or a back reference, we put the repeat*
6001	stuff after it, but just skip the item if the repeat was {0,0}. /*
6002
6003	else if (previous == OP_CLASS \|\| previous == OP_NCLASS \|\|
6004	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
6005	*previous == OP_XCLASS \|\|
6006	#endif
6007	previous == OP_REF \|\| previous == OP_REFI \|\|
6008	previous == OP_DNREF \|\| previous == OP_DNREFI)
6009	{
6010	if (repeat_max == `0`)
6011	{
6012	code = previous;
6013	goto END_REPEAT;
6014	}
6015
6016	if (repeat_min == `0` && repeat_max == -`1`)
6017	*code++ = OP_CRSTAR + repeat_type;
6018	else if (repeat_min == `1` && repeat_max == -`1`)
6019	*code++ = OP_CRPLUS + repeat_type;
6020	else if (repeat_min == `0` && repeat_max == `1`)
6021	*code++ = OP_CRQUERY + repeat_type;
6022	else
6023	{
6024	*code++ = OP_CRRANGE + repeat_type;
6025	PUT2INC(code, `0`, repeat_min);
6026	if (repeat_max == -`1`) repeat_max = `0`; / 2-byte encoding for max /
6027	PUT2INC(code, `0`, repeat_max);
6028	}
6029	}
6030
6031	/ If previous was a bracket group, we may have to replicate it in certain*
6032	cases. Note that at this point we can encounter only the "basic" bracket
6033	opcodes such as BRA and CBRA, as this is the place where they get converted
6034	into the more special varieties such as BRAPOS and SBRA. A test for >=
6035	OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
6036	ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
6037	Originally, PCRE did not allow repetition of assertions, but now it does,
6038	for Perl compatibility. /*
6039
6040	else if (previous >= OP_ASSERT && previous <= OP_COND)
6041	{
6042	register int i;
6043	int len = (int)(code - previous);
6044	size_t base_hwm_offset = item_hwm_offset;
6045	pcre_uchar *bralink = NULL;
6046	pcre_uchar *brazeroptr = NULL;
6047
6048	/ Repeating a DEFINE group is pointless, but Perl allows the syntax, so*
6049	we just ignore the repeat. /*
6050
6051	if (*previous == OP_COND && previous[LINK_SIZE+`1`] == OP_DEF)
6052	goto END_REPEAT;
6053
6054	/ There is no sense in actually repeating assertions. The only potential*
6055	use of repetition is in cases when the assertion is optional. Therefore,
6056	if the minimum is greater than zero, just ignore the repeat. If the
6057	maximum is not zero or one, set it to 1. /*
6058
6059	if (previous < OP_ONCE) /* Assertion /
6060	{
6061	if (repeat_min > `0`) goto END_REPEAT;
6062	if (repeat_max < `0` \|\| repeat_max > `1`) repeat_max = `1`;
6063	}
6064
6065	/ The case of a zero minimum is special because of the need to stick*
6066	OP_BRAZERO in front of it, and because the group appears once in the
6067	data, whereas in other cases it appears the minimum number of times. For
6068	this reason, it is simplest to treat this case separately, as otherwise
6069	the code gets far too messy. There are several special subcases when the
6070	minimum is zero. /*
6071
6072	if (repeat_min == `0`)
6073	{
6074	/ If the maximum is also zero, we used to just omit the group from the*
6075	output altogether, like this:
6076
6077	** if (repeat_max == 0)
6078	** {
6079	** code = previous;
6080	** goto END_REPEAT;
6081	** }
6082
6083	However, that fails when a group or a subgroup within it is referenced
6084	as a subroutine from elsewhere in the pattern, so now we stick in
6085	OP_SKIPZERO in front of it so that it is skipped on execution. As we
6086	don't have a list of which groups are referenced, we cannot do this
6087	selectively.
6088
6089	If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6090	and do no more at this point. However, we do need to adjust any
6091	OP_RECURSE calls inside the group that refer to the group itself or any
6092	internal or forward referenced group, because the offset is from the
6093	start of the whole regex. Temporarily terminate the pattern while doing
6094	this. /*
6095
6096	if (repeat_max <= `1`) / Covers 0, 1, and unlimited /
6097	{
6098	*code = OP_END;
6099	adjust_recurse(previous, `1`, utf, cd, item_hwm_offset);
6100	memmove(previous + `1`, previous, IN_UCHARS(len));
6101	code++;
6102	if (repeat_max == `0`)
6103	{
6104	*previous++ = OP_SKIPZERO;
6105	goto END_REPEAT;
6106	}
6107	brazeroptr = previous; / Save for possessive optimizing /
6108	*previous++ = OP_BRAZERO + repeat_type;
6109	}
6110
6111	/ If the maximum is greater than 1 and limited, we have to replicate*
6112	in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6113	The first one has to be handled carefully because it's the original
6114	copy, which has to be moved up. The remainder can be handled by code
6115	that is common with the non-zero minimum case below. We have to
6116	adjust the value or repeat_max, since one less copy is required. Once
6117	again, we may have to adjust any OP_RECURSE calls inside the group. /*
6118
6119	else
6120	{
6121	int offset;
6122	*code = OP_END;
6123	adjust_recurse(previous, `2` + LINK_SIZE, utf, cd, item_hwm_offset);
6124	memmove(previous + `2` + LINK_SIZE, previous, IN_UCHARS(len));
6125	code += `2` + LINK_SIZE;
6126	*previous++ = OP_BRAZERO + repeat_type;
6127	*previous++ = OP_BRA;
6128
6129	/ We chain together the bracket offset fields that have to be*
6130	filled in later when the ends of the brackets are reached. /*
6131
6132	offset = (bralink == NULL)? `0` : (int)(previous - bralink);
6133	bralink = previous;
6134	PUTINC(previous, `0`, offset);
6135	}
6136
6137	repeat_max--;
6138	}
6139
6140	/ If the minimum is greater than zero, replicate the group as many*
6141	times as necessary, and adjust the maximum to the number of subsequent
6142	copies that we need. If we set a first char from the group, and didn't
6143	set a required char, copy the latter from the former. If there are any
6144	forward reference subroutine calls in the group, there will be entries on
6145	the workspace list; replicate these with an appropriate increment. /*
6146
6147	else
6148	{
6149	if (repeat_min > `1`)
6150	{
6151	/ In the pre-compile phase, we don't actually do the replication. We*
6152	just adjust the length as if we had. Do some paranoid checks for
6153	potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6154	integer type when available, otherwise double. /*
6155
6156	if (lengthptr != NULL)
6157	{
6158	int delta = (repeat_min - `1`)*length_prevgroup;
6159	if ((INT64_OR_DOUBLE)(repeat_min - `1`)*
6160	(INT64_OR_DOUBLE)length_prevgroup >
6161	(INT64_OR_DOUBLE)INT_MAX \|\|
6162	OFLOW_MAX - *lengthptr < delta)
6163	{
6164	*errorcodeptr = ERR20;
6165	goto FAILED;
6166	}
6167	*lengthptr += delta;
6168	}
6169
6170	/ This is compiling for real. If there is a set first byte for*
6171	the group, and we have not yet set a "required byte", set it. Make
6172	sure there is enough workspace for copying forward references before
6173	doing the copy. /*
6174
6175	else
6176	{
6177	if (groupsetfirstchar && reqcharflags < `0`)
6178	{
6179	reqchar = firstchar;
6180	reqcharflags = firstcharflags;
6181	}
6182
6183	for (i = `1`; i < repeat_min; i++)
6184	{
6185	pcre_uchar *hc;
6186	size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6187	memcpy(code, previous, IN_UCHARS(len));
6188
6189	while (cd->hwm > cd->start_workspace + cd->workspace_size -
6190	WORK_SIZE_SAFETY_MARGIN -
6191	(this_hwm_offset - base_hwm_offset))
6192	{
6193	*errorcodeptr = expand_workspace(cd);
6194	if (errorcodeptr != `0`) goto* FAILED;
6195	}
6196
6197	for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6198	hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6199	hc += LINK_SIZE)
6200	{
6201	PUT(cd->hwm, `0`, GET(hc, `0`) + len);
6202	cd->hwm += LINK_SIZE;
6203	}
6204	base_hwm_offset = this_hwm_offset;
6205	code += len;
6206	}
6207	}
6208	}
6209
6210	if (repeat_max > `0`) repeat_max -= repeat_min;
6211	}
6212
6213	/ This code is common to both the zero and non-zero minimum cases. If*
6214	the maximum is limited, it replicates the group in a nested fashion,
6215	remembering the bracket starts on a stack. In the case of a zero minimum,
6216	the first one was set up above. In all cases the repeat_max now specifies
6217	the number of additional copies needed. Again, we must remember to
6218	replicate entries on the forward reference list. /*
6219
6220	if (repeat_max >= `0`)
6221	{
6222	/ In the pre-compile phase, we don't actually do the replication. We*
6223	just adjust the length as if we had. For each repetition we must add 1
6224	to the length for BRAZERO and for all but the last repetition we must
6225	add 2 + 2LINKSIZE to allow for the nesting that occurs. Do some*
6226	paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6227	a 64-bit integer type when available, otherwise double. /*
6228
6229	if (lengthptr != NULL && repeat_max > `0`)
6230	{
6231	int delta = repeat_max * (length_prevgroup + `1` + `2` + `2`*LINK_SIZE) -
6232	`2` - `2`LINK_SIZE; /* Last one doesn't nest /
6233	if ((INT64_OR_DOUBLE)repeat_max *
6234	(INT64_OR_DOUBLE)(length_prevgroup + `1` + `2` + `2`*LINK_SIZE)
6235	> (INT64_OR_DOUBLE)INT_MAX \|\|
6236	OFLOW_MAX - *lengthptr < delta)
6237	{
6238	*errorcodeptr = ERR20;
6239	goto FAILED;
6240	}
6241	*lengthptr += delta;
6242	}
6243
6244	/ This is compiling for real /
6245
6246	else for (i = repeat_max - `1`; i >= `0`; i--)
6247	{
6248	pcre_uchar *hc;
6249	size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6250
6251	*code++ = OP_BRAZERO + repeat_type;
6252
6253	/ All but the final copy start a new nesting, maintaining the*
6254	chain of brackets outstanding. /*
6255
6256	if (i != `0`)
6257	{
6258	int offset;
6259	*code++ = OP_BRA;
6260	offset = (bralink == NULL)? `0` : (int)(code - bralink);
6261	bralink = code;
6262	PUTINC(code, `0`, offset);
6263	}
6264
6265	memcpy(code, previous, IN_UCHARS(len));
6266
6267	/ Ensure there is enough workspace for forward references before*
6268	copying them. /*
6269
6270	while (cd->hwm > cd->start_workspace + cd->workspace_size -
6271	WORK_SIZE_SAFETY_MARGIN -
6272	(this_hwm_offset - base_hwm_offset))
6273	{
6274	*errorcodeptr = expand_workspace(cd);
6275	if (errorcodeptr != `0`) goto* FAILED;
6276	}
6277
6278	for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6279	hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6280	hc += LINK_SIZE)
6281	{
6282	PUT(cd->hwm, `0`, GET(hc, `0`) + len + ((i != `0`)? `2`+LINK_SIZE : `1`));
6283	cd->hwm += LINK_SIZE;
6284	}
6285	base_hwm_offset = this_hwm_offset;
6286	code += len;
6287	}
6288
6289	/ Now chain through the pending brackets, and fill in their length*
6290	fields (which are holding the chain links pro tem). /*
6291
6292	while (bralink != NULL)
6293	{
6294	int oldlinkoffset;
6295	int offset = (int)(code - bralink + `1`);
6296	pcre_uchar *bra = code - offset;
6297	oldlinkoffset = GET(bra, `1`);
6298	bralink = (oldlinkoffset == `0`)? NULL : bralink - oldlinkoffset;
6299	*code++ = OP_KET;
6300	PUTINC(code, `0`, offset);
6301	PUT(bra, `1`, offset);
6302	}
6303	}
6304
6305	/ If the maximum is unlimited, set a repeater in the final copy. For*
6306	ONCE brackets, that's all we need to do. However, possessively repeated
6307	ONCE brackets can be converted into non-capturing brackets, as the
6308	behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6309	deal with possessive ONCEs specially.
6310
6311	Otherwise, when we are doing the actual compile phase, check to see
6312	whether this group is one that could match an empty string. If so,
6313	convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6314	that runtime checking can be done. [This check is also applied to ONCE
6315	groups at runtime, but in a different way.]
6316
6317	Then, if the quantifier was possessive and the bracket is not a
6318	conditional, we convert the BRA code to the POS form, and the KET code to
6319	KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6320	subpattern at both the start and at the end.) The use of special opcodes
6321	makes it possible to reduce greatly the stack usage in pcre_exec(). If
6322	the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6323
6324	Then, if the minimum number of matches is 1 or 0, cancel the possessive
6325	flag so that the default action below, of wrapping everything inside
6326	atomic brackets, does not happen. When the minimum is greater than 1,
6327	there will be earlier copies of the group, and so we still have to wrap
6328	the whole thing. /*
6329
6330	else
6331	{
6332	pcre_uchar *ketcode = code - `1` - LINK_SIZE;
6333	pcre_uchar *bracode = ketcode - GET(ketcode, `1`);
6334
6335	/ Convert possessive ONCE brackets to non-capturing /
6336
6337	if ((bracode == OP_ONCE \|\| bracode == OP_ONCE_NC) &&
6338	possessive_quantifier) *bracode = OP_BRA;
6339
6340	/ For non-possessive ONCE brackets, all we need to do is to*
6341	set the KET. /*
6342
6343	if (bracode == OP_ONCE \|\| bracode == OP_ONCE_NC)
6344	*ketcode = OP_KETRMAX + repeat_type;
6345
6346	/ Handle non-ONCE brackets and possessive ONCEs (which have been*
6347	converted to non-capturing above). /*
6348
6349	else
6350	{
6351	/ In the compile phase, check for empty string matching. /
6352
6353	if (lengthptr == NULL)
6354	{
6355	pcre_uchar *scode = bracode;
6356	do
6357	{
6358	if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6359	{
6360	*bracode += OP_SBRA - OP_BRA;
6361	break;
6362	}
6363	scode += GET(scode, `1`);
6364	}
6365	while (*scode == OP_ALT);
6366	}
6367
6368	/ A conditional group with only one branch has an implicit empty*
6369	alternative branch. /*
6370
6371	if (*bracode == OP_COND && bracode[GET(bracode,`1`)] != OP_ALT)
6372	*bracode = OP_SCOND;
6373
6374	/ Handle possessive quantifiers. /
6375
6376	if (possessive_quantifier)
6377	{
6378	/ For COND brackets, we wrap the whole thing in a possessively*
6379	repeated non-capturing bracket, because we have not invented POS
6380	versions of the COND opcodes. Because we are moving code along, we
6381	must ensure that any pending recursive references are updated. /*
6382
6383	if (bracode == OP_COND \|\| bracode == OP_SCOND)
6384	{
6385	int nlen = (int)(code - bracode);
6386	*code = OP_END;
6387	adjust_recurse(bracode, `1` + LINK_SIZE, utf, cd, item_hwm_offset);
6388	memmove(bracode + `1` + LINK_SIZE, bracode, IN_UCHARS(nlen));
6389	code += `1` + LINK_SIZE;
6390	nlen += `1` + LINK_SIZE;
6391	bracode = (bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6392	*code++ = OP_KETRPOS;
6393	PUTINC(code, `0`, nlen);
6394	PUT(bracode, `1`, nlen);
6395	}
6396
6397	/ For non-COND brackets, we modify the BRA code and use KETRPOS. /
6398
6399	else
6400	{
6401	bracode += `1`; /* Switch to xxxPOS opcodes /
6402	*ketcode = OP_KETRPOS;
6403	}
6404
6405	/ If the minimum is zero, mark it as possessive, then unset the*
6406	possessive flag when the minimum is 0 or 1. /*
6407
6408	if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6409	if (repeat_min < `2`) possessive_quantifier = FALSE;
6410	}
6411
6412	/ Non-possessive quantifier /
6413
6414	else *ketcode = OP_KETRMAX + repeat_type;
6415	}
6416	}
6417	}
6418
6419	/ If previous is OP_FAIL, it was generated by an empty class [] in*
6420	JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6421	by (FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"*
6422	error above. We can just ignore the repeat in JS case. /*
6423
6424	else if (previous == OP_FAIL) goto* END_REPEAT;
6425
6426	/ Else there's some kind of shambles /
6427
6428	else
6429	{
6430	*errorcodeptr = ERR11;
6431	goto FAILED;
6432	}
6433
6434	/ If the character following a repeat is '+', possessive_quantifier is*
6435	TRUE. For some opcodes, there are special alternative opcodes for this
6436	case. For anything else, we wrap the entire repeated item inside OP_ONCE
6437	brackets. Logically, the '+' notation is just syntactic sugar, taken from
6438	Sun's Java package, but the special opcodes can optimize it.
6439
6440	Some (but not all) possessively repeated subpatterns have already been
6441	completely handled in the code just above. For them, possessive_quantifier
6442	is always FALSE at this stage. Note that the repeated item starts at
6443	tempcode, not at previous, which might be the first part of a string whose
6444	(former) last char we repeated. /*
6445
6446	if (possessive_quantifier)
6447	{
6448	int len;
6449
6450	/ Possessifying an EXACT quantifier has no effect, so we can ignore it.*
6451	However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6452	{5,}, or {5,10}). We skip over an EXACT item; if the length of what
6453	remains is greater than zero, there's a further opcode that can be
6454	handled. If not, do nothing, leaving the EXACT alone. /*
6455
6456	switch(*tempcode)
6457	{
6458	case OP_TYPEEXACT:
6459	tempcode += PRIV(OP_lengths)[*tempcode] +
6460	((tempcode[`1` + IMM2_SIZE] == OP_PROP
6461	\|\| tempcode[`1` + IMM2_SIZE] == OP_NOTPROP)? `2` : `0`);
6462	break;
6463
6464	/ CHAR opcodes are used for exacts whose count is 1. /
6465
6466	case OP_CHAR:
6467	case OP_CHARI:
6468	case OP_NOT:
6469	case OP_NOTI:
6470	case OP_EXACT:
6471	case OP_EXACTI:
6472	case OP_NOTEXACT:
6473	case OP_NOTEXACTI:
6474	tempcode += PRIV(OP_lengths)[*tempcode];
6475	#ifdef SUPPORT_UTF
6476	if (utf && HAS_EXTRALEN(tempcode[-`1`]))
6477	tempcode += GET_EXTRALEN(tempcode[-`1`]);
6478	#endif
6479	break;
6480
6481	/ For the class opcodes, the repeat operator appears at the end;*
6482	adjust tempcode to point to it. /*
6483
6484	case OP_CLASS:
6485	case OP_NCLASS:
6486	tempcode += `1` + `32`/sizeof(pcre_uchar);
6487	break;
6488
6489	#if defined SUPPORT_UTF \|\| !defined COMPILE_PCRE8
6490	case OP_XCLASS:
6491	tempcode += GET(tempcode, `1`);
6492	break;
6493	#endif
6494	}
6495
6496	/ If tempcode is equal to code (which points to the end of the repeated*
6497	item), it means we have skipped an EXACT item but there is no following
6498	QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6499	all other cases, tempcode will be pointing to the repeat opcode, and will
6500	be less than code, so the value of len will be greater than 0. /*
6501
6502	len = (int)(code - tempcode);
6503	if (len > `0`)
6504	{
6505	unsigned int repcode = *tempcode;
6506
6507	/ There is a table for possessifying opcodes, all of which are less*
6508	than OP_CALLOUT. A zero entry means there is no possessified version.
6509	*/
6510
6511	if (repcode < OP_CALLOUT && opcode_possessify[repcode] > `0`)
6512	*tempcode = opcode_possessify[repcode];
6513
6514	/ For opcode without a special possessified version, wrap the item in*
6515	ONCE brackets. Because we are moving code along, we must ensure that any
6516	pending recursive references are updated. /*
6517
6518	else
6519	{
6520	*code = OP_END;
6521	adjust_recurse(tempcode, `1` + LINK_SIZE, utf, cd, item_hwm_offset);
6522	memmove(tempcode + `1` + LINK_SIZE, tempcode, IN_UCHARS(len));
6523	code += `1` + LINK_SIZE;
6524	len += `1` + LINK_SIZE;
6525	tempcode[`0`] = OP_ONCE;
6526	*code++ = OP_KET;
6527	PUTINC(code, `0`, len);
6528	PUT(tempcode, `1`, len);
6529	}
6530	}
6531
6532	#ifdef NEVER
6533	if (len > `0`) switch (*tempcode)
6534	{
6535	case OP_STAR: tempcode = OP_POSSTAR; break*;
6536	case OP_PLUS: tempcode = OP_POSPLUS; break*;
6537	case OP_QUERY: tempcode = OP_POSQUERY; break*;
6538	case OP_UPTO: tempcode = OP_POSUPTO; break*;
6539
6540	case OP_STARI: tempcode = OP_POSSTARI; break*;
6541	case OP_PLUSI: tempcode = OP_POSPLUSI; break*;
6542	case OP_QUERYI: tempcode = OP_POSQUERYI; break*;
6543	case OP_UPTOI: tempcode = OP_POSUPTOI; break*;
6544
6545	case OP_NOTSTAR: tempcode = OP_NOTPOSSTAR; break*;
6546	case OP_NOTPLUS: tempcode = OP_NOTPOSPLUS; break*;
6547	case OP_NOTQUERY: tempcode = OP_NOTPOSQUERY; break*;
6548	case OP_NOTUPTO: tempcode = OP_NOTPOSUPTO; break*;
6549
6550	case OP_NOTSTARI: tempcode = OP_NOTPOSSTARI; break*;
6551	case OP_NOTPLUSI: tempcode = OP_NOTPOSPLUSI; break*;
6552	case OP_NOTQUERYI: tempcode = OP_NOTPOSQUERYI; break*;
6553	case OP_NOTUPTOI: tempcode = OP_NOTPOSUPTOI; break*;
6554
6555	case OP_TYPESTAR: tempcode = OP_TYPEPOSSTAR; break*;
6556	case OP_TYPEPLUS: tempcode = OP_TYPEPOSPLUS; break*;
6557	case OP_TYPEQUERY: tempcode = OP_TYPEPOSQUERY; break*;
6558	case OP_TYPEUPTO: tempcode = OP_TYPEPOSUPTO; break*;
6559
6560	case OP_CRSTAR: tempcode = OP_CRPOSSTAR; break*;
6561	case OP_CRPLUS: tempcode = OP_CRPOSPLUS; break*;
6562	case OP_CRQUERY: tempcode = OP_CRPOSQUERY; break*;
6563	case OP_CRRANGE: tempcode = OP_CRPOSRANGE; break*;
6564
6565	/ Because we are moving code along, we must ensure that any*
6566	pending recursive references are updated. /*
6567
6568	default:
6569	*code = OP_END;
6570	adjust_recurse(tempcode, `1` + LINK_SIZE, utf, cd, item_hwm_offset);
6571	memmove(tempcode + `1` + LINK_SIZE, tempcode, IN_UCHARS(len));
6572	code += `1` + LINK_SIZE;
6573	len += `1` + LINK_SIZE;
6574	tempcode[`0`] = OP_ONCE;
6575	*code++ = OP_KET;
6576	PUTINC(code, `0`, len);
6577	PUT(tempcode, `1`, len);
6578	break;
6579	}
6580	#endif
6581	}
6582
6583	/ In all case we no longer have a previous item. We also set the*
6584	"follows varying string" flag for subsequently encountered reqchars if
6585	it isn't already set and we have just passed a varying length item. /*
6586
6587	END_REPEAT:
6588	previous = NULL;
6589	cd->req_varyopt \|= reqvary;
6590	break;
6591
6592
6593	/ ===================================================================/
6594	/ Start of nested parenthesized sub-expression, or comment or lookahead or*
6595	lookbehind or option setting or condition or all the other extended
6596	parenthesis forms. /*
6597
6598	case CHAR_LEFT_PARENTHESIS:
6599	ptr++;
6600
6601	/ Now deal with various "verbs" that can be introduced by ''. /*
6602
6603	if (ptr[`0`] == CHAR_ASTERISK && (ptr[`1`] == `':'`
6604	\|\| (MAX_255(ptr[`1`]) && ((cd->ctypes[ptr[`1`]] & ctype_letter) != `0`))))
6605	{
6606	int i, namelen;
6607	int arglen = `0`;
6608	const char *vn = verbnames;
6609	const pcre_uchar *name = ptr + `1`;
6610	const pcre_uchar *arg = NULL;
6611	previous = NULL;
6612	ptr++;
6613	while (MAX_255(ptr) && (cd->ctypes[ptr] & ctype_letter) != `0`) ptr++;
6614	namelen = (int)(ptr - name);
6615
6616	/ It appears that Perl allows any characters whatsoever, other than*
6617	a closing parenthesis, to appear in arguments, so we no longer insist on
6618	letters, digits, and underscores. /*
6619
6620	if (*ptr == CHAR_COLON)
6621	{
6622	arg = ++ptr;
6623	while (ptr != CHAR_NULL && ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6624	arglen = (int)(ptr - arg);
6625	if ((unsigned int)arglen > MAX_MARK)
6626	{
6627	*errorcodeptr = ERR75;
6628	goto FAILED;
6629	}
6630	}
6631
6632	if (*ptr != CHAR_RIGHT_PARENTHESIS)
6633	{
6634	*errorcodeptr = ERR60;
6635	goto FAILED;
6636	}
6637
6638	/ Scan the table of verb names /
6639
6640	for (i = `0`; i < verbcount; i++)
6641	{
6642	if (namelen == verbs[i].len &&
6643	STRNCMP_UC_C8(name, vn, namelen) == `0`)
6644	{
6645	int setverb;
6646
6647	/ Check for open captures before ACCEPT and convert it to*
6648	ASSERT_ACCEPT if in an assertion. /*
6649
6650	if (verbs[i].op == OP_ACCEPT)
6651	{
6652	open_capitem *oc;
6653	if (arglen != `0`)
6654	{
6655	*errorcodeptr = ERR59;
6656	goto FAILED;
6657	}
6658	cd->had_accept = TRUE;
6659	for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6660	{
6661	if (lengthptr != NULL)
6662	{
6663	#ifdef COMPILE_PCRE8
6664	*lengthptr += `1` + IMM2_SIZE;
6665	#elif defined COMPILE_PCRE16
6666	*lengthptr += `2` + IMM2_SIZE;
6667	#elif defined COMPILE_PCRE32
6668	*lengthptr += `4` + IMM2_SIZE;
6669	#endif
6670	}
6671	else
6672	{
6673	*code++ = OP_CLOSE;
6674	PUT2INC(code, `0`, oc->number);
6675	}
6676	}
6677	setverb = *code++ =
6678	(cd->assert_depth > `0`)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6679
6680	/ Do not set firstchar after ACCEPT /*
6681	if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6682	}
6683
6684	/ Handle other cases with/without an argument /
6685
6686	else if (arglen == `0`)
6687	{
6688	if (verbs[i].op < `0`) / Argument is mandatory /
6689	{
6690	*errorcodeptr = ERR66;
6691	goto FAILED;
6692	}
6693	setverb = *code++ = verbs[i].op;
6694	}
6695
6696	else
6697	{
6698	if (verbs[i].op_arg < `0`) / Argument is forbidden /
6699	{
6700	*errorcodeptr = ERR59;
6701	goto FAILED;
6702	}
6703	setverb = *code++ = verbs[i].op_arg;
6704	if (lengthptr != NULL) / In pass 1 just add in the length /
6705	{ / to avoid potential workspace /
6706	lengthptr += arglen; /* overflow. /
6707	*code++ = `0`;
6708	}
6709	else
6710	{
6711	*code++ = arglen;
6712	memcpy(code, arg, IN_UCHARS(arglen));
6713	code += arglen;
6714	}
6715	*code++ = `0`;
6716	}
6717
6718	switch (setverb)
6719	{
6720	case OP_THEN:
6721	case OP_THEN_ARG:
6722	cd->external_flags \|= PCRE_HASTHEN;
6723	break;
6724
6725	case OP_PRUNE:
6726	case OP_PRUNE_ARG:
6727	case OP_SKIP:
6728	case OP_SKIP_ARG:
6729	cd->had_pruneorskip = TRUE;
6730	break;
6731	}
6732
6733	break; / Found verb, exit loop /
6734	}
6735
6736	vn += verbs[i].len + `1`;
6737	}
6738
6739	if (i < verbcount) continue; / Successfully handled a verb /
6740	errorcodeptr = ERR60; /* Verb not recognized /
6741	goto FAILED;
6742	}
6743
6744	/ Initialize for "real" parentheses /
6745
6746	newoptions = options;
6747	skipbytes = `0`;
6748	bravalue = OP_CBRA;
6749	item_hwm_offset = cd->hwm - cd->start_workspace;
6750	reset_bracount = FALSE;
6751
6752	/ Deal with the extended parentheses; all are introduced by '?', and the*
6753	appearance of any of them means that this is not a capturing group. /*
6754
6755	if (*ptr == CHAR_QUESTION_MARK)
6756	{
6757	int i, set, unset, namelen;
6758	int *optset;
6759	const pcre_uchar *name;
6760	pcre_uchar *slot;
6761
6762	switch (*(++ptr))
6763	{
6764	/ ------------------------------------------------------------ /
6765	case CHAR_VERTICAL_LINE: / Reset capture count for each branch /
6766	reset_bracount = TRUE;
6767	cd->dupgroups = TRUE; / Record (?\| encountered /
6768	/ Fall through /
6769
6770	/ ------------------------------------------------------------ /
6771	case CHAR_COLON: / Non-capturing bracket /
6772	bravalue = OP_BRA;
6773	ptr++;
6774	break;
6775
6776
6777	/ ------------------------------------------------------------ /
6778	case CHAR_LEFT_PARENTHESIS:
6779	bravalue = OP_COND; / Conditional group /
6780	tempptr = ptr;
6781
6782	/ A condition can be an assertion, a number (referring to a numbered*
6783	group's having been set), a name (referring to a named group), or 'R',
6784	referring to recursion. R<digits> and R&name are also permitted for
6785	recursion tests.
6786
6787	There are ways of testing a named group: (?(name)) is used by Python;
6788	Perl 5.10 onwards uses (?(<name>) or (?('name')).
6789
6790	There is one unfortunate ambiguity, caused by history. 'R' can be the
6791	recursive thing or the name 'R' (and similarly for 'R' followed by
6792	digits). We look for a name first; if not found, we try the other case.
6793
6794	For compatibility with auto-callouts, we allow a callout to be
6795	specified before a condition that is an assertion. First, check for the
6796	syntax of a callout; if found, adjust the temporary pointer that is
6797	used to check for an assertion condition. That's all that is needed! /*
6798
6799	if (ptr[`1`] == CHAR_QUESTION_MARK && ptr[`2`] == CHAR_C)
6800	{
6801	for (i = `3`;; i++) if (!IS_DIGIT(ptr[i])) break;
6802	if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6803	tempptr += i + `1`;
6804
6805	/ tempptr should now be pointing to the opening parenthesis of the*
6806	assertion condition. /*
6807
6808	if (*tempptr != CHAR_LEFT_PARENTHESIS)
6809	{
6810	*errorcodeptr = ERR28;
6811	goto FAILED;
6812	}
6813	}
6814
6815	/ For conditions that are assertions, check the syntax, and then exit*
6816	the switch. This will take control down to where bracketed groups,
6817	including assertions, are processed. /*
6818
6819	if (tempptr[`1`] == CHAR_QUESTION_MARK &&
6820	(tempptr[`2`] == CHAR_EQUALS_SIGN \|\|
6821	tempptr[`2`] == CHAR_EXCLAMATION_MARK \|\|
6822	(tempptr[`2`] == CHAR_LESS_THAN_SIGN &&
6823	(tempptr[`3`] == CHAR_EQUALS_SIGN \|\|
6824	tempptr[`3`] == CHAR_EXCLAMATION_MARK))))
6825	{
6826	cd->iscondassert = TRUE;
6827	break;
6828	}
6829
6830	/ Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all*
6831	need to skip at least 1+IMM2_SIZE bytes at the start of the group. /*
6832
6833	code[`1`+LINK_SIZE] = OP_CREF;
6834	skipbytes = `1`+IMM2_SIZE;
6835	refsign = -`1`; / => not a number /
6836	namelen = -`1`; / => not a name; must set to avoid warning /
6837	name = NULL; / Always set to avoid warning /
6838	recno = `0`; / Always set to avoid warning /
6839
6840	/ Check for a test for recursion in a named group. /
6841
6842	ptr++;
6843	if (*ptr == CHAR_R && ptr[`1`] == CHAR_AMPERSAND)
6844	{
6845	terminator = -`1`;
6846	ptr += `2`;
6847	code[`1`+LINK_SIZE] = OP_RREF; / Change the type of test /
6848	}
6849
6850	/ Check for a test for a named group's having been set, using the Perl*
6851	syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6852	syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). /*
6853
6854	else if (*ptr == CHAR_LESS_THAN_SIGN)
6855	{
6856	terminator = CHAR_GREATER_THAN_SIGN;
6857	ptr++;
6858	}
6859	else if (*ptr == CHAR_APOSTROPHE)
6860	{
6861	terminator = CHAR_APOSTROPHE;
6862	ptr++;
6863	}
6864	else
6865	{
6866	terminator = CHAR_NULL;
6867	if (ptr == CHAR_MINUS \|\| ptr == CHAR_PLUS) refsign = *ptr++;
6868	else if (IS_DIGIT(*ptr)) refsign = `0`;
6869	}
6870
6871	/ Handle a number /
6872
6873	if (refsign >= `0`)
6874	{
6875	while (IS_DIGIT(*ptr))
6876	{
6877	if (recno > INT_MAX / `10` - `1`) / Integer overflow /
6878	{
6879	while (IS_DIGIT(*ptr)) ptr++;
6880	*errorcodeptr = ERR61;
6881	goto FAILED;
6882	}
6883	recno = recno * `10` + (int)(*ptr - CHAR_0);
6884	ptr++;
6885	}
6886	}
6887
6888	/ Otherwise we expect to read a name; anything else is an error. When*
6889	a name is one of a number of duplicates, a different opcode is used and
6890	it needs more memory. Unfortunately we cannot tell whether a name is a
6891	duplicate in the first pass, so we have to allow for more memory. /*
6892
6893	else
6894	{
6895	if (IS_DIGIT(*ptr))
6896	{
6897	*errorcodeptr = ERR84;
6898	goto FAILED;
6899	}
6900	if (!MAX_255(ptr) \|\| (cd->ctypes[ptr] & ctype_word) == `0`)
6901	{
6902	errorcodeptr = ERR28; /* Assertion expected /
6903	goto FAILED;
6904	}
6905	name = ptr++;
6906	while (MAX_255(ptr) && (cd->ctypes[ptr] & ctype_word) != `0`)
6907	{
6908	ptr++;
6909	}
6910	namelen = (int)(ptr - name);
6911	if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6912	}
6913
6914	/ Check the terminator /
6915
6916	if ((terminator > `0` && *ptr++ != (pcre_uchar)terminator) \|\|
6917	*ptr++ != CHAR_RIGHT_PARENTHESIS)
6918	{
6919	ptr--; / Error offset /
6920	errorcodeptr = ERR26; /* Malformed number or name /
6921	goto FAILED;
6922	}
6923
6924	/ Do no further checking in the pre-compile phase. /
6925
6926	if (lengthptr != NULL) break;
6927
6928	/ In the real compile we do the work of looking for the actual*
6929	reference. If refsign is not negative, it means we have a number in
6930	recno. /*
6931
6932	if (refsign >= `0`)
6933	{
6934	if (recno <= `0`)
6935	{
6936	*errorcodeptr = ERR35;
6937	goto FAILED;
6938	}
6939	if (refsign != `0`) recno = (refsign == CHAR_MINUS)?
6940	cd->bracount - recno + `1` : recno + cd->bracount;
6941	if (recno <= `0` \|\| recno > cd->final_bracount)
6942	{
6943	*errorcodeptr = ERR15;
6944	goto FAILED;
6945	}
6946	PUT2(code, `2`+LINK_SIZE, recno);
6947	if (recno > cd->top_backref) cd->top_backref = recno;
6948	break;
6949	}
6950
6951	/ Otherwise look for the name. /
6952
6953	slot = cd->name_table;
6954	for (i = `0`; i < cd->names_found; i++)
6955	{
6956	if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == `0` &&
6957	slot[IMM2_SIZE+namelen] == `0`) break;
6958	slot += cd->name_entry_size;
6959	}
6960
6961	/ Found the named subpattern. If the name is duplicated, add one to*
6962	the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6963	appropriate data values. Otherwise, just insert the unique subpattern
6964	number. /*
6965
6966	if (i < cd->names_found)
6967	{
6968	int offset = i++;
6969	int count = `1`;
6970	recno = GET2(slot, `0`); / Number from first found /
6971	if (recno > cd->top_backref) cd->top_backref = recno;
6972	for (; i < cd->names_found; i++)
6973	{
6974	slot += cd->name_entry_size;
6975	if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != `0` \|\|
6976	(slot+IMM2_SIZE)[namelen] != `0`) break;
6977	count++;
6978	}
6979
6980	if (count > `1`)
6981	{
6982	PUT2(code, `2`+LINK_SIZE, offset);
6983	PUT2(code, `2`+LINK_SIZE+IMM2_SIZE, count);
6984	skipbytes += IMM2_SIZE;
6985	code[`1`+LINK_SIZE]++;
6986	}
6987	else / Not a duplicated name /
6988	{
6989	PUT2(code, `2`+LINK_SIZE, recno);
6990	}
6991	}
6992
6993	/ If terminator == CHAR_NULL it means that the name followed directly*
6994	after the opening parenthesis [e.g. (?(abc)...] and in this case there
6995	are some further alternatives to try. For the cases where terminator !=
6996	CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6997	we have now checked all the possibilities, so give an error. /*
6998
6999	else if (terminator != CHAR_NULL)
7000	{
7001	*errorcodeptr = ERR15;
7002	goto FAILED;
7003	}
7004
7005	/ Check for (?(R) for recursion. Allow digits after R to specify a*
7006	specific group number. /*
7007
7008	else if (*name == CHAR_R)
7009	{
7010	recno = `0`;
7011	for (i = `1`; i < namelen; i++)
7012	{
7013	if (!IS_DIGIT(name[i]))
7014	{
7015	*errorcodeptr = ERR15;
7016	goto FAILED;
7017	}
7018	if (recno > INT_MAX / `10` - `1`) / Integer overflow /
7019	{
7020	*errorcodeptr = ERR61;
7021	goto FAILED;
7022	}
7023	recno = recno * `10` + name[i] - CHAR_0;
7024	}
7025	if (recno == `0`) recno = RREF_ANY;
7026	code[`1`+LINK_SIZE] = OP_RREF; / Change test type /
7027	PUT2(code, `2`+LINK_SIZE, recno);
7028	}
7029
7030	/ Similarly, check for the (?(DEFINE) "condition", which is always*
7031	false. /*
7032
7033	else if (namelen == `6` && STRNCMP_UC_C8(name, STRING_DEFINE, `6`) == `0`)
7034	{
7035	code[`1`+LINK_SIZE] = OP_DEF;
7036	skipbytes = `1`;
7037	}
7038
7039	/ Reference to an unidentified subpattern. /
7040
7041	else
7042	{
7043	*errorcodeptr = ERR15;
7044	goto FAILED;
7045	}
7046	break;
7047
7048
7049	/ ------------------------------------------------------------ /
7050	case CHAR_EQUALS_SIGN: / Positive lookahead /
7051	bravalue = OP_ASSERT;
7052	cd->assert_depth += `1`;
7053	ptr++;
7054	break;
7055
7056	/ Optimize (?!) to (FAIL) unless it is quantified - which is a weird
7057	thing to do, but Perl allows all assertions to be quantified, and when
7058	they contain capturing parentheses there may be a potential use for
7059	this feature. Not that that applies to a quantified (?!) but we allow
7060	it for uniformity. /*
7061
7062	/ ------------------------------------------------------------ /
7063	case CHAR_EXCLAMATION_MARK: / Negative lookahead /
7064	ptr++;
7065	if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[`1`] != CHAR_ASTERISK &&
7066	ptr[`1`] != CHAR_PLUS && ptr[`1`] != CHAR_QUESTION_MARK &&
7067	(ptr[`1`] != CHAR_LEFT_CURLY_BRACKET \|\| !is_counted_repeat(ptr+`2`)))
7068	{
7069	*code++ = OP_FAIL;
7070	previous = NULL;
7071	continue;
7072	}
7073	bravalue = OP_ASSERT_NOT;
7074	cd->assert_depth += `1`;
7075	break;
7076
7077
7078	/ ------------------------------------------------------------ /
7079	case CHAR_LESS_THAN_SIGN: / Lookbehind or named define /
7080	switch (ptr[`1`])
7081	{
7082	case CHAR_EQUALS_SIGN: / Positive lookbehind /
7083	bravalue = OP_ASSERTBACK;
7084	cd->assert_depth += `1`;
7085	ptr += `2`;
7086	break;
7087
7088	case CHAR_EXCLAMATION_MARK: / Negative lookbehind /
7089	bravalue = OP_ASSERTBACK_NOT;
7090	cd->assert_depth += `1`;
7091	ptr += `2`;
7092	break;
7093
7094	default: / Could be name define, else bad /
7095	if (MAX_255(ptr[`1`]) && (cd->ctypes[ptr[`1`]] & ctype_word) != `0`)
7096	goto DEFINE_NAME;
7097	ptr++; / Correct offset for error /
7098	*errorcodeptr = ERR24;
7099	goto FAILED;
7100	}
7101	break;
7102
7103
7104	/ ------------------------------------------------------------ /
7105	case CHAR_GREATER_THAN_SIGN: / One-time brackets /
7106	bravalue = OP_ONCE;
7107	ptr++;
7108	break;
7109
7110
7111	/ ------------------------------------------------------------ /
7112	case CHAR_C: / Callout - may be followed by digits; /
7113	previous_callout = code; / Save for later completion /
7114	after_manual_callout = `1`; / Skip one item before completing /
7115	*code++ = OP_CALLOUT;
7116	{
7117	int n = `0`;
7118	ptr++;
7119	while(IS_DIGIT(*ptr))
7120	n = n * `10` + *ptr++ - CHAR_0;
7121	if (*ptr != CHAR_RIGHT_PARENTHESIS)
7122	{
7123	*errorcodeptr = ERR39;
7124	goto FAILED;
7125	}
7126	if (n > `255`)
7127	{
7128	*errorcodeptr = ERR38;
7129	goto FAILED;
7130	}
7131	*code++ = n;
7132	PUT(code, `0`, (int)(ptr - cd->start_pattern + `1`)); / Pattern offset /
7133	PUT(code, LINK_SIZE, `0`); / Default length /
7134	code += `2` * LINK_SIZE;
7135	}
7136	previous = NULL;
7137	continue;
7138
7139
7140	/ ------------------------------------------------------------ /
7141	case CHAR_P: / Python-style named subpattern handling /
7142	if (*(++ptr) == CHAR_EQUALS_SIGN \|\|
7143	ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion /
7144	{
7145	is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7146	terminator = CHAR_RIGHT_PARENTHESIS;
7147	goto NAMED_REF_OR_RECURSE;
7148	}
7149	else if (ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn /
7150	{
7151	*errorcodeptr = ERR41;
7152	goto FAILED;
7153	}
7154	/ Fall through to handle (?P< as (?< is handled /
7155
7156
7157	/ ------------------------------------------------------------ /
7158	DEFINE_NAME: / Come here from (?< handling /
7159	case CHAR_APOSTROPHE:
7160	terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7161	CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7162	name = ++ptr;
7163	if (IS_DIGIT(*ptr))
7164	{
7165	errorcodeptr = ERR84; /* Group name must start with non-digit /
7166	goto FAILED;
7167	}
7168	while (MAX_255(ptr) && (cd->ctypes[ptr] & ctype_word) != `0`) ptr++;
7169	namelen = (int)(ptr - name);
7170
7171	/ In the pre-compile phase, do a syntax check, remember the longest*
7172	name, and then remember the group in a vector, expanding it if
7173	necessary. Duplicates for the same number are skipped; other duplicates
7174	are checked for validity. In the actual compile, there is nothing to
7175	do. /*
7176
7177	if (lengthptr != NULL)
7178	{
7179	named_group *ng;
7180	pcre_uint32 number = cd->bracount + `1`;
7181
7182	if (*ptr != (pcre_uchar)terminator)
7183	{
7184	*errorcodeptr = ERR42;
7185	goto FAILED;
7186	}
7187
7188	if (cd->names_found >= MAX_NAME_COUNT)
7189	{
7190	*errorcodeptr = ERR49;
7191	goto FAILED;
7192	}
7193
7194	if (namelen + IMM2_SIZE + `1` > cd->name_entry_size)
7195	{
7196	cd->name_entry_size = namelen + IMM2_SIZE + `1`;
7197	if (namelen > MAX_NAME_SIZE)
7198	{
7199	*errorcodeptr = ERR48;
7200	goto FAILED;
7201	}
7202	}
7203
7204	/ Scan the list to check for duplicates. For duplicate names, if the*
7205	number is the same, break the loop, which causes the name to be
7206	discarded; otherwise, if DUPNAMES is not set, give an error.
7207	If it is set, allow the name with a different number, but continue
7208	scanning in case this is a duplicate with the same number. For
7209	non-duplicate names, give an error if the number is duplicated. /*
7210
7211	ng = cd->named_groups;
7212	for (i = `0`; i < cd->names_found; i++, ng++)
7213	{
7214	if (namelen == ng->length &&
7215	STRNCMP_UC_UC(name, ng->name, namelen) == `0`)
7216	{
7217	if (ng->number == number) break;
7218	if ((options & PCRE_DUPNAMES) == `0`)
7219	{
7220	*errorcodeptr = ERR43;
7221	goto FAILED;
7222	}
7223	cd->dupnames = TRUE; / Duplicate names exist /
7224	}
7225	else if (ng->number == number)
7226	{
7227	*errorcodeptr = ERR65;
7228	goto FAILED;
7229	}
7230	}
7231
7232	if (i >= cd->names_found) / Not a duplicate with same number /
7233	{
7234	/ Increase the list size if necessary /
7235
7236	if (cd->names_found >= cd->named_group_list_size)
7237	{
7238	int newsize = cd->named_group_list_size * `2`;
7239	named_group *newspace = (PUBL(malloc))
7240	(newsize * sizeof(named_group));
7241
7242	if (newspace == NULL)
7243	{
7244	*errorcodeptr = ERR21;
7245	goto FAILED;
7246	}
7247
7248	memcpy(newspace, cd->named_groups,
7249	cd->named_group_list_size * sizeof(named_group));
7250	if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7251	(PUBL(free))((void *)cd->named_groups);
7252	cd->named_groups = newspace;
7253	cd->named_group_list_size = newsize;
7254	}
7255
7256	cd->named_groups[cd->names_found].name = name;
7257	cd->named_groups[cd->names_found].length = namelen;
7258	cd->named_groups[cd->names_found].number = number;
7259	cd->names_found++;
7260	}
7261	}
7262
7263	ptr++; / Move past > or ' in both passes. /
7264	goto NUMBERED_GROUP;
7265
7266
7267	/ ------------------------------------------------------------ /
7268	case CHAR_AMPERSAND: / Perl recursion/subroutine syntax /
7269	terminator = CHAR_RIGHT_PARENTHESIS;
7270	is_recurse = TRUE;
7271	/ Fall through /
7272
7273	/ We come here from the Python syntax above that handles both*
7274	references (?P=name) and recursion (?P>name), as well as falling
7275	through from the Perl recursion syntax (?&name). We also come here from
7276	the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7277	.NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. /*
7278
7279	NAMED_REF_OR_RECURSE:
7280	name = ++ptr;
7281	if (IS_DIGIT(*ptr))
7282	{
7283	errorcodeptr = ERR84; /* Group name must start with non-digit /
7284	goto FAILED;
7285	}
7286	while (MAX_255(ptr) && (cd->ctypes[ptr] & ctype_word) != `0`) ptr++;
7287	namelen = (int)(ptr - name);
7288
7289	/ In the pre-compile phase, do a syntax check. We used to just set*
7290	a dummy reference number, because it was not used in the first pass.
7291	However, with the change of recursive back references to be atomic,
7292	we have to look for the number so that this state can be identified, as
7293	otherwise the incorrect length is computed. If it's not a backwards
7294	reference, the dummy number will do. /*
7295
7296	if (lengthptr != NULL)
7297	{
7298	named_group *ng;
7299	recno = `0`;
7300
7301	if (namelen == `0`)
7302	{
7303	*errorcodeptr = ERR62;
7304	goto FAILED;
7305	}
7306	if (*ptr != (pcre_uchar)terminator)
7307	{
7308	*errorcodeptr = ERR42;
7309	goto FAILED;
7310	}
7311	if (namelen > MAX_NAME_SIZE)
7312	{
7313	*errorcodeptr = ERR48;
7314	goto FAILED;
7315	}
7316
7317	/ Count named back references. /
7318
7319	if (!is_recurse) cd->namedrefcount++;
7320
7321	/ We have to allow for a named reference to a duplicated name (this*
7322	cannot be determined until the second pass). This needs an extra
7323	16-bit data item. /*
7324
7325	*lengthptr += IMM2_SIZE;
7326
7327	/ If this is a forward reference and we are within a (?\|...) group,*
7328	the reference may end up as the number of a group which we are
7329	currently inside, that is, it could be a recursive reference. In the
7330	real compile this will be picked up and the reference wrapped with
7331	OP_ONCE to make it atomic, so we must space in case this occurs. /*
7332
7333	/ In fact, this can happen for a non-forward reference because*
7334	another group with the same number might be created later. This
7335	issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7336	only mode, we finesse the bug by allowing more memory always. /*
7337
7338	lengthptr += `4` + `4`LINK_SIZE;
7339
7340	/ It is even worse than that. The current reference may be to an*
7341	existing named group with a different number (so apparently not
7342	recursive) but which later on is also attached to a group with the
7343	current number. This can only happen if $(\| has been previous
7344	encountered. In that case, we allow yet more memory, just in case.
7345	(Again, this is fixed "properly" in PCRE2. /*
7346
7347	if (cd->dupgroups) lengthptr += `4` + `4`LINK_SIZE;
7348
7349	/ Otherwise, check for recursion here. The name table does not exist*
7350	in the first pass; instead we must scan the list of names encountered
7351	so far in order to get the number. If the name is not found, leave
7352	the value of recno as 0 for a forward reference. /*
7353
7354	/ This patch (removing "else") fixes a problem when a reference is*
7355	to multiple identically named nested groups from within the nest.
7356	Once again, it is not the "proper" fix, and it results in an
7357	over-allocation of memory. /*
7358
7359	/ else /
7360	{
7361	ng = cd->named_groups;
7362	for (i = `0`; i < cd->names_found; i++, ng++)
7363	{
7364	if (namelen == ng->length &&
7365	STRNCMP_UC_UC(name, ng->name, namelen) == `0`)
7366	{
7367	open_capitem *oc;
7368	recno = ng->number;
7369	if (is_recurse) break;
7370	for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7371	{
7372	if (oc->number == recno)
7373	{
7374	oc->flag = TRUE;
7375	break;
7376	}
7377	}
7378	}
7379	}
7380	}
7381	}
7382
7383	/ In the real compile, search the name table. We check the name*
7384	first, and then check that we have reached the end of the name in the
7385	table. That way, if the name is longer than any in the table, the
7386	comparison will fail without reading beyond the table entry. /*
7387
7388	else
7389	{
7390	slot = cd->name_table;
7391	for (i = `0`; i < cd->names_found; i++)
7392	{
7393	if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == `0` &&
7394	slot[IMM2_SIZE+namelen] == `0`)
7395	break;
7396	slot += cd->name_entry_size;
7397	}
7398
7399	if (i < cd->names_found)
7400	{
7401	recno = GET2(slot, `0`);
7402	}
7403	else
7404	{
7405	*errorcodeptr = ERR15;
7406	goto FAILED;
7407	}
7408	}
7409
7410	/ In both phases, for recursions, we can now go to the code than*
7411	handles numerical recursion. /*
7412
7413	if (is_recurse) goto HANDLE_RECURSION;
7414
7415	/ In the second pass we must see if the name is duplicated. If so, we*
7416	generate a different opcode. /*
7417
7418	if (lengthptr == NULL && cd->dupnames)
7419	{
7420	int count = `1`;
7421	unsigned int index = i;
7422	pcre_uchar *cslot = slot + cd->name_entry_size;
7423
7424	for (i++; i < cd->names_found; i++)
7425	{
7426	if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != `0`) break;
7427	count++;
7428	cslot += cd->name_entry_size;
7429	}
7430
7431	if (count > `1`)
7432	{
7433	if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7434	previous = code;
7435	item_hwm_offset = cd->hwm - cd->start_workspace;
7436	*code++ = ((options & PCRE_CASELESS) != `0`)? OP_DNREFI : OP_DNREF;
7437	PUT2INC(code, `0`, index);
7438	PUT2INC(code, `0`, count);
7439
7440	/ Process each potentially referenced group. /
7441
7442	for (; slot < cslot; slot += cd->name_entry_size)
7443	{
7444	open_capitem *oc;
7445	recno = GET2(slot, `0`);
7446	cd->backref_map \|= (recno < `32`)? (`1` << recno) : `1`;
7447	if (recno > cd->top_backref) cd->top_backref = recno;
7448
7449	/ Check to see if this back reference is recursive, that it, it*
7450	is inside the group that it references. A flag is set so that the
7451	group can be made atomic. /*
7452
7453	for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7454	{
7455	if (oc->number == recno)
7456	{
7457	oc->flag = TRUE;
7458	break;
7459	}
7460	}
7461	}
7462
7463	continue; / End of back ref handling /
7464	}
7465	}
7466
7467	/ First pass, or a non-duplicated name. /
7468
7469	goto HANDLE_REFERENCE;
7470
7471
7472	/ ------------------------------------------------------------ /
7473	case CHAR_R: / Recursion, same as (?0) /
7474	recno = `0`;
7475	if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7476	{
7477	*errorcodeptr = ERR29;
7478	goto FAILED;
7479	}
7480	goto HANDLE_RECURSION;
7481
7482
7483	/ ------------------------------------------------------------ /
7484	case CHAR_MINUS: case CHAR_PLUS: / Recursion or subroutine /
7485	case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7486	case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7487	{
7488	const pcre_uchar *called;
7489	terminator = CHAR_RIGHT_PARENTHESIS;
7490
7491	/ Come here from the \g<...> and \g'...' code (Oniguruma*
7492	compatibility). However, the syntax has been checked to ensure that
7493	the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7494	be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7495	ever be taken. /*
7496
7497	HANDLE_NUMERICAL_RECURSION:
7498
7499	if ((refsign = *ptr) == CHAR_PLUS)
7500	{
7501	ptr++;
7502	if (!IS_DIGIT(*ptr))
7503	{
7504	*errorcodeptr = ERR63;
7505	goto FAILED;
7506	}
7507	}
7508	else if (refsign == CHAR_MINUS)
7509	{
7510	if (!IS_DIGIT(ptr[`1`]))
7511	goto OTHER_CHAR_AFTER_QUERY;
7512	ptr++;
7513	}
7514
7515	recno = `0`;
7516	while(IS_DIGIT(*ptr))
7517	{
7518	if (recno > INT_MAX / `10` - `1`) / Integer overflow /
7519	{
7520	while (IS_DIGIT(*ptr)) ptr++;
7521	*errorcodeptr = ERR61;
7522	goto FAILED;
7523	}
7524	recno = recno * `10` + *ptr++ - CHAR_0;
7525	}
7526
7527	if (*ptr != (pcre_uchar)terminator)
7528	{
7529	*errorcodeptr = ERR29;
7530	goto FAILED;
7531	}
7532
7533	if (refsign == CHAR_MINUS)
7534	{
7535	if (recno == `0`)
7536	{
7537	*errorcodeptr = ERR58;
7538	goto FAILED;
7539	}
7540	recno = cd->bracount - recno + `1`;
7541	if (recno <= `0`)
7542	{
7543	*errorcodeptr = ERR15;
7544	goto FAILED;
7545	}
7546	}
7547	else if (refsign == CHAR_PLUS)
7548	{
7549	if (recno == `0`)
7550	{
7551	*errorcodeptr = ERR58;
7552	goto FAILED;
7553	}
7554	recno += cd->bracount;
7555	}
7556
7557	/ Come here from code above that handles a named recursion /
7558
7559	HANDLE_RECURSION:
7560
7561	previous = code;
7562	item_hwm_offset = cd->hwm - cd->start_workspace;
7563	called = cd->start_code;
7564
7565	/ When we are actually compiling, find the bracket that is being*
7566	referenced. Temporarily end the regex in case it doesn't exist before
7567	this point. If we end up with a forward reference, first check that
7568	the bracket does occur later so we can give the error (and position)
7569	now. Then remember this forward reference in the workspace so it can
7570	be filled in at the end. /*
7571
7572	if (lengthptr == NULL)
7573	{
7574	*code = OP_END;
7575	if (recno != `0`)
7576	called = PRIV(find_bracket)(cd->start_code, utf, recno);
7577
7578	/ Forward reference /
7579
7580	if (called == NULL)
7581	{
7582	if (recno > cd->final_bracount)
7583	{
7584	*errorcodeptr = ERR15;
7585	goto FAILED;
7586	}
7587
7588	/ Fudge the value of "called" so that when it is inserted as an*
7589	offset below, what it actually inserted is the reference number
7590	of the group. Then remember the forward reference. /*
7591
7592	called = cd->start_code + recno;
7593	if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7594	WORK_SIZE_SAFETY_MARGIN)
7595	{
7596	*errorcodeptr = expand_workspace(cd);
7597	if (errorcodeptr != `0`) goto* FAILED;
7598	}
7599	PUTINC(cd->hwm, `0`, (int)(code + `1` - cd->start_code));
7600	}
7601
7602	/ If not a forward reference, and the subpattern is still open,*
7603	this is a recursive call. We check to see if this is a left
7604	recursion that could loop for ever, and diagnose that case. We
7605	must not, however, do this check if we are in a conditional
7606	subpattern because the condition might be testing for recursion in
7607	a pattern such as /(?(R)a+\|(?R)b)/, which is perfectly valid.
7608	Forever loops are also detected at runtime, so those that occur in
7609	conditional subpatterns will be picked up then. /*
7610
7611	else if (GET(called, `1`) == `0` && cond_depth <= `0` &&
7612	could_be_empty(called, code, bcptr, utf, cd))
7613	{
7614	*errorcodeptr = ERR40;
7615	goto FAILED;
7616	}
7617	}
7618
7619	/ Insert the recursion/subroutine item. It does not have a set first*
7620	character (relevant if it is repeated, because it will then be
7621	wrapped with ONCE brackets). /*
7622
7623	*code = OP_RECURSE;
7624	PUT(code, `1`, (int)(called - cd->start_code));
7625	code += `1` + LINK_SIZE;
7626	groupsetfirstchar = FALSE;
7627	}
7628
7629	/ Can't determine a first byte now /
7630
7631	if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7632	continue;
7633
7634
7635	/ ------------------------------------------------------------ /
7636	default: / Other characters: check option setting /
7637	OTHER_CHAR_AFTER_QUERY:
7638	set = unset = `0`;
7639	optset = &set;
7640
7641	while (ptr != CHAR_RIGHT_PARENTHESIS && ptr != CHAR_COLON)
7642	{
7643	switch (*ptr++)
7644	{
7645	case CHAR_MINUS: optset = &unset; break;
7646
7647	case CHAR_J: / Record that it changed in the external options /
7648	*optset \|= PCRE_DUPNAMES;
7649	cd->external_flags \|= PCRE_JCHANGED;
7650	break;
7651
7652	case CHAR_i: optset \|= PCRE_CASELESS; break*;
7653	case CHAR_m: optset \|= PCRE_MULTILINE; break*;
7654	case CHAR_s: optset \|= PCRE_DOTALL; break*;
7655	case CHAR_x: optset \|= PCRE_EXTENDED; break*;
7656	case CHAR_U: optset \|= PCRE_UNGREEDY; break*;
7657	case CHAR_X: optset \|= PCRE_EXTRA; break*;
7658
7659	default: *errorcodeptr = ERR12;
7660	ptr--; / Correct the offset /
7661	goto FAILED;
7662	}
7663	}
7664
7665	/ Set up the changed option bits, but don't change anything yet. /
7666
7667	newoptions = (options \| set) & (~unset);
7668
7669	/ If the options ended with ')' this is not the start of a nested*
7670	group with option changes, so the options change at this level.
7671	If we are not at the pattern start, reset the greedy defaults and the
7672	case value for firstchar and reqchar. /*
7673
7674	if (*ptr == CHAR_RIGHT_PARENTHESIS)
7675	{
7676	greedy_default = ((newoptions & PCRE_UNGREEDY) != `0`);
7677	greedy_non_default = greedy_default ^ `1`;
7678	req_caseopt = ((newoptions & PCRE_CASELESS) != `0`)? REQ_CASELESS:`0`;
7679
7680	/ Change options at this level, and pass them back for use*
7681	in subsequent branches. /*
7682
7683	*optionsptr = options = newoptions;
7684	previous = NULL; / This item can't be repeated /
7685	continue; / It is complete /
7686	}
7687
7688	/ If the options ended with ':' we are heading into a nested group*
7689	with possible change of options. Such groups are non-capturing and are
7690	not assertions of any kind. All we need to do is skip over the ':';
7691	the newoptions value is handled below. /*
7692
7693	bravalue = OP_BRA;
7694	ptr++;
7695	} / End of switch for character following (? /
7696	} / End of (? handling /
7697
7698	/ Opening parenthesis not followed by '' or '?'. If PCRE_NO_AUTO_CAPTURE
7699	is set, all unadorned brackets become non-capturing and behave like (?:...)
7700	brackets. /*
7701
7702	else if ((options & PCRE_NO_AUTO_CAPTURE) != `0`)
7703	{
7704	bravalue = OP_BRA;
7705	}
7706
7707	/ Else we have a capturing group. /
7708
7709	else
7710	{
7711	NUMBERED_GROUP:
7712	cd->bracount += `1`;
7713	PUT2(code, `1`+LINK_SIZE, cd->bracount);
7714	skipbytes = IMM2_SIZE;
7715	}
7716
7717	/ Process nested bracketed regex. First check for parentheses nested too*
7718	deeply. /*
7719
7720	if ((cd->parens_depth += `1`) > PARENS_NEST_LIMIT)
7721	{
7722	*errorcodeptr = ERR82;
7723	goto FAILED;
7724	}
7725
7726	/ All assertions used not to be repeatable, but this was changed for Perl*
7727	compatibility. All kinds can now be repeated except for assertions that are
7728	conditions (Perl also forbids these to be repeated). We copy code into a
7729	non-register variable (tempcode) in order to be able to pass its address
7730	because some compilers complain otherwise. At the start of a conditional
7731	group whose condition is an assertion, cd->iscondassert is set. We unset it
7732	here so as to allow assertions later in the group to be quantified. /*
7733
7734	if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7735	cd->iscondassert)
7736	{
7737	previous = NULL;
7738	cd->iscondassert = FALSE;
7739	}
7740	else
7741	{
7742	previous = code;
7743	item_hwm_offset = cd->hwm - cd->start_workspace;
7744	}
7745
7746	*code = bravalue;
7747	tempcode = code;
7748	tempreqvary = cd->req_varyopt; / Save value before bracket /
7749	tempbracount = cd->bracount; / Save value before bracket /
7750	length_prevgroup = `0`; / Initialize for pre-compile phase /
7751
7752	if (!compile_regex(
7753	newoptions, / The complete new option state /
7754	&tempcode, / Where to put code (updated) /
7755	&ptr, / Input pointer (updated) /
7756	errorcodeptr, / Where to put an error message /
7757	(bravalue == OP_ASSERTBACK \|\|
7758	bravalue == OP_ASSERTBACK_NOT), / TRUE if back assert /
7759	reset_bracount, / True if (?\| group /
7760	skipbytes, / Skip over bracket number /
7761	cond_depth +
7762	((bravalue == OP_COND)?`1`:`0`), / Depth of condition subpatterns /
7763	&subfirstchar, / For possible first char /
7764	&subfirstcharflags,
7765	&subreqchar, / For possible last char /
7766	&subreqcharflags,
7767	bcptr, / Current branch chain /
7768	cd, / Tables block /
7769	(lengthptr == NULL)? NULL : / Actual compile phase /
7770	&length_prevgroup / Pre-compile phase /
7771	))
7772	goto FAILED;
7773
7774	cd->parens_depth -= `1`;
7775
7776	/ If this was an atomic group and there are no capturing groups within it,*
7777	generate OP_ONCE_NC instead of OP_ONCE. /*
7778
7779	if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7780	*code = OP_ONCE_NC;
7781
7782	if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7783	cd->assert_depth -= `1`;
7784
7785	/ At the end of compiling, code is still pointing to the start of the*
7786	group, while tempcode has been updated to point past the end of the group.
7787	The pattern pointer (ptr) is on the bracket.
7788
7789	If this is a conditional bracket, check that there are no more than
7790	two branches in the group, or just one if it's a DEFINE group. We do this
7791	in the real compile phase, not in the pre-pass, where the whole group may
7792	not be available. /*
7793
7794	if (bravalue == OP_COND && lengthptr == NULL)
7795	{
7796	pcre_uchar *tc = code;
7797	int condcount = `0`;
7798
7799	do {
7800	condcount++;
7801	tc += GET(tc,`1`);
7802	}
7803	while (*tc != OP_KET);
7804
7805	/ A DEFINE group is never obeyed inline (the "condition" is always*
7806	false). It must have only one branch. /*
7807
7808	if (code[LINK_SIZE+`1`] == OP_DEF)
7809	{
7810	if (condcount > `1`)
7811	{
7812	*errorcodeptr = ERR54;
7813	goto FAILED;
7814	}
7815	bravalue = OP_DEF; / Just a flag to suppress char handling below /
7816	}
7817
7818	/ A "normal" conditional group. If there is just one branch, we must not*
7819	make use of its firstchar or reqchar, because this is equivalent to an
7820	empty second branch. /*
7821
7822	else
7823	{
7824	if (condcount > `2`)
7825	{
7826	*errorcodeptr = ERR27;
7827	goto FAILED;
7828	}
7829	if (condcount == `1`) subfirstcharflags = subreqcharflags = REQ_NONE;
7830	}
7831	}
7832
7833	/ Error if hit end of pattern /
7834
7835	if (*ptr != CHAR_RIGHT_PARENTHESIS)
7836	{
7837	*errorcodeptr = ERR14;
7838	goto FAILED;
7839	}
7840
7841	/ In the pre-compile phase, update the length by the length of the group,*
7842	less the brackets at either end. Then reduce the compiled code to just a
7843	set of non-capturing brackets so that it doesn't use much memory if it is
7844	duplicated by a quantifier./*
7845
7846	if (lengthptr != NULL)
7847	{
7848	if (OFLOW_MAX - lengthptr < length_prevgroup - `2` - `2`LINK_SIZE)
7849	{
7850	*errorcodeptr = ERR20;
7851	goto FAILED;
7852	}
7853	lengthptr += length_prevgroup - `2` - `2`LINK_SIZE;
7854	code++; / This already contains bravalue /
7855	PUTINC(code, `0`, `1` + LINK_SIZE);
7856	*code++ = OP_KET;
7857	PUTINC(code, `0`, `1` + LINK_SIZE);
7858	break; / No need to waste time with special character handling /
7859	}
7860
7861	/ Otherwise update the main code pointer to the end of the group. /
7862
7863	code = tempcode;
7864
7865	/ For a DEFINE group, required and first character settings are not*
7866	relevant. /*
7867
7868	if (bravalue == OP_DEF) break;
7869
7870	/ Handle updating of the required and first characters for other types of*
7871	group. Update for normal brackets of all kinds, and conditions with two
7872	branches (see code above). If the bracket is followed by a quantifier with
7873	zero repeat, we have to back off. Hence the definition of zeroreqchar and
7874	zerofirstchar outside the main loop so that they can be accessed for the
7875	back off. /*
7876
7877	zeroreqchar = reqchar;
7878	zeroreqcharflags = reqcharflags;
7879	zerofirstchar = firstchar;
7880	zerofirstcharflags = firstcharflags;
7881	groupsetfirstchar = FALSE;
7882
7883	if (bravalue >= OP_ONCE)
7884	{
7885	/ If we have not yet set a firstchar in this branch, take it from the*
7886	subpattern, remembering that it was set here so that a repeat of more
7887	than one can replicate it as reqchar if necessary. If the subpattern has
7888	no firstchar, set "none" for the whole branch. In both cases, a zero
7889	repeat forces firstchar to "none". /*
7890
7891	if (firstcharflags == REQ_UNSET)
7892	{
7893	if (subfirstcharflags >= `0`)
7894	{
7895	firstchar = subfirstchar;
7896	firstcharflags = subfirstcharflags;
7897	groupsetfirstchar = TRUE;
7898	}
7899	else firstcharflags = REQ_NONE;
7900	zerofirstcharflags = REQ_NONE;
7901	}
7902
7903	/ If firstchar was previously set, convert the subpattern's firstchar*
7904	into reqchar if there wasn't one, using the vary flag that was in
7905	existence beforehand. /*
7906
7907	else if (subfirstcharflags >= `0` && subreqcharflags < `0`)
7908	{
7909	subreqchar = subfirstchar;
7910	subreqcharflags = subfirstcharflags \| tempreqvary;
7911	}
7912
7913	/ If the subpattern set a required byte (or set a first byte that isn't*
7914	really the first byte - see above), set it. /*
7915
7916	if (subreqcharflags >= `0`)
7917	{
7918	reqchar = subreqchar;
7919	reqcharflags = subreqcharflags;
7920	}
7921	}
7922
7923	/ For a forward assertion, we take the reqchar, if set, provided that the*
7924	group has also set a first char. This can be helpful if the pattern that
7925	follows the assertion doesn't set a different char. For example, it's
7926	useful for /(?=abcde).+/. We can't set firstchar for an assertion, however
7927	because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7928	the "real" "a" would then become a reqchar instead of a firstchar. This is
7929	overcome by a scan at the end if there's no firstchar, looking for an
7930	asserted first char. /*
7931
7932	else if (bravalue == OP_ASSERT && subreqcharflags >= `0` &&
7933	subfirstcharflags >= `0`)
7934	{
7935	reqchar = subreqchar;
7936	reqcharflags = subreqcharflags;
7937	}
7938	break; / End of processing '(' /
7939
7940
7941	/ ===================================================================/
7942	/ Handle metasequences introduced by \. For ones like \d, the ESC_ values*
7943	are arranged to be the negation of the corresponding OP_values in the
7944	default case when PCRE_UCP is not set. For the back references, the values
7945	are negative the reference number. Only back references and those types
7946	that consume a character may be repeated. We can test for values between
7947	ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7948	ever created. /*
7949
7950	case CHAR_BACKSLASH:
7951	tempptr = ptr;
7952	escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7953	if (errorcodeptr != `0`) goto* FAILED;
7954
7955	if (escape == `0`) / The escape coded a single character /
7956	c = ec;
7957	else
7958	{
7959	/ For metasequences that actually match a character, we disable the*
7960	setting of a first character if it hasn't already been set. /*
7961
7962	if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7963	firstcharflags = REQ_NONE;
7964
7965	/ Set values to reset to if this is followed by a zero repeat. /
7966
7967	zerofirstchar = firstchar;
7968	zerofirstcharflags = firstcharflags;
7969	zeroreqchar = reqchar;
7970	zeroreqcharflags = reqcharflags;
7971
7972	/ \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'*
7973	is a subroutine call by number (Oniguruma syntax). In fact, the value
7974	ESC_g is returned only for these cases. So we don't need to check for <
7975	or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7976	-n, and for the Perl syntax \g{name} the result is ESC_k (as
7977	that is a synonym for a named back reference). /*
7978
7979	if (escape == ESC_g)
7980	{
7981	const pcre_uchar *p;
7982	pcre_uint32 cf;
7983
7984	item_hwm_offset = cd->hwm - cd->start_workspace; / Normally this is set when '(' is read /
7985	terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7986	CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7987
7988	/ These two statements stop the compiler for warning about possibly*
7989	unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7990	fact, because we do the check for a number below, the paths that
7991	would actually be in error are never taken. /*
7992
7993	skipbytes = `0`;
7994	reset_bracount = FALSE;
7995
7996	/ If it's not a signed or unsigned number, treat it as a name. /
7997
7998	cf = ptr[`1`];
7999	if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
8000	{
8001	is_recurse = TRUE;
8002	goto NAMED_REF_OR_RECURSE;
8003	}
8004
8005	/ Signed or unsigned number (cf = ptr[1]) is known to be plus or minus*
8006	or a digit. /*
8007
8008	p = ptr + `2`;
8009	while (IS_DIGIT(*p)) p++;
8010	if (*p != (pcre_uchar)terminator)
8011	{
8012	*errorcodeptr = ERR57;
8013	goto FAILED;
8014	}
8015	ptr++;
8016	goto HANDLE_NUMERICAL_RECURSION;
8017	}
8018
8019	/ \k<name> or \k'name' is a back reference by name (Perl syntax).*
8020	We also support \k{name} (.NET syntax). /*
8021
8022	if (escape == ESC_k)
8023	{
8024	if ((ptr[`1`] != CHAR_LESS_THAN_SIGN &&
8025	ptr[`1`] != CHAR_APOSTROPHE && ptr[`1`] != CHAR_LEFT_CURLY_BRACKET))
8026	{
8027	*errorcodeptr = ERR69;
8028	goto FAILED;
8029	}
8030	is_recurse = FALSE;
8031	terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8032	CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
8033	CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
8034	goto NAMED_REF_OR_RECURSE;
8035	}
8036
8037	/ Back references are handled specially; must disable firstchar if*
8038	not set to cope with cases like (?=(\w+))\1: which would otherwise set
8039	':' later. /*
8040
8041	if (escape < `0`)
8042	{
8043	open_capitem *oc;
8044	recno = -escape;
8045
8046	/ Come here from named backref handling when the reference is to a*
8047	single group (i.e. not to a duplicated name. /*
8048
8049	HANDLE_REFERENCE:
8050	if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
8051	previous = code;
8052	item_hwm_offset = cd->hwm - cd->start_workspace;
8053	*code++ = ((options & PCRE_CASELESS) != `0`)? OP_REFI : OP_REF;
8054	PUT2INC(code, `0`, recno);
8055	cd->backref_map \|= (recno < `32`)? (`1` << recno) : `1`;
8056	if (recno > cd->top_backref) cd->top_backref = recno;
8057
8058	/ Check to see if this back reference is recursive, that it, it*
8059	is inside the group that it references. A flag is set so that the
8060	group can be made atomic. /*
8061
8062	for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8063	{
8064	if (oc->number == recno)
8065	{
8066	oc->flag = TRUE;
8067	break;
8068	}
8069	}
8070	}
8071
8072	/ So are Unicode property matches, if supported. /
8073
8074	#ifdef SUPPORT_UCP
8075	else if (escape == ESC_P \|\| escape == ESC_p)
8076	{
8077	BOOL negated;
8078	unsigned int ptype = `0`, pdata = `0`;
8079	if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8080	goto FAILED;
8081	previous = code;
8082	item_hwm_offset = cd->hwm - cd->start_workspace;
8083	*code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8084	*code++ = ptype;
8085	*code++ = pdata;
8086	}
8087	#else
8088
8089	/ If Unicode properties are not supported, \X, \P, and \p are not*
8090	allowed. /*
8091
8092	else if (escape == ESC_X \|\| escape == ESC_P \|\| escape == ESC_p)
8093	{
8094	*errorcodeptr = ERR45;
8095	goto FAILED;
8096	}
8097	#endif
8098
8099	/ For the rest (including \X when Unicode properties are supported), we*
8100	can obtain the OP value by negating the escape value in the default
8101	situation when PCRE_UCP is not set. When it is* set, we substitute*
8102	Unicode property tests. Note that \b and \B do a one-character
8103	lookbehind, and \A also behaves as if it does. /*
8104
8105	else
8106	{
8107	if ((escape == ESC_b \|\| escape == ESC_B \|\| escape == ESC_A) &&
8108	cd->max_lookbehind == `0`)
8109	cd->max_lookbehind = `1`;
8110	#ifdef SUPPORT_UCP
8111	if (escape >= ESC_DU && escape <= ESC_wu)
8112	{
8113	nestptr = ptr + `1`; / Where to resume /
8114	ptr = substitutes[escape - ESC_DU] - `1`; / Just before substitute /
8115	}
8116	else
8117	#endif
8118	/ In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE*
8119	so that it works in DFA mode and in lookbehinds. /*
8120
8121	{
8122	previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8123	item_hwm_offset = cd->hwm - cd->start_workspace;
8124	*code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8125	}
8126	}
8127	continue;
8128	}
8129
8130	/ We have a data character whose value is in c. In UTF-8 mode it may have*
8131	a value > 127. We set its representation in the length/buffer, and then
8132	handle it as a data character. /*
8133
8134	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8135	if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8136	mclength = PRIV(ord2utf)(c, mcbuffer);
8137	else
8138	#endif
8139
8140	{
8141	mcbuffer[`0`] = c;
8142	mclength = `1`;
8143	}
8144	goto ONE_CHAR;
8145
8146
8147	/ ===================================================================/
8148	/ Handle a literal character. It is guaranteed not to be whitespace or #*
8149	when the extended flag is set. If we are in a UTF mode, it may be a
8150	multi-unit literal character. /*
8151
8152	default:
8153	NORMAL_CHAR:
8154	mclength = `1`;
8155	mcbuffer[`0`] = c;
8156
8157	#ifdef SUPPORT_UTF
8158	if (utf && HAS_EXTRALEN(c))
8159	ACROSSCHAR(TRUE, ptr[`1`], mcbuffer[mclength++] = *(++ptr));
8160	#endif
8161
8162	/ At this point we have the character's bytes in mcbuffer, and the length*
8163	in mclength. When not in UTF-8 mode, the length is always 1. /*
8164
8165	ONE_CHAR:
8166	previous = code;
8167	item_hwm_offset = cd->hwm - cd->start_workspace;
8168
8169	/ For caseless UTF-8 mode when UCP support is available, check whether*
8170	this character has more than one other case. If so, generate a special
8171	OP_PROP item instead of OP_CHARI. /*
8172
8173	#ifdef SUPPORT_UCP
8174	if (utf && (options & PCRE_CASELESS) != `0`)
8175	{
8176	GETCHAR(c, mcbuffer);
8177	if ((c = UCD_CASESET(c)) != `0`)
8178	{
8179	*code++ = OP_PROP;
8180	*code++ = PT_CLIST;
8181	*code++ = c;
8182	if (firstcharflags == REQ_UNSET)
8183	firstcharflags = zerofirstcharflags = REQ_NONE;
8184	break;
8185	}
8186	}
8187	#endif
8188
8189	/ Caseful matches, or not one of the multicase characters. /
8190
8191	*code++ = ((options & PCRE_CASELESS) != `0`)? OP_CHARI : OP_CHAR;
8192	for (c = `0`; c < mclength; c++) *code++ = mcbuffer[c];
8193
8194	/ Remember if \r or \n were seen /
8195
8196	if (mcbuffer[`0`] == CHAR_CR \|\| mcbuffer[`0`] == CHAR_NL)
8197	cd->external_flags \|= PCRE_HASCRORLF;
8198
8199	/ Set the first and required bytes appropriately. If no previous first*
8200	byte, set it from this character, but revert to none on a zero repeat.
8201	Otherwise, leave the firstchar value alone, and don't change it on a zero
8202	repeat. /*
8203
8204	if (firstcharflags == REQ_UNSET)
8205	{
8206	zerofirstcharflags = REQ_NONE;
8207	zeroreqchar = reqchar;
8208	zeroreqcharflags = reqcharflags;
8209
8210	/ If the character is more than one byte long, we can set firstchar*
8211	only if it is not to be matched caselessly. /*
8212
8213	if (mclength == `1` \|\| req_caseopt == `0`)
8214	{
8215	firstchar = mcbuffer[`0`] \| req_caseopt;
8216	firstchar = mcbuffer[`0`];
8217	firstcharflags = req_caseopt;
8218
8219	if (mclength != `1`)
8220	{
8221	reqchar = code[-`1`];
8222	reqcharflags = cd->req_varyopt;
8223	}
8224	}
8225	else firstcharflags = reqcharflags = REQ_NONE;
8226	}
8227
8228	/ firstchar was previously set; we can set reqchar only if the length is*
8229	1 or the matching is caseful. /*
8230
8231	else
8232	{
8233	zerofirstchar = firstchar;
8234	zerofirstcharflags = firstcharflags;
8235	zeroreqchar = reqchar;
8236	zeroreqcharflags = reqcharflags;
8237	if (mclength == `1` \|\| req_caseopt == `0`)
8238	{
8239	reqchar = code[-`1`];
8240	reqcharflags = req_caseopt \| cd->req_varyopt;
8241	}
8242	}
8243
8244	break; / End of literal character handling /
8245	}
8246	} / end of big loop /
8247
8248
8249	/ Control never reaches here by falling through, only by a goto for all the*
8250	error states. Pass back the position in the pattern so that it can be displayed
8251	to the user for diagnosing the error. /*
8252
8253	FAILED:
8254	*ptrptr = ptr;
8255	return FALSE;
8256	}
8257
8258
8259
8260	/*************************************************
8261	* Compile sequence of alternatives *
8262	*************************************************/
8263
8264	/ On entry, ptr is pointing past the bracket character, but on return it*
8265	points to the closing bracket, or vertical bar, or end of string. The code
8266	variable is pointing at the byte into which the BRA operator has been stored.
8267	This function is used during the pre-compile phase when we are trying to find
8268	out the amount of memory needed, as well as during the real compile phase. The
8269	value of lengthptr distinguishes the two phases.
8270
8271	Arguments:
8272	options option bits, including any changes for this subpattern
8273	codeptr -> the address of the current code pointer
8274	ptrptr -> the address of the current pattern pointer
8275	errorcodeptr -> pointer to error code variable
8276	lookbehind TRUE if this is a lookbehind assertion
8277	reset_bracount TRUE to reset the count for each branch
8278	skipbytes skip this many bytes at start (for brackets and OP_COND)
8279	cond_depth depth of nesting for conditional subpatterns
8280	firstcharptr place to put the first required character
8281	firstcharflagsptr place to put the first character flags, or a negative number
8282	reqcharptr place to put the last required character
8283	reqcharflagsptr place to put the last required character flags, or a negative number
8284	bcptr pointer to the chain of currently open branches
8285	cd points to the data block with tables pointers etc.
8286	lengthptr NULL during the real compile phase
8287	points to length accumulator during pre-compile phase
8288
8289	Returns: TRUE on success
8290	*/
8291
8292	static BOOL
8293	compile_regex(int options, pcre_uchar *codeptr, const* pcre_uchar **ptrptr,
8294	int errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int* skipbytes,
8295	int cond_depth,
8296	pcre_uint32 firstcharptr, pcre_int32 firstcharflagsptr,
8297	pcre_uint32 reqcharptr, pcre_int32 reqcharflagsptr,
8298	branch_chain bcptr, compile_data cd, int *lengthptr)
8299	{
8300	const pcre_uchar ptr = ptrptr;
8301	pcre_uchar code = codeptr;
8302	pcre_uchar *last_branch = code;
8303	pcre_uchar *start_bracket = code;
8304	pcre_uchar *reverse_count = NULL;
8305	open_capitem capitem;
8306	int capnumber = `0`;
8307	pcre_uint32 firstchar, reqchar;
8308	pcre_int32 firstcharflags, reqcharflags;
8309	pcre_uint32 branchfirstchar, branchreqchar;
8310	pcre_int32 branchfirstcharflags, branchreqcharflags;
8311	int length;
8312	unsigned int orig_bracount;
8313	unsigned int max_bracount;
8314	branch_chain bc;
8315	size_t save_hwm_offset;
8316
8317	/ If set, call the external function that checks for stack availability. /
8318
8319	if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8320	{
8321	*errorcodeptr= ERR85;
8322	return FALSE;
8323	}
8324
8325	/ Miscellaneous initialization /
8326
8327	bc.outer = bcptr;
8328	bc.current_branch = code;
8329
8330	firstchar = reqchar = `0`;
8331	firstcharflags = reqcharflags = REQ_UNSET;
8332
8333	save_hwm_offset = cd->hwm - cd->start_workspace;
8334
8335	/ Accumulate the length for use in the pre-compile phase. Start with the*
8336	length of the BRA and KET and any extra bytes that are required at the
8337	beginning. We accumulate in a local variable to save frequent testing of
8338	lenthptr for NULL. We cannot do this by looking at the value of code at the
8339	start and end of each alternative, because compiled items are discarded during
8340	the pre-compile phase so that the work space is not exceeded. /*
8341
8342	length = `2` + `2`*LINK_SIZE + skipbytes;
8343
8344	/ WARNING: If the above line is changed for any reason, you must also change*
8345	the code that abstracts option settings at the start of the pattern and makes
8346	them global. It tests the value of length for (2 + 2LINK_SIZE) in the*
8347	pre-compile phase to find out whether anything has yet been compiled or not. /*
8348
8349	/ If this is a capturing subpattern, add to the chain of open capturing items*
8350	so that we can detect them if (ACCEPT) is encountered. This is also used to*
8351	detect groups that contain recursive back references to themselves. Note that
8352	only OP_CBRA need be tested here; changing this opcode to one of its variants,
8353	e.g. OP_SCBRAPOS, happens later, after the group has been compiled. /*
8354
8355	if (*code == OP_CBRA)
8356	{
8357	capnumber = GET2(code, `1` + LINK_SIZE);
8358	capitem.number = capnumber;
8359	capitem.next = cd->open_caps;
8360	capitem.flag = FALSE;
8361	cd->open_caps = &capitem;
8362	}
8363
8364	/ Offset is set zero to mark that this bracket is still open /
8365
8366	PUT(code, `1`, `0`);
8367	code += `1` + LINK_SIZE + skipbytes;
8368
8369	/ Loop for each alternative branch /
8370
8371	orig_bracount = max_bracount = cd->bracount;
8372	for (;;)
8373	{
8374	/ For a (?\| group, reset the capturing bracket count so that each branch*
8375	uses the same numbers. /*
8376
8377	if (reset_bracount) cd->bracount = orig_bracount;
8378
8379	/ Set up dummy OP_REVERSE if lookbehind assertion /
8380
8381	if (lookbehind)
8382	{
8383	*code++ = OP_REVERSE;
8384	reverse_count = code;
8385	PUTINC(code, `0`, `0`);
8386	length += `1` + LINK_SIZE;
8387	}
8388
8389	/ Now compile the branch; in the pre-compile phase its length gets added*
8390	into the length. /*
8391
8392	if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8393	&branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8394	cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8395	{
8396	*ptrptr = ptr;
8397	return FALSE;
8398	}
8399
8400	/ Keep the highest bracket count in case (?\| was used and some branch*
8401	has fewer than the rest. /*
8402
8403	if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8404
8405	/ In the real compile phase, there is some post-processing to be done. /
8406
8407	if (lengthptr == NULL)
8408	{
8409	/ If this is the first branch, the firstchar and reqchar values for the*
8410	branch become the values for the regex. /*
8411
8412	if (*last_branch != OP_ALT)
8413	{
8414	firstchar = branchfirstchar;
8415	firstcharflags = branchfirstcharflags;
8416	reqchar = branchreqchar;
8417	reqcharflags = branchreqcharflags;
8418	}
8419
8420	/ If this is not the first branch, the first char and reqchar have to*
8421	match the values from all the previous branches, except that if the
8422	previous value for reqchar didn't have REQ_VARY set, it can still match,
8423	and we set REQ_VARY for the regex. /*
8424
8425	else
8426	{
8427	/ If we previously had a firstchar, but it doesn't match the new branch,*
8428	we have to abandon the firstchar for the regex, but if there was
8429	previously no reqchar, it takes on the value of the old firstchar. /*
8430
8431	if (firstcharflags >= `0` &&
8432	(firstcharflags != branchfirstcharflags \|\| firstchar != branchfirstchar))
8433	{
8434	if (reqcharflags < `0`)
8435	{
8436	reqchar = firstchar;
8437	reqcharflags = firstcharflags;
8438	}
8439	firstcharflags = REQ_NONE;
8440	}
8441
8442	/ If we (now or from before) have no firstchar, a firstchar from the*
8443	branch becomes a reqchar if there isn't a branch reqchar. /*
8444
8445	if (firstcharflags < `0` && branchfirstcharflags >= `0` && branchreqcharflags < `0`)
8446	{
8447	branchreqchar = branchfirstchar;
8448	branchreqcharflags = branchfirstcharflags;
8449	}
8450
8451	/ Now ensure that the reqchars match /
8452
8453	if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) \|\|
8454	reqchar != branchreqchar)
8455	reqcharflags = REQ_NONE;
8456	else
8457	{
8458	reqchar = branchreqchar;
8459	reqcharflags \|= branchreqcharflags; / To "or" REQ_VARY /
8460	}
8461	}
8462
8463	/ If lookbehind, check that this branch matches a fixed-length string, and*
8464	put the length into the OP_REVERSE item. Temporarily mark the end of the
8465	branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8466	because there may be forward references that we can't check here. Set a
8467	flag to cause another lookbehind check at the end. Why not do it all at the
8468	end? Because common, erroneous checks are picked up here and the offset of
8469	the problem can be shown. /*
8470
8471	if (lookbehind)
8472	{
8473	int fixed_length;
8474	*code = OP_END;
8475	fixed_length = find_fixedlength(last_branch, (options & PCRE_UTF8) != `0`,
8476	FALSE, cd, NULL);
8477	DPRINTF(("fixed length = %d\n", fixed_length));
8478	if (fixed_length == -`3`)
8479	{
8480	cd->check_lookbehind = TRUE;
8481	}
8482	else if (fixed_length < `0`)
8483	{
8484	*errorcodeptr = (fixed_length == -`2`)? ERR36 :
8485	(fixed_length == -`4`)? ERR70: ERR25;
8486	*ptrptr = ptr;
8487	return FALSE;
8488	}
8489	else
8490	{
8491	if (fixed_length > cd->max_lookbehind)
8492	cd->max_lookbehind = fixed_length;
8493	PUT(reverse_count, `0`, fixed_length);
8494	}
8495	}
8496	}
8497
8498	/ Reached end of expression, either ')' or end of pattern. In the real*
8499	compile phase, go back through the alternative branches and reverse the chain
8500	of offsets, with the field in the BRA item now becoming an offset to the
8501	first alternative. If there are no alternatives, it points to the end of the
8502	group. The length in the terminating ket is always the length of the whole
8503	bracketed item. Return leaving the pointer at the terminating char. /*
8504
8505	if (*ptr != CHAR_VERTICAL_LINE)
8506	{
8507	if (lengthptr == NULL)
8508	{
8509	int branch_length = (int)(code - last_branch);
8510	do
8511	{
8512	int prev_length = GET(last_branch, `1`);
8513	PUT(last_branch, `1`, branch_length);
8514	branch_length = prev_length;
8515	last_branch -= branch_length;
8516	}
8517	while (branch_length > `0`);
8518	}
8519
8520	/ Fill in the ket /
8521
8522	*code = OP_KET;
8523	PUT(code, `1`, (int)(code - start_bracket));
8524	code += `1` + LINK_SIZE;
8525
8526	/ If it was a capturing subpattern, check to see if it contained any*
8527	recursive back references. If so, we must wrap it in atomic brackets.
8528	Because we are moving code along, we must ensure that any pending recursive
8529	references are updated. In any event, remove the block from the chain. /*
8530
8531	if (capnumber > `0`)
8532	{
8533	if (cd->open_caps->flag)
8534	{
8535	*code = OP_END;
8536	adjust_recurse(start_bracket, `1` + LINK_SIZE,
8537	(options & PCRE_UTF8) != `0`, cd, save_hwm_offset);
8538	memmove(start_bracket + `1` + LINK_SIZE, start_bracket,
8539	IN_UCHARS(code - start_bracket));
8540	*start_bracket = OP_ONCE;
8541	code += `1` + LINK_SIZE;
8542	PUT(start_bracket, `1`, (int)(code - start_bracket));
8543	*code = OP_KET;
8544	PUT(code, `1`, (int)(code - start_bracket));
8545	code += `1` + LINK_SIZE;
8546	length += `2` + `2`*LINK_SIZE;
8547	}
8548	cd->open_caps = cd->open_caps->next;
8549	}
8550
8551	/ Retain the highest bracket number, in case resetting was used. /
8552
8553	cd->bracount = max_bracount;
8554
8555	/ Set values to pass back /
8556
8557	*codeptr = code;
8558	*ptrptr = ptr;
8559	*firstcharptr = firstchar;
8560	*firstcharflagsptr = firstcharflags;
8561	*reqcharptr = reqchar;
8562	*reqcharflagsptr = reqcharflags;
8563	if (lengthptr != NULL)
8564	{
8565	if (OFLOW_MAX - *lengthptr < length)
8566	{
8567	*errorcodeptr = ERR20;
8568	return FALSE;
8569	}
8570	*lengthptr += length;
8571	}
8572	return TRUE;
8573	}
8574
8575	/ Another branch follows. In the pre-compile phase, we can move the code*
8576	pointer back to where it was for the start of the first branch. (That is,
8577	pretend that each branch is the only one.)
8578
8579	In the real compile phase, insert an ALT node. Its length field points back
8580	to the previous branch while the bracket remains open. At the end the chain
8581	is reversed. It's done like this so that the start of the bracket has a
8582	zero offset until it is closed, making it possible to detect recursion. /*
8583
8584	if (lengthptr != NULL)
8585	{
8586	code = *codeptr + `1` + LINK_SIZE + skipbytes;
8587	length += `1` + LINK_SIZE;
8588	}
8589	else
8590	{
8591	*code = OP_ALT;
8592	PUT(code, `1`, (int)(code - last_branch));
8593	bc.current_branch = last_branch = code;
8594	code += `1` + LINK_SIZE;
8595	}
8596
8597	ptr++;
8598	}
8599	/ Control never reaches here /
8600	}
8601
8602
8603
8604
8605	/*************************************************
8606	* Check for anchored expression *
8607	*************************************************/
8608
8609	/ Try to find out if this is an anchored regular expression. Consider each*
8610	alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8611	all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8612	it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8613	be found, because ^ generates OP_CIRCM in that mode.
8614
8615	We can also consider a regex to be anchored if OP_SOM starts all its branches.
8616	This is the code for \G, which means "match at start of match position, taking
8617	into account the match offset".
8618
8619	A branch is also implicitly anchored if it starts with . and DOTALL is set,*
8620	because that will try the rest of the pattern at all possible matching points,
8621	so there is no point trying again.... er ....
8622
8623	.... except when the . appears inside capturing parentheses, and there is a*
8624	subsequent back reference to those parentheses. We haven't enough information
8625	to catch that case precisely.
8626
8627	At first, the best we could do was to detect when . was in capturing brackets*
8628	and the highest back reference was greater than or equal to that level.
8629	However, by keeping a bitmap of the first 31 back references, we can catch some
8630	of the more common cases more precisely.
8631
8632	... A second exception is when the . appears inside an atomic group, because*
8633	this prevents the number of characters it matches from being adjusted.
8634
8635	Arguments:
8636	code points to start of expression (the bracket)
8637	bracket_map a bitmap of which brackets we are inside while testing; this
8638	handles up to substring 31; after that we just have to take
8639	the less precise approach
8640	cd points to the compile data block
8641	atomcount atomic group level
8642
8643	Returns: TRUE or FALSE
8644	*/
8645
8646	static BOOL
8647	is_anchored(register const pcre_uchar code, unsigned* int bracket_map,
8648	compile_data cd, int* atomcount)
8649	{
8650	do {
8651	const pcre_uchar *scode = first_significant_code(
8652	code + PRIV(OP_lengths)[*code], FALSE);
8653	register int op = *scode;
8654
8655	/ Non-capturing brackets /
8656
8657	if (op == OP_BRA \|\| op == OP_BRAPOS \|\|
8658	op == OP_SBRA \|\| op == OP_SBRAPOS)
8659	{
8660	if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8661	}
8662
8663	/ Capturing brackets /
8664
8665	else if (op == OP_CBRA \|\| op == OP_CBRAPOS \|\|
8666	op == OP_SCBRA \|\| op == OP_SCBRAPOS)
8667	{
8668	int n = GET2(scode, `1`+LINK_SIZE);
8669	int new_map = bracket_map \| ((n < `32`)? (`1` << n) : `1`);
8670	if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8671	}
8672
8673	/ Positive forward assertions and conditions /
8674
8675	else if (op == OP_ASSERT \|\| op == OP_COND)
8676	{
8677	if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8678	}
8679
8680	/ Atomic groups /
8681
8682	else if (op == OP_ONCE \|\| op == OP_ONCE_NC)
8683	{
8684	if (!is_anchored(scode, bracket_map, cd, atomcount + `1`))
8685	return FALSE;
8686	}
8687
8688	/ .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and*
8689	it isn't in brackets that are or may be referenced or inside an atomic
8690	group. /*
8691
8692	else if ((op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\|
8693	op == OP_TYPEPOSSTAR))
8694	{
8695	if (scode[`1`] != OP_ALLANY \|\| (bracket_map & cd->backref_map) != `0` \|\|
8696	atomcount > `0` \|\| cd->had_pruneorskip)
8697	return FALSE;
8698	}
8699
8700	/ Check for explicit anchoring /
8701
8702	else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8703
8704	code += GET(code, `1`);
8705	}
8706	while (code == OP_ALT); /* Loop for each alternative /
8707	return TRUE;
8708	}
8709
8710
8711
8712	/*************************************************
8713	* Check for starting with ^ or .* *
8714	*************************************************/
8715
8716	/ This is called to find out if every branch starts with ^ or .* so that*
8717	"first char" processing can be done to speed things up in multiline
8718	matching and for non-DOTALL patterns that start with . (which must start at*
8719	the beginning or after \n). As in the case of is_anchored() (see above), we
8720	have to take account of back references to capturing brackets that contain .*
8721	because in that case we can't make the assumption. Also, the appearance of .*
8722	inside atomic brackets or in an assertion, or in a pattern that contains PRUNE*
8723	or SKIP does not count, because once again the assumption no longer holds.*
8724
8725	Arguments:
8726	code points to start of expression (the bracket)
8727	bracket_map a bitmap of which brackets we are inside while testing; this
8728	handles up to substring 31; after that we just have to take
8729	the less precise approach
8730	cd points to the compile data
8731	atomcount atomic group level
8732	inassert TRUE if in an assertion
8733
8734	Returns: TRUE or FALSE
8735	*/
8736
8737	static BOOL
8738	is_startline(const pcre_uchar code, unsigned* int bracket_map,
8739	compile_data cd, int* atomcount, BOOL inassert)
8740	{
8741	do {
8742	const pcre_uchar *scode = first_significant_code(
8743	code + PRIV(OP_lengths)[*code], FALSE);
8744	register int op = *scode;
8745
8746	/ If we are at the start of a conditional assertion group, both the*
8747	conditional assertion and* what follows the condition must satisfy the test*
8748	for start of line. Other kinds of condition fail. Note that there may be an
8749	auto-callout at the start of a condition. /*
8750
8751	if (op == OP_COND)
8752	{
8753	scode += `1` + LINK_SIZE;
8754	if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8755	switch (*scode)
8756	{
8757	case OP_CREF:
8758	case OP_DNCREF:
8759	case OP_RREF:
8760	case OP_DNRREF:
8761	case OP_DEF:
8762	case OP_FAIL:
8763	return FALSE;
8764
8765	default: / Assertion /
8766	if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8767	do scode += GET(scode, `1`); while (*scode == OP_ALT);
8768	scode += `1` + LINK_SIZE;
8769	break;
8770	}
8771	scode = first_significant_code(scode, FALSE);
8772	op = *scode;
8773	}
8774
8775	/ Non-capturing brackets /
8776
8777	if (op == OP_BRA \|\| op == OP_BRAPOS \|\|
8778	op == OP_SBRA \|\| op == OP_SBRAPOS)
8779	{
8780	if (!is_startline(scode, bracket_map, cd, atomcount, inassert)) return FALSE;
8781	}
8782
8783	/ Capturing brackets /
8784
8785	else if (op == OP_CBRA \|\| op == OP_CBRAPOS \|\|
8786	op == OP_SCBRA \|\| op == OP_SCBRAPOS)
8787	{
8788	int n = GET2(scode, `1`+LINK_SIZE);
8789	int new_map = bracket_map \| ((n < `32`)? (`1` << n) : `1`);
8790	if (!is_startline(scode, new_map, cd, atomcount, inassert)) return FALSE;
8791	}
8792
8793	/ Positive forward assertions /
8794
8795	else if (op == OP_ASSERT)
8796	{
8797	if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8798	}
8799
8800	/ Atomic brackets /
8801
8802	else if (op == OP_ONCE \|\| op == OP_ONCE_NC)
8803	{
8804	if (!is_startline(scode, bracket_map, cd, atomcount + `1`, inassert)) return FALSE;
8805	}
8806
8807	/ .* means "start at start or after \n" if it isn't in atomic brackets or*
8808	brackets that may be referenced or an assertion, as long as the pattern does
8809	not contain PRUNE or SKIP, because these break the feature. Consider, for
8810	example, /.?a(PRUNE)b/ with the subject "aab", which matches "ab", i.e.
8811	not at the start of a line. /*
8812
8813	else if (op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\| op == OP_TYPEPOSSTAR)
8814	{
8815	if (scode[`1`] != OP_ANY \|\| (bracket_map & cd->backref_map) != `0` \|\|
8816	atomcount > `0` \|\| cd->had_pruneorskip \|\| inassert)
8817	return FALSE;
8818	}
8819
8820	/ Check for explicit circumflex; anything else gives a FALSE result. Note*
8821	in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8822	because the number of characters matched by . cannot be adjusted inside*
8823	them. /*
8824
8825	else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8826
8827	/ Move on to the next alternative /
8828
8829	code += GET(code, `1`);
8830	}
8831	while (code == OP_ALT); /* Loop for each alternative /
8832	return TRUE;
8833	}
8834
8835
8836
8837	/*************************************************
8838	* Check for asserted fixed first char *
8839	*************************************************/
8840
8841	/ During compilation, the "first char" settings from forward assertions are*
8842	discarded, because they can cause conflicts with actual literals that follow.
8843	However, if we end up without a first char setting for an unanchored pattern,
8844	it is worth scanning the regex to see if there is an initial asserted first
8845	char. If all branches start with the same asserted char, or with a
8846	non-conditional bracket all of whose alternatives start with the same asserted
8847	char (recurse ad lib), then we return that char, with the flags set to zero or
8848	REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8849
8850	Arguments:
8851	code points to start of expression (the bracket)
8852	flags points to the first char flags, or to REQ_NONE
8853	inassert TRUE if in an assertion
8854
8855	Returns: the fixed first char, or 0 with REQ_NONE in flags
8856	*/
8857
8858	static pcre_uint32
8859	find_firstassertedchar(const pcre_uchar code, pcre_int32 flags,
8860	BOOL inassert)
8861	{
8862	register pcre_uint32 c = `0`;
8863	int cflags = REQ_NONE;
8864
8865	*flags = REQ_NONE;
8866	do {
8867	pcre_uint32 d;
8868	int dflags;
8869	int xl = (code == OP_CBRA \|\| code == OP_SCBRA \|\|
8870	code == OP_CBRAPOS \|\| code == OP_SCBRAPOS)? IMM2_SIZE:`0`;
8871	const pcre_uchar *scode = first_significant_code(code + `1`+LINK_SIZE + xl,
8872	TRUE);
8873	register pcre_uchar op = *scode;
8874
8875	switch(op)
8876	{
8877	default:
8878	return `0`;
8879
8880	case OP_BRA:
8881	case OP_BRAPOS:
8882	case OP_CBRA:
8883	case OP_SCBRA:
8884	case OP_CBRAPOS:
8885	case OP_SCBRAPOS:
8886	case OP_ASSERT:
8887	case OP_ONCE:
8888	case OP_ONCE_NC:
8889	d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8890	if (dflags < `0`)
8891	return `0`;
8892	if (cflags < `0`) { c = d; cflags = dflags; } else if (c != d \|\| cflags != dflags) return `0`;
8893	break;
8894
8895	case OP_EXACT:
8896	scode += IMM2_SIZE;
8897	/ Fall through /
8898
8899	case OP_CHAR:
8900	case OP_PLUS:
8901	case OP_MINPLUS:
8902	case OP_POSPLUS:
8903	if (!inassert) return `0`;
8904	if (cflags < `0`) { c = scode[`1`]; cflags = `0`; }
8905	else if (c != scode[`1`]) return `0`;
8906	break;
8907
8908	case OP_EXACTI:
8909	scode += IMM2_SIZE;
8910	/ Fall through /
8911
8912	case OP_CHARI:
8913	case OP_PLUSI:
8914	case OP_MINPLUSI:
8915	case OP_POSPLUSI:
8916	if (!inassert) return `0`;
8917	if (cflags < `0`) { c = scode[`1`]; cflags = REQ_CASELESS; }
8918	else if (c != scode[`1`]) return `0`;
8919	break;
8920	}
8921
8922	code += GET(code, `1`);
8923	}
8924	while (*code == OP_ALT);
8925
8926	*flags = cflags;
8927	return c;
8928	}
8929
8930
8931
8932	/*************************************************
8933	* Add an entry to the name/number table *
8934	*************************************************/
8935
8936	/ This function is called between compiling passes to add an entry to the*
8937	name/number table, maintaining alphabetical order. Checking for permitted
8938	and forbidden duplicates has already been done.
8939
8940	Arguments:
8941	cd the compile data block
8942	name the name to add
8943	length the length of the name
8944	groupno the group number
8945
8946	Returns: nothing
8947	*/
8948
8949	static void
8950	add_name(compile_data cd, const* pcre_uchar name, int* length,
8951	unsigned int groupno)
8952	{
8953	int i;
8954	pcre_uchar *slot = cd->name_table;
8955
8956	for (i = `0`; i < cd->names_found; i++)
8957	{
8958	int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8959	if (crc == `0` && slot[IMM2_SIZE+length] != `0`)
8960	crc = -`1`; / Current name is a substring /
8961
8962	/ Make space in the table and break the loop for an earlier name. For a*
8963	duplicate or later name, carry on. We do this for duplicates so that in the
8964	simple case (when ?(\| is not used) they are in order of their numbers. In all
8965	cases they are in the order in which they appear in the pattern. /*
8966
8967	if (crc < `0`)
8968	{
8969	memmove(slot + cd->name_entry_size, slot,
8970	IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8971	break;
8972	}
8973
8974	/ Continue the loop for a later or duplicate name /
8975
8976	slot += cd->name_entry_size;
8977	}
8978
8979	PUT2(slot, `0`, groupno);
8980	memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
8981	slot[IMM2_SIZE + length] = `0`;
8982	cd->names_found++;
8983	}
8984
8985
8986
8987	/*************************************************
8988	* Compile a Regular Expression *
8989	*************************************************/
8990
8991	/ This function takes a string and returns a pointer to a block of store*
8992	holding a compiled version of the expression. The original API for this
8993	function had no error code return variable; it is retained for backwards
8994	compatibility. The new function is given a new name.
8995
8996	Arguments:
8997	pattern the regular expression
8998	options various option bits
8999	errorcodeptr pointer to error code variable (pcre_compile2() only)
9000	can be NULL if you don't want a code value
9001	errorptr pointer to pointer to error text
9002	erroroffset ptr offset in pattern where error was detected
9003	tables pointer to character tables or NULL
9004
9005	Returns: pointer to compiled data block, or NULL on error,
9006	with errorptr and erroroffset set
9007	*/
9008
9009	#if defined COMPILE_PCRE8
9010	PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
9011	pcre_compile(const char pattern, int* options, const char **errorptr,
9012	int erroroffset, const* unsigned char *tables)
9013	#elif defined COMPILE_PCRE16
9014	PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9015	pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
9016	int erroroffset, const* unsigned char *tables)
9017	#elif defined COMPILE_PCRE32
9018	PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9019	pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
9020	int erroroffset, const* unsigned char *tables)
9021	#endif
9022	{
9023	#if defined COMPILE_PCRE8
9024	return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9025	#elif defined COMPILE_PCRE16
9026	return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9027	#elif defined COMPILE_PCRE32
9028	return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9029	#endif
9030	}
9031
9032
9033	#if defined COMPILE_PCRE8
9034	PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
9035	pcre_compile2(const char pattern, int* options, int *errorcodeptr,
9036	const char *errorptr, int* erroroffset, const* unsigned char *tables)
9037	#elif defined COMPILE_PCRE16
9038	PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9039	pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
9040	const char *errorptr, int* erroroffset, const* unsigned char *tables)
9041	#elif defined COMPILE_PCRE32
9042	PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9043	pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
9044	const char *errorptr, int* erroroffset, const* unsigned char *tables)
9045	#endif
9046	{
9047	REAL_PCRE *re;
9048	int length = `1`; / For final END opcode /
9049	pcre_int32 firstcharflags, reqcharflags;
9050	pcre_uint32 firstchar, reqchar;
9051	pcre_uint32 limit_match = PCRE_UINT32_MAX;
9052	pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9053	int newline;
9054	int errorcode = `0`;
9055	int skipatstart = `0`;
9056	BOOL utf;
9057	BOOL never_utf = FALSE;
9058	size_t size;
9059	pcre_uchar *code;
9060	const pcre_uchar *codestart;
9061	const pcre_uchar *ptr;
9062	compile_data compile_block;
9063	compile_data *cd = &compile_block;
9064
9065	/ This space is used for "compiling" into during the first phase, when we are*
9066	computing the amount of memory that is needed. Compiled items are thrown away
9067	as soon as possible, so that a fairly large buffer should be sufficient for
9068	this purpose. The same space is used in the second phase for remembering where
9069	to fill in forward references to subpatterns. That may overflow, in which case
9070	new memory is obtained from malloc(). /*
9071
9072	pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9073
9074	/ This vector is used for remembering name groups during the pre-compile. In a*
9075	similar way to cworkspace, it can be expanded using malloc() if necessary. /*
9076
9077	named_group named_groups[NAMED_GROUP_LIST_SIZE];
9078
9079	/ Set this early so that early errors get offset 0. /
9080
9081	ptr = (const pcre_uchar *)pattern;
9082
9083	/ We can't pass back an error message if errorptr is NULL; I guess the best we*
9084	can do is just return NULL, but we can set a code value if there is a code
9085	pointer. /*
9086
9087	if (errorptr == NULL)
9088	{
9089	if (errorcodeptr != NULL) *errorcodeptr = `99`;
9090	return NULL;
9091	}
9092
9093	*errorptr = NULL;
9094	if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9095
9096	/ However, we can give a message for this error /
9097
9098	if (erroroffset == NULL)
9099	{
9100	errorcode = ERR16;
9101	goto PCRE_EARLY_ERROR_RETURN2;
9102	}
9103
9104	*erroroffset = `0`;
9105
9106	/ Set up pointers to the individual character tables /
9107
9108	if (tables == NULL) tables = PRIV(default_tables);
9109	cd->lcc = tables + lcc_offset;
9110	cd->fcc = tables + fcc_offset;
9111	cd->cbits = tables + cbits_offset;
9112	cd->ctypes = tables + ctypes_offset;
9113
9114	/ Check that all undefined public option bits are zero /
9115
9116	if ((options & ~PUBLIC_COMPILE_OPTIONS) != `0`)
9117	{
9118	errorcode = ERR17;
9119	goto PCRE_EARLY_ERROR_RETURN;
9120	}
9121
9122	/ If PCRE_NEVER_UTF is set, remember it. /
9123
9124	if ((options & PCRE_NEVER_UTF) != `0`) never_utf = TRUE;
9125
9126	/ Check for global one-time settings at the start of the pattern, and remember*
9127	the offset for later. /*
9128
9129	cd->external_flags = `0`; / Initialize here for LIMIT_MATCH/RECURSION /
9130
9131	while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9132	ptr[skipatstart+`1`] == CHAR_ASTERISK)
9133	{
9134	int newnl = `0`;
9135	int newbsr = `0`;
9136
9137	/ For completeness and backward compatibility, (UTFn) is supported in the
9138	relevant libraries, but (UTF) is generic and always supported. Note that*
9139	PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. /*
9140
9141	#ifdef COMPILE_PCRE8
9142	if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_UTF8_RIGHTPAR, `5`) == `0`)
9143	{ skipatstart += `7`; options \|= PCRE_UTF8; continue; }
9144	#endif
9145	#ifdef COMPILE_PCRE16
9146	if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_UTF16_RIGHTPAR, `6`) == `0`)
9147	{ skipatstart += `8`; options \|= PCRE_UTF16; continue; }
9148	#endif
9149	#ifdef COMPILE_PCRE32
9150	if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_UTF32_RIGHTPAR, `6`) == `0`)
9151	{ skipatstart += `8`; options \|= PCRE_UTF32; continue; }
9152	#endif
9153
9154	else if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_UTF_RIGHTPAR, `4`) == `0`)
9155	{ skipatstart += `6`; options \|= PCRE_UTF8; continue; }
9156	else if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_UCP_RIGHTPAR, `4`) == `0`)
9157	{ skipatstart += `6`; options \|= PCRE_UCP; continue; }
9158	else if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_NO_AUTO_POSSESS_RIGHTPAR, `16`) == `0`)
9159	{ skipatstart += `18`; options \|= PCRE_NO_AUTO_POSSESS; continue; }
9160	else if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_NO_START_OPT_RIGHTPAR, `13`) == `0`)
9161	{ skipatstart += `15`; options \|= PCRE_NO_START_OPTIMIZE; continue; }
9162
9163	else if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_LIMIT_MATCH_EQ, `12`) == `0`)
9164	{
9165	pcre_uint32 c = `0`;
9166	int p = skipatstart + `14`;
9167	while (isdigit(ptr[p]))
9168	{
9169	if (c > PCRE_UINT32_MAX / `10` - `1`) break; / Integer overflow /
9170	c = c*`10` + ptr[p++] - CHAR_0;
9171	}
9172	if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9173	if (c < limit_match)
9174	{
9175	limit_match = c;
9176	cd->external_flags \|= PCRE_MLSET;
9177	}
9178	skipatstart = p;
9179	continue;
9180	}
9181
9182	else if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_LIMIT_RECURSION_EQ, `16`) == `0`)
9183	{
9184	pcre_uint32 c = `0`;
9185	int p = skipatstart + `18`;
9186	while (isdigit(ptr[p]))
9187	{
9188	if (c > PCRE_UINT32_MAX / `10` - `1`) break; / Integer overflow check /
9189	c = c*`10` + ptr[p++] - CHAR_0;
9190	}
9191	if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9192	if (c < limit_recursion)
9193	{
9194	limit_recursion = c;
9195	cd->external_flags \|= PCRE_RLSET;
9196	}
9197	skipatstart = p;
9198	continue;
9199	}
9200
9201	if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_CR_RIGHTPAR, `3`) == `0`)
9202	{ skipatstart += `5`; newnl = PCRE_NEWLINE_CR; }
9203	else if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_LF_RIGHTPAR, `3`) == `0`)
9204	{ skipatstart += `5`; newnl = PCRE_NEWLINE_LF; }
9205	else if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_CRLF_RIGHTPAR, `5`) == `0`)
9206	{ skipatstart += `7`; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9207	else if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_ANY_RIGHTPAR, `4`) == `0`)
9208	{ skipatstart += `6`; newnl = PCRE_NEWLINE_ANY; }
9209	else if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_ANYCRLF_RIGHTPAR, `8`) == `0`)
9210	{ skipatstart += `10`; newnl = PCRE_NEWLINE_ANYCRLF; }
9211
9212	else if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_BSR_ANYCRLF_RIGHTPAR, `12`) == `0`)
9213	{ skipatstart += `14`; newbsr = PCRE_BSR_ANYCRLF; }
9214	else if (STRNCMP_UC_C8(ptr+skipatstart+`2`, STRING_BSR_UNICODE_RIGHTPAR, `12`) == `0`)
9215	{ skipatstart += `14`; newbsr = PCRE_BSR_UNICODE; }
9216
9217	if (newnl != `0`)
9218	options = (options & ~PCRE_NEWLINE_BITS) \| newnl;
9219	else if (newbsr != `0`)
9220	options = (options & ~(PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) \| newbsr;
9221	else break;
9222	}
9223
9224	/ PCRE_UTF(16\|32) have the same value as PCRE_UTF8. /
9225	utf = (options & PCRE_UTF8) != `0`;
9226	if (utf && never_utf)
9227	{
9228	errorcode = ERR78;
9229	goto PCRE_EARLY_ERROR_RETURN2;
9230	}
9231
9232	/ Can't support UTF unless PCRE has been compiled to include the code. The*
9233	return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9234	release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9235	not used here. /*
9236
9237	#ifdef SUPPORT_UTF
9238	if (utf && (options & PCRE_NO_UTF8_CHECK) == `0` &&
9239	(errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -`1`, erroroffset)) != `0`)
9240	{
9241	#if defined COMPILE_PCRE8
9242	errorcode = ERR44;
9243	#elif defined COMPILE_PCRE16
9244	errorcode = ERR74;
9245	#elif defined COMPILE_PCRE32
9246	errorcode = ERR77;
9247	#endif
9248	goto PCRE_EARLY_ERROR_RETURN2;
9249	}
9250	#else
9251	if (utf)
9252	{
9253	errorcode = ERR32;
9254	goto PCRE_EARLY_ERROR_RETURN;
9255	}
9256	#endif
9257
9258	/ Can't support UCP unless PCRE has been compiled to include the code. /
9259
9260	#ifndef SUPPORT_UCP
9261	if ((options & PCRE_UCP) != `0`)
9262	{
9263	errorcode = ERR67;
9264	goto PCRE_EARLY_ERROR_RETURN;
9265	}
9266	#endif
9267
9268	/ Check validity of \R options. /
9269
9270	if ((options & (PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) ==
9271	(PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE))
9272	{
9273	errorcode = ERR56;
9274	goto PCRE_EARLY_ERROR_RETURN;
9275	}
9276
9277	/ Handle different types of newline. The three bits give seven cases. The*
9278	current code allows for fixed one- or two-byte sequences, plus "any" and
9279	"anycrlf". /*
9280
9281	switch (options & PCRE_NEWLINE_BITS)
9282	{
9283	case `0`: newline = NEWLINE; break; / Build-time default /
9284	case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9285	case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9286	case PCRE_NEWLINE_CR+
9287	PCRE_NEWLINE_LF: newline = (CHAR_CR << `8`) \| CHAR_NL; break;
9288	case PCRE_NEWLINE_ANY: newline = -`1`; break;
9289	case PCRE_NEWLINE_ANYCRLF: newline = -`2`; break;
9290	default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9291	}
9292
9293	if (newline == -`2`)
9294	{
9295	cd->nltype = NLTYPE_ANYCRLF;
9296	}
9297	else if (newline < `0`)
9298	{
9299	cd->nltype = NLTYPE_ANY;
9300	}
9301	else
9302	{
9303	cd->nltype = NLTYPE_FIXED;
9304	if (newline > `255`)
9305	{
9306	cd->nllen = `2`;
9307	cd->nl[`0`] = (newline >> `8`) & `255`;
9308	cd->nl[`1`] = newline & `255`;
9309	}
9310	else
9311	{
9312	cd->nllen = `1`;
9313	cd->nl[`0`] = newline;
9314	}
9315	}
9316
9317	/ Maximum back reference and backref bitmap. The bitmap records up to 31 back*
9318	references to help in deciding whether (.) can be treated as anchored or not.*
9319	*/
9320
9321	cd->top_backref = `0`;
9322	cd->backref_map = `0`;
9323
9324	/ Reflect pattern for debugging output /
9325
9326	DPRINTF(("------------------------------------------------------------------\n"));
9327	#ifdef PCRE_DEBUG
9328	print_puchar(stdout, (PCRE_PUCHAR)pattern);
9329	#endif
9330	DPRINTF(("\n"));
9331
9332	/ Pretend to compile the pattern while actually just accumulating the length*
9333	of memory required. This behaviour is triggered by passing a non-NULL final
9334	argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9335	to compile parts of the pattern into; the compiled code is discarded when it is
9336	no longer needed, so hopefully this workspace will never overflow, though there
9337	is a test for its doing so. /*
9338
9339	cd->bracount = cd->final_bracount = `0`;
9340	cd->names_found = `0`;
9341	cd->name_entry_size = `0`;
9342	cd->name_table = NULL;
9343	cd->dupnames = FALSE;
9344	cd->dupgroups = FALSE;
9345	cd->namedrefcount = `0`;
9346	cd->start_code = cworkspace;
9347	cd->hwm = cworkspace;
9348	cd->iscondassert = FALSE;
9349	cd->start_workspace = cworkspace;
9350	cd->workspace_size = COMPILE_WORK_SIZE;
9351	cd->named_groups = named_groups;
9352	cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9353	cd->start_pattern = (const pcre_uchar *)pattern;
9354	cd->end_pattern = (const pcre_uchar )(pattern + STRLEN_UC((const* pcre_uchar *)pattern));
9355	cd->req_varyopt = `0`;
9356	cd->parens_depth = `0`;
9357	cd->assert_depth = `0`;
9358	cd->max_lookbehind = `0`;
9359	cd->external_options = options;
9360	cd->open_caps = NULL;
9361
9362	/ Now do the pre-compile. On error, errorcode will be set non-zero, so we*
9363	don't need to look at the result of the function here. The initial options have
9364	been put into the cd block so that they can be changed if an option setting is
9365	found within the regex right at the beginning. Bringing initial option settings
9366	outside can help speed up starting point checks. /*
9367
9368	ptr += skipatstart;
9369	code = cworkspace;
9370	*code = OP_BRA;
9371
9372	(void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9373	FALSE, `0`, `0`, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9374	cd, &length);
9375	if (errorcode != `0`) goto PCRE_EARLY_ERROR_RETURN;
9376
9377	DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9378	(int)(cd->hwm - cworkspace)));
9379
9380	if (length > MAX_PATTERN_SIZE)
9381	{
9382	errorcode = ERR20;
9383	goto PCRE_EARLY_ERROR_RETURN;
9384	}
9385
9386	/ Compute the size of the data block for storing the compiled pattern. Integer*
9387	overflow should no longer be possible because nowadays we limit the maximum
9388	value of cd->names_found and cd->name_entry_size. /*
9389
9390	size = sizeof(REAL_PCRE) +
9391	(length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9392
9393	/ Get the memory. /
9394
9395	re = (REAL_PCRE *)(PUBL(malloc))(size);
9396	if (re == NULL)
9397	{
9398	errorcode = ERR21;
9399	goto PCRE_EARLY_ERROR_RETURN;
9400	}
9401
9402	/ Put in the magic number, and save the sizes, initial options, internal*
9403	flags, and character table pointer. NULL is used for the default character
9404	tables. The nullpad field is at the end; it's there to help in the case when a
9405	regex compiled on a system with 4-byte pointers is run on another with 8-byte
9406	pointers. /*
9407
9408	re->magic_number = MAGIC_NUMBER;
9409	re->size = (int)size;
9410	re->options = cd->external_options;
9411	re->flags = cd->external_flags;
9412	re->limit_match = limit_match;
9413	re->limit_recursion = limit_recursion;
9414	re->first_char = `0`;
9415	re->req_char = `0`;
9416	re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9417	re->name_entry_size = cd->name_entry_size;
9418	re->name_count = cd->names_found;
9419	re->ref_count = `0`;
9420	re->tables = (tables == PRIV(default_tables))? NULL : tables;
9421	re->nullpad = NULL;
9422	#ifdef COMPILE_PCRE32
9423	re->dummy = `0`;
9424	#else
9425	re->dummy1 = re->dummy2 = re->dummy3 = `0`;
9426	#endif
9427
9428	/ The starting points of the name/number translation table and of the code are*
9429	passed around in the compile data block. The start/end pattern and initial
9430	options are already set from the pre-compile phase, as is the name_entry_size
9431	field. Reset the bracket count and the names_found field. Also reset the hwm
9432	field; this time it's used for remembering forward references to subpatterns.
9433	*/
9434
9435	cd->final_bracount = cd->bracount; / Save for checking forward references /
9436	cd->parens_depth = `0`;
9437	cd->assert_depth = `0`;
9438	cd->bracount = `0`;
9439	cd->max_lookbehind = `0`;
9440	cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9441	codestart = cd->name_table + re->name_entry_size * re->name_count;
9442	cd->start_code = codestart;
9443	cd->hwm = (pcre_uchar *)(cd->start_workspace);
9444	cd->iscondassert = FALSE;
9445	cd->req_varyopt = `0`;
9446	cd->had_accept = FALSE;
9447	cd->had_pruneorskip = FALSE;
9448	cd->check_lookbehind = FALSE;
9449	cd->open_caps = NULL;
9450
9451	/ If any named groups were found, create the name/number table from the list*
9452	created in the first pass. /*
9453
9454	if (cd->names_found > `0`)
9455	{
9456	int i = cd->names_found;
9457	named_group *ng = cd->named_groups;
9458	cd->names_found = `0`;
9459	for (; i > `0`; i--, ng++)
9460	add_name(cd, ng->name, ng->length, ng->number);
9461	if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9462	(PUBL(free))((void *)cd->named_groups);
9463	}
9464
9465	/ Set up a starting, non-extracting bracket, then compile the expression. On*
9466	error, errorcode will be set non-zero, so we don't need to look at the result
9467	of the function here. /*
9468
9469	ptr = (const pcre_uchar *)pattern + skipatstart;
9470	code = (pcre_uchar *)codestart;
9471	*code = OP_BRA;
9472	(void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, `0`, `0`,
9473	&firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9474	re->top_bracket = cd->bracount;
9475	re->top_backref = cd->top_backref;
9476	re->max_lookbehind = cd->max_lookbehind;
9477	re->flags = cd->external_flags \| PCRE_MODE;
9478
9479	if (cd->had_accept)
9480	{
9481	reqchar = `0`; / Must disable after (ACCEPT) /*
9482	reqcharflags = REQ_NONE;
9483	}
9484
9485	/ If not reached end of pattern on success, there's an excess bracket. /
9486
9487	if (errorcode == `0` && *ptr != CHAR_NULL) errorcode = ERR22;
9488
9489	/ Fill in the terminating state and check for disastrous overflow, but*
9490	if debugging, leave the test till after things are printed out. /*
9491
9492	*code++ = OP_END;
9493
9494	#ifndef PCRE_DEBUG
9495	if (code - codestart > length) errorcode = ERR23;
9496	#endif
9497
9498	#ifdef SUPPORT_VALGRIND
9499	/ If the estimated length exceeds the really used length, mark the extra*
9500	allocated memory as unaddressable, so that any out-of-bound reads can be
9501	detected. /*
9502	VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9503	#endif
9504
9505	/ Fill in any forward references that are required. There may be repeated*
9506	references; optimize for them, as searching a large regex takes time. /*
9507
9508	if (cd->hwm > cd->start_workspace)
9509	{
9510	int prev_recno = -`1`;
9511	const pcre_uchar *groupptr = NULL;
9512	while (errorcode == `0` && cd->hwm > cd->start_workspace)
9513	{
9514	int offset, recno;
9515	cd->hwm -= LINK_SIZE;
9516	offset = GET(cd->hwm, `0`);
9517
9518	/ Check that the hwm handling hasn't gone wrong. This whole area is*
9519	rewritten in PCRE2 because there are some obscure cases. /*
9520
9521	if (offset == `0` \|\| codestart[offset-`1`] != OP_RECURSE)
9522	{
9523	errorcode = ERR10;
9524	break;
9525	}
9526
9527	recno = GET(codestart, offset);
9528	if (recno != prev_recno)
9529	{
9530	groupptr = PRIV(find_bracket)(codestart, utf, recno);
9531	prev_recno = recno;
9532	}
9533	if (groupptr == NULL) errorcode = ERR53;
9534	else PUT(((pcre_uchar )codestart), offset, (int*)(groupptr - codestart));
9535	}
9536	}
9537
9538	/ If the workspace had to be expanded, free the new memory. Set the pointer to*
9539	NULL to indicate that forward references have been filled in. /*
9540
9541	if (cd->workspace_size > COMPILE_WORK_SIZE)
9542	(PUBL(free))((void *)cd->start_workspace);
9543	cd->start_workspace = NULL;
9544
9545	/ Give an error if there's back reference to a non-existent capturing*
9546	subpattern. /*
9547
9548	if (errorcode == `0` && re->top_backref > re->top_bracket) errorcode = ERR15;
9549
9550	/ Unless disabled, check whether any single character iterators can be*
9551	auto-possessified. The function overwrites the appropriate opcode values, so
9552	the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9553	used in this code because at least one compiler gives a warning about loss of
9554	"const" attribute if the cast (pcre_uchar )codestart is used directly in the*
9555	function call. /*
9556
9557	if (errorcode == `0` && (options & PCRE_NO_AUTO_POSSESS) == `0`)
9558	{
9559	pcre_uchar temp = (pcre_uchar )codestart;
9560	auto_possessify(temp, utf, cd);
9561	}
9562
9563	/ If there were any lookbehind assertions that contained OP_RECURSE*
9564	(recursions or subroutine calls), a flag is set for them to be checked here,
9565	because they may contain forward references. Actual recursions cannot be fixed
9566	length, but subroutine calls can. It is done like this so that those without
9567	OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9568	exceptional ones forgo this. We scan the pattern to check that they are fixed
9569	length, and set their lengths. /*
9570
9571	if (errorcode == `0` && cd->check_lookbehind)
9572	{
9573	pcre_uchar cc = (pcre_uchar )codestart;
9574
9575	/ Loop, searching for OP_REVERSE items, and process those that do not have*
9576	their length set. (Actually, it will also re-process any that have a length
9577	of zero, but that is a pathological case, and it does no harm.) When we find
9578	one, we temporarily terminate the branch it is in while we scan it. /*
9579
9580	for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -`1`);
9581	cc != NULL;
9582	cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -`1`))
9583	{
9584	if (GET(cc, `1`) == `0`)
9585	{
9586	int fixed_length;
9587	pcre_uchar *be = cc - `1` - LINK_SIZE + GET(cc, -LINK_SIZE);
9588	int end_op = *be;
9589	*be = OP_END;
9590	fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != `0`, TRUE,
9591	cd, NULL);
9592	*be = end_op;
9593	DPRINTF(("fixed length = %d\n", fixed_length));
9594	if (fixed_length < `0`)
9595	{
9596	errorcode = (fixed_length == -`2`)? ERR36 :
9597	(fixed_length == -`4`)? ERR70 : ERR25;
9598	break;
9599	}
9600	if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9601	PUT(cc, `1`, fixed_length);
9602	}
9603	cc += `1` + LINK_SIZE;
9604	}
9605	}
9606
9607	/ Failed to compile, or error while post-processing /
9608
9609	if (errorcode != `0`)
9610	{
9611	(PUBL(free))(re);
9612	PCRE_EARLY_ERROR_RETURN:
9613	erroroffset = (int)(ptr - (const* pcre_uchar *)pattern);
9614	PCRE_EARLY_ERROR_RETURN2:
9615	*errorptr = find_error_text(errorcode);
9616	if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9617	return NULL;
9618	}
9619
9620	/ If the anchored option was not passed, set the flag if we can determine that*
9621	the pattern is anchored by virtue of ^ characters or \A or anything else, such
9622	as starting with non-atomic . when DOTALL is set and there are no occurrences*
9623	of PRUNE or SKIP.
9624
9625	Otherwise, if we know what the first byte has to be, save it, because that
9626	speeds up unanchored matches no end. If not, see if we can set the
9627	PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9628	start with ^. and also when all branches start with non-atomic . for*
9629	non-DOTALL matches when PRUNE and SKIP are not present. /
9630
9631	if ((re->options & PCRE_ANCHORED) == `0`)
9632	{
9633	if (is_anchored(codestart, `0`, cd, `0`)) re->options \|= PCRE_ANCHORED;
9634	else
9635	{
9636	if (firstcharflags < `0`)
9637	firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9638	if (firstcharflags >= `0`) / Remove caseless flag for non-caseable chars /
9639	{
9640	#if defined COMPILE_PCRE8
9641	re->first_char = firstchar & `0xff`;
9642	#elif defined COMPILE_PCRE16
9643	re->first_char = firstchar & `0xffff`;
9644	#elif defined COMPILE_PCRE32
9645	re->first_char = firstchar;
9646	#endif
9647	if ((firstcharflags & REQ_CASELESS) != `0`)
9648	{
9649	#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9650	/ We ignore non-ASCII first chars in 8 bit mode. /
9651	if (utf)
9652	{
9653	if (re->first_char < `128`)
9654	{
9655	if (cd->fcc[re->first_char] != re->first_char)
9656	re->flags \|= PCRE_FCH_CASELESS;
9657	}
9658	else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9659	re->flags \|= PCRE_FCH_CASELESS;
9660	}
9661	else
9662	#endif
9663	if (MAX_255(re->first_char)
9664	&& cd->fcc[re->first_char] != re->first_char)
9665	re->flags \|= PCRE_FCH_CASELESS;
9666	}
9667
9668	re->flags \|= PCRE_FIRSTSET;
9669	}
9670
9671	else if (is_startline(codestart, `0`, cd, `0`, FALSE)) re->flags \|= PCRE_STARTLINE;
9672	}
9673	}
9674
9675	/ For an anchored pattern, we use the "required byte" only if it follows a*
9676	variable length item in the regex. Remove the caseless flag for non-caseable
9677	bytes. /*
9678
9679	if (reqcharflags >= `0` &&
9680	((re->options & PCRE_ANCHORED) == `0` \|\| (reqcharflags & REQ_VARY) != `0`))
9681	{
9682	#if defined COMPILE_PCRE8
9683	re->req_char = reqchar & `0xff`;
9684	#elif defined COMPILE_PCRE16
9685	re->req_char = reqchar & `0xffff`;
9686	#elif defined COMPILE_PCRE32
9687	re->req_char = reqchar;
9688	#endif
9689	if ((reqcharflags & REQ_CASELESS) != `0`)
9690	{
9691	#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9692	/ We ignore non-ASCII first chars in 8 bit mode. /
9693	if (utf)
9694	{
9695	if (re->req_char < `128`)
9696	{
9697	if (cd->fcc[re->req_char] != re->req_char)
9698	re->flags \|= PCRE_RCH_CASELESS;
9699	}
9700	else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9701	re->flags \|= PCRE_RCH_CASELESS;
9702	}
9703	else
9704	#endif
9705	if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9706	re->flags \|= PCRE_RCH_CASELESS;
9707	}
9708
9709	re->flags \|= PCRE_REQCHSET;
9710	}
9711
9712	/ Print out the compiled data if debugging is enabled. This is never the*
9713	case when building a production library. /*
9714
9715	#ifdef PCRE_DEBUG
9716	printf("Length = %d top_bracket = %d top_backref = %d\n",
9717	length, re->top_bracket, re->top_backref);
9718
9719	printf("Options=%08x\n", re->options);
9720
9721	if ((re->flags & PCRE_FIRSTSET) != `0`)
9722	{
9723	pcre_uchar ch = re->first_char;
9724	const char *caseless =
9725	((re->flags & PCRE_FCH_CASELESS) == `0`)? "" : " (caseless)";
9726	if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9727	else printf("First char = \\x%02x%s\n", ch, caseless);
9728	}
9729
9730	if ((re->flags & PCRE_REQCHSET) != `0`)
9731	{
9732	pcre_uchar ch = re->req_char;
9733	const char *caseless =
9734	((re->flags & PCRE_RCH_CASELESS) == `0`)? "" : " (caseless)";
9735	if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9736	else printf("Req char = \\x%02x%s\n", ch, caseless);
9737	}
9738
9739	#if defined COMPILE_PCRE8
9740	pcre_printint((pcre *)re, stdout, TRUE);
9741	#elif defined COMPILE_PCRE16
9742	pcre16_printint((pcre *)re, stdout, TRUE);
9743	#elif defined COMPILE_PCRE32
9744	pcre32_printint((pcre *)re, stdout, TRUE);
9745	#endif
9746
9747	/ This check is done here in the debugging case so that the code that*
9748	was compiled can be seen. /*
9749
9750	if (code - codestart > length)
9751	{
9752	(PUBL(free))(re);
9753	*errorptr = find_error_text(ERR23);
9754	erroroffset = ptr - (pcre_uchar )pattern;
9755	if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9756	return NULL;
9757	}
9758	#endif /* PCRE_DEBUG */
9759
9760	/ Check for a pattern than can match an empty string, so that this information*
9761	can be provided to applications. /*
9762
9763	do
9764	{
9765	if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9766	{
9767	re->flags \|= PCRE_MATCH_EMPTY;
9768	break;
9769	}
9770	codestart += GET(codestart, `1`);
9771	}
9772	while (*codestart == OP_ALT);
9773
9774	#if defined COMPILE_PCRE8
9775	return (pcre *)re;
9776	#elif defined COMPILE_PCRE16
9777	return (pcre16 *)re;
9778	#elif defined COMPILE_PCRE32
9779	return (pcre32 *)re;
9780	#endif
9781	}
9782
9783	/ End of pcre_compile.c /
9784
9785

Browse the source code of ClickHouse/contrib/poco/Foundation/src/pcre_compile.c